From a5f2c45528032ed20c33e0f8cd2c163a800a0017 Mon Sep 17 00:00:00 2001 From: Nick Terrell Date: Fri, 17 Sep 2021 11:43:04 -0700 Subject: [PATCH] Huffman ASM --- build/meson/lib/meson.build | 1 + build/single_file_libs/zstd-in.c | 2 + build/single_file_libs/zstddeclib-in.c | 2 + contrib/linux-kernel/Makefile | 1 + contrib/linux-kernel/decompress_sources.h | 5 + contrib/linux-kernel/test/macro-test.sh | 1 + lib/common/compiler.h | 4 +- lib/common/error_private.h | 79 +++ lib/common/huf.h | 7 +- lib/common/zstd_internal.h | 120 ++-- lib/compress/huf_compress.c | 1 + lib/decompress/huf_decompress.c | 755 ++++++++++++++++++---- lib/decompress/huf_decompress_amd64.S | 561 ++++++++++++++++ lib/zstd.h | 2 +- tests/fuzz/huf_decompress.c | 2 +- 15 files changed, 1348 insertions(+), 195 deletions(-) create mode 100644 lib/decompress/huf_decompress_amd64.S diff --git a/build/meson/lib/meson.build b/build/meson/lib/meson.build index 5da53872..1f4f8c25 100644 --- a/build/meson/lib/meson.build +++ b/build/meson/lib/meson.build @@ -37,6 +37,7 @@ libzstd_sources = [join_paths(zstd_rootdir, 'lib/common/entropy_common.c'), join_paths(zstd_rootdir, 'lib/compress/zstd_opt.c'), join_paths(zstd_rootdir, 'lib/compress/zstd_ldm.c'), join_paths(zstd_rootdir, 'lib/decompress/huf_decompress.c'), + join_paths(zstd_rootdir, 'lib/decompress/huf_decompress_amd64.S'), join_paths(zstd_rootdir, 'lib/decompress/zstd_decompress.c'), join_paths(zstd_rootdir, 'lib/decompress/zstd_decompress_block.c'), join_paths(zstd_rootdir, 'lib/decompress/zstd_ddict.c'), diff --git a/build/single_file_libs/zstd-in.c b/build/single_file_libs/zstd-in.c index 1b27953a..f73a2e72 100644 --- a/build/single_file_libs/zstd-in.c +++ b/build/single_file_libs/zstd-in.c @@ -43,6 +43,8 @@ #define ZSTD_MULTITHREAD #endif #define ZSTD_TRACE 0 +/* TODO: Can't amalgamate ASM function */ +#define HUF_DISABLE_ASM 1 /* Include zstd_deps.h first with all the options we need enabled. */ #define ZSTD_DEPS_NEED_MALLOC diff --git a/build/single_file_libs/zstddeclib-in.c b/build/single_file_libs/zstddeclib-in.c index 019d9c26..a5fd958e 100644 --- a/build/single_file_libs/zstddeclib-in.c +++ b/build/single_file_libs/zstddeclib-in.c @@ -39,6 +39,8 @@ #define ZSTD_LEGACY_SUPPORT 0 #define ZSTD_STRIP_ERROR_STRINGS #define ZSTD_TRACE 0 +/* TODO: Can't amalgamate ASM function */ +#define HUF_DISABLE_ASM 1 /* Include zstd_deps.h first with all the options we need enabled. */ #define ZSTD_DEPS_NEED_MALLOC diff --git a/contrib/linux-kernel/Makefile b/contrib/linux-kernel/Makefile index c391df7c..f85179fc 100644 --- a/contrib/linux-kernel/Makefile +++ b/contrib/linux-kernel/Makefile @@ -35,6 +35,7 @@ libzstd: -DXXH_STATIC_LINKING_ONLY \ -DMEM_FORCE_MEMORY_ACCESS=0 \ -D__GNUC__ \ + -D__linux__=1 \ -DSTATIC_BMI2=0 \ -DZSTD_ADDRESS_SANITIZER=0 \ -DZSTD_MEMORY_SANITIZER=0 \ diff --git a/contrib/linux-kernel/decompress_sources.h b/contrib/linux-kernel/decompress_sources.h index f35bef03..aef953c6 100644 --- a/contrib/linux-kernel/decompress_sources.h +++ b/contrib/linux-kernel/decompress_sources.h @@ -21,6 +21,11 @@ #include "common/error_private.c" #include "common/fse_decompress.c" #include "common/zstd_common.c" +/* + * Disable the ASM Huffman implementation because we need to + * include all the sources. + */ +#define HUF_DISABLE_ASM 1 #include "decompress/huf_decompress.c" #include "decompress/zstd_ddict.c" #include "decompress/zstd_decompress.c" diff --git a/contrib/linux-kernel/test/macro-test.sh b/contrib/linux-kernel/test/macro-test.sh index c688ac03..bde6cbb5 100755 --- a/contrib/linux-kernel/test/macro-test.sh +++ b/contrib/linux-kernel/test/macro-test.sh @@ -42,3 +42,4 @@ test_not_present "ZSTD_DLL_IMPORT" test_not_present "__ICCARM__" test_not_present "_MSC_VER" test_not_present "_WIN32" +test_not_present "__linux__" diff --git a/lib/common/compiler.h b/lib/common/compiler.h index 012ff022..9d7d968c 100644 --- a/lib/common/compiler.h +++ b/lib/common/compiler.h @@ -108,7 +108,7 @@ #if ((defined(__clang__) && __has_attribute(__target__)) \ || (defined(__GNUC__) \ && (__GNUC__ >= 5 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)))) \ - && (defined(__x86_64__) || defined(_M_X86)) \ + && (defined(__x86_64__) || defined(_M_X64)) \ && !defined(__BMI2__) # define DYNAMIC_BMI2 1 #else @@ -212,7 +212,7 @@ # elif defined(ZSTD_ARCH_ARM_NEON) # include # endif -#endif +#endif /* compat. with non-clang compilers */ #ifndef __has_builtin diff --git a/lib/common/error_private.h b/lib/common/error_private.h index 6d8b9f77..007d8106 100644 --- a/lib/common/error_private.h +++ b/lib/common/error_private.h @@ -22,6 +22,8 @@ extern "C" { * Dependencies ******************************************/ #include "../zstd_errors.h" /* enum list */ +#include "compiler.h" +#include "debug.h" #include "zstd_deps.h" /* size_t */ @@ -73,6 +75,83 @@ ERR_STATIC const char* ERR_getErrorName(size_t code) return ERR_getErrorString(ERR_getErrorCode(code)); } +/** + * Ignore: this is an internal helper. + * + * This is a helper function to help force C99-correctness during compilation. + * Under strict compilation modes, variadic macro arguments can't be empty. + * However, variadic function arguments can be. Using a function therefore lets + * us statically check that at least one (string) argument was passed, + * independent of the compilation flags. + */ +static INLINE_KEYWORD UNUSED_ATTR +void _force_has_format_string(const char *format, ...) { + (void)format; +} + +/** + * Ignore: this is an internal helper. + * + * We want to force this function invocation to be syntactically correct, but + * we don't want to force runtime evaluation of its arguments. + */ +#define _FORCE_HAS_FORMAT_STRING(...) \ + if (0) { \ + _force_has_format_string(__VA_ARGS__); \ + } + +#define ERR_QUOTE(str) #str + +/** + * Return the specified error if the condition evaluates to true. + * + * In debug modes, prints additional information. + * In order to do that (particularly, printing the conditional that failed), + * this can't just wrap RETURN_ERROR(). + */ +#define RETURN_ERROR_IF(cond, err, ...) \ + if (cond) { \ + RAWLOG(3, "%s:%d: ERROR!: check %s failed, returning %s", \ + __FILE__, __LINE__, ERR_QUOTE(cond), ERR_QUOTE(ERROR(err))); \ + _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \ + RAWLOG(3, ": " __VA_ARGS__); \ + RAWLOG(3, "\n"); \ + return ERROR(err); \ + } + +/** + * Unconditionally return the specified error. + * + * In debug modes, prints additional information. + */ +#define RETURN_ERROR(err, ...) \ + do { \ + RAWLOG(3, "%s:%d: ERROR!: unconditional check failed, returning %s", \ + __FILE__, __LINE__, ERR_QUOTE(ERROR(err))); \ + _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \ + RAWLOG(3, ": " __VA_ARGS__); \ + RAWLOG(3, "\n"); \ + return ERROR(err); \ + } while(0); + +/** + * If the provided expression evaluates to an error code, returns that error code. + * + * In debug modes, prints additional information. + */ +#define FORWARD_IF_ERROR(err, ...) \ + do { \ + size_t const err_code = (err); \ + if (ERR_isError(err_code)) { \ + RAWLOG(3, "%s:%d: ERROR!: forwarding error in %s: %s", \ + __FILE__, __LINE__, ERR_QUOTE(err), ERR_getErrorName(err_code)); \ + _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \ + RAWLOG(3, ": " __VA_ARGS__); \ + RAWLOG(3, "\n"); \ + return err_code; \ + } \ + } while(0); + #if defined (__cplusplus) } #endif diff --git a/lib/common/huf.h b/lib/common/huf.h index a3ae06e4..85518481 100644 --- a/lib/common/huf.h +++ b/lib/common/huf.h @@ -116,11 +116,11 @@ HUF_PUBLIC_API size_t HUF_compress4X_wksp (void* dst, size_t dstCapacity, /* *** Constants *** */ -#define HUF_TABLELOG_MAX 12 /* max runtime value of tableLog (due to static allocation); can be modified up to HUF_ABSOLUTEMAX_TABLELOG */ +#define HUF_TABLELOG_MAX 12 /* max runtime value of tableLog (due to static allocation); can be modified up to HUF_TABLELOG_ABSOLUTEMAX */ #define HUF_TABLELOG_DEFAULT 11 /* default tableLog value when none specified */ #define HUF_SYMBOLVALUE_MAX 255 -#define HUF_TABLELOG_ABSOLUTEMAX 15 /* absolute limit of HUF_MAX_TABLELOG. Beyond that value, code does not work */ +#define HUF_TABLELOG_ABSOLUTEMAX 12 /* absolute limit of HUF_MAX_TABLELOG. Beyond that value, code does not work */ #if (HUF_TABLELOG_MAX > HUF_TABLELOG_ABSOLUTEMAX) # error "HUF_TABLELOG_MAX is too large !" #endif @@ -353,6 +353,9 @@ size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t ds #ifndef HUF_FORCE_DECOMPRESS_X2 size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2); #endif +#ifndef HUF_FORCE_DECOMPRESS_X1 +size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2); +#endif #endif /* HUF_STATIC_LINKING_ONLY */ diff --git a/lib/common/zstd_internal.h b/lib/common/zstd_internal.h index 3134a5ab..aaf7d466 100644 --- a/lib/common/zstd_internal.h +++ b/lib/common/zstd_internal.h @@ -58,81 +58,6 @@ extern "C" { #define MIN(a,b) ((a)<(b) ? (a) : (b)) #define MAX(a,b) ((a)>(b) ? (a) : (b)) -/** - * Ignore: this is an internal helper. - * - * This is a helper function to help force C99-correctness during compilation. - * Under strict compilation modes, variadic macro arguments can't be empty. - * However, variadic function arguments can be. Using a function therefore lets - * us statically check that at least one (string) argument was passed, - * independent of the compilation flags. - */ -static INLINE_KEYWORD UNUSED_ATTR -void _force_has_format_string(const char *format, ...) { - (void)format; -} - -/** - * Ignore: this is an internal helper. - * - * We want to force this function invocation to be syntactically correct, but - * we don't want to force runtime evaluation of its arguments. - */ -#define _FORCE_HAS_FORMAT_STRING(...) \ - if (0) { \ - _force_has_format_string(__VA_ARGS__); \ - } - -/** - * Return the specified error if the condition evaluates to true. - * - * In debug modes, prints additional information. - * In order to do that (particularly, printing the conditional that failed), - * this can't just wrap RETURN_ERROR(). - */ -#define RETURN_ERROR_IF(cond, err, ...) \ - if (cond) { \ - RAWLOG(3, "%s:%d: ERROR!: check %s failed, returning %s", \ - __FILE__, __LINE__, ZSTD_QUOTE(cond), ZSTD_QUOTE(ERROR(err))); \ - _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \ - RAWLOG(3, ": " __VA_ARGS__); \ - RAWLOG(3, "\n"); \ - return ERROR(err); \ - } - -/** - * Unconditionally return the specified error. - * - * In debug modes, prints additional information. - */ -#define RETURN_ERROR(err, ...) \ - do { \ - RAWLOG(3, "%s:%d: ERROR!: unconditional check failed, returning %s", \ - __FILE__, __LINE__, ZSTD_QUOTE(ERROR(err))); \ - _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \ - RAWLOG(3, ": " __VA_ARGS__); \ - RAWLOG(3, "\n"); \ - return ERROR(err); \ - } while(0); - -/** - * If the provided expression evaluates to an error code, returns that error code. - * - * In debug modes, prints additional information. - */ -#define FORWARD_IF_ERROR(err, ...) \ - do { \ - size_t const err_code = (err); \ - if (ERR_isError(err_code)) { \ - RAWLOG(3, "%s:%d: ERROR!: forwarding error in %s: %s", \ - __FILE__, __LINE__, ZSTD_QUOTE(err), ERR_getErrorName(err_code)); \ - _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \ - RAWLOG(3, ": " __VA_ARGS__); \ - RAWLOG(3, "\n"); \ - return err_code; \ - } \ - } while(0); - /*-************************************* * Common constants @@ -453,6 +378,51 @@ MEM_STATIC U32 ZSTD_highbit32(U32 val) /* compress, dictBuilder, decodeCorpus } } +/** + * Computes CTZ on a U64. + * This will be slow on 32-bit mode, and on unsupported compilers. + * If you need this function to be fast (because it is hot) expand + * support. + */ +MEM_STATIC unsigned ZSTD_countTrailingZeros(size_t val) +{ + if (MEM_64bits()) { +# if defined(_MSC_VER) && defined(_WIN64) +# if STATIC_BMI2 + return _tzcnt_u64(val); +# else + unsigned long r = 0; + return _BitScanForward64( &r, (U64)val ) ? (unsigned)(r >> 3) : 0; +# endif +# elif defined(__GNUC__) && (__GNUC__ >= 4) + return __builtin_ctzll((U64)val); +# else + static const int DeBruijnBytePos[64] = { 0, 1, 2, 7, 3, 13, 8, 19, + 4, 25, 14, 28, 9, 34, 20, 56, + 5, 17, 26, 54, 15, 41, 29, 43, + 10, 31, 38, 35, 21, 45, 49, 57, + 63, 6, 12, 18, 24, 27, 33, 55, + 16, 53, 40, 42, 30, 37, 44, 48, + 62, 11, 23, 32, 52, 39, 36, 47, + 61, 22, 51, 46, 60, 50, 59, 58 }; + return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58]; +# endif + } else { /* 32 bits */ +# if defined(_MSC_VER) + unsigned long r=0; + return _BitScanForward( &r, (U32)val ) ? (unsigned)(r >> 3) : 0; +# elif defined(__GNUC__) && (__GNUC__ >= 3) + return (__builtin_ctz((U32)val) >> 3); +# else + static const int DeBruijnBytePos[32] = { 0, 1, 28, 2, 29, 14, 24, 3, + 30, 22, 20, 15, 25, 17, 4, 8, + 31, 27, 13, 23, 21, 19, 16, 7, + 26, 12, 18, 6, 11, 5, 10, 9 }; + return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27]; +# endif + } +} + /* ZSTD_invalidateRepCodes() : * ensures next compression will not use repcodes from previous block. diff --git a/lib/compress/huf_compress.c b/lib/compress/huf_compress.c index 915778eb..ba296da8 100644 --- a/lib/compress/huf_compress.c +++ b/lib/compress/huf_compress.c @@ -809,6 +809,7 @@ FORCE_INLINE_TEMPLATE void HUF_addBits(HUF_CStream_t* bitC, HUF_CElt elt, int id { size_t const nbBits = HUF_getNbBits(elt); size_t const dirtyBits = nbBits == 0 ? 0 : BIT_highbit32((U32)nbBits) + 1; + (void)dirtyBits; /* Middle bits are 0. */ assert(((elt >> dirtyBits) << (dirtyBits + nbBits)) == 0); /* We didn't overwrite any bits in the bit container. */ diff --git a/lib/decompress/huf_decompress.c b/lib/decompress/huf_decompress.c index f41a02bb..1dfa4b80 100644 --- a/lib/decompress/huf_decompress.c +++ b/lib/decompress/huf_decompress.c @@ -22,6 +22,13 @@ #define HUF_STATIC_LINKING_ONLY #include "../common/huf.h" #include "../common/error_private.h" +#include "../common/zstd_internal.h" + +/* ************************************************************** +* Constants +****************************************************************/ + +#define HUF_DECODER_FAST_TABLELOG 11 /* ************************************************************** * Macros @@ -36,6 +43,40 @@ #error "Cannot force the use of the X1 and X2 decoders at the same time!" #endif +/* Only use assembly on Linux / MacOS. + * Disable when MSAN is enabled. + */ +#if defined(__linux__) || defined(__linux) || defined(__APPLE__) +# if ZSTD_MEMORY_SANITIZER +# define HUF_ASM_SUPPORTED 0 +# else +# define HUF_ASM_SUPPORTED 1 +#endif +#else +# define HUF_ASM_SUPPORTED 0 +#endif + +/* HUF_DISABLE_ASM: Disables all ASM implementations. */ +#if !defined(HUF_DISABLE_ASM) && \ + HUF_ASM_SUPPORTED && \ + defined(__x86_64__) && (DYNAMIC_BMI2 || defined(__BMI2__)) +# define HUF_ENABLE_ASM_X86_64_BMI2 1 +#else +# define HUF_ENABLE_ASM_X86_64_BMI2 0 +#endif + +#if HUF_ENABLE_ASM_X86_64_BMI2 && DYNAMIC_BMI2 +# define HUF_ASM_X86_64_BMI2_ATTRS TARGET_ATTRIBUTE("bmi2") +#else +# define HUF_ASM_X86_64_BMI2_ATTRS +#endif + +#ifdef __cplusplus +# define HUF_EXTERN_C extern "C" +#else +# define HUF_EXTERN_C +#endif +#define HUF_ASM_DECL HUF_EXTERN_C /* ************************************************************** * Error Management @@ -107,13 +148,146 @@ static DTableDesc HUF_getDTableDesc(const HUF_DTable* table) return dtd; } +#if HUF_ENABLE_ASM_X86_64_BMI2 + +static size_t HUF_initDStream(BYTE const* ip) { + BYTE const lastByte = ip[7]; + size_t const bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0; + size_t const value = MEM_readLEST(ip) | 1; + assert(bitsConsumed <= 8); + return value << bitsConsumed; +} +typedef struct { + BYTE const* ip[4]; + BYTE* op[4]; + U64 bits[4]; + void const* dt; + BYTE const* ilimit; + BYTE* oend; + BYTE const* iend[4]; +} HUF_DecompressAsmArgs; + +/** + * Initializes args for the asm decoding loop. + * @returns 0 on success + * 1 if the fallback implementation should be used. + * Or an error code on failure. + */ +static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst, size_t dstSize, void const* src, size_t srcSize, const HUF_DTable* DTable) +{ + void const* dt = DTable + 1; + U32 const dtLog = HUF_getDTableDesc(DTable).tableLog; + + const BYTE* const ilimit = (const BYTE*)src + 6 + 8; + + BYTE* const oend = (BYTE*)dst + dstSize; + + /* We're assuming x86-64 BMI2 - assure that this is the case. */ + assert(MEM_isLittleEndian() && !MEM_32bits()); + + /* strict minimum : jump table + 1 byte per stream */ + if (srcSize < 10) + return ERROR(corruption_detected); + + /* Must have at least 8 bytes per stream because we don't handle initializing smaller bit containers. + * If table log is not correct at this point, fallback to the old decoder. + * On small inputs we don't have enough data to trigger the fast loop, so use the old decoder. + */ + if (dtLog != HUF_DECODER_FAST_TABLELOG) + return 1; + + /* Read the jump table. */ + { + const BYTE* const istart = (const BYTE*)src; + size_t const length1 = MEM_readLE16(istart); + size_t const length2 = MEM_readLE16(istart+2); + size_t const length3 = MEM_readLE16(istart+4); + size_t const length4 = srcSize - (length1 + length2 + length3 + 6); + args->iend[0] = istart + 6; /* jumpTable */ + args->iend[1] = args->iend[0] + length1; + args->iend[2] = args->iend[1] + length2; + args->iend[3] = args->iend[2] + length3; + + /* HUF_initDStream() requires this, and this small of an input + * won't benefit from the ASM loop anyways. + * length1 must be >= 16 so that ip[0] >= ilimit before the loop + * starts. + */ + if (length1 < 16 || length2 < 8 || length3 < 8 || length4 < 8) + return 1; + if (length4 > srcSize) return ERROR(corruption_detected); /* overflow */ + } + /* ip[] contains the position that is currently loaded into bits[]. */ + args->ip[0] = args->iend[1] - sizeof(U64); + args->ip[1] = args->iend[2] - sizeof(U64); + args->ip[2] = args->iend[3] - sizeof(U64); + args->ip[3] = (BYTE const*)src + srcSize - sizeof(U64); + + /* op[] contains the output pointers. */ + args->op[0] = (BYTE*)dst; + args->op[1] = args->op[0] + (dstSize+3)/4; + args->op[2] = args->op[1] + (dstSize+3)/4; + args->op[3] = args->op[2] + (dstSize+3)/4; + + /* No point to call the ASM loop for tiny outputs. */ + if (args->op[3] >= oend) + return 1; + + /* bits[] is the bit container. + * It is read from the MSB down to the LSB. + * It is shifted left as it is read, and zeros are + * shifted in. After the lowest valid bit a 1 is + * set, so that CountTrailingZeros(bits[]) can be used + * to count how many bits we've consumed. + */ + args->bits[0] = HUF_initDStream(args->ip[0]); + args->bits[1] = HUF_initDStream(args->ip[1]); + args->bits[2] = HUF_initDStream(args->ip[2]); + args->bits[3] = HUF_initDStream(args->ip[3]); + + /* If ip[] >= ilimit, it is guaranteed to be safe to + * reload bits[]. It may be beyond its section, but is + * guaranteed to be valid (>= istart). + */ + args->ilimit = ilimit; + + args->oend = oend; + args->dt = dt; + + return 0; +} + +static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressAsmArgs const* args, int stream, BYTE* segmentEnd) +{ + /* Validate that we haven't overwritten. */ + if (args->op[stream] > segmentEnd) + return ERROR(corruption_detected); + /* Validate that we haven't read beyond iend[]. + * Note that ip[] may be < iend[] because the MSB is + * the next bit to read, and we may have consumed 100% + * of the stream, so down to iend[i] - 8 is valid. + */ + if (args->ip[stream] < args->iend[stream] - 8) + return ERROR(corruption_detected); + + /* Construct the BIT_DStream_t. */ + bit->bitContainer = MEM_readLE64(args->ip[stream]); + bit->bitsConsumed = ZSTD_countTrailingZeros((size_t)args->bits[stream]); + bit->start = (const char*)args->iend[0]; + bit->limitPtr = bit->start + sizeof(size_t); + bit->ptr = (const char*)args->ip[stream]; + + return 0; +} +#endif + #ifndef HUF_FORCE_DECOMPRESS_X2 /*-***************************/ /* single-symbol decoding */ /*-***************************/ -typedef struct { BYTE byte; BYTE nbBits; } HUF_DEltX1; /* single-symbol decoding */ +typedef struct { BYTE nbBits; BYTE byte; } HUF_DEltX1; /* single-symbol decoding */ /** * Packs 4 HUF_DEltX1 structs into a U64. This is used to lay down 4 entries at @@ -122,14 +296,44 @@ typedef struct { BYTE byte; BYTE nbBits; } HUF_DEltX1; /* single-symbol decodi static U64 HUF_DEltX1_set4(BYTE symbol, BYTE nbBits) { U64 D4; if (MEM_isLittleEndian()) { - D4 = symbol + (nbBits << 8); - } else { D4 = (symbol << 8) + nbBits; + } else { + D4 = symbol + (nbBits << 8); } D4 *= 0x0001000100010001ULL; return D4; } +/** + * Increase the tableLog to targetTableLog and rescales the stats. + * If tableLog > targetTableLog this is a no-op. + * @returns New tableLog + */ +static U32 HUF_rescaleStats(BYTE* huffWeight, U32* rankVal, U32 nbSymbols, U32 tableLog, U32 targetTableLog) +{ + if (tableLog > targetTableLog) + return tableLog; + if (tableLog < targetTableLog) { + U32 const scale = targetTableLog - tableLog; + U32 s; + /* Increase the weight for all non-zero probability symbols by scale. */ + for (s = 0; s < nbSymbols; ++s) { + huffWeight[s] += (BYTE)((huffWeight[s] == 0) ? 0 : scale); + } + /* Update rankVal to reflect the new weights. + * All weights except 0 get moved to weight + scale. + * Weights [1, scale] are empty. + */ + for (s = targetTableLog; s > scale; --s) { + rankVal[s] = rankVal[s - scale]; + } + for (s = scale; s > 0; --s) { + rankVal[s] = 0; + } + } + return targetTableLog; +} + typedef struct { U32 rankVal[HUF_TABLELOG_ABSOLUTEMAX + 1]; U32 rankStart[HUF_TABLELOG_ABSOLUTEMAX + 1]; @@ -162,8 +366,12 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr iSize = HUF_readStats_wksp(wksp->huffWeight, HUF_SYMBOLVALUE_MAX + 1, wksp->rankVal, &nbSymbols, &tableLog, src, srcSize, wksp->statsWksp, sizeof(wksp->statsWksp), bmi2); if (HUF_isError(iSize)) return iSize; + /* Table header */ { DTableDesc dtd = HUF_getDTableDesc(DTable); + U32 const maxTableLog = dtd.maxTableLog + 1; + U32 const targetTableLog = MIN(maxTableLog, HUF_DECODER_FAST_TABLELOG); + tableLog = HUF_rescaleStats(wksp->huffWeight, wksp->rankVal, nbSymbols, tableLog, targetTableLog); if (tableLog > (U32)(dtd.maxTableLog+1)) return ERROR(tableLog_tooLarge); /* DTable too small, Huffman tree cannot fit in */ dtd.tableType = 0; dtd.tableLog = (BYTE)tableLog; @@ -445,6 +653,77 @@ HUF_decompress4X1_usingDTable_internal_body( } } +#if DYNAMIC_BMI2 +static TARGET_ATTRIBUTE("bmi2") +size_t HUF_decompress4X1_usingDTable_internal_bmi2(void* dst, size_t dstSize, void const* cSrc, + size_t cSrcSize, HUF_DTable const* DTable) { + return HUF_decompress4X1_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable); +} +#endif + +static +size_t HUF_decompress4X1_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc, + size_t cSrcSize, HUF_DTable const* DTable) { + return HUF_decompress4X1_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable); +} + +#if HUF_ENABLE_ASM_X86_64_BMI2 + +HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop(HUF_DecompressAsmArgs* args); + +static HUF_ASM_X86_64_BMI2_ATTRS +size_t +HUF_decompress4X1_usingDTable_internal_bmi2_asm( + void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, + const HUF_DTable* DTable) +{ + void const* dt = DTable + 1; + const BYTE* const iend = (const BYTE*)cSrc + 6; + BYTE* const oend = (BYTE*)dst + dstSize; + HUF_DecompressAsmArgs args; + { + size_t const ret = HUF_DecompressAsmArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable); + FORWARD_IF_ERROR(ret, "Failed to init asm args"); + if (ret != 0) + return HUF_decompress4X1_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); + } + + assert(args.ip[0] >= args.ilimit); + HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop(&args); + + /* Our loop guarantees that ip[] >= ilimit and that we haven't + * overwritten any op[]. + */ + assert(args.ip[0] >= iend); + assert(args.ip[1] >= iend); + assert(args.ip[2] >= iend); + assert(args.ip[3] >= iend); + assert(args.op[3] <= oend); + (void)iend; + + /* finish bit streams one by one. */ + { + size_t const segmentSize = (dstSize+3) / 4; + BYTE* segmentEnd = (BYTE*)dst; + int i; + for (i = 0; i < 4; ++i) { + BIT_DStream_t bit; + if (segmentSize <= (size_t)(oend - segmentEnd)) + segmentEnd += segmentSize; + else + segmentEnd = oend; + FORWARD_IF_ERROR(HUF_initRemainingDStream(&bit, &args, i, segmentEnd), "corruption"); + /* Decompress and validate that we've produced exactly the expected length. */ + args.op[i] += HUF_decodeStreamX1(args.op[i], &bit, segmentEnd, (HUF_DEltX1 const*)dt, HUF_DECODER_FAST_TABLELOG); + if (args.op[i] != segmentEnd) return ERROR(corruption_detected); + } + } + + /* decoded size */ + return dstSize; +} +#endif /* HUF_ENABLE_ASM_X86_64_BMI2 */ typedef size_t (*HUF_decompress_usingDTable_t)(void *dst, size_t dstSize, const void *cSrc, @@ -452,8 +731,28 @@ typedef size_t (*HUF_decompress_usingDTable_t)(void *dst, size_t dstSize, const HUF_DTable *DTable); HUF_DGEN(HUF_decompress1X1_usingDTable_internal) -HUF_DGEN(HUF_decompress4X1_usingDTable_internal) +static size_t HUF_decompress4X1_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc, + size_t cSrcSize, HUF_DTable const* DTable, int bmi2) +{ +#if DYNAMIC_BMI2 + if (bmi2) { +# if HUF_ENABLE_ASM_X86_64_BMI2 + return HUF_decompress4X1_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable); +# else + return HUF_decompress4X1_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); +# endif + } +#else + (void)bmi2; +#endif + +#if HUF_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__) + return HUF_decompress4X1_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable); +#else + return HUF_decompress4X1_usingDTable_internal_default(dst, dstSize, cSrc, cSrcSize, DTable); +#endif +} size_t HUF_decompress1X1_usingDTable( @@ -523,106 +822,226 @@ size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, /* *************************/ typedef struct { U16 sequence; BYTE nbBits; BYTE length; } HUF_DEltX2; /* double-symbols decoding */ -typedef struct { BYTE symbol; BYTE weight; } sortedSymbol_t; +typedef struct { BYTE symbol; } sortedSymbol_t; typedef U32 rankValCol_t[HUF_TABLELOG_MAX + 1]; typedef rankValCol_t rankVal_t[HUF_TABLELOG_MAX]; +/** + * Constructs a HUF_DEltX2 in a U32. + */ +static U32 HUF_buildDEltX2U32(U32 symbol, U32 nbBits, U32 baseSeq, int level) +{ + U32 seq; + DEBUG_STATIC_ASSERT(offsetof(HUF_DEltX2, sequence) == 0); + DEBUG_STATIC_ASSERT(offsetof(HUF_DEltX2, nbBits) == 2); + DEBUG_STATIC_ASSERT(offsetof(HUF_DEltX2, length) == 3); + DEBUG_STATIC_ASSERT(sizeof(HUF_DEltX2) == sizeof(U32)); + if (MEM_isLittleEndian()) { + seq = level == 1 ? symbol : (baseSeq + (symbol << 8)); + return seq + (nbBits << 16) + ((U32)level << 24); + } else { + seq = level == 1 ? (symbol << 8) : ((baseSeq << 8) + symbol); + return (seq << 16) + (nbBits << 8) + (U32)level; + } +} + +/** + * Constructs a HUF_DEltX2. + */ +static HUF_DEltX2 HUF_buildDEltX2(U32 symbol, U32 nbBits, U32 baseSeq, int level) +{ + HUF_DEltX2 DElt; + U32 const val = HUF_buildDEltX2U32(symbol, nbBits, baseSeq, level); + DEBUG_STATIC_ASSERT(sizeof(DElt) == sizeof(val)); + ZSTD_memcpy(&DElt, &val, sizeof(val)); + return DElt; +} + +/** + * Constructs 2 HUF_DEltX2s and packs them into a U64. + */ +static U64 HUF_buildDEltX2U64(U32 symbol, U32 nbBits, U16 baseSeq, int level) +{ + U32 DElt = HUF_buildDEltX2U32(symbol, nbBits, baseSeq, level); + return (U64)DElt + ((U64)DElt << 32); +} + +/** + * Fills the DTable rank with all the symbols from [begin, end) that are each + * nbBits long. + * + * @param DTableRank The start of the rank in the DTable. + * @param begin The first symbol to fill (inclusive). + * @param end The last symbol to fill (exclusive). + * @param nbBits Each symbol is nbBits long. + * @param tableLog The table log. + * @param baseSeq If level == 1 { 0 } else { the first level symbol } + * @param level The level in the table. Must be 1 or 2. + */ +static void HUF_fillDTableX2ForWeight( + HUF_DEltX2* DTableRank, + sortedSymbol_t const* begin, sortedSymbol_t const* end, + U32 nbBits, U32 tableLog, + U16 baseSeq, int const level) +{ + U32 const length = 1U << ((tableLog - nbBits) & 0x1F /* quiet static-analyzer */); + const sortedSymbol_t* ptr; + assert(level >= 1 && level <= 2); + switch (length) { + case 1: + for (ptr = begin; ptr != end; ++ptr) { + HUF_DEltX2 const DElt = HUF_buildDEltX2(ptr->symbol, nbBits, baseSeq, level); + *DTableRank++ = DElt; + } + break; + case 2: + for (ptr = begin; ptr != end; ++ptr) { + HUF_DEltX2 const DElt = HUF_buildDEltX2(ptr->symbol, nbBits, baseSeq, level); + DTableRank[0] = DElt; + DTableRank[1] = DElt; + DTableRank += 2; + } + break; + case 4: + for (ptr = begin; ptr != end; ++ptr) { + U64 const DEltX2 = HUF_buildDEltX2U64(ptr->symbol, nbBits, baseSeq, level); + ZSTD_memcpy(DTableRank + 0, &DEltX2, sizeof(DEltX2)); + ZSTD_memcpy(DTableRank + 2, &DEltX2, sizeof(DEltX2)); + DTableRank += 4; + } + break; + case 8: + for (ptr = begin; ptr != end; ++ptr) { + U64 const DEltX2 = HUF_buildDEltX2U64(ptr->symbol, nbBits, baseSeq, level); + ZSTD_memcpy(DTableRank + 0, &DEltX2, sizeof(DEltX2)); + ZSTD_memcpy(DTableRank + 2, &DEltX2, sizeof(DEltX2)); + ZSTD_memcpy(DTableRank + 4, &DEltX2, sizeof(DEltX2)); + ZSTD_memcpy(DTableRank + 6, &DEltX2, sizeof(DEltX2)); + DTableRank += 8; + } + break; + default: + for (ptr = begin; ptr != end; ++ptr) { + U64 const DEltX2 = HUF_buildDEltX2U64(ptr->symbol, nbBits, baseSeq, level); + HUF_DEltX2* const DTableRankEnd = DTableRank + length; + for (; DTableRank != DTableRankEnd; DTableRank += 8) { + ZSTD_memcpy(DTableRank + 0, &DEltX2, sizeof(DEltX2)); + ZSTD_memcpy(DTableRank + 2, &DEltX2, sizeof(DEltX2)); + ZSTD_memcpy(DTableRank + 4, &DEltX2, sizeof(DEltX2)); + ZSTD_memcpy(DTableRank + 6, &DEltX2, sizeof(DEltX2)); + } + } + break; + } +} /* HUF_fillDTableX2Level2() : * `rankValOrigin` must be a table of at least (HUF_TABLELOG_MAX + 1) U32 */ -static void HUF_fillDTableX2Level2(HUF_DEltX2* DTable, U32 sizeLog, const U32 consumed, - const U32* rankValOrigin, const int minWeight, - const sortedSymbol_t* sortedSymbols, const U32 sortedListSize, - U32 nbBitsBaseline, U16 baseSeq, U32* wksp, size_t wkspSize) +static void HUF_fillDTableX2Level2(HUF_DEltX2* DTable, U32 targetLog, const U32 consumedBits, + const U32* rankVal, const int minWeight, const int maxWeight1, + const sortedSymbol_t* sortedSymbols, U32 const* rankStart, + U32 nbBitsBaseline, U16 baseSeq) { - HUF_DEltX2 DElt; - U32* rankVal = wksp; - - assert(wkspSize >= HUF_TABLELOG_MAX + 1); - (void)wkspSize; - /* get pre-calculated rankVal */ - ZSTD_memcpy(rankVal, rankValOrigin, sizeof(U32) * (HUF_TABLELOG_MAX + 1)); - - /* fill skipped values */ + /* Fill skipped values (all positions up to rankVal[minWeight]). + * These are positions only get a single symbol because the combined weight + * is too large. + */ if (minWeight>1) { - U32 i, skipSize = rankVal[minWeight]; - MEM_writeLE16(&(DElt.sequence), baseSeq); - DElt.nbBits = (BYTE)(consumed); - DElt.length = 1; - for (i = 0; i < skipSize; i++) - DTable[i] = DElt; + U32 const length = 1U << ((targetLog - consumedBits) & 0x1F /* quiet static-analyzer */); + U64 const DEltX2 = HUF_buildDEltX2U64(baseSeq, consumedBits, /* baseSeq */ 0, /* level */ 1); + int const skipSize = rankVal[minWeight]; + assert(length > 1); + assert((U32)skipSize < length); + switch (length) { + case 2: + assert(skipSize == 1); + ZSTD_memcpy(DTable, &DEltX2, sizeof(DEltX2)); + break; + case 4: + assert(skipSize <= 4); + ZSTD_memcpy(DTable + 0, &DEltX2, sizeof(DEltX2)); + ZSTD_memcpy(DTable + 2, &DEltX2, sizeof(DEltX2)); + break; + default: + { + int i; + for (i = 0; i < skipSize; i += 8) { + ZSTD_memcpy(DTable + i + 0, &DEltX2, sizeof(DEltX2)); + ZSTD_memcpy(DTable + i + 2, &DEltX2, sizeof(DEltX2)); + ZSTD_memcpy(DTable + i + 4, &DEltX2, sizeof(DEltX2)); + ZSTD_memcpy(DTable + i + 6, &DEltX2, sizeof(DEltX2)); + } + } + } } - /* fill DTable */ - { U32 s; for (s=0; s= 1 */ - - rankVal[weight] += length; - } } + /* Fill each of the second level symbols by weight. */ + { + int w; + for (w = minWeight; w < maxWeight1; ++w) { + int const begin = rankStart[w]; + int const end = rankStart[w+1]; + U32 const nbBits = nbBitsBaseline - w; + U32 const totalBits = nbBits + consumedBits; + HUF_fillDTableX2ForWeight( + DTable + rankVal[w], + sortedSymbols + begin, sortedSymbols + end, + totalBits, targetLog, + baseSeq, /* level */ 2); + } + } } - static void HUF_fillDTableX2(HUF_DEltX2* DTable, const U32 targetLog, - const sortedSymbol_t* sortedList, const U32 sortedListSize, + const sortedSymbol_t* sortedList, const U32* rankStart, rankVal_t rankValOrigin, const U32 maxWeight, - const U32 nbBitsBaseline, U32* wksp, size_t wkspSize) + const U32 nbBitsBaseline) { - U32* rankVal = wksp; + U32* const rankVal = rankValOrigin[0]; const int scaleLog = nbBitsBaseline - targetLog; /* note : targetLog >= srcLog, hence scaleLog <= 1 */ const U32 minBits = nbBitsBaseline - maxWeight; - U32 s; + int w; + int const wEnd = (int)maxWeight + 1; - assert(wkspSize >= HUF_TABLELOG_MAX + 1); - wksp += HUF_TABLELOG_MAX + 1; - wkspSize -= HUF_TABLELOG_MAX + 1; + /* Fill DTable in order of weight. */ + for (w = 1; w < wEnd; ++w) { + int const begin = (int)rankStart[w]; + int const end = (int)rankStart[w+1]; + U32 const nbBits = nbBitsBaseline - w; - ZSTD_memcpy(rankVal, rankValOrigin, sizeof(U32) * (HUF_TABLELOG_MAX + 1)); - - /* fill DTable */ - for (s=0; s= minBits) { /* enough room for a second symbol */ - U32 sortedRank; + if (targetLog-nbBits >= minBits) { + /* Enough room for a second symbol. */ + int start = rankVal[w]; + U32 const length = 1U << ((targetLog - nbBits) & 0x1F /* quiet static-analyzer */); int minWeight = nbBits + scaleLog; + int s; if (minWeight < 1) minWeight = 1; - sortedRank = rankStart[minWeight]; - HUF_fillDTableX2Level2(DTable+start, targetLog-nbBits, nbBits, - rankValOrigin[nbBits], minWeight, - sortedList+sortedRank, sortedListSize-sortedRank, - nbBitsBaseline, symbol, wksp, wkspSize); + /* Fill the DTable for every symbol of weight w. + * These symbols get at least 1 second symbol. + */ + for (s = begin; s != end; ++s) { + HUF_fillDTableX2Level2( + DTable + start, targetLog, nbBits, + rankValOrigin[nbBits], minWeight, wEnd, + sortedList, rankStart, + nbBitsBaseline, sortedList[s].symbol); + start += length; + } } else { - HUF_DEltX2 DElt; - MEM_writeLE16(&(DElt.sequence), symbol); - DElt.nbBits = (BYTE)(nbBits); - DElt.length = 1; - { U32 const end = start + length; - U32 u; - for (u = start; u < end; u++) DTable[u] = DElt; - } } - rankVal[weight] += length; + /* Only a single symbol. */ + HUF_fillDTableX2ForWeight( + DTable + rankVal[w], + sortedList + begin, sortedList + end, + nbBits, targetLog, + /* baseSeq */ 0, /* level */ 1); + } } } typedef struct { rankValCol_t rankVal[HUF_TABLELOG_MAX]; U32 rankStats[HUF_TABLELOG_MAX + 1]; - U32 rankStart0[HUF_TABLELOG_MAX + 2]; + U32 rankStart0[HUF_TABLELOG_MAX + 3]; sortedSymbol_t sortedSymbol[HUF_SYMBOLVALUE_MAX + 1]; BYTE weightList[HUF_SYMBOLVALUE_MAX + 1]; U32 calleeWksp[HUF_READ_STATS_WORKSPACE_SIZE_U32]; @@ -632,9 +1051,16 @@ size_t HUF_readDTableX2_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize) { - U32 tableLog, maxW, sizeOfSort, nbSymbols; + return HUF_readDTableX2_wksp_bmi2(DTable, src, srcSize, workSpace, wkspSize, /* bmi2 */ 0); +} + +size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable, + const void* src, size_t srcSize, + void* workSpace, size_t wkspSize, int bmi2) +{ + U32 tableLog, maxW, nbSymbols; DTableDesc dtd = HUF_getDTableDesc(DTable); - U32 const maxTableLog = dtd.maxTableLog; + U32 maxTableLog = dtd.maxTableLog; size_t iSize; void* dtPtr = DTable+1; /* force compiler to avoid strict-aliasing */ HUF_DEltX2* const dt = (HUF_DEltX2*)dtPtr; @@ -652,11 +1078,12 @@ size_t HUF_readDTableX2_wksp(HUF_DTable* DTable, if (maxTableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge); /* ZSTD_memset(weightList, 0, sizeof(weightList)); */ /* is not necessary, even though some analyzer complain ... */ - iSize = HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1, wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, sizeof(wksp->calleeWksp), /* bmi2 */ 0); + iSize = HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1, wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, sizeof(wksp->calleeWksp), bmi2); if (HUF_isError(iSize)) return iSize; /* check result */ if (tableLog > maxTableLog) return ERROR(tableLog_tooLarge); /* DTable can't fit code depth */ + if (tableLog <= HUF_DECODER_FAST_TABLELOG && maxTableLog > HUF_DECODER_FAST_TABLELOG) maxTableLog = HUF_DECODER_FAST_TABLELOG; /* find maxWeight */ for (maxW = tableLog; wksp->rankStats[maxW]==0; maxW--) {} /* necessarily finds a solution before 0 */ @@ -669,7 +1096,7 @@ size_t HUF_readDTableX2_wksp(HUF_DTable* DTable, rankStart[w] = curr; } rankStart[0] = nextRankStart; /* put all 0w symbols at the end of sorted list*/ - sizeOfSort = nextRankStart; + rankStart[maxW+1] = nextRankStart; } /* sort symbols by weight */ @@ -678,7 +1105,6 @@ size_t HUF_readDTableX2_wksp(HUF_DTable* DTable, U32 const w = wksp->weightList[s]; U32 const r = rankStart[w]++; wksp->sortedSymbol[r].symbol = (BYTE)s; - wksp->sortedSymbol[r].weight = (BYTE)w; } rankStart[0] = 0; /* forget 0w symbols; this is beginning of weight(1) */ } @@ -703,10 +1129,9 @@ size_t HUF_readDTableX2_wksp(HUF_DTable* DTable, } } } } HUF_fillDTableX2(dt, maxTableLog, - wksp->sortedSymbol, sizeOfSort, + wksp->sortedSymbol, wksp->rankStart0, wksp->rankVal, maxW, - tableLog+1, - wksp->calleeWksp, sizeof(wksp->calleeWksp) / sizeof(U32)); + tableLog+1); dtd.tableLog = (BYTE)maxTableLog; dtd.tableType = 1; @@ -719,7 +1144,7 @@ FORCE_INLINE_TEMPLATE U32 HUF_decodeSymbolX2(void* op, BIT_DStream_t* DStream, const HUF_DEltX2* dt, const U32 dtLog) { size_t const val = BIT_lookBitsFast(DStream, dtLog); /* note : dtLog >= 1 */ - ZSTD_memcpy(op, dt+val, 2); + ZSTD_memcpy(op, &dt[val].sequence, 2); BIT_skipBits(DStream, dt[val].nbBits); return dt[val].length; } @@ -728,15 +1153,17 @@ FORCE_INLINE_TEMPLATE U32 HUF_decodeLastSymbolX2(void* op, BIT_DStream_t* DStream, const HUF_DEltX2* dt, const U32 dtLog) { size_t const val = BIT_lookBitsFast(DStream, dtLog); /* note : dtLog >= 1 */ - ZSTD_memcpy(op, dt+val, 1); - if (dt[val].length==1) BIT_skipBits(DStream, dt[val].nbBits); - else { + ZSTD_memcpy(op, &dt[val].sequence, 1); + if (dt[val].length==1) { + BIT_skipBits(DStream, dt[val].nbBits); + } else { if (DStream->bitsConsumed < (sizeof(DStream->bitContainer)*8)) { BIT_skipBits(DStream, dt[val].nbBits); if (DStream->bitsConsumed > (sizeof(DStream->bitContainer)*8)) /* ugly hack; works only because it's the last symbol. Note : can't easily extract nbBits from just this symbol */ DStream->bitsConsumed = (sizeof(DStream->bitContainer)*8); - } } + } + } return 1; } @@ -759,11 +1186,23 @@ HUF_decodeStreamX2(BYTE* p, BIT_DStream_t* bitDPtr, BYTE* const pEnd, /* up to 8 symbols at a time */ if ((size_t)(pEnd - p) >= sizeof(bitDPtr->bitContainer)) { - while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-(sizeof(bitDPtr->bitContainer)-1))) { - HUF_DECODE_SYMBOLX2_2(p, bitDPtr); - HUF_DECODE_SYMBOLX2_1(p, bitDPtr); - HUF_DECODE_SYMBOLX2_2(p, bitDPtr); - HUF_DECODE_SYMBOLX2_0(p, bitDPtr); + if (dtLog <= 11 && MEM_64bits()) { + /* up to 10 symbols at a time */ + while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-9)) { + HUF_DECODE_SYMBOLX2_0(p, bitDPtr); + HUF_DECODE_SYMBOLX2_0(p, bitDPtr); + HUF_DECODE_SYMBOLX2_0(p, bitDPtr); + HUF_DECODE_SYMBOLX2_0(p, bitDPtr); + HUF_DECODE_SYMBOLX2_0(p, bitDPtr); + } + } else { + /* up to 8 symbols at a time */ + while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-(sizeof(bitDPtr->bitContainer)-1))) { + HUF_DECODE_SYMBOLX2_2(p, bitDPtr); + HUF_DECODE_SYMBOLX2_1(p, bitDPtr); + HUF_DECODE_SYMBOLX2_2(p, bitDPtr); + HUF_DECODE_SYMBOLX2_0(p, bitDPtr); + } } } @@ -808,7 +1247,6 @@ HUF_decompress1X2_usingDTable_internal_body( /* decoded size */ return dstSize; } - FORCE_INLINE_TEMPLATE size_t HUF_decompress4X2_usingDTable_internal_body( void* dst, size_t dstSize, @@ -927,8 +1365,97 @@ HUF_decompress4X2_usingDTable_internal_body( } } +#if DYNAMIC_BMI2 +static TARGET_ATTRIBUTE("bmi2") +size_t HUF_decompress4X2_usingDTable_internal_bmi2(void* dst, size_t dstSize, void const* cSrc, + size_t cSrcSize, HUF_DTable const* DTable) { + return HUF_decompress4X2_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable); +} +#endif + +static +size_t HUF_decompress4X2_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc, + size_t cSrcSize, HUF_DTable const* DTable) { + return HUF_decompress4X2_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable); +} + +#if HUF_ENABLE_ASM_X86_64_BMI2 + +HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop(HUF_DecompressAsmArgs* args); + +static HUF_ASM_X86_64_BMI2_ATTRS size_t +HUF_decompress4X2_usingDTable_internal_bmi2_asm( + void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, + const HUF_DTable* DTable) { + void const* dt = DTable + 1; + const BYTE* const iend = (const BYTE*)cSrc + 6; + BYTE* const oend = (BYTE*)dst + dstSize; + HUF_DecompressAsmArgs args; + { + size_t const ret = HUF_DecompressAsmArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable); + FORWARD_IF_ERROR(ret, "Failed to init asm args"); + if (ret != 0) + return HUF_decompress4X2_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); + } + + assert(args.ip[0] >= args.ilimit); + HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop(&args); + + /* note : op4 already verified within main loop */ + assert(args.ip[0] >= iend); + assert(args.ip[1] >= iend); + assert(args.ip[2] >= iend); + assert(args.ip[3] >= iend); + assert(args.op[3] <= oend); + (void)iend; + + /* finish bitStreams one by one */ + { + size_t const segmentSize = (dstSize+3) / 4; + BYTE* segmentEnd = (BYTE*)dst; + int i; + for (i = 0; i < 4; ++i) { + BIT_DStream_t bit; + if (segmentSize <= (size_t)(oend - segmentEnd)) + segmentEnd += segmentSize; + else + segmentEnd = oend; + FORWARD_IF_ERROR(HUF_initRemainingDStream(&bit, &args, i, segmentEnd), "corruption"); + args.op[i] += HUF_decodeStreamX2(args.op[i], &bit, segmentEnd, (HUF_DEltX2 const*)dt, HUF_DECODER_FAST_TABLELOG); + if (args.op[i] != segmentEnd) + return ERROR(corruption_detected); + } + } + + /* decoded size */ + return dstSize; +} +#endif /* HUF_ENABLE_ASM_X86_64_BMI2 */ + +static size_t HUF_decompress4X2_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc, + size_t cSrcSize, HUF_DTable const* DTable, int bmi2) +{ +#if DYNAMIC_BMI2 + if (bmi2) { +# if HUF_ENABLE_ASM_X86_64_BMI2 + return HUF_decompress4X2_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable); +# else + return HUF_decompress4X2_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); +# endif + } +#else + (void)bmi2; +#endif + +#if HUF_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__) + return HUF_decompress4X2_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable); +#else + return HUF_decompress4X2_usingDTable_internal_default(dst, dstSize, cSrc, cSrcSize, DTable); +#endif +} + HUF_DGEN(HUF_decompress1X2_usingDTable_internal) -HUF_DGEN(HUF_decompress4X2_usingDTable_internal) size_t HUF_decompress1X2_usingDTable( void* dst, size_t dstSize, @@ -1037,25 +1564,25 @@ size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, #if !defined(HUF_FORCE_DECOMPRESS_X1) && !defined(HUF_FORCE_DECOMPRESS_X2) typedef struct { U32 tableTime; U32 decode256Time; } algo_time_t; -static const algo_time_t algoTime[16 /* Quantization */][3 /* single, double, quad */] = +static const algo_time_t algoTime[16 /* Quantization */][2 /* single, double */] = { /* single, double, quad */ - {{0,0}, {1,1}, {2,2}}, /* Q==0 : impossible */ - {{0,0}, {1,1}, {2,2}}, /* Q==1 : impossible */ - {{ 38,130}, {1313, 74}, {2151, 38}}, /* Q == 2 : 12-18% */ - {{ 448,128}, {1353, 74}, {2238, 41}}, /* Q == 3 : 18-25% */ - {{ 556,128}, {1353, 74}, {2238, 47}}, /* Q == 4 : 25-32% */ - {{ 714,128}, {1418, 74}, {2436, 53}}, /* Q == 5 : 32-38% */ - {{ 883,128}, {1437, 74}, {2464, 61}}, /* Q == 6 : 38-44% */ - {{ 897,128}, {1515, 75}, {2622, 68}}, /* Q == 7 : 44-50% */ - {{ 926,128}, {1613, 75}, {2730, 75}}, /* Q == 8 : 50-56% */ - {{ 947,128}, {1729, 77}, {3359, 77}}, /* Q == 9 : 56-62% */ - {{1107,128}, {2083, 81}, {4006, 84}}, /* Q ==10 : 62-69% */ - {{1177,128}, {2379, 87}, {4785, 88}}, /* Q ==11 : 69-75% */ - {{1242,128}, {2415, 93}, {5155, 84}}, /* Q ==12 : 75-81% */ - {{1349,128}, {2644,106}, {5260,106}}, /* Q ==13 : 81-87% */ - {{1455,128}, {2422,124}, {4174,124}}, /* Q ==14 : 87-93% */ - {{ 722,128}, {1891,145}, {1936,146}}, /* Q ==15 : 93-99% */ + {{0,0}, {1,1}}, /* Q==0 : impossible */ + {{0,0}, {1,1}}, /* Q==1 : impossible */ + {{ 150,216}, { 381,119}}, /* Q == 2 : 12-18% */ + {{ 170,205}, { 514,112}}, /* Q == 3 : 18-25% */ + {{ 177,199}, { 539,110}}, /* Q == 4 : 25-32% */ + {{ 197,194}, { 644,107}}, /* Q == 5 : 32-38% */ + {{ 221,192}, { 735,107}}, /* Q == 6 : 38-44% */ + {{ 256,189}, { 881,106}}, /* Q == 7 : 44-50% */ + {{ 359,188}, {1167,109}}, /* Q == 8 : 50-56% */ + {{ 582,187}, {1570,114}}, /* Q == 9 : 56-62% */ + {{ 688,187}, {1712,122}}, /* Q ==10 : 62-69% */ + {{ 825,186}, {1965,136}}, /* Q ==11 : 69-75% */ + {{ 976,185}, {2131,150}}, /* Q ==12 : 75-81% */ + {{1180,186}, {2070,175}}, /* Q ==13 : 81-87% */ + {{1377,185}, {1731,202}}, /* Q ==14 : 87-93% */ + {{1412,185}, {1695,202}}, /* Q ==15 : 93-99% */ }; #endif @@ -1082,7 +1609,7 @@ U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize) U32 const D256 = (U32)(dstSize >> 8); U32 const DTime0 = algoTime[Q][0].tableTime + (algoTime[Q][0].decode256Time * D256); U32 DTime1 = algoTime[Q][1].tableTime + (algoTime[Q][1].decode256Time * D256); - DTime1 += DTime1 >> 3; /* advantage to algorithm using less memory, to reduce cache eviction */ + DTime1 += DTime1 >> 5; /* small advantage to algorithm using less memory, to reduce cache eviction */ return DTime1 < DTime0; } #endif diff --git a/lib/decompress/huf_decompress_amd64.S b/lib/decompress/huf_decompress_amd64.S new file mode 100644 index 00000000..77a2d856 --- /dev/null +++ b/lib/decompress/huf_decompress_amd64.S @@ -0,0 +1,561 @@ +# Calling convention: +# +# %rdi contains the first argument: HUF_DecompressAsmArgs*. +# %rbp is'nt maintained (no frame pointer). +# %rsp contains the stack pointer that grows down. +# No red-zone is assumed, only addresses >= %rsp are used. +# All register contents are preserved. +# +# TODO: Support Windows calling convention. + +#if !defined(HUF_DISABLE_ASM) && defined(__x86_64__) +.global HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop +.global HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop +.global _HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop +.global _HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop +.text + +# Sets up register mappings for clarity. +# op[], bits[], dtable & ip[0] each get their own register. +# ip[1,2,3] & olimit alias var[]. +# %rax is a scratch register. + +#define op0 rsi +#define op1 rbx +#define op2 rcx +#define op3 rdi + +#define ip0 r8 +#define ip1 r9 +#define ip2 r10 +#define ip3 r11 + +#define bits0 rbp +#define bits1 rdx +#define bits2 r12 +#define bits3 r13 +#define dtable r14 +#define olimit r15 + +# var[] aliases ip[1,2,3] & olimit +# ip[1,2,3] are saved every iteration. +# olimit is only used in compute_olimit. +#define var0 r15 +#define var1 r9 +#define var2 r10 +#define var3 r11 + +# 32-bit var registers +#define vard0 r15d +#define vard1 r9d +#define vard2 r10d +#define vard3 r11d + +# Helper macro: args if idx != 4. +#define IF_NOT_4_0(...) __VA_ARGS__ +#define IF_NOT_4_1(...) __VA_ARGS__ +#define IF_NOT_4_2(...) __VA_ARGS__ +#define IF_NOT_4_3(...) __VA_ARGS__ +#define IF_NOT_4_4(...) +#define IF_NOT_4_(idx, ...) IF_NOT_4_##idx(__VA_ARGS__) +#define IF_NOT_4(idx, ...) IF_NOT_4_(idx, __VA_ARGS__) + +# Calls X(N) for each stream 0, 1, 2, 3. +#define FOR_EACH_STREAM(X) \ + X(0); \ + X(1); \ + X(2); \ + X(3) + +# Calls X(N, idx) for each stream 0, 1, 2, 3. +#define FOR_EACH_STREAM_WITH_INDEX(X, idx) \ + X(0, idx); \ + X(1, idx); \ + X(2, idx); \ + X(3, idx) + +# Define both _HUF_* & HUF_* symbols because MacOS +# C symbols are prefixed with '_' & Linux symbols aren't. +_HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop: +HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop: + # Save all registers - even if they are callee saved for simplicity. + push %rax + push %rbx + push %rcx + push %rdx + push %rbp + push %rsi + push %rdi + push %r8 + push %r9 + push %r10 + push %r11 + push %r12 + push %r13 + push %r14 + push %r15 + + # Read HUF_DecompressAsmArgs* args from %rax + movq %rdi, %rax + movq 0(%rax), %ip0 + movq 8(%rax), %ip1 + movq 16(%rax), %ip2 + movq 24(%rax), %ip3 + movq 32(%rax), %op0 + movq 40(%rax), %op1 + movq 48(%rax), %op2 + movq 56(%rax), %op3 + movq 64(%rax), %bits0 + movq 72(%rax), %bits1 + movq 80(%rax), %bits2 + movq 88(%rax), %bits3 + movq 96(%rax), %dtable + push %rax # argument + push 104(%rax) # ilimit + push 112(%rax) # oend + push %olimit # olimit space + + subq $24, %rsp + +.L_4X1_compute_olimit: + # Computes how many iterations we can do savely + # %r15, %rax may be clobbered + # rbx, rdx must be saved + # op3 & ip0 mustn't be clobbered + movq %rbx, 0(%rsp) + movq %rdx, 8(%rsp) + + movq 32(%rsp), %rax # rax = oend + subq %op3, %rax # rax = oend - op3 + + # r15 = (oend - op3) / 5 + movabsq $-3689348814741910323, %rdx + mulq %rdx + movq %rdx, %r15 + shrq $2, %r15 + + movq %ip0, %rax # rax = ip0 + movq 40(%rsp), %rdx # rdx = ilimit + subq %rdx, %rax # rax = ip0 - ilimit + movq %rax, %rbx # rbx = ip0 - ilimit + + # rdx = (ip0 - ilimit) / 7 + movabsq $2635249153387078803, %rdx + mulq %rdx + subq %rdx, %rbx + shrq %rbx + addq %rbx, %rdx + shrq $2, %rdx + + # r15 = min(%rdx, %r15) + cmpq %rdx, %r15 + cmova %rdx, %r15 + + # r15 = r15 * 5 + leaq (%r15, %r15, 4), %r15 + + # olimit = op3 + r15 + addq %op3, %olimit + + movq 8(%rsp), %rdx + movq 0(%rsp), %rbx + + # If (op3 + 20 > olimit) + movq %op3, %rax # rax = op3 + addq $20, %rax # rax = op3 + 20 + cmpq %rax, %olimit # op3 + 20 > olimit + jb .L_4X1_exit + + # If (ip1 < ip0) go to exit + cmpq %ip0, %ip1 + jb .L_4X1_exit + + # If (ip2 < ip1) go to exit + cmpq %ip1, %ip2 + jb .L_4X1_exit + + # If (ip3 < ip2) go to exit + cmpq %ip2, %ip3 + jb .L_4X1_exit + +# Reads top 11 bits from bits[n] +# Loads dt[bits[n]] into var[n] +#define GET_NEXT_DELT(n) \ + movq $53, %var##n; \ + shrxq %var##n, %bits##n, %var##n; \ + movzwl (%dtable,%var##n,2),%vard##n + +# var[n] must contain the DTable entry computed with GET_NEXT_DELT +# Moves var[n] to %rax +# bits[n] <<= var[n] & 63 +# op[n][idx] = %rax >> 8 +# %ah is a way to access bits [8, 16) of %rax +#define DECODE_FROM_DELT(n, idx) \ + movq %var##n, %rax; \ + shlxq %var##n, %bits##n, %bits##n; \ + movb %ah, idx(%op##n) + +# Assumes GET_NEXT_DELT has been called. +# Calls DECODE_FROM_DELT then GET_NEXT_DELT if n < 4 +#define DECODE(n, idx) \ + DECODE_FROM_DELT(n, idx); \ + IF_NOT_4(idx, GET_NEXT_DELT(n)) + +# // ctz & nbBytes is stored in bits[n] +# // nbBits is stored in %rax +# ctz = CTZ[bits[n]] +# nbBits = ctz & 7 +# nbBytes = ctz >> 3 +# op[n] += 5 +# ip[n] -= nbBytes +# // Note: x86-64 is little-endian ==> no bswap +# bits[n] = MEM_readST(ip[n]) | 1 +# bits[n] <<= nbBits +#define RELOAD_BITS(n) \ + bsfq %bits##n, %bits##n; \ + movq %bits##n, %rax; \ + andq $7, %rax; \ + shrq $3, %bits##n; \ + leaq 5(%op##n), %op##n; \ + subq %bits##n, %ip##n; \ + movq (%ip##n), %bits##n; \ + orq $1, %bits##n; \ + shlx %rax, %bits##n, %bits##n; + + # Store clobbered variables on the stack + movq %olimit, 24(%rsp) + movq %ip1, 0(%rsp) + movq %ip2, 8(%rsp) + movq %ip3, 16(%rsp) + + # Call GET_NEXT_DELT for each stream + FOR_EACH_STREAM(GET_NEXT_DELT) + + .p2align 6 + +.L_4X1_loop_body: +# LLVM-MCA-BEGIN decode-4X1 + # Decode 5 symbols in each of the 4 streams (20 total) + # Must have called GET_NEXT_DELT for each stream + FOR_EACH_STREAM_WITH_INDEX(DECODE, 0) + FOR_EACH_STREAM_WITH_INDEX(DECODE, 1) + FOR_EACH_STREAM_WITH_INDEX(DECODE, 2) + FOR_EACH_STREAM_WITH_INDEX(DECODE, 3) + FOR_EACH_STREAM_WITH_INDEX(DECODE, 4) + + # Load ip[1,2,3] from stack (var[] aliases them) + # ip[] is needed for RELOAD_BITS + # Each will be stored back to the stack after RELOAD + movq 0(%rsp), %ip1 + movq 8(%rsp), %ip2 + movq 16(%rsp), %ip3 + + # Reload each stream & fetch the next table entry + # to prepare for the next iteration + RELOAD_BITS(0) + GET_NEXT_DELT(0) + + RELOAD_BITS(1) + movq %ip1, 0(%rsp) + GET_NEXT_DELT(1) + + RELOAD_BITS(2) + movq %ip2, 8(%rsp) + GET_NEXT_DELT(2) + + RELOAD_BITS(3) + movq %ip3, 16(%rsp) + GET_NEXT_DELT(3) + + # If op3 < olimit: continue the loop + cmp %op3, 24(%rsp) + ja .L_4X1_loop_body + + # Reload ip[1,2,3] from stack + movq 0(%rsp), %ip1 + movq 8(%rsp), %ip2 + movq 16(%rsp), %ip3 + + # Re-compute olimit + jmp .L_4X1_compute_olimit + +#undef GET_NEXT_DELT +#undef DECODE_FROM_DELT +#undef DECODE +#undef RELOAD_BITS +# LLVM-MCA-END +.L_4X1_exit: + addq $24, %rsp + + # Restore stack (oend & olimit) + pop %rax # olimit + pop %rax # oend + pop %rax # ilimit + pop %rax # arg + + # Save ip / op / bits + movq %ip0, 0(%rax) + movq %ip1, 8(%rax) + movq %ip2, 16(%rax) + movq %ip3, 24(%rax) + movq %op0, 32(%rax) + movq %op1, 40(%rax) + movq %op2, 48(%rax) + movq %op3, 56(%rax) + movq %bits0, 64(%rax) + movq %bits1, 72(%rax) + movq %bits2, 80(%rax) + movq %bits3, 88(%rax) + + # Restore registers + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %r11 + pop %r10 + pop %r9 + pop %r8 + pop %rdi + pop %rsi + pop %rbp + pop %rdx + pop %rcx + pop %rbx + pop %rax + ret + +_HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop: +HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop: + # Save all registers - even if they are callee saved for simplicity. + push %rax + push %rbx + push %rcx + push %rdx + push %rbp + push %rsi + push %rdi + push %r8 + push %r9 + push %r10 + push %r11 + push %r12 + push %r13 + push %r14 + push %r15 + + movq %rdi, %rax + movq 0(%rax), %ip0 + movq 8(%rax), %ip1 + movq 16(%rax), %ip2 + movq 24(%rax), %ip3 + movq 32(%rax), %op0 + movq 40(%rax), %op1 + movq 48(%rax), %op2 + movq 56(%rax), %op3 + movq 64(%rax), %bits0 + movq 72(%rax), %bits1 + movq 80(%rax), %bits2 + movq 88(%rax), %bits3 + movq 96(%rax), %dtable + push %rax # argument + push %rax # olimit + push 104(%rax) # ilimit + + movq 112(%rax), %rax + push %rax # oend3 + + movq %op3, %rax + push %rax # oend2 + + movq %op2, %rax + push %rax # oend1 + + movq %op1, %rax + push %rax # oend0 + + # Scratch space + subq $8, %rsp + +.L_4X2_compute_olimit: + # Computes how many iterations we can do savely + # %r15, %rax may be clobbered + # rdx must be saved + # op[1,2,3,4] & ip0 mustn't be clobbered + movq %rdx, 0(%rsp) + + # We can consume up to 7 input bytes each iteration. + movq %ip0, %rax # rax = ip0 + movq 40(%rsp), %rdx # rdx = ilimit + subq %rdx, %rax # rax = ip0 - ilimit + movq %rax, %r15 # r15 = ip0 - ilimit + + # rdx = rax / 7 + movabsq $2635249153387078803, %rdx + mulq %rdx + subq %rdx, %r15 + shrq %r15 + addq %r15, %rdx + shrq $2, %rdx + + # r15 = (ip0 - ilimit) / 7 + movq %rdx, %r15 + + movabsq $-3689348814741910323, %rdx + movq 8(%rsp), %rax # rax = oend0 + subq %op0, %rax # rax = oend0 - op0 + mulq %rdx + shrq $3, %rdx # rdx = rax / 10 + + # r15 = min(%rdx, %r15) + cmpq %rdx, %r15 + cmova %rdx, %r15 + + movabsq $-3689348814741910323, %rdx + movq 16(%rsp), %rax # rax = oend1 + subq %op1, %rax # rax = oend1 - op1 + mulq %rdx + shrq $3, %rdx # rdx = rax / 10 + + # r15 = min(%rdx, %r15) + cmpq %rdx, %r15 + cmova %rdx, %r15 + + movabsq $-3689348814741910323, %rdx + movq 24(%rsp), %rax # rax = oend2 + subq %op2, %rax # rax = oend2 - op2 + mulq %rdx + shrq $3, %rdx # rdx = rax / 10 + + # r15 = min(%rdx, %r15) + cmpq %rdx, %r15 + cmova %rdx, %r15 + + movabsq $-3689348814741910323, %rdx + movq 32(%rsp), %rax # rax = oend3 + subq %op3, %rax # rax = oend3 - op3 + mulq %rdx + shrq $3, %rdx # rdx = rax / 10 + + # r15 = min(%rdx, %r15) + cmpq %rdx, %r15 + cmova %rdx, %r15 + + # olimit = op3 + 5 * r15 + movq %r15, %rax + leaq (%op3, %rax, 4), %olimit + addq %rax, %olimit + + movq 0(%rsp), %rdx + + # If (op3 + 10 > olimit) + movq %op3, %rax # rax = op3 + addq $10, %rax # rax = op3 + 10 + cmpq %rax, %olimit # op3 + 10 > olimit + jb .L_4X2_exit + + # If (ip1 < ip0) go to exit + cmpq %ip0, %ip1 + jb .L_4X2_exit + + # If (ip2 < ip1) go to exit + cmpq %ip1, %ip2 + jb .L_4X2_exit + + # If (ip3 < ip2) go to exit + cmpq %ip2, %ip3 + jb .L_4X2_exit + +#define DECODE(n, idx) \ + movq %bits##n, %rax; \ + shrq $53, %rax; \ + movzwl 0(%dtable,%rax,4),%r8d; \ + movzbl 2(%dtable,%rax,4),%r15d; \ + movzbl 3(%dtable,%rax,4),%eax; \ + movw %r8w, (%op##n); \ + shlxq %r15, %bits##n, %bits##n; \ + addq %rax, %op##n + +#define RELOAD_BITS(n) \ + bsfq %bits##n, %bits##n; \ + movq %bits##n, %rax; \ + shrq $3, %bits##n; \ + andq $7, %rax; \ + subq %bits##n, %ip##n; \ + movq (%ip##n), %bits##n; \ + orq $1, %bits##n; \ + shlxq %rax, %bits##n, %bits##n; + + + movq %olimit, 48(%rsp) + + .p2align 6 + +.L_4X2_loop_body: +# LLVM-MCA-BEGIN decode-4X2 + + # We clobber r8, so store it on the stack + movq %r8, 0(%rsp) + + # Decode 5 symbols from each of the 4 streams (20 symbols total). + FOR_EACH_STREAM_WITH_INDEX(DECODE, 0) + FOR_EACH_STREAM_WITH_INDEX(DECODE, 1) + FOR_EACH_STREAM_WITH_INDEX(DECODE, 2) + FOR_EACH_STREAM_WITH_INDEX(DECODE, 3) + FOR_EACH_STREAM_WITH_INDEX(DECODE, 4) + + # Reload r8 + movq 0(%rsp), %r8 + + FOR_EACH_STREAM(RELOAD_BITS) + + cmp %op3, 48(%rsp) + ja .L_4X2_loop_body + jmp .L_4X2_compute_olimit + +#undef DECODE +#undef RELOAD_BITS +# LLVM-MCA-END +.L_4X2_exit: + addq $8, %rsp + # Restore stack (oend & olimit) + pop %rax # oend0 + pop %rax # oend1 + pop %rax # oend2 + pop %rax # oend3 + pop %rax # ilimit + pop %rax # olimit + pop %rax # arg + + # Save ip / op / bits + movq %ip0, 0(%rax) + movq %ip1, 8(%rax) + movq %ip2, 16(%rax) + movq %ip3, 24(%rax) + movq %op0, 32(%rax) + movq %op1, 40(%rax) + movq %op2, 48(%rax) + movq %op3, 56(%rax) + movq %bits0, 64(%rax) + movq %bits1, 72(%rax) + movq %bits2, 80(%rax) + movq %bits3, 88(%rax) + + # Restore registers + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %r11 + pop %r10 + pop %r9 + pop %r8 + pop %rdi + pop %rsi + pop %rbp + pop %rdx + pop %rcx + pop %rbx + pop %rax + ret +#endif diff --git a/lib/zstd.h b/lib/zstd.h index ebe2e04b..90b59475 100644 --- a/lib/zstd.h +++ b/lib/zstd.h @@ -72,7 +72,7 @@ extern "C" { /*------ Version ------*/ #define ZSTD_VERSION_MAJOR 1 #define ZSTD_VERSION_MINOR 5 -#define ZSTD_VERSION_RELEASE 0 +#define ZSTD_VERSION_RELEASE 1 #define ZSTD_VERSION_NUMBER (ZSTD_VERSION_MAJOR *100*100 + ZSTD_VERSION_MINOR *100 + ZSTD_VERSION_RELEASE) /*! ZSTD_versionNumber() : diff --git a/tests/fuzz/huf_decompress.c b/tests/fuzz/huf_decompress.c index 7ede9ec5..fea09fc9 100644 --- a/tests/fuzz/huf_decompress.c +++ b/tests/fuzz/huf_decompress.c @@ -46,7 +46,7 @@ int LLVMFuzzerTestOneInput(const uint8_t *src, size_t size) if (ZSTD_isError(err)) goto _out; } else { - size_t const err = HUF_readDTableX2_wksp(dt, src, size, wksp, wkspSize); + size_t const err = HUF_readDTableX2_wksp_bmi2(dt, src, size, wksp, wkspSize, bmi2); if (ZSTD_isError(err)) goto _out; }