Huffman ASM
This commit is contained in:
parent
51b123d5f7
commit
a5f2c45528
@ -37,6 +37,7 @@ libzstd_sources = [join_paths(zstd_rootdir, 'lib/common/entropy_common.c'),
|
||||
join_paths(zstd_rootdir, 'lib/compress/zstd_opt.c'),
|
||||
join_paths(zstd_rootdir, 'lib/compress/zstd_ldm.c'),
|
||||
join_paths(zstd_rootdir, 'lib/decompress/huf_decompress.c'),
|
||||
join_paths(zstd_rootdir, 'lib/decompress/huf_decompress_amd64.S'),
|
||||
join_paths(zstd_rootdir, 'lib/decompress/zstd_decompress.c'),
|
||||
join_paths(zstd_rootdir, 'lib/decompress/zstd_decompress_block.c'),
|
||||
join_paths(zstd_rootdir, 'lib/decompress/zstd_ddict.c'),
|
||||
|
@ -43,6 +43,8 @@
|
||||
#define ZSTD_MULTITHREAD
|
||||
#endif
|
||||
#define ZSTD_TRACE 0
|
||||
/* TODO: Can't amalgamate ASM function */
|
||||
#define HUF_DISABLE_ASM 1
|
||||
|
||||
/* Include zstd_deps.h first with all the options we need enabled. */
|
||||
#define ZSTD_DEPS_NEED_MALLOC
|
||||
|
@ -39,6 +39,8 @@
|
||||
#define ZSTD_LEGACY_SUPPORT 0
|
||||
#define ZSTD_STRIP_ERROR_STRINGS
|
||||
#define ZSTD_TRACE 0
|
||||
/* TODO: Can't amalgamate ASM function */
|
||||
#define HUF_DISABLE_ASM 1
|
||||
|
||||
/* Include zstd_deps.h first with all the options we need enabled. */
|
||||
#define ZSTD_DEPS_NEED_MALLOC
|
||||
|
@ -35,6 +35,7 @@ libzstd:
|
||||
-DXXH_STATIC_LINKING_ONLY \
|
||||
-DMEM_FORCE_MEMORY_ACCESS=0 \
|
||||
-D__GNUC__ \
|
||||
-D__linux__=1 \
|
||||
-DSTATIC_BMI2=0 \
|
||||
-DZSTD_ADDRESS_SANITIZER=0 \
|
||||
-DZSTD_MEMORY_SANITIZER=0 \
|
||||
|
@ -21,6 +21,11 @@
|
||||
#include "common/error_private.c"
|
||||
#include "common/fse_decompress.c"
|
||||
#include "common/zstd_common.c"
|
||||
/*
|
||||
* Disable the ASM Huffman implementation because we need to
|
||||
* include all the sources.
|
||||
*/
|
||||
#define HUF_DISABLE_ASM 1
|
||||
#include "decompress/huf_decompress.c"
|
||||
#include "decompress/zstd_ddict.c"
|
||||
#include "decompress/zstd_decompress.c"
|
||||
|
@ -42,3 +42,4 @@ test_not_present "ZSTD_DLL_IMPORT"
|
||||
test_not_present "__ICCARM__"
|
||||
test_not_present "_MSC_VER"
|
||||
test_not_present "_WIN32"
|
||||
test_not_present "__linux__"
|
||||
|
@ -108,7 +108,7 @@
|
||||
#if ((defined(__clang__) && __has_attribute(__target__)) \
|
||||
|| (defined(__GNUC__) \
|
||||
&& (__GNUC__ >= 5 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)))) \
|
||||
&& (defined(__x86_64__) || defined(_M_X86)) \
|
||||
&& (defined(__x86_64__) || defined(_M_X64)) \
|
||||
&& !defined(__BMI2__)
|
||||
# define DYNAMIC_BMI2 1
|
||||
#else
|
||||
|
@ -22,6 +22,8 @@ extern "C" {
|
||||
* Dependencies
|
||||
******************************************/
|
||||
#include "../zstd_errors.h" /* enum list */
|
||||
#include "compiler.h"
|
||||
#include "debug.h"
|
||||
#include "zstd_deps.h" /* size_t */
|
||||
|
||||
|
||||
@ -73,6 +75,83 @@ ERR_STATIC const char* ERR_getErrorName(size_t code)
|
||||
return ERR_getErrorString(ERR_getErrorCode(code));
|
||||
}
|
||||
|
||||
/**
|
||||
* Ignore: this is an internal helper.
|
||||
*
|
||||
* This is a helper function to help force C99-correctness during compilation.
|
||||
* Under strict compilation modes, variadic macro arguments can't be empty.
|
||||
* However, variadic function arguments can be. Using a function therefore lets
|
||||
* us statically check that at least one (string) argument was passed,
|
||||
* independent of the compilation flags.
|
||||
*/
|
||||
static INLINE_KEYWORD UNUSED_ATTR
|
||||
void _force_has_format_string(const char *format, ...) {
|
||||
(void)format;
|
||||
}
|
||||
|
||||
/**
|
||||
* Ignore: this is an internal helper.
|
||||
*
|
||||
* We want to force this function invocation to be syntactically correct, but
|
||||
* we don't want to force runtime evaluation of its arguments.
|
||||
*/
|
||||
#define _FORCE_HAS_FORMAT_STRING(...) \
|
||||
if (0) { \
|
||||
_force_has_format_string(__VA_ARGS__); \
|
||||
}
|
||||
|
||||
#define ERR_QUOTE(str) #str
|
||||
|
||||
/**
|
||||
* Return the specified error if the condition evaluates to true.
|
||||
*
|
||||
* In debug modes, prints additional information.
|
||||
* In order to do that (particularly, printing the conditional that failed),
|
||||
* this can't just wrap RETURN_ERROR().
|
||||
*/
|
||||
#define RETURN_ERROR_IF(cond, err, ...) \
|
||||
if (cond) { \
|
||||
RAWLOG(3, "%s:%d: ERROR!: check %s failed, returning %s", \
|
||||
__FILE__, __LINE__, ERR_QUOTE(cond), ERR_QUOTE(ERROR(err))); \
|
||||
_FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \
|
||||
RAWLOG(3, ": " __VA_ARGS__); \
|
||||
RAWLOG(3, "\n"); \
|
||||
return ERROR(err); \
|
||||
}
|
||||
|
||||
/**
|
||||
* Unconditionally return the specified error.
|
||||
*
|
||||
* In debug modes, prints additional information.
|
||||
*/
|
||||
#define RETURN_ERROR(err, ...) \
|
||||
do { \
|
||||
RAWLOG(3, "%s:%d: ERROR!: unconditional check failed, returning %s", \
|
||||
__FILE__, __LINE__, ERR_QUOTE(ERROR(err))); \
|
||||
_FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \
|
||||
RAWLOG(3, ": " __VA_ARGS__); \
|
||||
RAWLOG(3, "\n"); \
|
||||
return ERROR(err); \
|
||||
} while(0);
|
||||
|
||||
/**
|
||||
* If the provided expression evaluates to an error code, returns that error code.
|
||||
*
|
||||
* In debug modes, prints additional information.
|
||||
*/
|
||||
#define FORWARD_IF_ERROR(err, ...) \
|
||||
do { \
|
||||
size_t const err_code = (err); \
|
||||
if (ERR_isError(err_code)) { \
|
||||
RAWLOG(3, "%s:%d: ERROR!: forwarding error in %s: %s", \
|
||||
__FILE__, __LINE__, ERR_QUOTE(err), ERR_getErrorName(err_code)); \
|
||||
_FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \
|
||||
RAWLOG(3, ": " __VA_ARGS__); \
|
||||
RAWLOG(3, "\n"); \
|
||||
return err_code; \
|
||||
} \
|
||||
} while(0);
|
||||
|
||||
#if defined (__cplusplus)
|
||||
}
|
||||
#endif
|
||||
|
@ -116,11 +116,11 @@ HUF_PUBLIC_API size_t HUF_compress4X_wksp (void* dst, size_t dstCapacity,
|
||||
|
||||
|
||||
/* *** Constants *** */
|
||||
#define HUF_TABLELOG_MAX 12 /* max runtime value of tableLog (due to static allocation); can be modified up to HUF_ABSOLUTEMAX_TABLELOG */
|
||||
#define HUF_TABLELOG_MAX 12 /* max runtime value of tableLog (due to static allocation); can be modified up to HUF_TABLELOG_ABSOLUTEMAX */
|
||||
#define HUF_TABLELOG_DEFAULT 11 /* default tableLog value when none specified */
|
||||
#define HUF_SYMBOLVALUE_MAX 255
|
||||
|
||||
#define HUF_TABLELOG_ABSOLUTEMAX 15 /* absolute limit of HUF_MAX_TABLELOG. Beyond that value, code does not work */
|
||||
#define HUF_TABLELOG_ABSOLUTEMAX 12 /* absolute limit of HUF_MAX_TABLELOG. Beyond that value, code does not work */
|
||||
#if (HUF_TABLELOG_MAX > HUF_TABLELOG_ABSOLUTEMAX)
|
||||
# error "HUF_TABLELOG_MAX is too large !"
|
||||
#endif
|
||||
@ -353,6 +353,9 @@ size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t ds
|
||||
#ifndef HUF_FORCE_DECOMPRESS_X2
|
||||
size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2);
|
||||
#endif
|
||||
#ifndef HUF_FORCE_DECOMPRESS_X1
|
||||
size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2);
|
||||
#endif
|
||||
|
||||
#endif /* HUF_STATIC_LINKING_ONLY */
|
||||
|
||||
|
@ -58,81 +58,6 @@ extern "C" {
|
||||
#define MIN(a,b) ((a)<(b) ? (a) : (b))
|
||||
#define MAX(a,b) ((a)>(b) ? (a) : (b))
|
||||
|
||||
/**
|
||||
* Ignore: this is an internal helper.
|
||||
*
|
||||
* This is a helper function to help force C99-correctness during compilation.
|
||||
* Under strict compilation modes, variadic macro arguments can't be empty.
|
||||
* However, variadic function arguments can be. Using a function therefore lets
|
||||
* us statically check that at least one (string) argument was passed,
|
||||
* independent of the compilation flags.
|
||||
*/
|
||||
static INLINE_KEYWORD UNUSED_ATTR
|
||||
void _force_has_format_string(const char *format, ...) {
|
||||
(void)format;
|
||||
}
|
||||
|
||||
/**
|
||||
* Ignore: this is an internal helper.
|
||||
*
|
||||
* We want to force this function invocation to be syntactically correct, but
|
||||
* we don't want to force runtime evaluation of its arguments.
|
||||
*/
|
||||
#define _FORCE_HAS_FORMAT_STRING(...) \
|
||||
if (0) { \
|
||||
_force_has_format_string(__VA_ARGS__); \
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the specified error if the condition evaluates to true.
|
||||
*
|
||||
* In debug modes, prints additional information.
|
||||
* In order to do that (particularly, printing the conditional that failed),
|
||||
* this can't just wrap RETURN_ERROR().
|
||||
*/
|
||||
#define RETURN_ERROR_IF(cond, err, ...) \
|
||||
if (cond) { \
|
||||
RAWLOG(3, "%s:%d: ERROR!: check %s failed, returning %s", \
|
||||
__FILE__, __LINE__, ZSTD_QUOTE(cond), ZSTD_QUOTE(ERROR(err))); \
|
||||
_FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \
|
||||
RAWLOG(3, ": " __VA_ARGS__); \
|
||||
RAWLOG(3, "\n"); \
|
||||
return ERROR(err); \
|
||||
}
|
||||
|
||||
/**
|
||||
* Unconditionally return the specified error.
|
||||
*
|
||||
* In debug modes, prints additional information.
|
||||
*/
|
||||
#define RETURN_ERROR(err, ...) \
|
||||
do { \
|
||||
RAWLOG(3, "%s:%d: ERROR!: unconditional check failed, returning %s", \
|
||||
__FILE__, __LINE__, ZSTD_QUOTE(ERROR(err))); \
|
||||
_FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \
|
||||
RAWLOG(3, ": " __VA_ARGS__); \
|
||||
RAWLOG(3, "\n"); \
|
||||
return ERROR(err); \
|
||||
} while(0);
|
||||
|
||||
/**
|
||||
* If the provided expression evaluates to an error code, returns that error code.
|
||||
*
|
||||
* In debug modes, prints additional information.
|
||||
*/
|
||||
#define FORWARD_IF_ERROR(err, ...) \
|
||||
do { \
|
||||
size_t const err_code = (err); \
|
||||
if (ERR_isError(err_code)) { \
|
||||
RAWLOG(3, "%s:%d: ERROR!: forwarding error in %s: %s", \
|
||||
__FILE__, __LINE__, ZSTD_QUOTE(err), ERR_getErrorName(err_code)); \
|
||||
_FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \
|
||||
RAWLOG(3, ": " __VA_ARGS__); \
|
||||
RAWLOG(3, "\n"); \
|
||||
return err_code; \
|
||||
} \
|
||||
} while(0);
|
||||
|
||||
|
||||
/*-*************************************
|
||||
* Common constants
|
||||
@ -453,6 +378,51 @@ MEM_STATIC U32 ZSTD_highbit32(U32 val) /* compress, dictBuilder, decodeCorpus
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes CTZ on a U64.
|
||||
* This will be slow on 32-bit mode, and on unsupported compilers.
|
||||
* If you need this function to be fast (because it is hot) expand
|
||||
* support.
|
||||
*/
|
||||
MEM_STATIC unsigned ZSTD_countTrailingZeros(size_t val)
|
||||
{
|
||||
if (MEM_64bits()) {
|
||||
# if defined(_MSC_VER) && defined(_WIN64)
|
||||
# if STATIC_BMI2
|
||||
return _tzcnt_u64(val);
|
||||
# else
|
||||
unsigned long r = 0;
|
||||
return _BitScanForward64( &r, (U64)val ) ? (unsigned)(r >> 3) : 0;
|
||||
# endif
|
||||
# elif defined(__GNUC__) && (__GNUC__ >= 4)
|
||||
return __builtin_ctzll((U64)val);
|
||||
# else
|
||||
static const int DeBruijnBytePos[64] = { 0, 1, 2, 7, 3, 13, 8, 19,
|
||||
4, 25, 14, 28, 9, 34, 20, 56,
|
||||
5, 17, 26, 54, 15, 41, 29, 43,
|
||||
10, 31, 38, 35, 21, 45, 49, 57,
|
||||
63, 6, 12, 18, 24, 27, 33, 55,
|
||||
16, 53, 40, 42, 30, 37, 44, 48,
|
||||
62, 11, 23, 32, 52, 39, 36, 47,
|
||||
61, 22, 51, 46, 60, 50, 59, 58 };
|
||||
return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58];
|
||||
# endif
|
||||
} else { /* 32 bits */
|
||||
# if defined(_MSC_VER)
|
||||
unsigned long r=0;
|
||||
return _BitScanForward( &r, (U32)val ) ? (unsigned)(r >> 3) : 0;
|
||||
# elif defined(__GNUC__) && (__GNUC__ >= 3)
|
||||
return (__builtin_ctz((U32)val) >> 3);
|
||||
# else
|
||||
static const int DeBruijnBytePos[32] = { 0, 1, 28, 2, 29, 14, 24, 3,
|
||||
30, 22, 20, 15, 25, 17, 4, 8,
|
||||
31, 27, 13, 23, 21, 19, 16, 7,
|
||||
26, 12, 18, 6, 11, 5, 10, 9 };
|
||||
return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27];
|
||||
# endif
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* ZSTD_invalidateRepCodes() :
|
||||
* ensures next compression will not use repcodes from previous block.
|
||||
|
@ -809,6 +809,7 @@ FORCE_INLINE_TEMPLATE void HUF_addBits(HUF_CStream_t* bitC, HUF_CElt elt, int id
|
||||
{
|
||||
size_t const nbBits = HUF_getNbBits(elt);
|
||||
size_t const dirtyBits = nbBits == 0 ? 0 : BIT_highbit32((U32)nbBits) + 1;
|
||||
(void)dirtyBits;
|
||||
/* Middle bits are 0. */
|
||||
assert(((elt >> dirtyBits) << (dirtyBits + nbBits)) == 0);
|
||||
/* We didn't overwrite any bits in the bit container. */
|
||||
|
@ -22,6 +22,13 @@
|
||||
#define HUF_STATIC_LINKING_ONLY
|
||||
#include "../common/huf.h"
|
||||
#include "../common/error_private.h"
|
||||
#include "../common/zstd_internal.h"
|
||||
|
||||
/* **************************************************************
|
||||
* Constants
|
||||
****************************************************************/
|
||||
|
||||
#define HUF_DECODER_FAST_TABLELOG 11
|
||||
|
||||
/* **************************************************************
|
||||
* Macros
|
||||
@ -36,6 +43,40 @@
|
||||
#error "Cannot force the use of the X1 and X2 decoders at the same time!"
|
||||
#endif
|
||||
|
||||
/* Only use assembly on Linux / MacOS.
|
||||
* Disable when MSAN is enabled.
|
||||
*/
|
||||
#if defined(__linux__) || defined(__linux) || defined(__APPLE__)
|
||||
# if ZSTD_MEMORY_SANITIZER
|
||||
# define HUF_ASM_SUPPORTED 0
|
||||
# else
|
||||
# define HUF_ASM_SUPPORTED 1
|
||||
#endif
|
||||
#else
|
||||
# define HUF_ASM_SUPPORTED 0
|
||||
#endif
|
||||
|
||||
/* HUF_DISABLE_ASM: Disables all ASM implementations. */
|
||||
#if !defined(HUF_DISABLE_ASM) && \
|
||||
HUF_ASM_SUPPORTED && \
|
||||
defined(__x86_64__) && (DYNAMIC_BMI2 || defined(__BMI2__))
|
||||
# define HUF_ENABLE_ASM_X86_64_BMI2 1
|
||||
#else
|
||||
# define HUF_ENABLE_ASM_X86_64_BMI2 0
|
||||
#endif
|
||||
|
||||
#if HUF_ENABLE_ASM_X86_64_BMI2 && DYNAMIC_BMI2
|
||||
# define HUF_ASM_X86_64_BMI2_ATTRS TARGET_ATTRIBUTE("bmi2")
|
||||
#else
|
||||
# define HUF_ASM_X86_64_BMI2_ATTRS
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
# define HUF_EXTERN_C extern "C"
|
||||
#else
|
||||
# define HUF_EXTERN_C
|
||||
#endif
|
||||
#define HUF_ASM_DECL HUF_EXTERN_C
|
||||
|
||||
/* **************************************************************
|
||||
* Error Management
|
||||
@ -107,13 +148,146 @@ static DTableDesc HUF_getDTableDesc(const HUF_DTable* table)
|
||||
return dtd;
|
||||
}
|
||||
|
||||
#if HUF_ENABLE_ASM_X86_64_BMI2
|
||||
|
||||
static size_t HUF_initDStream(BYTE const* ip) {
|
||||
BYTE const lastByte = ip[7];
|
||||
size_t const bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0;
|
||||
size_t const value = MEM_readLEST(ip) | 1;
|
||||
assert(bitsConsumed <= 8);
|
||||
return value << bitsConsumed;
|
||||
}
|
||||
typedef struct {
|
||||
BYTE const* ip[4];
|
||||
BYTE* op[4];
|
||||
U64 bits[4];
|
||||
void const* dt;
|
||||
BYTE const* ilimit;
|
||||
BYTE* oend;
|
||||
BYTE const* iend[4];
|
||||
} HUF_DecompressAsmArgs;
|
||||
|
||||
/**
|
||||
* Initializes args for the asm decoding loop.
|
||||
* @returns 0 on success
|
||||
* 1 if the fallback implementation should be used.
|
||||
* Or an error code on failure.
|
||||
*/
|
||||
static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst, size_t dstSize, void const* src, size_t srcSize, const HUF_DTable* DTable)
|
||||
{
|
||||
void const* dt = DTable + 1;
|
||||
U32 const dtLog = HUF_getDTableDesc(DTable).tableLog;
|
||||
|
||||
const BYTE* const ilimit = (const BYTE*)src + 6 + 8;
|
||||
|
||||
BYTE* const oend = (BYTE*)dst + dstSize;
|
||||
|
||||
/* We're assuming x86-64 BMI2 - assure that this is the case. */
|
||||
assert(MEM_isLittleEndian() && !MEM_32bits());
|
||||
|
||||
/* strict minimum : jump table + 1 byte per stream */
|
||||
if (srcSize < 10)
|
||||
return ERROR(corruption_detected);
|
||||
|
||||
/* Must have at least 8 bytes per stream because we don't handle initializing smaller bit containers.
|
||||
* If table log is not correct at this point, fallback to the old decoder.
|
||||
* On small inputs we don't have enough data to trigger the fast loop, so use the old decoder.
|
||||
*/
|
||||
if (dtLog != HUF_DECODER_FAST_TABLELOG)
|
||||
return 1;
|
||||
|
||||
/* Read the jump table. */
|
||||
{
|
||||
const BYTE* const istart = (const BYTE*)src;
|
||||
size_t const length1 = MEM_readLE16(istart);
|
||||
size_t const length2 = MEM_readLE16(istart+2);
|
||||
size_t const length3 = MEM_readLE16(istart+4);
|
||||
size_t const length4 = srcSize - (length1 + length2 + length3 + 6);
|
||||
args->iend[0] = istart + 6; /* jumpTable */
|
||||
args->iend[1] = args->iend[0] + length1;
|
||||
args->iend[2] = args->iend[1] + length2;
|
||||
args->iend[3] = args->iend[2] + length3;
|
||||
|
||||
/* HUF_initDStream() requires this, and this small of an input
|
||||
* won't benefit from the ASM loop anyways.
|
||||
* length1 must be >= 16 so that ip[0] >= ilimit before the loop
|
||||
* starts.
|
||||
*/
|
||||
if (length1 < 16 || length2 < 8 || length3 < 8 || length4 < 8)
|
||||
return 1;
|
||||
if (length4 > srcSize) return ERROR(corruption_detected); /* overflow */
|
||||
}
|
||||
/* ip[] contains the position that is currently loaded into bits[]. */
|
||||
args->ip[0] = args->iend[1] - sizeof(U64);
|
||||
args->ip[1] = args->iend[2] - sizeof(U64);
|
||||
args->ip[2] = args->iend[3] - sizeof(U64);
|
||||
args->ip[3] = (BYTE const*)src + srcSize - sizeof(U64);
|
||||
|
||||
/* op[] contains the output pointers. */
|
||||
args->op[0] = (BYTE*)dst;
|
||||
args->op[1] = args->op[0] + (dstSize+3)/4;
|
||||
args->op[2] = args->op[1] + (dstSize+3)/4;
|
||||
args->op[3] = args->op[2] + (dstSize+3)/4;
|
||||
|
||||
/* No point to call the ASM loop for tiny outputs. */
|
||||
if (args->op[3] >= oend)
|
||||
return 1;
|
||||
|
||||
/* bits[] is the bit container.
|
||||
* It is read from the MSB down to the LSB.
|
||||
* It is shifted left as it is read, and zeros are
|
||||
* shifted in. After the lowest valid bit a 1 is
|
||||
* set, so that CountTrailingZeros(bits[]) can be used
|
||||
* to count how many bits we've consumed.
|
||||
*/
|
||||
args->bits[0] = HUF_initDStream(args->ip[0]);
|
||||
args->bits[1] = HUF_initDStream(args->ip[1]);
|
||||
args->bits[2] = HUF_initDStream(args->ip[2]);
|
||||
args->bits[3] = HUF_initDStream(args->ip[3]);
|
||||
|
||||
/* If ip[] >= ilimit, it is guaranteed to be safe to
|
||||
* reload bits[]. It may be beyond its section, but is
|
||||
* guaranteed to be valid (>= istart).
|
||||
*/
|
||||
args->ilimit = ilimit;
|
||||
|
||||
args->oend = oend;
|
||||
args->dt = dt;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressAsmArgs const* args, int stream, BYTE* segmentEnd)
|
||||
{
|
||||
/* Validate that we haven't overwritten. */
|
||||
if (args->op[stream] > segmentEnd)
|
||||
return ERROR(corruption_detected);
|
||||
/* Validate that we haven't read beyond iend[].
|
||||
* Note that ip[] may be < iend[] because the MSB is
|
||||
* the next bit to read, and we may have consumed 100%
|
||||
* of the stream, so down to iend[i] - 8 is valid.
|
||||
*/
|
||||
if (args->ip[stream] < args->iend[stream] - 8)
|
||||
return ERROR(corruption_detected);
|
||||
|
||||
/* Construct the BIT_DStream_t. */
|
||||
bit->bitContainer = MEM_readLE64(args->ip[stream]);
|
||||
bit->bitsConsumed = ZSTD_countTrailingZeros((size_t)args->bits[stream]);
|
||||
bit->start = (const char*)args->iend[0];
|
||||
bit->limitPtr = bit->start + sizeof(size_t);
|
||||
bit->ptr = (const char*)args->ip[stream];
|
||||
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef HUF_FORCE_DECOMPRESS_X2
|
||||
|
||||
/*-***************************/
|
||||
/* single-symbol decoding */
|
||||
/*-***************************/
|
||||
typedef struct { BYTE byte; BYTE nbBits; } HUF_DEltX1; /* single-symbol decoding */
|
||||
typedef struct { BYTE nbBits; BYTE byte; } HUF_DEltX1; /* single-symbol decoding */
|
||||
|
||||
/**
|
||||
* Packs 4 HUF_DEltX1 structs into a U64. This is used to lay down 4 entries at
|
||||
@ -122,14 +296,44 @@ typedef struct { BYTE byte; BYTE nbBits; } HUF_DEltX1; /* single-symbol decodi
|
||||
static U64 HUF_DEltX1_set4(BYTE symbol, BYTE nbBits) {
|
||||
U64 D4;
|
||||
if (MEM_isLittleEndian()) {
|
||||
D4 = symbol + (nbBits << 8);
|
||||
} else {
|
||||
D4 = (symbol << 8) + nbBits;
|
||||
} else {
|
||||
D4 = symbol + (nbBits << 8);
|
||||
}
|
||||
D4 *= 0x0001000100010001ULL;
|
||||
return D4;
|
||||
}
|
||||
|
||||
/**
|
||||
* Increase the tableLog to targetTableLog and rescales the stats.
|
||||
* If tableLog > targetTableLog this is a no-op.
|
||||
* @returns New tableLog
|
||||
*/
|
||||
static U32 HUF_rescaleStats(BYTE* huffWeight, U32* rankVal, U32 nbSymbols, U32 tableLog, U32 targetTableLog)
|
||||
{
|
||||
if (tableLog > targetTableLog)
|
||||
return tableLog;
|
||||
if (tableLog < targetTableLog) {
|
||||
U32 const scale = targetTableLog - tableLog;
|
||||
U32 s;
|
||||
/* Increase the weight for all non-zero probability symbols by scale. */
|
||||
for (s = 0; s < nbSymbols; ++s) {
|
||||
huffWeight[s] += (BYTE)((huffWeight[s] == 0) ? 0 : scale);
|
||||
}
|
||||
/* Update rankVal to reflect the new weights.
|
||||
* All weights except 0 get moved to weight + scale.
|
||||
* Weights [1, scale] are empty.
|
||||
*/
|
||||
for (s = targetTableLog; s > scale; --s) {
|
||||
rankVal[s] = rankVal[s - scale];
|
||||
}
|
||||
for (s = scale; s > 0; --s) {
|
||||
rankVal[s] = 0;
|
||||
}
|
||||
}
|
||||
return targetTableLog;
|
||||
}
|
||||
|
||||
typedef struct {
|
||||
U32 rankVal[HUF_TABLELOG_ABSOLUTEMAX + 1];
|
||||
U32 rankStart[HUF_TABLELOG_ABSOLUTEMAX + 1];
|
||||
@ -162,8 +366,12 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr
|
||||
iSize = HUF_readStats_wksp(wksp->huffWeight, HUF_SYMBOLVALUE_MAX + 1, wksp->rankVal, &nbSymbols, &tableLog, src, srcSize, wksp->statsWksp, sizeof(wksp->statsWksp), bmi2);
|
||||
if (HUF_isError(iSize)) return iSize;
|
||||
|
||||
|
||||
/* Table header */
|
||||
{ DTableDesc dtd = HUF_getDTableDesc(DTable);
|
||||
U32 const maxTableLog = dtd.maxTableLog + 1;
|
||||
U32 const targetTableLog = MIN(maxTableLog, HUF_DECODER_FAST_TABLELOG);
|
||||
tableLog = HUF_rescaleStats(wksp->huffWeight, wksp->rankVal, nbSymbols, tableLog, targetTableLog);
|
||||
if (tableLog > (U32)(dtd.maxTableLog+1)) return ERROR(tableLog_tooLarge); /* DTable too small, Huffman tree cannot fit in */
|
||||
dtd.tableType = 0;
|
||||
dtd.tableLog = (BYTE)tableLog;
|
||||
@ -445,6 +653,77 @@ HUF_decompress4X1_usingDTable_internal_body(
|
||||
}
|
||||
}
|
||||
|
||||
#if DYNAMIC_BMI2
|
||||
static TARGET_ATTRIBUTE("bmi2")
|
||||
size_t HUF_decompress4X1_usingDTable_internal_bmi2(void* dst, size_t dstSize, void const* cSrc,
|
||||
size_t cSrcSize, HUF_DTable const* DTable) {
|
||||
return HUF_decompress4X1_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
|
||||
}
|
||||
#endif
|
||||
|
||||
static
|
||||
size_t HUF_decompress4X1_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc,
|
||||
size_t cSrcSize, HUF_DTable const* DTable) {
|
||||
return HUF_decompress4X1_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
|
||||
}
|
||||
|
||||
#if HUF_ENABLE_ASM_X86_64_BMI2
|
||||
|
||||
HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop(HUF_DecompressAsmArgs* args);
|
||||
|
||||
static HUF_ASM_X86_64_BMI2_ATTRS
|
||||
size_t
|
||||
HUF_decompress4X1_usingDTable_internal_bmi2_asm(
|
||||
void* dst, size_t dstSize,
|
||||
const void* cSrc, size_t cSrcSize,
|
||||
const HUF_DTable* DTable)
|
||||
{
|
||||
void const* dt = DTable + 1;
|
||||
const BYTE* const iend = (const BYTE*)cSrc + 6;
|
||||
BYTE* const oend = (BYTE*)dst + dstSize;
|
||||
HUF_DecompressAsmArgs args;
|
||||
{
|
||||
size_t const ret = HUF_DecompressAsmArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
|
||||
FORWARD_IF_ERROR(ret, "Failed to init asm args");
|
||||
if (ret != 0)
|
||||
return HUF_decompress4X1_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
|
||||
}
|
||||
|
||||
assert(args.ip[0] >= args.ilimit);
|
||||
HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop(&args);
|
||||
|
||||
/* Our loop guarantees that ip[] >= ilimit and that we haven't
|
||||
* overwritten any op[].
|
||||
*/
|
||||
assert(args.ip[0] >= iend);
|
||||
assert(args.ip[1] >= iend);
|
||||
assert(args.ip[2] >= iend);
|
||||
assert(args.ip[3] >= iend);
|
||||
assert(args.op[3] <= oend);
|
||||
(void)iend;
|
||||
|
||||
/* finish bit streams one by one. */
|
||||
{
|
||||
size_t const segmentSize = (dstSize+3) / 4;
|
||||
BYTE* segmentEnd = (BYTE*)dst;
|
||||
int i;
|
||||
for (i = 0; i < 4; ++i) {
|
||||
BIT_DStream_t bit;
|
||||
if (segmentSize <= (size_t)(oend - segmentEnd))
|
||||
segmentEnd += segmentSize;
|
||||
else
|
||||
segmentEnd = oend;
|
||||
FORWARD_IF_ERROR(HUF_initRemainingDStream(&bit, &args, i, segmentEnd), "corruption");
|
||||
/* Decompress and validate that we've produced exactly the expected length. */
|
||||
args.op[i] += HUF_decodeStreamX1(args.op[i], &bit, segmentEnd, (HUF_DEltX1 const*)dt, HUF_DECODER_FAST_TABLELOG);
|
||||
if (args.op[i] != segmentEnd) return ERROR(corruption_detected);
|
||||
}
|
||||
}
|
||||
|
||||
/* decoded size */
|
||||
return dstSize;
|
||||
}
|
||||
#endif /* HUF_ENABLE_ASM_X86_64_BMI2 */
|
||||
|
||||
typedef size_t (*HUF_decompress_usingDTable_t)(void *dst, size_t dstSize,
|
||||
const void *cSrc,
|
||||
@ -452,8 +731,28 @@ typedef size_t (*HUF_decompress_usingDTable_t)(void *dst, size_t dstSize,
|
||||
const HUF_DTable *DTable);
|
||||
|
||||
HUF_DGEN(HUF_decompress1X1_usingDTable_internal)
|
||||
HUF_DGEN(HUF_decompress4X1_usingDTable_internal)
|
||||
|
||||
static size_t HUF_decompress4X1_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc,
|
||||
size_t cSrcSize, HUF_DTable const* DTable, int bmi2)
|
||||
{
|
||||
#if DYNAMIC_BMI2
|
||||
if (bmi2) {
|
||||
# if HUF_ENABLE_ASM_X86_64_BMI2
|
||||
return HUF_decompress4X1_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
|
||||
# else
|
||||
return HUF_decompress4X1_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
|
||||
# endif
|
||||
}
|
||||
#else
|
||||
(void)bmi2;
|
||||
#endif
|
||||
|
||||
#if HUF_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)
|
||||
return HUF_decompress4X1_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
|
||||
#else
|
||||
return HUF_decompress4X1_usingDTable_internal_default(dst, dstSize, cSrc, cSrcSize, DTable);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
size_t HUF_decompress1X1_usingDTable(
|
||||
@ -523,106 +822,226 @@ size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
|
||||
/* *************************/
|
||||
|
||||
typedef struct { U16 sequence; BYTE nbBits; BYTE length; } HUF_DEltX2; /* double-symbols decoding */
|
||||
typedef struct { BYTE symbol; BYTE weight; } sortedSymbol_t;
|
||||
typedef struct { BYTE symbol; } sortedSymbol_t;
|
||||
typedef U32 rankValCol_t[HUF_TABLELOG_MAX + 1];
|
||||
typedef rankValCol_t rankVal_t[HUF_TABLELOG_MAX];
|
||||
|
||||
/**
|
||||
* Constructs a HUF_DEltX2 in a U32.
|
||||
*/
|
||||
static U32 HUF_buildDEltX2U32(U32 symbol, U32 nbBits, U32 baseSeq, int level)
|
||||
{
|
||||
U32 seq;
|
||||
DEBUG_STATIC_ASSERT(offsetof(HUF_DEltX2, sequence) == 0);
|
||||
DEBUG_STATIC_ASSERT(offsetof(HUF_DEltX2, nbBits) == 2);
|
||||
DEBUG_STATIC_ASSERT(offsetof(HUF_DEltX2, length) == 3);
|
||||
DEBUG_STATIC_ASSERT(sizeof(HUF_DEltX2) == sizeof(U32));
|
||||
if (MEM_isLittleEndian()) {
|
||||
seq = level == 1 ? symbol : (baseSeq + (symbol << 8));
|
||||
return seq + (nbBits << 16) + ((U32)level << 24);
|
||||
} else {
|
||||
seq = level == 1 ? (symbol << 8) : ((baseSeq << 8) + symbol);
|
||||
return (seq << 16) + (nbBits << 8) + (U32)level;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs a HUF_DEltX2.
|
||||
*/
|
||||
static HUF_DEltX2 HUF_buildDEltX2(U32 symbol, U32 nbBits, U32 baseSeq, int level)
|
||||
{
|
||||
HUF_DEltX2 DElt;
|
||||
U32 const val = HUF_buildDEltX2U32(symbol, nbBits, baseSeq, level);
|
||||
DEBUG_STATIC_ASSERT(sizeof(DElt) == sizeof(val));
|
||||
ZSTD_memcpy(&DElt, &val, sizeof(val));
|
||||
return DElt;
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs 2 HUF_DEltX2s and packs them into a U64.
|
||||
*/
|
||||
static U64 HUF_buildDEltX2U64(U32 symbol, U32 nbBits, U16 baseSeq, int level)
|
||||
{
|
||||
U32 DElt = HUF_buildDEltX2U32(symbol, nbBits, baseSeq, level);
|
||||
return (U64)DElt + ((U64)DElt << 32);
|
||||
}
|
||||
|
||||
/**
|
||||
* Fills the DTable rank with all the symbols from [begin, end) that are each
|
||||
* nbBits long.
|
||||
*
|
||||
* @param DTableRank The start of the rank in the DTable.
|
||||
* @param begin The first symbol to fill (inclusive).
|
||||
* @param end The last symbol to fill (exclusive).
|
||||
* @param nbBits Each symbol is nbBits long.
|
||||
* @param tableLog The table log.
|
||||
* @param baseSeq If level == 1 { 0 } else { the first level symbol }
|
||||
* @param level The level in the table. Must be 1 or 2.
|
||||
*/
|
||||
static void HUF_fillDTableX2ForWeight(
|
||||
HUF_DEltX2* DTableRank,
|
||||
sortedSymbol_t const* begin, sortedSymbol_t const* end,
|
||||
U32 nbBits, U32 tableLog,
|
||||
U16 baseSeq, int const level)
|
||||
{
|
||||
U32 const length = 1U << ((tableLog - nbBits) & 0x1F /* quiet static-analyzer */);
|
||||
const sortedSymbol_t* ptr;
|
||||
assert(level >= 1 && level <= 2);
|
||||
switch (length) {
|
||||
case 1:
|
||||
for (ptr = begin; ptr != end; ++ptr) {
|
||||
HUF_DEltX2 const DElt = HUF_buildDEltX2(ptr->symbol, nbBits, baseSeq, level);
|
||||
*DTableRank++ = DElt;
|
||||
}
|
||||
break;
|
||||
case 2:
|
||||
for (ptr = begin; ptr != end; ++ptr) {
|
||||
HUF_DEltX2 const DElt = HUF_buildDEltX2(ptr->symbol, nbBits, baseSeq, level);
|
||||
DTableRank[0] = DElt;
|
||||
DTableRank[1] = DElt;
|
||||
DTableRank += 2;
|
||||
}
|
||||
break;
|
||||
case 4:
|
||||
for (ptr = begin; ptr != end; ++ptr) {
|
||||
U64 const DEltX2 = HUF_buildDEltX2U64(ptr->symbol, nbBits, baseSeq, level);
|
||||
ZSTD_memcpy(DTableRank + 0, &DEltX2, sizeof(DEltX2));
|
||||
ZSTD_memcpy(DTableRank + 2, &DEltX2, sizeof(DEltX2));
|
||||
DTableRank += 4;
|
||||
}
|
||||
break;
|
||||
case 8:
|
||||
for (ptr = begin; ptr != end; ++ptr) {
|
||||
U64 const DEltX2 = HUF_buildDEltX2U64(ptr->symbol, nbBits, baseSeq, level);
|
||||
ZSTD_memcpy(DTableRank + 0, &DEltX2, sizeof(DEltX2));
|
||||
ZSTD_memcpy(DTableRank + 2, &DEltX2, sizeof(DEltX2));
|
||||
ZSTD_memcpy(DTableRank + 4, &DEltX2, sizeof(DEltX2));
|
||||
ZSTD_memcpy(DTableRank + 6, &DEltX2, sizeof(DEltX2));
|
||||
DTableRank += 8;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
for (ptr = begin; ptr != end; ++ptr) {
|
||||
U64 const DEltX2 = HUF_buildDEltX2U64(ptr->symbol, nbBits, baseSeq, level);
|
||||
HUF_DEltX2* const DTableRankEnd = DTableRank + length;
|
||||
for (; DTableRank != DTableRankEnd; DTableRank += 8) {
|
||||
ZSTD_memcpy(DTableRank + 0, &DEltX2, sizeof(DEltX2));
|
||||
ZSTD_memcpy(DTableRank + 2, &DEltX2, sizeof(DEltX2));
|
||||
ZSTD_memcpy(DTableRank + 4, &DEltX2, sizeof(DEltX2));
|
||||
ZSTD_memcpy(DTableRank + 6, &DEltX2, sizeof(DEltX2));
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* HUF_fillDTableX2Level2() :
|
||||
* `rankValOrigin` must be a table of at least (HUF_TABLELOG_MAX + 1) U32 */
|
||||
static void HUF_fillDTableX2Level2(HUF_DEltX2* DTable, U32 sizeLog, const U32 consumed,
|
||||
const U32* rankValOrigin, const int minWeight,
|
||||
const sortedSymbol_t* sortedSymbols, const U32 sortedListSize,
|
||||
U32 nbBitsBaseline, U16 baseSeq, U32* wksp, size_t wkspSize)
|
||||
static void HUF_fillDTableX2Level2(HUF_DEltX2* DTable, U32 targetLog, const U32 consumedBits,
|
||||
const U32* rankVal, const int minWeight, const int maxWeight1,
|
||||
const sortedSymbol_t* sortedSymbols, U32 const* rankStart,
|
||||
U32 nbBitsBaseline, U16 baseSeq)
|
||||
{
|
||||
HUF_DEltX2 DElt;
|
||||
U32* rankVal = wksp;
|
||||
|
||||
assert(wkspSize >= HUF_TABLELOG_MAX + 1);
|
||||
(void)wkspSize;
|
||||
/* get pre-calculated rankVal */
|
||||
ZSTD_memcpy(rankVal, rankValOrigin, sizeof(U32) * (HUF_TABLELOG_MAX + 1));
|
||||
|
||||
/* fill skipped values */
|
||||
/* Fill skipped values (all positions up to rankVal[minWeight]).
|
||||
* These are positions only get a single symbol because the combined weight
|
||||
* is too large.
|
||||
*/
|
||||
if (minWeight>1) {
|
||||
U32 i, skipSize = rankVal[minWeight];
|
||||
MEM_writeLE16(&(DElt.sequence), baseSeq);
|
||||
DElt.nbBits = (BYTE)(consumed);
|
||||
DElt.length = 1;
|
||||
for (i = 0; i < skipSize; i++)
|
||||
DTable[i] = DElt;
|
||||
U32 const length = 1U << ((targetLog - consumedBits) & 0x1F /* quiet static-analyzer */);
|
||||
U64 const DEltX2 = HUF_buildDEltX2U64(baseSeq, consumedBits, /* baseSeq */ 0, /* level */ 1);
|
||||
int const skipSize = rankVal[minWeight];
|
||||
assert(length > 1);
|
||||
assert((U32)skipSize < length);
|
||||
switch (length) {
|
||||
case 2:
|
||||
assert(skipSize == 1);
|
||||
ZSTD_memcpy(DTable, &DEltX2, sizeof(DEltX2));
|
||||
break;
|
||||
case 4:
|
||||
assert(skipSize <= 4);
|
||||
ZSTD_memcpy(DTable + 0, &DEltX2, sizeof(DEltX2));
|
||||
ZSTD_memcpy(DTable + 2, &DEltX2, sizeof(DEltX2));
|
||||
break;
|
||||
default:
|
||||
{
|
||||
int i;
|
||||
for (i = 0; i < skipSize; i += 8) {
|
||||
ZSTD_memcpy(DTable + i + 0, &DEltX2, sizeof(DEltX2));
|
||||
ZSTD_memcpy(DTable + i + 2, &DEltX2, sizeof(DEltX2));
|
||||
ZSTD_memcpy(DTable + i + 4, &DEltX2, sizeof(DEltX2));
|
||||
ZSTD_memcpy(DTable + i + 6, &DEltX2, sizeof(DEltX2));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* fill DTable */
|
||||
{ U32 s; for (s=0; s<sortedListSize; s++) { /* note : sortedSymbols already skipped */
|
||||
const U32 symbol = sortedSymbols[s].symbol;
|
||||
const U32 weight = sortedSymbols[s].weight;
|
||||
const U32 nbBits = nbBitsBaseline - weight;
|
||||
const U32 length = 1 << (sizeLog-nbBits);
|
||||
const U32 start = rankVal[weight];
|
||||
U32 i = start;
|
||||
const U32 end = start + length;
|
||||
|
||||
MEM_writeLE16(&(DElt.sequence), (U16)(baseSeq + (symbol << 8)));
|
||||
DElt.nbBits = (BYTE)(nbBits + consumed);
|
||||
DElt.length = 2;
|
||||
do { DTable[i++] = DElt; } while (i<end); /* since length >= 1 */
|
||||
|
||||
rankVal[weight] += length;
|
||||
} }
|
||||
/* Fill each of the second level symbols by weight. */
|
||||
{
|
||||
int w;
|
||||
for (w = minWeight; w < maxWeight1; ++w) {
|
||||
int const begin = rankStart[w];
|
||||
int const end = rankStart[w+1];
|
||||
U32 const nbBits = nbBitsBaseline - w;
|
||||
U32 const totalBits = nbBits + consumedBits;
|
||||
HUF_fillDTableX2ForWeight(
|
||||
DTable + rankVal[w],
|
||||
sortedSymbols + begin, sortedSymbols + end,
|
||||
totalBits, targetLog,
|
||||
baseSeq, /* level */ 2);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static void HUF_fillDTableX2(HUF_DEltX2* DTable, const U32 targetLog,
|
||||
const sortedSymbol_t* sortedList, const U32 sortedListSize,
|
||||
const sortedSymbol_t* sortedList,
|
||||
const U32* rankStart, rankVal_t rankValOrigin, const U32 maxWeight,
|
||||
const U32 nbBitsBaseline, U32* wksp, size_t wkspSize)
|
||||
const U32 nbBitsBaseline)
|
||||
{
|
||||
U32* rankVal = wksp;
|
||||
U32* const rankVal = rankValOrigin[0];
|
||||
const int scaleLog = nbBitsBaseline - targetLog; /* note : targetLog >= srcLog, hence scaleLog <= 1 */
|
||||
const U32 minBits = nbBitsBaseline - maxWeight;
|
||||
U32 s;
|
||||
int w;
|
||||
int const wEnd = (int)maxWeight + 1;
|
||||
|
||||
assert(wkspSize >= HUF_TABLELOG_MAX + 1);
|
||||
wksp += HUF_TABLELOG_MAX + 1;
|
||||
wkspSize -= HUF_TABLELOG_MAX + 1;
|
||||
/* Fill DTable in order of weight. */
|
||||
for (w = 1; w < wEnd; ++w) {
|
||||
int const begin = (int)rankStart[w];
|
||||
int const end = (int)rankStart[w+1];
|
||||
U32 const nbBits = nbBitsBaseline - w;
|
||||
|
||||
ZSTD_memcpy(rankVal, rankValOrigin, sizeof(U32) * (HUF_TABLELOG_MAX + 1));
|
||||
|
||||
/* fill DTable */
|
||||
for (s=0; s<sortedListSize; s++) {
|
||||
const U16 symbol = sortedList[s].symbol;
|
||||
const U32 weight = sortedList[s].weight;
|
||||
const U32 nbBits = nbBitsBaseline - weight;
|
||||
const U32 start = rankVal[weight];
|
||||
const U32 length = 1 << (targetLog-nbBits);
|
||||
|
||||
if (targetLog-nbBits >= minBits) { /* enough room for a second symbol */
|
||||
U32 sortedRank;
|
||||
if (targetLog-nbBits >= minBits) {
|
||||
/* Enough room for a second symbol. */
|
||||
int start = rankVal[w];
|
||||
U32 const length = 1U << ((targetLog - nbBits) & 0x1F /* quiet static-analyzer */);
|
||||
int minWeight = nbBits + scaleLog;
|
||||
int s;
|
||||
if (minWeight < 1) minWeight = 1;
|
||||
sortedRank = rankStart[minWeight];
|
||||
HUF_fillDTableX2Level2(DTable+start, targetLog-nbBits, nbBits,
|
||||
rankValOrigin[nbBits], minWeight,
|
||||
sortedList+sortedRank, sortedListSize-sortedRank,
|
||||
nbBitsBaseline, symbol, wksp, wkspSize);
|
||||
/* Fill the DTable for every symbol of weight w.
|
||||
* These symbols get at least 1 second symbol.
|
||||
*/
|
||||
for (s = begin; s != end; ++s) {
|
||||
HUF_fillDTableX2Level2(
|
||||
DTable + start, targetLog, nbBits,
|
||||
rankValOrigin[nbBits], minWeight, wEnd,
|
||||
sortedList, rankStart,
|
||||
nbBitsBaseline, sortedList[s].symbol);
|
||||
start += length;
|
||||
}
|
||||
} else {
|
||||
HUF_DEltX2 DElt;
|
||||
MEM_writeLE16(&(DElt.sequence), symbol);
|
||||
DElt.nbBits = (BYTE)(nbBits);
|
||||
DElt.length = 1;
|
||||
{ U32 const end = start + length;
|
||||
U32 u;
|
||||
for (u = start; u < end; u++) DTable[u] = DElt;
|
||||
} }
|
||||
rankVal[weight] += length;
|
||||
/* Only a single symbol. */
|
||||
HUF_fillDTableX2ForWeight(
|
||||
DTable + rankVal[w],
|
||||
sortedList + begin, sortedList + end,
|
||||
nbBits, targetLog,
|
||||
/* baseSeq */ 0, /* level */ 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
typedef struct {
|
||||
rankValCol_t rankVal[HUF_TABLELOG_MAX];
|
||||
U32 rankStats[HUF_TABLELOG_MAX + 1];
|
||||
U32 rankStart0[HUF_TABLELOG_MAX + 2];
|
||||
U32 rankStart0[HUF_TABLELOG_MAX + 3];
|
||||
sortedSymbol_t sortedSymbol[HUF_SYMBOLVALUE_MAX + 1];
|
||||
BYTE weightList[HUF_SYMBOLVALUE_MAX + 1];
|
||||
U32 calleeWksp[HUF_READ_STATS_WORKSPACE_SIZE_U32];
|
||||
@ -632,9 +1051,16 @@ size_t HUF_readDTableX2_wksp(HUF_DTable* DTable,
|
||||
const void* src, size_t srcSize,
|
||||
void* workSpace, size_t wkspSize)
|
||||
{
|
||||
U32 tableLog, maxW, sizeOfSort, nbSymbols;
|
||||
return HUF_readDTableX2_wksp_bmi2(DTable, src, srcSize, workSpace, wkspSize, /* bmi2 */ 0);
|
||||
}
|
||||
|
||||
size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable,
|
||||
const void* src, size_t srcSize,
|
||||
void* workSpace, size_t wkspSize, int bmi2)
|
||||
{
|
||||
U32 tableLog, maxW, nbSymbols;
|
||||
DTableDesc dtd = HUF_getDTableDesc(DTable);
|
||||
U32 const maxTableLog = dtd.maxTableLog;
|
||||
U32 maxTableLog = dtd.maxTableLog;
|
||||
size_t iSize;
|
||||
void* dtPtr = DTable+1; /* force compiler to avoid strict-aliasing */
|
||||
HUF_DEltX2* const dt = (HUF_DEltX2*)dtPtr;
|
||||
@ -652,11 +1078,12 @@ size_t HUF_readDTableX2_wksp(HUF_DTable* DTable,
|
||||
if (maxTableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge);
|
||||
/* ZSTD_memset(weightList, 0, sizeof(weightList)); */ /* is not necessary, even though some analyzer complain ... */
|
||||
|
||||
iSize = HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1, wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, sizeof(wksp->calleeWksp), /* bmi2 */ 0);
|
||||
iSize = HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1, wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, sizeof(wksp->calleeWksp), bmi2);
|
||||
if (HUF_isError(iSize)) return iSize;
|
||||
|
||||
/* check result */
|
||||
if (tableLog > maxTableLog) return ERROR(tableLog_tooLarge); /* DTable can't fit code depth */
|
||||
if (tableLog <= HUF_DECODER_FAST_TABLELOG && maxTableLog > HUF_DECODER_FAST_TABLELOG) maxTableLog = HUF_DECODER_FAST_TABLELOG;
|
||||
|
||||
/* find maxWeight */
|
||||
for (maxW = tableLog; wksp->rankStats[maxW]==0; maxW--) {} /* necessarily finds a solution before 0 */
|
||||
@ -669,7 +1096,7 @@ size_t HUF_readDTableX2_wksp(HUF_DTable* DTable,
|
||||
rankStart[w] = curr;
|
||||
}
|
||||
rankStart[0] = nextRankStart; /* put all 0w symbols at the end of sorted list*/
|
||||
sizeOfSort = nextRankStart;
|
||||
rankStart[maxW+1] = nextRankStart;
|
||||
}
|
||||
|
||||
/* sort symbols by weight */
|
||||
@ -678,7 +1105,6 @@ size_t HUF_readDTableX2_wksp(HUF_DTable* DTable,
|
||||
U32 const w = wksp->weightList[s];
|
||||
U32 const r = rankStart[w]++;
|
||||
wksp->sortedSymbol[r].symbol = (BYTE)s;
|
||||
wksp->sortedSymbol[r].weight = (BYTE)w;
|
||||
}
|
||||
rankStart[0] = 0; /* forget 0w symbols; this is beginning of weight(1) */
|
||||
}
|
||||
@ -703,10 +1129,9 @@ size_t HUF_readDTableX2_wksp(HUF_DTable* DTable,
|
||||
} } } }
|
||||
|
||||
HUF_fillDTableX2(dt, maxTableLog,
|
||||
wksp->sortedSymbol, sizeOfSort,
|
||||
wksp->sortedSymbol,
|
||||
wksp->rankStart0, wksp->rankVal, maxW,
|
||||
tableLog+1,
|
||||
wksp->calleeWksp, sizeof(wksp->calleeWksp) / sizeof(U32));
|
||||
tableLog+1);
|
||||
|
||||
dtd.tableLog = (BYTE)maxTableLog;
|
||||
dtd.tableType = 1;
|
||||
@ -719,7 +1144,7 @@ FORCE_INLINE_TEMPLATE U32
|
||||
HUF_decodeSymbolX2(void* op, BIT_DStream_t* DStream, const HUF_DEltX2* dt, const U32 dtLog)
|
||||
{
|
||||
size_t const val = BIT_lookBitsFast(DStream, dtLog); /* note : dtLog >= 1 */
|
||||
ZSTD_memcpy(op, dt+val, 2);
|
||||
ZSTD_memcpy(op, &dt[val].sequence, 2);
|
||||
BIT_skipBits(DStream, dt[val].nbBits);
|
||||
return dt[val].length;
|
||||
}
|
||||
@ -728,15 +1153,17 @@ FORCE_INLINE_TEMPLATE U32
|
||||
HUF_decodeLastSymbolX2(void* op, BIT_DStream_t* DStream, const HUF_DEltX2* dt, const U32 dtLog)
|
||||
{
|
||||
size_t const val = BIT_lookBitsFast(DStream, dtLog); /* note : dtLog >= 1 */
|
||||
ZSTD_memcpy(op, dt+val, 1);
|
||||
if (dt[val].length==1) BIT_skipBits(DStream, dt[val].nbBits);
|
||||
else {
|
||||
ZSTD_memcpy(op, &dt[val].sequence, 1);
|
||||
if (dt[val].length==1) {
|
||||
BIT_skipBits(DStream, dt[val].nbBits);
|
||||
} else {
|
||||
if (DStream->bitsConsumed < (sizeof(DStream->bitContainer)*8)) {
|
||||
BIT_skipBits(DStream, dt[val].nbBits);
|
||||
if (DStream->bitsConsumed > (sizeof(DStream->bitContainer)*8))
|
||||
/* ugly hack; works only because it's the last symbol. Note : can't easily extract nbBits from just this symbol */
|
||||
DStream->bitsConsumed = (sizeof(DStream->bitContainer)*8);
|
||||
} }
|
||||
}
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
@ -759,11 +1186,23 @@ HUF_decodeStreamX2(BYTE* p, BIT_DStream_t* bitDPtr, BYTE* const pEnd,
|
||||
|
||||
/* up to 8 symbols at a time */
|
||||
if ((size_t)(pEnd - p) >= sizeof(bitDPtr->bitContainer)) {
|
||||
while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-(sizeof(bitDPtr->bitContainer)-1))) {
|
||||
HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
|
||||
HUF_DECODE_SYMBOLX2_1(p, bitDPtr);
|
||||
HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
|
||||
HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
|
||||
if (dtLog <= 11 && MEM_64bits()) {
|
||||
/* up to 10 symbols at a time */
|
||||
while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-9)) {
|
||||
HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
|
||||
HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
|
||||
HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
|
||||
HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
|
||||
HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
|
||||
}
|
||||
} else {
|
||||
/* up to 8 symbols at a time */
|
||||
while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-(sizeof(bitDPtr->bitContainer)-1))) {
|
||||
HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
|
||||
HUF_DECODE_SYMBOLX2_1(p, bitDPtr);
|
||||
HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
|
||||
HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -808,7 +1247,6 @@ HUF_decompress1X2_usingDTable_internal_body(
|
||||
/* decoded size */
|
||||
return dstSize;
|
||||
}
|
||||
|
||||
FORCE_INLINE_TEMPLATE size_t
|
||||
HUF_decompress4X2_usingDTable_internal_body(
|
||||
void* dst, size_t dstSize,
|
||||
@ -927,8 +1365,97 @@ HUF_decompress4X2_usingDTable_internal_body(
|
||||
}
|
||||
}
|
||||
|
||||
#if DYNAMIC_BMI2
|
||||
static TARGET_ATTRIBUTE("bmi2")
|
||||
size_t HUF_decompress4X2_usingDTable_internal_bmi2(void* dst, size_t dstSize, void const* cSrc,
|
||||
size_t cSrcSize, HUF_DTable const* DTable) {
|
||||
return HUF_decompress4X2_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
|
||||
}
|
||||
#endif
|
||||
|
||||
static
|
||||
size_t HUF_decompress4X2_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc,
|
||||
size_t cSrcSize, HUF_DTable const* DTable) {
|
||||
return HUF_decompress4X2_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
|
||||
}
|
||||
|
||||
#if HUF_ENABLE_ASM_X86_64_BMI2
|
||||
|
||||
HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop(HUF_DecompressAsmArgs* args);
|
||||
|
||||
static HUF_ASM_X86_64_BMI2_ATTRS size_t
|
||||
HUF_decompress4X2_usingDTable_internal_bmi2_asm(
|
||||
void* dst, size_t dstSize,
|
||||
const void* cSrc, size_t cSrcSize,
|
||||
const HUF_DTable* DTable) {
|
||||
void const* dt = DTable + 1;
|
||||
const BYTE* const iend = (const BYTE*)cSrc + 6;
|
||||
BYTE* const oend = (BYTE*)dst + dstSize;
|
||||
HUF_DecompressAsmArgs args;
|
||||
{
|
||||
size_t const ret = HUF_DecompressAsmArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
|
||||
FORWARD_IF_ERROR(ret, "Failed to init asm args");
|
||||
if (ret != 0)
|
||||
return HUF_decompress4X2_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
|
||||
}
|
||||
|
||||
assert(args.ip[0] >= args.ilimit);
|
||||
HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop(&args);
|
||||
|
||||
/* note : op4 already verified within main loop */
|
||||
assert(args.ip[0] >= iend);
|
||||
assert(args.ip[1] >= iend);
|
||||
assert(args.ip[2] >= iend);
|
||||
assert(args.ip[3] >= iend);
|
||||
assert(args.op[3] <= oend);
|
||||
(void)iend;
|
||||
|
||||
/* finish bitStreams one by one */
|
||||
{
|
||||
size_t const segmentSize = (dstSize+3) / 4;
|
||||
BYTE* segmentEnd = (BYTE*)dst;
|
||||
int i;
|
||||
for (i = 0; i < 4; ++i) {
|
||||
BIT_DStream_t bit;
|
||||
if (segmentSize <= (size_t)(oend - segmentEnd))
|
||||
segmentEnd += segmentSize;
|
||||
else
|
||||
segmentEnd = oend;
|
||||
FORWARD_IF_ERROR(HUF_initRemainingDStream(&bit, &args, i, segmentEnd), "corruption");
|
||||
args.op[i] += HUF_decodeStreamX2(args.op[i], &bit, segmentEnd, (HUF_DEltX2 const*)dt, HUF_DECODER_FAST_TABLELOG);
|
||||
if (args.op[i] != segmentEnd)
|
||||
return ERROR(corruption_detected);
|
||||
}
|
||||
}
|
||||
|
||||
/* decoded size */
|
||||
return dstSize;
|
||||
}
|
||||
#endif /* HUF_ENABLE_ASM_X86_64_BMI2 */
|
||||
|
||||
static size_t HUF_decompress4X2_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc,
|
||||
size_t cSrcSize, HUF_DTable const* DTable, int bmi2)
|
||||
{
|
||||
#if DYNAMIC_BMI2
|
||||
if (bmi2) {
|
||||
# if HUF_ENABLE_ASM_X86_64_BMI2
|
||||
return HUF_decompress4X2_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
|
||||
# else
|
||||
return HUF_decompress4X2_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
|
||||
# endif
|
||||
}
|
||||
#else
|
||||
(void)bmi2;
|
||||
#endif
|
||||
|
||||
#if HUF_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)
|
||||
return HUF_decompress4X2_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
|
||||
#else
|
||||
return HUF_decompress4X2_usingDTable_internal_default(dst, dstSize, cSrc, cSrcSize, DTable);
|
||||
#endif
|
||||
}
|
||||
|
||||
HUF_DGEN(HUF_decompress1X2_usingDTable_internal)
|
||||
HUF_DGEN(HUF_decompress4X2_usingDTable_internal)
|
||||
|
||||
size_t HUF_decompress1X2_usingDTable(
|
||||
void* dst, size_t dstSize,
|
||||
@ -1037,25 +1564,25 @@ size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize,
|
||||
|
||||
#if !defined(HUF_FORCE_DECOMPRESS_X1) && !defined(HUF_FORCE_DECOMPRESS_X2)
|
||||
typedef struct { U32 tableTime; U32 decode256Time; } algo_time_t;
|
||||
static const algo_time_t algoTime[16 /* Quantization */][3 /* single, double, quad */] =
|
||||
static const algo_time_t algoTime[16 /* Quantization */][2 /* single, double */] =
|
||||
{
|
||||
/* single, double, quad */
|
||||
{{0,0}, {1,1}, {2,2}}, /* Q==0 : impossible */
|
||||
{{0,0}, {1,1}, {2,2}}, /* Q==1 : impossible */
|
||||
{{ 38,130}, {1313, 74}, {2151, 38}}, /* Q == 2 : 12-18% */
|
||||
{{ 448,128}, {1353, 74}, {2238, 41}}, /* Q == 3 : 18-25% */
|
||||
{{ 556,128}, {1353, 74}, {2238, 47}}, /* Q == 4 : 25-32% */
|
||||
{{ 714,128}, {1418, 74}, {2436, 53}}, /* Q == 5 : 32-38% */
|
||||
{{ 883,128}, {1437, 74}, {2464, 61}}, /* Q == 6 : 38-44% */
|
||||
{{ 897,128}, {1515, 75}, {2622, 68}}, /* Q == 7 : 44-50% */
|
||||
{{ 926,128}, {1613, 75}, {2730, 75}}, /* Q == 8 : 50-56% */
|
||||
{{ 947,128}, {1729, 77}, {3359, 77}}, /* Q == 9 : 56-62% */
|
||||
{{1107,128}, {2083, 81}, {4006, 84}}, /* Q ==10 : 62-69% */
|
||||
{{1177,128}, {2379, 87}, {4785, 88}}, /* Q ==11 : 69-75% */
|
||||
{{1242,128}, {2415, 93}, {5155, 84}}, /* Q ==12 : 75-81% */
|
||||
{{1349,128}, {2644,106}, {5260,106}}, /* Q ==13 : 81-87% */
|
||||
{{1455,128}, {2422,124}, {4174,124}}, /* Q ==14 : 87-93% */
|
||||
{{ 722,128}, {1891,145}, {1936,146}}, /* Q ==15 : 93-99% */
|
||||
{{0,0}, {1,1}}, /* Q==0 : impossible */
|
||||
{{0,0}, {1,1}}, /* Q==1 : impossible */
|
||||
{{ 150,216}, { 381,119}}, /* Q == 2 : 12-18% */
|
||||
{{ 170,205}, { 514,112}}, /* Q == 3 : 18-25% */
|
||||
{{ 177,199}, { 539,110}}, /* Q == 4 : 25-32% */
|
||||
{{ 197,194}, { 644,107}}, /* Q == 5 : 32-38% */
|
||||
{{ 221,192}, { 735,107}}, /* Q == 6 : 38-44% */
|
||||
{{ 256,189}, { 881,106}}, /* Q == 7 : 44-50% */
|
||||
{{ 359,188}, {1167,109}}, /* Q == 8 : 50-56% */
|
||||
{{ 582,187}, {1570,114}}, /* Q == 9 : 56-62% */
|
||||
{{ 688,187}, {1712,122}}, /* Q ==10 : 62-69% */
|
||||
{{ 825,186}, {1965,136}}, /* Q ==11 : 69-75% */
|
||||
{{ 976,185}, {2131,150}}, /* Q ==12 : 75-81% */
|
||||
{{1180,186}, {2070,175}}, /* Q ==13 : 81-87% */
|
||||
{{1377,185}, {1731,202}}, /* Q ==14 : 87-93% */
|
||||
{{1412,185}, {1695,202}}, /* Q ==15 : 93-99% */
|
||||
};
|
||||
#endif
|
||||
|
||||
@ -1082,7 +1609,7 @@ U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize)
|
||||
U32 const D256 = (U32)(dstSize >> 8);
|
||||
U32 const DTime0 = algoTime[Q][0].tableTime + (algoTime[Q][0].decode256Time * D256);
|
||||
U32 DTime1 = algoTime[Q][1].tableTime + (algoTime[Q][1].decode256Time * D256);
|
||||
DTime1 += DTime1 >> 3; /* advantage to algorithm using less memory, to reduce cache eviction */
|
||||
DTime1 += DTime1 >> 5; /* small advantage to algorithm using less memory, to reduce cache eviction */
|
||||
return DTime1 < DTime0;
|
||||
}
|
||||
#endif
|
||||
|
561
lib/decompress/huf_decompress_amd64.S
Normal file
561
lib/decompress/huf_decompress_amd64.S
Normal file
@ -0,0 +1,561 @@
|
||||
# Calling convention:
|
||||
#
|
||||
# %rdi contains the first argument: HUF_DecompressAsmArgs*.
|
||||
# %rbp is'nt maintained (no frame pointer).
|
||||
# %rsp contains the stack pointer that grows down.
|
||||
# No red-zone is assumed, only addresses >= %rsp are used.
|
||||
# All register contents are preserved.
|
||||
#
|
||||
# TODO: Support Windows calling convention.
|
||||
|
||||
#if !defined(HUF_DISABLE_ASM) && defined(__x86_64__)
|
||||
.global HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop
|
||||
.global HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop
|
||||
.global _HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop
|
||||
.global _HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop
|
||||
.text
|
||||
|
||||
# Sets up register mappings for clarity.
|
||||
# op[], bits[], dtable & ip[0] each get their own register.
|
||||
# ip[1,2,3] & olimit alias var[].
|
||||
# %rax is a scratch register.
|
||||
|
||||
#define op0 rsi
|
||||
#define op1 rbx
|
||||
#define op2 rcx
|
||||
#define op3 rdi
|
||||
|
||||
#define ip0 r8
|
||||
#define ip1 r9
|
||||
#define ip2 r10
|
||||
#define ip3 r11
|
||||
|
||||
#define bits0 rbp
|
||||
#define bits1 rdx
|
||||
#define bits2 r12
|
||||
#define bits3 r13
|
||||
#define dtable r14
|
||||
#define olimit r15
|
||||
|
||||
# var[] aliases ip[1,2,3] & olimit
|
||||
# ip[1,2,3] are saved every iteration.
|
||||
# olimit is only used in compute_olimit.
|
||||
#define var0 r15
|
||||
#define var1 r9
|
||||
#define var2 r10
|
||||
#define var3 r11
|
||||
|
||||
# 32-bit var registers
|
||||
#define vard0 r15d
|
||||
#define vard1 r9d
|
||||
#define vard2 r10d
|
||||
#define vard3 r11d
|
||||
|
||||
# Helper macro: args if idx != 4.
|
||||
#define IF_NOT_4_0(...) __VA_ARGS__
|
||||
#define IF_NOT_4_1(...) __VA_ARGS__
|
||||
#define IF_NOT_4_2(...) __VA_ARGS__
|
||||
#define IF_NOT_4_3(...) __VA_ARGS__
|
||||
#define IF_NOT_4_4(...)
|
||||
#define IF_NOT_4_(idx, ...) IF_NOT_4_##idx(__VA_ARGS__)
|
||||
#define IF_NOT_4(idx, ...) IF_NOT_4_(idx, __VA_ARGS__)
|
||||
|
||||
# Calls X(N) for each stream 0, 1, 2, 3.
|
||||
#define FOR_EACH_STREAM(X) \
|
||||
X(0); \
|
||||
X(1); \
|
||||
X(2); \
|
||||
X(3)
|
||||
|
||||
# Calls X(N, idx) for each stream 0, 1, 2, 3.
|
||||
#define FOR_EACH_STREAM_WITH_INDEX(X, idx) \
|
||||
X(0, idx); \
|
||||
X(1, idx); \
|
||||
X(2, idx); \
|
||||
X(3, idx)
|
||||
|
||||
# Define both _HUF_* & HUF_* symbols because MacOS
|
||||
# C symbols are prefixed with '_' & Linux symbols aren't.
|
||||
_HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop:
|
||||
HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop:
|
||||
# Save all registers - even if they are callee saved for simplicity.
|
||||
push %rax
|
||||
push %rbx
|
||||
push %rcx
|
||||
push %rdx
|
||||
push %rbp
|
||||
push %rsi
|
||||
push %rdi
|
||||
push %r8
|
||||
push %r9
|
||||
push %r10
|
||||
push %r11
|
||||
push %r12
|
||||
push %r13
|
||||
push %r14
|
||||
push %r15
|
||||
|
||||
# Read HUF_DecompressAsmArgs* args from %rax
|
||||
movq %rdi, %rax
|
||||
movq 0(%rax), %ip0
|
||||
movq 8(%rax), %ip1
|
||||
movq 16(%rax), %ip2
|
||||
movq 24(%rax), %ip3
|
||||
movq 32(%rax), %op0
|
||||
movq 40(%rax), %op1
|
||||
movq 48(%rax), %op2
|
||||
movq 56(%rax), %op3
|
||||
movq 64(%rax), %bits0
|
||||
movq 72(%rax), %bits1
|
||||
movq 80(%rax), %bits2
|
||||
movq 88(%rax), %bits3
|
||||
movq 96(%rax), %dtable
|
||||
push %rax # argument
|
||||
push 104(%rax) # ilimit
|
||||
push 112(%rax) # oend
|
||||
push %olimit # olimit space
|
||||
|
||||
subq $24, %rsp
|
||||
|
||||
.L_4X1_compute_olimit:
|
||||
# Computes how many iterations we can do savely
|
||||
# %r15, %rax may be clobbered
|
||||
# rbx, rdx must be saved
|
||||
# op3 & ip0 mustn't be clobbered
|
||||
movq %rbx, 0(%rsp)
|
||||
movq %rdx, 8(%rsp)
|
||||
|
||||
movq 32(%rsp), %rax # rax = oend
|
||||
subq %op3, %rax # rax = oend - op3
|
||||
|
||||
# r15 = (oend - op3) / 5
|
||||
movabsq $-3689348814741910323, %rdx
|
||||
mulq %rdx
|
||||
movq %rdx, %r15
|
||||
shrq $2, %r15
|
||||
|
||||
movq %ip0, %rax # rax = ip0
|
||||
movq 40(%rsp), %rdx # rdx = ilimit
|
||||
subq %rdx, %rax # rax = ip0 - ilimit
|
||||
movq %rax, %rbx # rbx = ip0 - ilimit
|
||||
|
||||
# rdx = (ip0 - ilimit) / 7
|
||||
movabsq $2635249153387078803, %rdx
|
||||
mulq %rdx
|
||||
subq %rdx, %rbx
|
||||
shrq %rbx
|
||||
addq %rbx, %rdx
|
||||
shrq $2, %rdx
|
||||
|
||||
# r15 = min(%rdx, %r15)
|
||||
cmpq %rdx, %r15
|
||||
cmova %rdx, %r15
|
||||
|
||||
# r15 = r15 * 5
|
||||
leaq (%r15, %r15, 4), %r15
|
||||
|
||||
# olimit = op3 + r15
|
||||
addq %op3, %olimit
|
||||
|
||||
movq 8(%rsp), %rdx
|
||||
movq 0(%rsp), %rbx
|
||||
|
||||
# If (op3 + 20 > olimit)
|
||||
movq %op3, %rax # rax = op3
|
||||
addq $20, %rax # rax = op3 + 20
|
||||
cmpq %rax, %olimit # op3 + 20 > olimit
|
||||
jb .L_4X1_exit
|
||||
|
||||
# If (ip1 < ip0) go to exit
|
||||
cmpq %ip0, %ip1
|
||||
jb .L_4X1_exit
|
||||
|
||||
# If (ip2 < ip1) go to exit
|
||||
cmpq %ip1, %ip2
|
||||
jb .L_4X1_exit
|
||||
|
||||
# If (ip3 < ip2) go to exit
|
||||
cmpq %ip2, %ip3
|
||||
jb .L_4X1_exit
|
||||
|
||||
# Reads top 11 bits from bits[n]
|
||||
# Loads dt[bits[n]] into var[n]
|
||||
#define GET_NEXT_DELT(n) \
|
||||
movq $53, %var##n; \
|
||||
shrxq %var##n, %bits##n, %var##n; \
|
||||
movzwl (%dtable,%var##n,2),%vard##n
|
||||
|
||||
# var[n] must contain the DTable entry computed with GET_NEXT_DELT
|
||||
# Moves var[n] to %rax
|
||||
# bits[n] <<= var[n] & 63
|
||||
# op[n][idx] = %rax >> 8
|
||||
# %ah is a way to access bits [8, 16) of %rax
|
||||
#define DECODE_FROM_DELT(n, idx) \
|
||||
movq %var##n, %rax; \
|
||||
shlxq %var##n, %bits##n, %bits##n; \
|
||||
movb %ah, idx(%op##n)
|
||||
|
||||
# Assumes GET_NEXT_DELT has been called.
|
||||
# Calls DECODE_FROM_DELT then GET_NEXT_DELT if n < 4
|
||||
#define DECODE(n, idx) \
|
||||
DECODE_FROM_DELT(n, idx); \
|
||||
IF_NOT_4(idx, GET_NEXT_DELT(n))
|
||||
|
||||
# // ctz & nbBytes is stored in bits[n]
|
||||
# // nbBits is stored in %rax
|
||||
# ctz = CTZ[bits[n]]
|
||||
# nbBits = ctz & 7
|
||||
# nbBytes = ctz >> 3
|
||||
# op[n] += 5
|
||||
# ip[n] -= nbBytes
|
||||
# // Note: x86-64 is little-endian ==> no bswap
|
||||
# bits[n] = MEM_readST(ip[n]) | 1
|
||||
# bits[n] <<= nbBits
|
||||
#define RELOAD_BITS(n) \
|
||||
bsfq %bits##n, %bits##n; \
|
||||
movq %bits##n, %rax; \
|
||||
andq $7, %rax; \
|
||||
shrq $3, %bits##n; \
|
||||
leaq 5(%op##n), %op##n; \
|
||||
subq %bits##n, %ip##n; \
|
||||
movq (%ip##n), %bits##n; \
|
||||
orq $1, %bits##n; \
|
||||
shlx %rax, %bits##n, %bits##n;
|
||||
|
||||
# Store clobbered variables on the stack
|
||||
movq %olimit, 24(%rsp)
|
||||
movq %ip1, 0(%rsp)
|
||||
movq %ip2, 8(%rsp)
|
||||
movq %ip3, 16(%rsp)
|
||||
|
||||
# Call GET_NEXT_DELT for each stream
|
||||
FOR_EACH_STREAM(GET_NEXT_DELT)
|
||||
|
||||
.p2align 6
|
||||
|
||||
.L_4X1_loop_body:
|
||||
# LLVM-MCA-BEGIN decode-4X1
|
||||
# Decode 5 symbols in each of the 4 streams (20 total)
|
||||
# Must have called GET_NEXT_DELT for each stream
|
||||
FOR_EACH_STREAM_WITH_INDEX(DECODE, 0)
|
||||
FOR_EACH_STREAM_WITH_INDEX(DECODE, 1)
|
||||
FOR_EACH_STREAM_WITH_INDEX(DECODE, 2)
|
||||
FOR_EACH_STREAM_WITH_INDEX(DECODE, 3)
|
||||
FOR_EACH_STREAM_WITH_INDEX(DECODE, 4)
|
||||
|
||||
# Load ip[1,2,3] from stack (var[] aliases them)
|
||||
# ip[] is needed for RELOAD_BITS
|
||||
# Each will be stored back to the stack after RELOAD
|
||||
movq 0(%rsp), %ip1
|
||||
movq 8(%rsp), %ip2
|
||||
movq 16(%rsp), %ip3
|
||||
|
||||
# Reload each stream & fetch the next table entry
|
||||
# to prepare for the next iteration
|
||||
RELOAD_BITS(0)
|
||||
GET_NEXT_DELT(0)
|
||||
|
||||
RELOAD_BITS(1)
|
||||
movq %ip1, 0(%rsp)
|
||||
GET_NEXT_DELT(1)
|
||||
|
||||
RELOAD_BITS(2)
|
||||
movq %ip2, 8(%rsp)
|
||||
GET_NEXT_DELT(2)
|
||||
|
||||
RELOAD_BITS(3)
|
||||
movq %ip3, 16(%rsp)
|
||||
GET_NEXT_DELT(3)
|
||||
|
||||
# If op3 < olimit: continue the loop
|
||||
cmp %op3, 24(%rsp)
|
||||
ja .L_4X1_loop_body
|
||||
|
||||
# Reload ip[1,2,3] from stack
|
||||
movq 0(%rsp), %ip1
|
||||
movq 8(%rsp), %ip2
|
||||
movq 16(%rsp), %ip3
|
||||
|
||||
# Re-compute olimit
|
||||
jmp .L_4X1_compute_olimit
|
||||
|
||||
#undef GET_NEXT_DELT
|
||||
#undef DECODE_FROM_DELT
|
||||
#undef DECODE
|
||||
#undef RELOAD_BITS
|
||||
# LLVM-MCA-END
|
||||
.L_4X1_exit:
|
||||
addq $24, %rsp
|
||||
|
||||
# Restore stack (oend & olimit)
|
||||
pop %rax # olimit
|
||||
pop %rax # oend
|
||||
pop %rax # ilimit
|
||||
pop %rax # arg
|
||||
|
||||
# Save ip / op / bits
|
||||
movq %ip0, 0(%rax)
|
||||
movq %ip1, 8(%rax)
|
||||
movq %ip2, 16(%rax)
|
||||
movq %ip3, 24(%rax)
|
||||
movq %op0, 32(%rax)
|
||||
movq %op1, 40(%rax)
|
||||
movq %op2, 48(%rax)
|
||||
movq %op3, 56(%rax)
|
||||
movq %bits0, 64(%rax)
|
||||
movq %bits1, 72(%rax)
|
||||
movq %bits2, 80(%rax)
|
||||
movq %bits3, 88(%rax)
|
||||
|
||||
# Restore registers
|
||||
pop %r15
|
||||
pop %r14
|
||||
pop %r13
|
||||
pop %r12
|
||||
pop %r11
|
||||
pop %r10
|
||||
pop %r9
|
||||
pop %r8
|
||||
pop %rdi
|
||||
pop %rsi
|
||||
pop %rbp
|
||||
pop %rdx
|
||||
pop %rcx
|
||||
pop %rbx
|
||||
pop %rax
|
||||
ret
|
||||
|
||||
_HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop:
|
||||
HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop:
|
||||
# Save all registers - even if they are callee saved for simplicity.
|
||||
push %rax
|
||||
push %rbx
|
||||
push %rcx
|
||||
push %rdx
|
||||
push %rbp
|
||||
push %rsi
|
||||
push %rdi
|
||||
push %r8
|
||||
push %r9
|
||||
push %r10
|
||||
push %r11
|
||||
push %r12
|
||||
push %r13
|
||||
push %r14
|
||||
push %r15
|
||||
|
||||
movq %rdi, %rax
|
||||
movq 0(%rax), %ip0
|
||||
movq 8(%rax), %ip1
|
||||
movq 16(%rax), %ip2
|
||||
movq 24(%rax), %ip3
|
||||
movq 32(%rax), %op0
|
||||
movq 40(%rax), %op1
|
||||
movq 48(%rax), %op2
|
||||
movq 56(%rax), %op3
|
||||
movq 64(%rax), %bits0
|
||||
movq 72(%rax), %bits1
|
||||
movq 80(%rax), %bits2
|
||||
movq 88(%rax), %bits3
|
||||
movq 96(%rax), %dtable
|
||||
push %rax # argument
|
||||
push %rax # olimit
|
||||
push 104(%rax) # ilimit
|
||||
|
||||
movq 112(%rax), %rax
|
||||
push %rax # oend3
|
||||
|
||||
movq %op3, %rax
|
||||
push %rax # oend2
|
||||
|
||||
movq %op2, %rax
|
||||
push %rax # oend1
|
||||
|
||||
movq %op1, %rax
|
||||
push %rax # oend0
|
||||
|
||||
# Scratch space
|
||||
subq $8, %rsp
|
||||
|
||||
.L_4X2_compute_olimit:
|
||||
# Computes how many iterations we can do savely
|
||||
# %r15, %rax may be clobbered
|
||||
# rdx must be saved
|
||||
# op[1,2,3,4] & ip0 mustn't be clobbered
|
||||
movq %rdx, 0(%rsp)
|
||||
|
||||
# We can consume up to 7 input bytes each iteration.
|
||||
movq %ip0, %rax # rax = ip0
|
||||
movq 40(%rsp), %rdx # rdx = ilimit
|
||||
subq %rdx, %rax # rax = ip0 - ilimit
|
||||
movq %rax, %r15 # r15 = ip0 - ilimit
|
||||
|
||||
# rdx = rax / 7
|
||||
movabsq $2635249153387078803, %rdx
|
||||
mulq %rdx
|
||||
subq %rdx, %r15
|
||||
shrq %r15
|
||||
addq %r15, %rdx
|
||||
shrq $2, %rdx
|
||||
|
||||
# r15 = (ip0 - ilimit) / 7
|
||||
movq %rdx, %r15
|
||||
|
||||
movabsq $-3689348814741910323, %rdx
|
||||
movq 8(%rsp), %rax # rax = oend0
|
||||
subq %op0, %rax # rax = oend0 - op0
|
||||
mulq %rdx
|
||||
shrq $3, %rdx # rdx = rax / 10
|
||||
|
||||
# r15 = min(%rdx, %r15)
|
||||
cmpq %rdx, %r15
|
||||
cmova %rdx, %r15
|
||||
|
||||
movabsq $-3689348814741910323, %rdx
|
||||
movq 16(%rsp), %rax # rax = oend1
|
||||
subq %op1, %rax # rax = oend1 - op1
|
||||
mulq %rdx
|
||||
shrq $3, %rdx # rdx = rax / 10
|
||||
|
||||
# r15 = min(%rdx, %r15)
|
||||
cmpq %rdx, %r15
|
||||
cmova %rdx, %r15
|
||||
|
||||
movabsq $-3689348814741910323, %rdx
|
||||
movq 24(%rsp), %rax # rax = oend2
|
||||
subq %op2, %rax # rax = oend2 - op2
|
||||
mulq %rdx
|
||||
shrq $3, %rdx # rdx = rax / 10
|
||||
|
||||
# r15 = min(%rdx, %r15)
|
||||
cmpq %rdx, %r15
|
||||
cmova %rdx, %r15
|
||||
|
||||
movabsq $-3689348814741910323, %rdx
|
||||
movq 32(%rsp), %rax # rax = oend3
|
||||
subq %op3, %rax # rax = oend3 - op3
|
||||
mulq %rdx
|
||||
shrq $3, %rdx # rdx = rax / 10
|
||||
|
||||
# r15 = min(%rdx, %r15)
|
||||
cmpq %rdx, %r15
|
||||
cmova %rdx, %r15
|
||||
|
||||
# olimit = op3 + 5 * r15
|
||||
movq %r15, %rax
|
||||
leaq (%op3, %rax, 4), %olimit
|
||||
addq %rax, %olimit
|
||||
|
||||
movq 0(%rsp), %rdx
|
||||
|
||||
# If (op3 + 10 > olimit)
|
||||
movq %op3, %rax # rax = op3
|
||||
addq $10, %rax # rax = op3 + 10
|
||||
cmpq %rax, %olimit # op3 + 10 > olimit
|
||||
jb .L_4X2_exit
|
||||
|
||||
# If (ip1 < ip0) go to exit
|
||||
cmpq %ip0, %ip1
|
||||
jb .L_4X2_exit
|
||||
|
||||
# If (ip2 < ip1) go to exit
|
||||
cmpq %ip1, %ip2
|
||||
jb .L_4X2_exit
|
||||
|
||||
# If (ip3 < ip2) go to exit
|
||||
cmpq %ip2, %ip3
|
||||
jb .L_4X2_exit
|
||||
|
||||
#define DECODE(n, idx) \
|
||||
movq %bits##n, %rax; \
|
||||
shrq $53, %rax; \
|
||||
movzwl 0(%dtable,%rax,4),%r8d; \
|
||||
movzbl 2(%dtable,%rax,4),%r15d; \
|
||||
movzbl 3(%dtable,%rax,4),%eax; \
|
||||
movw %r8w, (%op##n); \
|
||||
shlxq %r15, %bits##n, %bits##n; \
|
||||
addq %rax, %op##n
|
||||
|
||||
#define RELOAD_BITS(n) \
|
||||
bsfq %bits##n, %bits##n; \
|
||||
movq %bits##n, %rax; \
|
||||
shrq $3, %bits##n; \
|
||||
andq $7, %rax; \
|
||||
subq %bits##n, %ip##n; \
|
||||
movq (%ip##n), %bits##n; \
|
||||
orq $1, %bits##n; \
|
||||
shlxq %rax, %bits##n, %bits##n;
|
||||
|
||||
|
||||
movq %olimit, 48(%rsp)
|
||||
|
||||
.p2align 6
|
||||
|
||||
.L_4X2_loop_body:
|
||||
# LLVM-MCA-BEGIN decode-4X2
|
||||
|
||||
# We clobber r8, so store it on the stack
|
||||
movq %r8, 0(%rsp)
|
||||
|
||||
# Decode 5 symbols from each of the 4 streams (20 symbols total).
|
||||
FOR_EACH_STREAM_WITH_INDEX(DECODE, 0)
|
||||
FOR_EACH_STREAM_WITH_INDEX(DECODE, 1)
|
||||
FOR_EACH_STREAM_WITH_INDEX(DECODE, 2)
|
||||
FOR_EACH_STREAM_WITH_INDEX(DECODE, 3)
|
||||
FOR_EACH_STREAM_WITH_INDEX(DECODE, 4)
|
||||
|
||||
# Reload r8
|
||||
movq 0(%rsp), %r8
|
||||
|
||||
FOR_EACH_STREAM(RELOAD_BITS)
|
||||
|
||||
cmp %op3, 48(%rsp)
|
||||
ja .L_4X2_loop_body
|
||||
jmp .L_4X2_compute_olimit
|
||||
|
||||
#undef DECODE
|
||||
#undef RELOAD_BITS
|
||||
# LLVM-MCA-END
|
||||
.L_4X2_exit:
|
||||
addq $8, %rsp
|
||||
# Restore stack (oend & olimit)
|
||||
pop %rax # oend0
|
||||
pop %rax # oend1
|
||||
pop %rax # oend2
|
||||
pop %rax # oend3
|
||||
pop %rax # ilimit
|
||||
pop %rax # olimit
|
||||
pop %rax # arg
|
||||
|
||||
# Save ip / op / bits
|
||||
movq %ip0, 0(%rax)
|
||||
movq %ip1, 8(%rax)
|
||||
movq %ip2, 16(%rax)
|
||||
movq %ip3, 24(%rax)
|
||||
movq %op0, 32(%rax)
|
||||
movq %op1, 40(%rax)
|
||||
movq %op2, 48(%rax)
|
||||
movq %op3, 56(%rax)
|
||||
movq %bits0, 64(%rax)
|
||||
movq %bits1, 72(%rax)
|
||||
movq %bits2, 80(%rax)
|
||||
movq %bits3, 88(%rax)
|
||||
|
||||
# Restore registers
|
||||
pop %r15
|
||||
pop %r14
|
||||
pop %r13
|
||||
pop %r12
|
||||
pop %r11
|
||||
pop %r10
|
||||
pop %r9
|
||||
pop %r8
|
||||
pop %rdi
|
||||
pop %rsi
|
||||
pop %rbp
|
||||
pop %rdx
|
||||
pop %rcx
|
||||
pop %rbx
|
||||
pop %rax
|
||||
ret
|
||||
#endif
|
@ -72,7 +72,7 @@ extern "C" {
|
||||
/*------ Version ------*/
|
||||
#define ZSTD_VERSION_MAJOR 1
|
||||
#define ZSTD_VERSION_MINOR 5
|
||||
#define ZSTD_VERSION_RELEASE 0
|
||||
#define ZSTD_VERSION_RELEASE 1
|
||||
#define ZSTD_VERSION_NUMBER (ZSTD_VERSION_MAJOR *100*100 + ZSTD_VERSION_MINOR *100 + ZSTD_VERSION_RELEASE)
|
||||
|
||||
/*! ZSTD_versionNumber() :
|
||||
|
@ -46,7 +46,7 @@ int LLVMFuzzerTestOneInput(const uint8_t *src, size_t size)
|
||||
if (ZSTD_isError(err))
|
||||
goto _out;
|
||||
} else {
|
||||
size_t const err = HUF_readDTableX2_wksp(dt, src, size, wksp, wkspSize);
|
||||
size_t const err = HUF_readDTableX2_wksp_bmi2(dt, src, size, wksp, wkspSize, bmi2);
|
||||
if (ZSTD_isError(err))
|
||||
goto _out;
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user