summaryrefslogtreecommitdiff
path: root/lib/zstd/decompress
diff options
context:
space:
mode:
authorNick Terrell <terrelln@meta.com>2025-03-08 12:09:33 -0800
committerNick Terrell <terrelln@meta.com>2025-03-13 13:25:58 -0700
commit65d1f5507ed2c78c64fce40e44e5574a9419eb09 (patch)
tree4a1b819db2ea7f2a322e9bcc7582d946d0a4ea29 /lib/zstd/decompress
parent7eb172143d5508b4da468ed59ee857c6e5e01da6 (diff)
downloadlinux-65d1f5507ed2c78c64fce40e44e5574a9419eb09.tar.gz
linux-65d1f5507ed2c78c64fce40e44e5574a9419eb09.tar.bz2
linux-65d1f5507ed2c78c64fce40e44e5574a9419eb09.zip
zstd: Import upstream v1.5.7
In addition to keeping the kernel's copy of zstd up to date, this update was requested by Intel to expose upstream's APIs that allow QAT to accelerate the LZ match finding stage of Zstd. This patch is imported from the upstream tag v1.5.7-kernel [0], which is signed with upstream's signing key EF8FE99528B52FFD [1]. It was imported from upstream using this command: export ZSTD=/path/to/repo/zstd/ export LINUX=/path/to/repo/linux/ cd "$ZSTD/contrib/linux-kernel" git checkout v1.5.7-kernel make import LINUX="$LINUX" This patch has been tested on x86-64, and has been boot tested with a zstd compressed kernel & initramfs on i386 and aarch64. I benchmarked the patch on x86-64 with gcc-14.2.1 on an Intel i9-9900K by measruing the performance of compressed filesystem reads and writes. Component, Level, Size delta, C. time delta, D. time delta Btrfs , 1, +0.00%, -6.1%, +1.4% Btrfs , 3, +0.00%, -9.8%, +3.0% Btrfs , 5, +0.00%, +1.7%, +1.4% Btrfs , 7, +0.00%, -1.9%, +2.7% Btrfs , 9, +0.00%, -3.4%, +3.7% Btrfs , 15, +0.00%, -0.3%, +3.6% SquashFS , 1, +0.00%, N/A, +1.9% The major changes that impact the kernel use cases for each version are: v1.5.7: https://github.com/facebook/zstd/releases/tag/v1.5.7 * Add zstd_compress_sequences_and_literals() for use by Intel's QAT driver to implement Zstd compression acceleration in the kernel. * Fix an underflow bug in 32-bit builds that can cause data corruption when processing more than 4GB of data with a single `ZSTD_CCtx` object, when an input crosses the 4GB boundry. I don't believe this impacts any current kernel use cases, because the `ZSTD_CCtx` is typically reconstructed between compressions. * Levels 1-4 see 5-10% compression speed improvements for inputs smaller than 128KB. v1.5.6: https://github.com/facebook/zstd/releases/tag/v1.5.6 * Improved compression ratio for the highest compression levels. I don't expect these see much use however, due to their slow speeds. v1.5.5: https://github.com/facebook/zstd/releases/tag/v1.5.5 * Fix a rare corruption bug that can trigger on levels 13 and above. * Improve compression speed of levels 5-11 on incompressible data. v1.5.4: https://github.com/facebook/zstd/releases/tag/v1.5.4 * Improve copmression speed of levels 5-11 on ARM. * Improve dictionary compression speed. Signed-off-by: Nick Terrell <terrelln@fb.com>
Diffstat (limited to 'lib/zstd/decompress')
-rw-r--r--lib/zstd/decompress/huf_decompress.c887
-rw-r--r--lib/zstd/decompress/zstd_ddict.c9
-rw-r--r--lib/zstd/decompress/zstd_ddict.h3
-rw-r--r--lib/zstd/decompress/zstd_decompress.c375
-rw-r--r--lib/zstd/decompress/zstd_decompress_block.c724
-rw-r--r--lib/zstd/decompress/zstd_decompress_block.h10
-rw-r--r--lib/zstd/decompress/zstd_decompress_internal.h19
7 files changed, 1266 insertions, 761 deletions
diff --git a/lib/zstd/decompress/huf_decompress.c b/lib/zstd/decompress/huf_decompress.c
index 60958afebc41..ac8b87f48f84 100644
--- a/lib/zstd/decompress/huf_decompress.c
+++ b/lib/zstd/decompress/huf_decompress.c
@@ -1,7 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
/* ******************************************************************
* huff0 huffman decoder,
* part of Finite State Entropy library
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
*
* You can contact the author at :
* - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
@@ -19,10 +20,10 @@
#include "../common/compiler.h"
#include "../common/bitstream.h" /* BIT_* */
#include "../common/fse.h" /* to compress headers */
-#define HUF_STATIC_LINKING_ONLY
#include "../common/huf.h"
#include "../common/error_private.h"
#include "../common/zstd_internal.h"
+#include "../common/bits.h" /* ZSTD_highbit32, ZSTD_countTrailingZeros64 */
/* **************************************************************
* Constants
@@ -34,6 +35,12 @@
* Macros
****************************************************************/
+#ifdef HUF_DISABLE_FAST_DECODE
+# define HUF_ENABLE_FAST_DECODE 0
+#else
+# define HUF_ENABLE_FAST_DECODE 1
+#endif
+
/* These two optional macros force the use one way or another of the two
* Huffman decompression implementations. You can't force in both directions
* at the same time.
@@ -43,27 +50,25 @@
#error "Cannot force the use of the X1 and X2 decoders at the same time!"
#endif
-#if ZSTD_ENABLE_ASM_X86_64_BMI2 && DYNAMIC_BMI2
-# define HUF_ASM_X86_64_BMI2_ATTRS BMI2_TARGET_ATTRIBUTE
+/* When DYNAMIC_BMI2 is enabled, fast decoders are only called when bmi2 is
+ * supported at runtime, so we can add the BMI2 target attribute.
+ * When it is disabled, we will still get BMI2 if it is enabled statically.
+ */
+#if DYNAMIC_BMI2
+# define HUF_FAST_BMI2_ATTRS BMI2_TARGET_ATTRIBUTE
#else
-# define HUF_ASM_X86_64_BMI2_ATTRS
+# define HUF_FAST_BMI2_ATTRS
#endif
#define HUF_EXTERN_C
#define HUF_ASM_DECL HUF_EXTERN_C
-#if DYNAMIC_BMI2 || (ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__))
+#if DYNAMIC_BMI2
# define HUF_NEED_BMI2_FUNCTION 1
#else
# define HUF_NEED_BMI2_FUNCTION 0
#endif
-#if !(ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__))
-# define HUF_NEED_DEFAULT_FUNCTION 1
-#else
-# define HUF_NEED_DEFAULT_FUNCTION 0
-#endif
-
/* **************************************************************
* Error Management
****************************************************************/
@@ -80,6 +85,11 @@
/* **************************************************************
* BMI2 Variant Wrappers
****************************************************************/
+typedef size_t (*HUF_DecompressUsingDTableFn)(void *dst, size_t dstSize,
+ const void *cSrc,
+ size_t cSrcSize,
+ const HUF_DTable *DTable);
+
#if DYNAMIC_BMI2
#define HUF_DGEN(fn) \
@@ -101,9 +111,9 @@
} \
\
static size_t fn(void* dst, size_t dstSize, void const* cSrc, \
- size_t cSrcSize, HUF_DTable const* DTable, int bmi2) \
+ size_t cSrcSize, HUF_DTable const* DTable, int flags) \
{ \
- if (bmi2) { \
+ if (flags & HUF_flags_bmi2) { \
return fn##_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); \
} \
return fn##_default(dst, dstSize, cSrc, cSrcSize, DTable); \
@@ -113,9 +123,9 @@
#define HUF_DGEN(fn) \
static size_t fn(void* dst, size_t dstSize, void const* cSrc, \
- size_t cSrcSize, HUF_DTable const* DTable, int bmi2) \
+ size_t cSrcSize, HUF_DTable const* DTable, int flags) \
{ \
- (void)bmi2; \
+ (void)flags; \
return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable); \
}
@@ -134,43 +144,66 @@ static DTableDesc HUF_getDTableDesc(const HUF_DTable* table)
return dtd;
}
-#if ZSTD_ENABLE_ASM_X86_64_BMI2
-
-static size_t HUF_initDStream(BYTE const* ip) {
+static size_t HUF_initFastDStream(BYTE const* ip) {
BYTE const lastByte = ip[7];
- size_t const bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0;
+ size_t const bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0;
size_t const value = MEM_readLEST(ip) | 1;
assert(bitsConsumed <= 8);
+ assert(sizeof(size_t) == 8);
return value << bitsConsumed;
}
+
+
+/*
+ * The input/output arguments to the Huffman fast decoding loop:
+ *
+ * ip [in/out] - The input pointers, must be updated to reflect what is consumed.
+ * op [in/out] - The output pointers, must be updated to reflect what is written.
+ * bits [in/out] - The bitstream containers, must be updated to reflect the current state.
+ * dt [in] - The decoding table.
+ * ilowest [in] - The beginning of the valid range of the input. Decoders may read
+ * down to this pointer. It may be below iend[0].
+ * oend [in] - The end of the output stream. op[3] must not cross oend.
+ * iend [in] - The end of each input stream. ip[i] may cross iend[i],
+ * as long as it is above ilowest, but that indicates corruption.
+ */
typedef struct {
BYTE const* ip[4];
BYTE* op[4];
U64 bits[4];
void const* dt;
- BYTE const* ilimit;
+ BYTE const* ilowest;
BYTE* oend;
BYTE const* iend[4];
-} HUF_DecompressAsmArgs;
+} HUF_DecompressFastArgs;
+
+typedef void (*HUF_DecompressFastLoopFn)(HUF_DecompressFastArgs*);
/*
- * Initializes args for the asm decoding loop.
- * @returns 0 on success
- * 1 if the fallback implementation should be used.
+ * Initializes args for the fast decoding loop.
+ * @returns 1 on success
+ * 0 if the fallback implementation should be used.
* Or an error code on failure.
*/
-static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst, size_t dstSize, void const* src, size_t srcSize, const HUF_DTable* DTable)
+static size_t HUF_DecompressFastArgs_init(HUF_DecompressFastArgs* args, void* dst, size_t dstSize, void const* src, size_t srcSize, const HUF_DTable* DTable)
{
void const* dt = DTable + 1;
U32 const dtLog = HUF_getDTableDesc(DTable).tableLog;
- const BYTE* const ilimit = (const BYTE*)src + 6 + 8;
+ const BYTE* const istart = (const BYTE*)src;
- BYTE* const oend = (BYTE*)dst + dstSize;
+ BYTE* const oend = ZSTD_maybeNullPtrAdd((BYTE*)dst, dstSize);
- /* The following condition is false on x32 platform,
- * but HUF_asm is not compatible with this ABI */
- if (!(MEM_isLittleEndian() && !MEM_32bits())) return 1;
+ /* The fast decoding loop assumes 64-bit little-endian.
+ * This condition is false on x32.
+ */
+ if (!MEM_isLittleEndian() || MEM_32bits())
+ return 0;
+
+ /* Avoid nullptr addition */
+ if (dstSize == 0)
+ return 0;
+ assert(dst != NULL);
/* strict minimum : jump table + 1 byte per stream */
if (srcSize < 10)
@@ -181,11 +214,10 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
* On small inputs we don't have enough data to trigger the fast loop, so use the old decoder.
*/
if (dtLog != HUF_DECODER_FAST_TABLELOG)
- return 1;
+ return 0;
/* Read the jump table. */
{
- const BYTE* const istart = (const BYTE*)src;
size_t const length1 = MEM_readLE16(istart);
size_t const length2 = MEM_readLE16(istart+2);
size_t const length3 = MEM_readLE16(istart+4);
@@ -195,13 +227,11 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
args->iend[2] = args->iend[1] + length2;
args->iend[3] = args->iend[2] + length3;
- /* HUF_initDStream() requires this, and this small of an input
+ /* HUF_initFastDStream() requires this, and this small of an input
* won't benefit from the ASM loop anyways.
- * length1 must be >= 16 so that ip[0] >= ilimit before the loop
- * starts.
*/
- if (length1 < 16 || length2 < 8 || length3 < 8 || length4 < 8)
- return 1;
+ if (length1 < 8 || length2 < 8 || length3 < 8 || length4 < 8)
+ return 0;
if (length4 > srcSize) return ERROR(corruption_detected); /* overflow */
}
/* ip[] contains the position that is currently loaded into bits[]. */
@@ -218,7 +248,7 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
/* No point to call the ASM loop for tiny outputs. */
if (args->op[3] >= oend)
- return 1;
+ return 0;
/* bits[] is the bit container.
* It is read from the MSB down to the LSB.
@@ -227,24 +257,25 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
* set, so that CountTrailingZeros(bits[]) can be used
* to count how many bits we've consumed.
*/
- args->bits[0] = HUF_initDStream(args->ip[0]);
- args->bits[1] = HUF_initDStream(args->ip[1]);
- args->bits[2] = HUF_initDStream(args->ip[2]);
- args->bits[3] = HUF_initDStream(args->ip[3]);
-
- /* If ip[] >= ilimit, it is guaranteed to be safe to
- * reload bits[]. It may be beyond its section, but is
- * guaranteed to be valid (>= istart).
- */
- args->ilimit = ilimit;
+ args->bits[0] = HUF_initFastDStream(args->ip[0]);
+ args->bits[1] = HUF_initFastDStream(args->ip[1]);
+ args->bits[2] = HUF_initFastDStream(args->ip[2]);
+ args->bits[3] = HUF_initFastDStream(args->ip[3]);
+
+ /* The decoders must be sure to never read beyond ilowest.
+ * This is lower than iend[0], but allowing decoders to read
+ * down to ilowest can allow an extra iteration or two in the
+ * fast loop.
+ */
+ args->ilowest = istart;
args->oend = oend;
args->dt = dt;
- return 0;
+ return 1;
}
-static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressAsmArgs const* args, int stream, BYTE* segmentEnd)
+static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressFastArgs const* args, int stream, BYTE* segmentEnd)
{
/* Validate that we haven't overwritten. */
if (args->op[stream] > segmentEnd)
@@ -258,15 +289,33 @@ static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressAsmArgs
return ERROR(corruption_detected);
/* Construct the BIT_DStream_t. */
- bit->bitContainer = MEM_readLE64(args->ip[stream]);
- bit->bitsConsumed = ZSTD_countTrailingZeros((size_t)args->bits[stream]);
- bit->start = (const char*)args->iend[0];
+ assert(sizeof(size_t) == 8);
+ bit->bitContainer = MEM_readLEST(args->ip[stream]);
+ bit->bitsConsumed = ZSTD_countTrailingZeros64(args->bits[stream]);
+ bit->start = (const char*)args->ilowest;
bit->limitPtr = bit->start + sizeof(size_t);
bit->ptr = (const char*)args->ip[stream];
return 0;
}
-#endif
+
+/* Calls X(N) for each stream 0, 1, 2, 3. */
+#define HUF_4X_FOR_EACH_STREAM(X) \
+ do { \
+ X(0); \
+ X(1); \
+ X(2); \
+ X(3); \
+ } while (0)
+
+/* Calls X(N, var) for each stream 0, 1, 2, 3. */
+#define HUF_4X_FOR_EACH_STREAM_WITH_VAR(X, var) \
+ do { \
+ X(0, (var)); \
+ X(1, (var)); \
+ X(2, (var)); \
+ X(3, (var)); \
+ } while (0)
#ifndef HUF_FORCE_DECOMPRESS_X2
@@ -283,10 +332,11 @@ typedef struct { BYTE nbBits; BYTE byte; } HUF_DEltX1; /* single-symbol decodi
static U64 HUF_DEltX1_set4(BYTE symbol, BYTE nbBits) {
U64 D4;
if (MEM_isLittleEndian()) {
- D4 = (symbol << 8) + nbBits;
+ D4 = (U64)((symbol << 8) + nbBits);
} else {
- D4 = symbol + (nbBits << 8);
+ D4 = (U64)(symbol + (nbBits << 8));
}
+ assert(D4 < (1U << 16));
D4 *= 0x0001000100010001ULL;
return D4;
}
@@ -329,13 +379,7 @@ typedef struct {
BYTE huffWeight[HUF_SYMBOLVALUE_MAX + 1];
} HUF_ReadDTableX1_Workspace;
-
-size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize)
-{
- return HUF_readDTableX1_wksp_bmi2(DTable, src, srcSize, workSpace, wkspSize, /* bmi2 */ 0);
-}
-
-size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2)
+size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags)
{
U32 tableLog = 0;
U32 nbSymbols = 0;
@@ -350,7 +394,7 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr
DEBUG_STATIC_ASSERT(sizeof(DTableDesc) == sizeof(HUF_DTable));
/* ZSTD_memset(huffWeight, 0, sizeof(huffWeight)); */ /* is not necessary, even though some analyzer complain ... */
- iSize = HUF_readStats_wksp(wksp->huffWeight, HUF_SYMBOLVALUE_MAX + 1, wksp->rankVal, &nbSymbols, &tableLog, src, srcSize, wksp->statsWksp, sizeof(wksp->statsWksp), bmi2);
+ iSize = HUF_readStats_wksp(wksp->huffWeight, HUF_SYMBOLVALUE_MAX + 1, wksp->rankVal, &nbSymbols, &tableLog, src, srcSize, wksp->statsWksp, sizeof(wksp->statsWksp), flags);
if (HUF_isError(iSize)) return iSize;
@@ -377,9 +421,8 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr
* rankStart[0] is not filled because there are no entries in the table for
* weight 0.
*/
- {
- int n;
- int nextRankStart = 0;
+ { int n;
+ U32 nextRankStart = 0;
int const unroll = 4;
int const nLimit = (int)nbSymbols - unroll + 1;
for (n=0; n<(int)tableLog+1; n++) {
@@ -406,10 +449,9 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr
* We can switch based on the length to a different inner loop which is
* optimized for that particular case.
*/
- {
- U32 w;
- int symbol=wksp->rankVal[0];
- int rankStart=0;
+ { U32 w;
+ int symbol = wksp->rankVal[0];
+ int rankStart = 0;
for (w=1; w<tableLog+1; ++w) {
int const symbolCount = wksp->rankVal[w];
int const length = (1 << w) >> 1;
@@ -483,15 +525,19 @@ HUF_decodeSymbolX1(BIT_DStream_t* Dstream, const HUF_DEltX1* dt, const U32 dtLog
}
#define HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr) \
- *ptr++ = HUF_decodeSymbolX1(DStreamPtr, dt, dtLog)
+ do { *ptr++ = HUF_decodeSymbolX1(DStreamPtr, dt, dtLog); } while (0)
-#define HUF_DECODE_SYMBOLX1_1(ptr, DStreamPtr) \
- if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \
- HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr)
+#define HUF_DECODE_SYMBOLX1_1(ptr, DStreamPtr) \
+ do { \
+ if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \
+ HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr); \
+ } while (0)
-#define HUF_DECODE_SYMBOLX1_2(ptr, DStreamPtr) \
- if (MEM_64bits()) \
- HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr)
+#define HUF_DECODE_SYMBOLX1_2(ptr, DStreamPtr) \
+ do { \
+ if (MEM_64bits()) \
+ HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr); \
+ } while (0)
HINT_INLINE size_t
HUF_decodeStreamX1(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, const HUF_DEltX1* const dt, const U32 dtLog)
@@ -519,7 +565,7 @@ HUF_decodeStreamX1(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, cons
while (p < pEnd)
HUF_DECODE_SYMBOLX1_0(p, bitDPtr);
- return pEnd-pStart;
+ return (size_t)(pEnd-pStart);
}
FORCE_INLINE_TEMPLATE size_t
@@ -529,7 +575,7 @@ HUF_decompress1X1_usingDTable_internal_body(
const HUF_DTable* DTable)
{
BYTE* op = (BYTE*)dst;
- BYTE* const oend = op + dstSize;
+ BYTE* const oend = ZSTD_maybeNullPtrAdd(op, dstSize);
const void* dtPtr = DTable + 1;
const HUF_DEltX1* const dt = (const HUF_DEltX1*)dtPtr;
BIT_DStream_t bitD;
@@ -545,6 +591,10 @@ HUF_decompress1X1_usingDTable_internal_body(
return dstSize;
}
+/* HUF_decompress4X1_usingDTable_internal_body():
+ * Conditions :
+ * @dstSize >= 6
+ */
FORCE_INLINE_TEMPLATE size_t
HUF_decompress4X1_usingDTable_internal_body(
void* dst, size_t dstSize,
@@ -553,6 +603,7 @@ HUF_decompress4X1_usingDTable_internal_body(
{
/* Check */
if (cSrcSize < 10) return ERROR(corruption_detected); /* strict minimum : jump table + 1 byte per stream */
+ if (dstSize < 6) return ERROR(corruption_detected); /* stream 4-split doesn't work */
{ const BYTE* const istart = (const BYTE*) cSrc;
BYTE* const ostart = (BYTE*) dst;
@@ -588,6 +639,7 @@ HUF_decompress4X1_usingDTable_internal_body(
if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */
if (opStart4 > oend) return ERROR(corruption_detected); /* overflow */
+ assert(dstSize >= 6); /* validated above */
CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
@@ -650,52 +702,173 @@ size_t HUF_decompress4X1_usingDTable_internal_bmi2(void* dst, size_t dstSize, vo
}
#endif
-#if HUF_NEED_DEFAULT_FUNCTION
static
size_t HUF_decompress4X1_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc,
size_t cSrcSize, HUF_DTable const* DTable) {
return HUF_decompress4X1_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
}
-#endif
#if ZSTD_ENABLE_ASM_X86_64_BMI2
-HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop(HUF_DecompressAsmArgs* args) ZSTDLIB_HIDDEN;
+HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_fast_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN;
+
+#endif
+
+static HUF_FAST_BMI2_ATTRS
+void HUF_decompress4X1_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* args)
+{
+ U64 bits[4];
+ BYTE const* ip[4];
+ BYTE* op[4];
+ U16 const* const dtable = (U16 const*)args->dt;
+ BYTE* const oend = args->oend;
+ BYTE const* const ilowest = args->ilowest;
+
+ /* Copy the arguments to local variables */
+ ZSTD_memcpy(&bits, &args->bits, sizeof(bits));
+ ZSTD_memcpy((void*)(&ip), &args->ip, sizeof(ip));
+ ZSTD_memcpy(&op, &args->op, sizeof(op));
+
+ assert(MEM_isLittleEndian());
+ assert(!MEM_32bits());
+
+ for (;;) {
+ BYTE* olimit;
+ int stream;
+
+ /* Assert loop preconditions */
+#ifndef NDEBUG
+ for (stream = 0; stream < 4; ++stream) {
+ assert(op[stream] <= (stream == 3 ? oend : op[stream + 1]));
+ assert(ip[stream] >= ilowest);
+ }
+#endif
+ /* Compute olimit */
+ {
+ /* Each iteration produces 5 output symbols per stream */
+ size_t const oiters = (size_t)(oend - op[3]) / 5;
+ /* Each iteration consumes up to 11 bits * 5 = 55 bits < 7 bytes
+ * per stream.
+ */
+ size_t const iiters = (size_t)(ip[0] - ilowest) / 7;
+ /* We can safely run iters iterations before running bounds checks */
+ size_t const iters = MIN(oiters, iiters);
+ size_t const symbols = iters * 5;
+
+ /* We can simply check that op[3] < olimit, instead of checking all
+ * of our bounds, since we can't hit the other bounds until we've run
+ * iters iterations, which only happens when op[3] == olimit.
+ */
+ olimit = op[3] + symbols;
+
+ /* Exit fast decoding loop once we reach the end. */
+ if (op[3] == olimit)
+ break;
+
+ /* Exit the decoding loop if any input pointer has crossed the
+ * previous one. This indicates corruption, and a precondition
+ * to our loop is that ip[i] >= ip[0].
+ */
+ for (stream = 1; stream < 4; ++stream) {
+ if (ip[stream] < ip[stream - 1])
+ goto _out;
+ }
+ }
+
+#ifndef NDEBUG
+ for (stream = 1; stream < 4; ++stream) {
+ assert(ip[stream] >= ip[stream - 1]);
+ }
+#endif
+
+#define HUF_4X1_DECODE_SYMBOL(_stream, _symbol) \
+ do { \
+ int const index = (int)(bits[(_stream)] >> 53); \
+ int const entry = (int)dtable[index]; \
+ bits[(_stream)] <<= (entry & 0x3F); \
+ op[(_stream)][(_symbol)] = (BYTE)((entry >> 8) & 0xFF); \
+ } while (0)
+
+#define HUF_4X1_RELOAD_STREAM(_stream) \
+ do { \
+ int const ctz = ZSTD_countTrailingZeros64(bits[(_stream)]); \
+ int const nbBits = ctz & 7; \
+ int const nbBytes = ctz >> 3; \
+ op[(_stream)] += 5; \
+ ip[(_stream)] -= nbBytes; \
+ bits[(_stream)] = MEM_read64(ip[(_stream)]) | 1; \
+ bits[(_stream)] <<= nbBits; \
+ } while (0)
+
+ /* Manually unroll the loop because compilers don't consistently
+ * unroll the inner loops, which destroys performance.
+ */
+ do {
+ /* Decode 5 symbols in each of the 4 streams */
+ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 0);
+ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 1);
+ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 2);
+ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 3);
+ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 4);
+
+ /* Reload each of the 4 the bitstreams */
+ HUF_4X_FOR_EACH_STREAM(HUF_4X1_RELOAD_STREAM);
+ } while (op[3] < olimit);
+
+#undef HUF_4X1_DECODE_SYMBOL
+#undef HUF_4X1_RELOAD_STREAM
+ }
-static HUF_ASM_X86_64_BMI2_ATTRS
+_out:
+
+ /* Save the final values of each of the state variables back to args. */
+ ZSTD_memcpy(&args->bits, &bits, sizeof(bits));
+ ZSTD_memcpy((void*)(&args->ip), &ip, sizeof(ip));
+ ZSTD_memcpy(&args->op, &op, sizeof(op));
+}
+
+/*
+ * @returns @p dstSize on success (>= 6)
+ * 0 if the fallback implementation should be used
+ * An error if an error occurred
+ */
+static HUF_FAST_BMI2_ATTRS
size_t
-HUF_decompress4X1_usingDTable_internal_bmi2_asm(
+HUF_decompress4X1_usingDTable_internal_fast(
void* dst, size_t dstSize,
const void* cSrc, size_t cSrcSize,
- const HUF_DTable* DTable)
+ const HUF_DTable* DTable,
+ HUF_DecompressFastLoopFn loopFn)
{
void const* dt = DTable + 1;
- const BYTE* const iend = (const BYTE*)cSrc + 6;
- BYTE* const oend = (BYTE*)dst + dstSize;
- HUF_DecompressAsmArgs args;
- {
- size_t const ret = HUF_DecompressAsmArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
- FORWARD_IF_ERROR(ret, "Failed to init asm args");
- if (ret != 0)
- return HUF_decompress4X1_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
+ BYTE const* const ilowest = (BYTE const*)cSrc;
+ BYTE* const oend = ZSTD_maybeNullPtrAdd((BYTE*)dst, dstSize);
+ HUF_DecompressFastArgs args;
+ { size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
+ FORWARD_IF_ERROR(ret, "Failed to init fast loop args");
+ if (ret == 0)
+ return 0;
}
- assert(args.ip[0] >= args.ilimit);
- HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop(&args);
+ assert(args.ip[0] >= args.ilowest);
+ loopFn(&args);
- /* Our loop guarantees that ip[] >= ilimit and that we haven't
+ /* Our loop guarantees that ip[] >= ilowest and that we haven't
* overwritten any op[].
*/
- assert(args.ip[0] >= iend);
- assert(args.ip[1] >= iend);
- assert(args.ip[2] >= iend);
- assert(args.ip[3] >= iend);
+ assert(args.ip[0] >= ilowest);
+ assert(args.ip[0] >= ilowest);
+ assert(args.ip[1] >= ilowest);
+ assert(args.ip[2] >= ilowest);
+ assert(args.ip[3] >= ilowest);
assert(args.op[3] <= oend);
- (void)iend;
+
+ assert(ilowest == args.ilowest);
+ assert(ilowest + 6 == args.iend[0]);
+ (void)ilowest;
/* finish bit streams one by one. */
- {
- size_t const segmentSize = (dstSize+3) / 4;
+ { size_t const segmentSize = (dstSize+3) / 4;
BYTE* segmentEnd = (BYTE*)dst;
int i;
for (i = 0; i < 4; ++i) {
@@ -712,97 +885,59 @@ HUF_decompress4X1_usingDTable_internal_bmi2_asm(
}
/* decoded size */
+ assert(dstSize != 0);
return dstSize;
}
-#endif /* ZSTD_ENABLE_ASM_X86_64_BMI2 */
-
-typedef size_t (*HUF_decompress_usingDTable_t)(void *dst, size_t dstSize,
- const void *cSrc,
- size_t cSrcSize,
- const HUF_DTable *DTable);
HUF_DGEN(HUF_decompress1X1_usingDTable_internal)
static size_t HUF_decompress4X1_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc,
- size_t cSrcSize, HUF_DTable const* DTable, int bmi2)
+ size_t cSrcSize, HUF_DTable const* DTable, int flags)
{
+ HUF_DecompressUsingDTableFn fallbackFn = HUF_decompress4X1_usingDTable_internal_default;
+ HUF_DecompressFastLoopFn loopFn = HUF_decompress4X1_usingDTable_internal_fast_c_loop;
+
#if DYNAMIC_BMI2
- if (bmi2) {
+ if (flags & HUF_flags_bmi2) {
+ fallbackFn = HUF_decompress4X1_usingDTable_internal_bmi2;
# if ZSTD_ENABLE_ASM_X86_64_BMI2
- return HUF_decompress4X1_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
-# else
- return HUF_decompress4X1_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
+ if (!(flags & HUF_flags_disableAsm)) {
+ loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop;
+ }
# endif
+ } else {
+ return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
}
-#else
- (void)bmi2;
#endif
#if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)
- return HUF_decompress4X1_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
-#else
- return HUF_decompress4X1_usingDTable_internal_default(dst, dstSize, cSrc, cSrcSize, DTable);
+ if (!(flags & HUF_flags_disableAsm)) {
+ loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop;
+ }
#endif
-}
-
-
-size_t HUF_decompress1X1_usingDTable(
- void* dst, size_t dstSize,
- const void* cSrc, size_t cSrcSize,
- const HUF_DTable* DTable)
-{
- DTableDesc dtd = HUF_getDTableDesc(DTable);
- if (dtd.tableType != 0) return ERROR(GENERIC);
- return HUF_decompress1X1_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
-}
-size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
- const void* cSrc, size_t cSrcSize,
- void* workSpace, size_t wkspSize)
-{
- const BYTE* ip = (const BYTE*) cSrc;
-
- size_t const hSize = HUF_readDTableX1_wksp(DCtx, cSrc, cSrcSize, workSpace, wkspSize);
- if (HUF_isError(hSize)) return hSize;
- if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
- ip += hSize; cSrcSize -= hSize;
-
- return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0);
-}
-
-
-size_t HUF_decompress4X1_usingDTable(
- void* dst, size_t dstSize,
- const void* cSrc, size_t cSrcSize,
- const HUF_DTable* DTable)
-{
- DTableDesc dtd = HUF_getDTableDesc(DTable);
- if (dtd.tableType != 0) return ERROR(GENERIC);
- return HUF_decompress4X1_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+ if (HUF_ENABLE_FAST_DECODE && !(flags & HUF_flags_disableFast)) {
+ size_t const ret = HUF_decompress4X1_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn);
+ if (ret != 0)
+ return ret;
+ }
+ return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
}
-static size_t HUF_decompress4X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize,
+static size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
const void* cSrc, size_t cSrcSize,
- void* workSpace, size_t wkspSize, int bmi2)
+ void* workSpace, size_t wkspSize, int flags)
{
const BYTE* ip = (const BYTE*) cSrc;
- size_t const hSize = HUF_readDTableX1_wksp_bmi2(dctx, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
+ size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize, flags);
if (HUF_isError(hSize)) return hSize;
if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
ip += hSize; cSrcSize -= hSize;
- return HUF_decompress4X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
-}
-
-size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
- const void* cSrc, size_t cSrcSize,
- void* workSpace, size_t wkspSize)
-{
- return HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, 0);
+ return HUF_decompress4X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags);
}
-
#endif /* HUF_FORCE_DECOMPRESS_X2 */
@@ -985,7 +1120,7 @@ static void HUF_fillDTableX2Level2(HUF_DEltX2* DTable, U32 targetLog, const U32
static void HUF_fillDTableX2(HUF_DEltX2* DTable, const U32 targetLog,
const sortedSymbol_t* sortedList,
- const U32* rankStart, rankValCol_t *rankValOrigin, const U32 maxWeight,
+ const U32* rankStart, rankValCol_t* rankValOrigin, const U32 maxWeight,
const U32 nbBitsBaseline)
{
U32* const rankVal = rankValOrigin[0];
@@ -1040,14 +1175,7 @@ typedef struct {
size_t HUF_readDTableX2_wksp(HUF_DTable* DTable,
const void* src, size_t srcSize,
- void* workSpace, size_t wkspSize)
-{
- return HUF_readDTableX2_wksp_bmi2(DTable, src, srcSize, workSpace, wkspSize, /* bmi2 */ 0);
-}
-
-size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable,
- const void* src, size_t srcSize,
- void* workSpace, size_t wkspSize, int bmi2)
+ void* workSpace, size_t wkspSize, int flags)
{
U32 tableLog, maxW, nbSymbols;
DTableDesc dtd = HUF_getDTableDesc(DTable);
@@ -1069,7 +1197,7 @@ size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable,
if (maxTableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge);
/* ZSTD_memset(weightList, 0, sizeof(weightList)); */ /* is not necessary, even though some analyzer complain ... */
- iSize = HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1, wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, sizeof(wksp->calleeWksp), bmi2);
+ iSize = HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1, wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, sizeof(wksp->calleeWksp), flags);
if (HUF_isError(iSize)) return iSize;
/* check result */
@@ -1159,15 +1287,19 @@ HUF_decodeLastSymbolX2(void* op, BIT_DStream_t* DStream, const HUF_DEltX2* dt, c
}