diff options
| author | Enzo Matsumiya <ematsumiya@suse.de> | 2025-12-18 13:43:09 -0300 |
|---|---|---|
| committer | Enzo Matsumiya <ematsumiya@suse.de> | 2026-01-23 17:47:19 -0300 |
| commit | e5b3302e01f780b07e48ae9a6229e245d8e7e091 (patch) | |
| tree | b9890b812ff369679c88dd1b817c015af4793224 | |
| parent | b1027bc4f590db7a7a42e226674cb36987700c62 (diff) | |
| download | linux-e5b3302e01f780b07e48ae9a6229e245d8e7e091.tar.gz linux-e5b3302e01f780b07e48ae9a6229e245d8e7e091.tar.bz2 linux-e5b3302e01f780b07e48ae9a6229e245d8e7e091.zip | |
smb: client: compress: lz77_compress() optimizations
WIP
Signed-off-by: Enzo Matsumiya <ematsumiya@suse.de>
| -rw-r--r-- | fs/smb/client/compress/lz77.c | 193 |
1 files changed, 132 insertions, 61 deletions
diff --git a/fs/smb/client/compress/lz77.c b/fs/smb/client/compress/lz77.c index 4a1d62c07921..2188ff2b7322 100644 --- a/fs/smb/client/compress/lz77.c +++ b/fs/smb/client/compress/lz77.c @@ -15,13 +15,33 @@ /* * Compression parameters. + * + * LZ77_MATCH_MIN_LEN: Minimum match length (can be 3 - 8, default sizeof(u32)). + * Smaller: many small matches, better perf, worse compression ratio. + * Bigger: fewer longer matches, wose perf, better compression ratio. + * LZ77_MATCH_MAX_DIST: Offset from current position back to match position (can be 256 - + * 8K). + * Smaller: closer matches, better perf, worse compression ratio. + * Bigger: better matches, worse perf, better compression ratio. + * LZ77_HASH_LOG: + * LZ77_HASH_SIZE: ilog2 hash size (recommended to be 13 - 16, default 15 (hash size + * 32k)). + * Smaller: less memory, faster access, more hash collisions. + * Bigger: more memory, slower access, less hash collisions. + * LZ77_RSTEP_SIZE: Number of bytes to read from input buffer to check initial match + * (default LZ77_MATCH_MIN_LEN). + * LZ77_MSTEP_SIZE: Number of bytes to extend compare a found match (default + * sizeof(u64)). + * + * MATCH_MIN_LEN and RSTEP_SIZE are conveniently set to the same value so we can read, hash, and + * compare a single value, without having to mess with offsets. */ -#define LZ77_MATCH_MIN_LEN 4 -#define LZ77_MATCH_MIN_DIST 1 -#define LZ77_MATCH_MAX_DIST SZ_1K +#define LZ77_MATCH_MIN_LEN sizeof(u32) +#define LZ77_MATCH_MAX_DIST SZ_4K #define LZ77_HASH_LOG 15 #define LZ77_HASH_SIZE (1 << LZ77_HASH_LOG) -#define LZ77_STEP_SIZE sizeof(u64) +#define LZ77_RSTEP_SIZE sizeof(u32) +#define LZ77_MSTEP_SIZE sizeof(u64) static __always_inline u8 lz77_read8(const u8 *ptr) { @@ -67,8 +87,8 @@ static __always_inline u32 lz77_match_len(const void *wnd, const void *cur, cons const u64 diff = lz77_read64(cur) ^ lz77_read64(wnd); if (!diff) { - cur += LZ77_STEP_SIZE; - wnd += LZ77_STEP_SIZE; + cur += LZ77_MSTEP_SIZE; + wnd += LZ77_MSTEP_SIZE; continue; } @@ -77,8 +97,9 @@ static __always_inline u32 lz77_match_len(const void *wnd, const void *cur, cons cur += count_trailing_zeros(diff) >> 3; return (cur - start); - } while (likely(cur + LZ77_STEP_SIZE < end)); + } while (likely(cur + LZ77_MSTEP_SIZE < end)); + /* fallback to byte-by-byte comparison for last <8 bytes */ while (cur < end && lz77_read8(cur) == lz77_read8(wnd)) { cur++; wnd++; @@ -141,6 +162,40 @@ static __always_inline void *lz77_write_match(void *dst, void **nib, u32 dist, u return dst + 4; } +static __always_inline void *lz77_write_literals(const void *start, const void *end, void *dst, + long *f, u32 *fc, void **fp) +{ + u32 c, remaining; + + /* lz77_compress() ensures that start < end when enter here */ + remaining = end - start; + do { + c = umin(remaining, 32 - *fc); + + memcpy(dst, start, c); + + dst += c; + start += c; + remaining -= c; + + *f <<= c; + *fc += c; + if (*fc == 32) { + lz77_write32(*fp, *f); + *fc = 0; + *fp = dst; + dst += 4; + } + } while (remaining); + + return dst; +} + +static __always_inline u32 lz77_hash(u32 v) +{ + return ((v ^ 0x9E3779B9) * 0x85EBCA6B) >> (32 - LZ77_HASH_LOG); +} + /* * Compute compressed (dst) buffer length based on uncompressed (src) length. * @@ -163,14 +218,20 @@ u32 lz77_calc_dlen(u32 slen) noinline int lz77_compress(const void *src, u32 slen, void *dst, u32 *dlen) { - const void *srcp, *end; + const void *srcp, *limit, *end, *anchor; void *dstp, *nib, *flag_pos; u32 flag_count = 0; long flag = 0; - u64 *htable; + u32 *htable; srcp = src; - end = src + slen; + anchor = srcp; + end = srcp + slen; + /* + * We read 4 bytes from srcp[0] and srcp[1] at each loop iteration, and if a match is found, + * lz77_match_len() will do 8 byte reads (on a do/while loop). + */ + limit = end - LZ77_RSTEP_SIZE - 1 - LZ77_MSTEP_SIZE; dstp = dst; nib = NULL; flag_pos = dstp; @@ -180,45 +241,71 @@ noinline int lz77_compress(const void *src, u32 slen, void *dst, u32 *dlen) if (!htable) return -ENOMEM; + prefetch(srcp + 8); + + /* adjust @srcp so we don't get a false positive match on first iteration */ + htable[lz77_hash(lz77_read32(srcp++))] = 0; + /* * Main loop. * - * Assuming @dlen was computed with lz77_calc_dlen(), we can run without bound checking - * @dst. + * Assumes @dlen was computed with lz77_calc_dlen(), so run without bound-checking @dst. + * + * This code was crafted in a way to best utilise fetch-decode-execute CPU flow. + * Any attempt to optimize it, or even organize it, can lead to huge performance loss. + * + * Hot path: + * - input reads (@srcp and @srcp + 1), streamed and possibly prefetched -- very fast + * Note: it's much cheaper to preprocess @srcp + 1 and ignore it when @srcp has a match, + * than to conditionally process it (i.e. only if @srcp doesn't have a match) + * - compute hashes + * - get offsets from @htable + * - update @htable offsets -- this is important as htable[h0/h1] is still in cache, so + * doing it only later, e.g. if no match was found, would require fetching it from memory + * again) + * - compute distance _before_ checking if values match -- this is also important because a + * too far window is much more common than a mismatch, and also comparing values requires + * reading from window buffer (more expensive) + * - don't zero-check w0/w1 -- (important) it adds a non-negligible delay in processing, + * such that it's faster to just read and compare @src + 0 + * + * Whenever a match is found, we break out of the hotpath to begin writing to @dstp. */ do { - u32 dist, len = 0; - const void *wnd; - u64 hash; - - hash = ((lz77_read64(srcp) << 24) * 889523592379ULL) >> (64 - LZ77_HASH_LOG); - wnd = src + htable[hash]; - htable[hash] = srcp - src; - dist = srcp - wnd; - - if (dist && dist < LZ77_MATCH_MAX_DIST) - len = lz77_match_len(wnd, srcp, end); - - if (len < LZ77_MATCH_MIN_LEN) { - lz77_write8(dstp, lz77_read8(srcp)); - - dstp++; - srcp++; - - flag <<= 1; - flag_count++; - if (flag_count == 32) { - lz77_write32(flag_pos, flag); - flag_count = 0; - flag_pos = dstp; - dstp += 4; - } - - continue; - } - - dstp = lz77_write_match(dstp, &nib, dist, len); + const u32 p0 = srcp - src; + const u32 p1 = p0 + 1; + const u32 v0 = lz77_read32(srcp); + const u32 v1 = lz77_read32(srcp + 1); + const u32 h0 = lz77_hash(v0); + const u32 h1 = lz77_hash(v1); + const u32 w0 = htable[h0]; + const u32 w1 = htable[h1]; + const void *wnd = src + w0; + u32 len; + + htable[h0] = p0; + htable[h1] = p1; + + if (unlikely(p0 - w0 < LZ77_MATCH_MAX_DIST && lz77_read32(wnd) == v0)) + goto _match; + + srcp++; + wnd = src + w1; + if (unlikely(p1 - w1 < LZ77_MATCH_MAX_DIST && lz77_read32(wnd) == v1)) + goto _match; + + srcp++; + continue; +_match: + prefetchw(dstp); + dstp = lz77_write_literals(anchor, srcp, dstp, &flag, &flag_count, &flag_pos); + wnd += LZ77_RSTEP_SIZE; + srcp += LZ77_RSTEP_SIZE; + len = lz77_match_len(wnd, srcp, end); + dstp = lz77_write_match(dstp, &nib, srcp - wnd, len + LZ77_RSTEP_SIZE); srcp += len; + anchor = srcp; + prefetch(srcp); flag = (flag << 1) | 1; flag_count++; @@ -228,25 +315,9 @@ noinline int lz77_compress(const void *src, u32 slen, void *dst, u32 *dlen) flag_pos = dstp; dstp += 4; } - } while (likely(srcp + LZ77_STEP_SIZE < end)); - - while (srcp < end) { - u32 c = umin(end - srcp, 32 - flag_count); + } while (srcp < limit); - memcpy(dstp, srcp, c); - - dstp += c; - srcp += c; - - flag <<= c; - flag_count += c; - if (flag_count == 32) { - lz77_write32(flag_pos, flag); - flag_count = 0; - flag_pos = dstp; - dstp += 4; - } - } + dstp = lz77_write_literals(anchor, end, dstp, &flag, &flag_count, &flag_pos); flag <<= (32 - flag_count); flag |= (1UL << (32 - flag_count)) - 1; |
