diff options
author | Enzo Matsumiya <ematsumiya@suse.de> | 2024-08-15 12:24:36 -0300 |
---|---|---|
committer | Enzo Matsumiya <ematsumiya@suse.de> | 2024-08-15 13:35:06 -0300 |
commit | 463bdc6c942b64c7065256fdf8cc6e21021cbdb6 (patch) | |
tree | 5bcb47ccdbb5f040a7b930717f3bafbaa2ed886c | |
parent | 2e8afe16eb831c227693c37a7da59215e5686134 (diff) | |
download | linux-smb-compression-upstream.tar.gz linux-smb-compression-upstream.tar.bz2 linux-smb-compression-upstream.zip |
smb: client: compress fastest (checkpoint 2024-08-15)smb-compression-upstream
Signed-off-by: Enzo Matsumiya <ematsumiya@suse.de>
-rw-r--r-- | fs/smb/client/compress.c | 2 | ||||
-rw-r--r-- | fs/smb/client/compress/lz77.c | 196 | ||||
-rw-r--r-- | fs/smb/client/compress/lz77.h | 3 |
3 files changed, 99 insertions, 102 deletions
diff --git a/fs/smb/client/compress.c b/fs/smb/client/compress.c index c313e9e6504c..770070c6263b 100644 --- a/fs/smb/client/compress.c +++ b/fs/smb/client/compress.c @@ -347,7 +347,7 @@ int smb_compress(struct TCP_Server_Info *server, struct smb_rqst *rq, compress_s goto err_free; } - dlen = round_up(slen, PAGE_SIZE); + dlen = lz77_compress_mem_required(round_up(slen, PAGE_SIZE)); dst = kvzalloc(dlen, GFP_KERNEL); if (!dst) { ret = -ENOMEM; diff --git a/fs/smb/client/compress/lz77.c b/fs/smb/client/compress/lz77.c index 7d672200d55f..412be1684433 100644 --- a/fs/smb/client/compress/lz77.c +++ b/fs/smb/client/compress/lz77.c @@ -10,62 +10,60 @@ #include <linux/slab.h> #include <linux/sizes.h> #include <linux/count_zeros.h> +#include <asm/unaligned.h> -#include "../compress.h" #include "lz77.h" #ifdef CONFIG_CIFS_DEBUG2 -# define LZ77_STATS_ENABLED 1 -#else -# define LZ77_STATS_ENABLED 0 +# define LZ77_STATS_ENABLED #endif /* * Compression parameters. * * - finding closer and smaller matches is faster, but also compresses worse, e.g. - * match min len = 4, max dist = 4K compared to match min len = 8, max dist = 8K: + * match min len = 3, max dist = 1K compared to match min len = 8, max dist = 8K: * the former is ~10% faster, but yields a ~5% bigger output * - we don't really care about a maximum match length -- it's simply much, much faster to compute - * a very long match length (e.g. >2M) and write only its encoded bytes than to write e.g. 500K + * a very long match length (e.g. 2M) and write only its encoded bytes than to write e.g. 500K * of literals */ -#define LZ77_MATCH_MIN_LEN 4 +#define LZ77_MATCH_MIN_LEN 3 #define LZ77_MATCH_MAX_LEN S32_MAX #define LZ77_MATCH_MIN_DIST 1 -#define LZ77_MATCH_MAX_DIST SZ_2K +#define LZ77_MATCH_MAX_DIST SZ_1K #define LZ77_HASH_LOG 15 #define LZ77_HASH_SIZE (1 << LZ77_HASH_LOG) #define lz77_read(nbits, _ptr) get_unaligned((const u ## nbits *)(_ptr)) #define lz77_write(nbits, _ptr, _v) put_unaligned_le ## nbits((_v), (typeof((_v)) *)(_ptr)) -static __always_inline u16 lz77_read16(const void *ptr) +static __always_inline u16 lz77_read16(const u8 *ptr) { return lz77_read(16, ptr); } -static __always_inline u32 lz77_read32(const void *ptr) +static __always_inline u32 lz77_read32(const u8 *ptr) { return lz77_read(32, ptr); } -static __always_inline u64 lz77_read64(const void *ptr) +static __always_inline u64 lz77_read64(const u8 *ptr) { return lz77_read(64, ptr); } -static __always_inline void lz77_write8(void *ptr, u8 v) +static __always_inline void lz77_write8(u8 *ptr, u8 v) { - *(u8 *)ptr = v; + put_unaligned(v, ptr); } -static __always_inline void lz77_write16(void *ptr, u16 v) +static __always_inline void lz77_write16(u8 *ptr, u16 v) { lz77_write(16, ptr, v); } -static __always_inline void lz77_write32(void *ptr, u32 v) +static __always_inline void lz77_write32(u8 *ptr, u32 v) { lz77_write(32, ptr, v); } @@ -73,7 +71,7 @@ static __always_inline void lz77_write32(void *ptr, u32 v) static __always_inline u64 lz77_hash_bytes(const u8 *ptr) { /* A "hash function" -- simple, fast, and enough for our use-case. */ - return ((lz77_read64(ptr) << 24) * 889523592379ULL) >> (64 - LZ77_HASH_LOG); + return ((lz77_read64(ptr) << 24) * 889523592379ULL) >> (64 - LZ77_HASH_LOG); } /* @@ -85,44 +83,38 @@ static __always_inline u32 lz77_count_common_bytes(const u64 diff) return count_trailing_zeros(diff) >> 3; } -static __always_inline u32 lz77_match_len(const u8 *match, const u8 *cur, const u8 *end) +static __always_inline u32 lz77_match_len(const u8 *wnd, const u8 *cur, const u8 *end) { const u8 *start = cur; - u32 step = sizeof(u64); - u64 diff; + const u32 step = sizeof(u64); + u64 diff = lz77_read32(cur) ^ lz77_read32(wnd); - while (likely(cur < end - (step - 1))) { - diff = lz77_read64(cur) ^ lz77_read64(match); + if (diff) + return 0; + + cur += sizeof(u32); + wnd += sizeof(u32); + + if (cur + sizeof(u64) >= end) + return (cur - start); + + while (likely(cur + step < end)) { + diff = lz77_read64(cur) ^ lz77_read64(wnd); if (!diff) { cur += step; - match += step; + wnd += step; continue; } cur += lz77_count_common_bytes(diff); - return (u32)(cur - start); - } - - step = sizeof(u32); - - if (cur < end - (step - 1) && !(lz77_read32(cur) ^ lz77_read32(match))) { - cur += step; - match += step; + return (cur - start); } - step = sizeof(u16); + while (cur < end && *cur++ == *wnd++) { } - if (cur < end - (step - 1) && lz77_read16(cur) == lz77_read16(match)) { - cur += step; - match += step; - } - - if (cur < end && *cur == *match) - cur++; - - return (u32)(cur - start); + return (cur - start); } static __always_inline u8 *lz77_write_match(u8 *dst, u8 **nib, u32 dist, u32 len) @@ -200,16 +192,15 @@ static __always_inline void lz77_copy(u8 *dst, const u8 *src, size_t count) noinline int lz77_compress(const u8 *src, u32 slen, u8 *dst, u32 *dlen) { - const u8 *srcp, *end, *anchor; + const u8 *srcp, *end; u8 *dstp, *nib, *flag_pos; u32 flag_count = 0; u64 *htable; long flag = 0; - int ret; + int i; unsigned long s, e; srcp = src; - anchor = srcp; end = src + slen; dstp = dst; @@ -223,52 +214,53 @@ noinline int lz77_compress(const u8 *src, u32 slen, u8 *dst, u32 *dlen) s = jiffies; - /* Main loop. */ - while (likely(srcp < end)) { - u32 offset, dist, len; - u64 hash; - - while (likely(srcp + LZ77_MATCH_MIN_LEN < end)) { - offset = srcp - src; - hash = lz77_hash_bytes(srcp); - dist = offset - htable[hash]; - if (dist >= LZ77_MATCH_MIN_DIST && dist < LZ77_MATCH_MAX_DIST) - len = lz77_match_len(src + htable[hash], srcp, end); + for (i = 0; i < LZ77_MATCH_MIN_LEN; i++) { + u64 hash = lz77_hash_bytes(srcp); - htable[hash] = offset; - - if (len >= LZ77_MATCH_MIN_LEN) - break; - - dist = 0; - len = 0; - srcp++; - } - - while (likely(anchor < srcp)) { - u32 c = umin(srcp - anchor, 32 - flag_count); - - lz77_copy(dstp, anchor, c); - - dstp += c; - anchor += c; + htable[hash] = i; + *dstp++ = *srcp++; + flag_count++; + } - flag <<= c; - flag_count += c; + /* Main loop. */ + do { + u64 hash = lz77_hash_bytes(srcp); + u32 offset = srcp - src; + u32 dist = offset - htable[hash]; + u32 len = 0; + + /* + * Perf note: + * It's faster to compare (src + htable[hash]) when htable[hash] == 0 in + * lz77_match_len() than to add a check here. + */ + dist = offset - htable[hash]; + if (dist >= LZ77_MATCH_MIN_DIST && dist < LZ77_MATCH_MAX_DIST) + len = lz77_match_len(src + htable[hash], srcp, end); + + htable[hash] = offset; + + if (len < LZ77_MATCH_MIN_LEN) { + /* + * Perf note: + * It's faster to simply copy the literal byte than to accumulate it and + * write them in 32-byte chunks. + */ + lz77_write8(dstp++, *srcp++); + flag <<= 1; + flag_count++; if (flag_count == 32) { lz77_write32(flag_pos, flag); flag_count = 0; flag_pos = dstp; dstp += 4; } - } - if (unlikely(srcp + LZ77_MATCH_MIN_LEN >= end)) - goto leftovers; + continue; + } dstp = lz77_write_match(dstp, &nib, dist, len); srcp += len; - anchor = srcp; flag = (flag << 1) | 1; flag_count++; @@ -278,44 +270,48 @@ noinline int lz77_compress(const u8 *src, u32 slen, u8 *dst, u32 *dlen) flag_pos = dstp; dstp += 4; } - } -leftovers: - if (srcp < end) { - while (likely(srcp < end)) { - u32 c = umin(end - srcp, 32 - flag_count); + } while (likely(srcp + sizeof(u64) < end)); - lz77_copy(dstp, srcp, c); + //pr_err("%s: leftovers %zu, flag count %u\n", __func__, end - srcp, flag_count); + while (srcp < end) { + u32 c; - dstp += c; - srcp += c; + if (unlikely((dstp - dst) + (end - srcp) > slen)) { + *dlen = slen; + goto out; + } - flag <<= c; - flag_count += c; - if (flag_count == 32) { - lz77_write32(flag_pos, flag); - flag_count = 0; - flag_pos = dstp; - dstp += 4; - } + c = umin(end - srcp, 32 - flag_count); + + lz77_copy(dstp, srcp, c); + + dstp += c; + srcp += c; + + flag <<= c; + flag_count += c; + if (flag_count == 32) { + lz77_write32(flag_pos, flag); + flag_count = 0; + flag_pos = dstp; + dstp += 4; } } + kvfree(htable); + flag <<= (32 - flag_count); flag |= (1 << (32 - flag_count)) - 1; lz77_write32(flag_pos, flag); *dlen = dstp - (u8 *)dst; +out: e = jiffies; pr_err("%s: (fast) took %ums to compress (slen=%u, dlen=%u)\n", __func__, jiffies_to_msecs(e - s), slen, *dlen); if (*dlen < slen) - ret = 0; - else - ret = -EMSGSIZE; - - kvfree(htable); - + return 0; - return ret; + return -EMSGSIZE; } diff --git a/fs/smb/client/compress/lz77.h b/fs/smb/client/compress/lz77.h index 904023f0fb4d..7d616f746990 100644 --- a/fs/smb/client/compress/lz77.h +++ b/fs/smb/client/compress/lz77.h @@ -9,11 +9,12 @@ #ifndef _SMB_COMPRESS_LZ77_H #define _SMB_COMPRESS_LZ77_H +#include <asm/page_types.h> #include <linux/kernel.h> static __always_inline size_t lz77_compress_mem_required(size_t len) { - return len + (len >> 3); + return len + (len >> 3) + PAGE_SIZE; } int lz77_compress(const u8 *src, u32 slen, u8 *dst, u32 *dlen); |