summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorEnzo Matsumiya <ematsumiya@suse.de>2024-08-15 12:24:36 -0300
committerEnzo Matsumiya <ematsumiya@suse.de>2024-08-15 13:35:06 -0300
commit463bdc6c942b64c7065256fdf8cc6e21021cbdb6 (patch)
tree5bcb47ccdbb5f040a7b930717f3bafbaa2ed886c
parent2e8afe16eb831c227693c37a7da59215e5686134 (diff)
downloadlinux-smb-compression-upstream.tar.gz
linux-smb-compression-upstream.tar.bz2
linux-smb-compression-upstream.zip
smb: client: compress fastest (checkpoint 2024-08-15)smb-compression-upstream
Signed-off-by: Enzo Matsumiya <ematsumiya@suse.de>
-rw-r--r--fs/smb/client/compress.c2
-rw-r--r--fs/smb/client/compress/lz77.c196
-rw-r--r--fs/smb/client/compress/lz77.h3
3 files changed, 99 insertions, 102 deletions
diff --git a/fs/smb/client/compress.c b/fs/smb/client/compress.c
index c313e9e6504c..770070c6263b 100644
--- a/fs/smb/client/compress.c
+++ b/fs/smb/client/compress.c
@@ -347,7 +347,7 @@ int smb_compress(struct TCP_Server_Info *server, struct smb_rqst *rq, compress_s
goto err_free;
}
- dlen = round_up(slen, PAGE_SIZE);
+ dlen = lz77_compress_mem_required(round_up(slen, PAGE_SIZE));
dst = kvzalloc(dlen, GFP_KERNEL);
if (!dst) {
ret = -ENOMEM;
diff --git a/fs/smb/client/compress/lz77.c b/fs/smb/client/compress/lz77.c
index 7d672200d55f..412be1684433 100644
--- a/fs/smb/client/compress/lz77.c
+++ b/fs/smb/client/compress/lz77.c
@@ -10,62 +10,60 @@
#include <linux/slab.h>
#include <linux/sizes.h>
#include <linux/count_zeros.h>
+#include <asm/unaligned.h>
-#include "../compress.h"
#include "lz77.h"
#ifdef CONFIG_CIFS_DEBUG2
-# define LZ77_STATS_ENABLED 1
-#else
-# define LZ77_STATS_ENABLED 0
+# define LZ77_STATS_ENABLED
#endif
/*
* Compression parameters.
*
* - finding closer and smaller matches is faster, but also compresses worse, e.g.
- * match min len = 4, max dist = 4K compared to match min len = 8, max dist = 8K:
+ * match min len = 3, max dist = 1K compared to match min len = 8, max dist = 8K:
* the former is ~10% faster, but yields a ~5% bigger output
* - we don't really care about a maximum match length -- it's simply much, much faster to compute
- * a very long match length (e.g. >2M) and write only its encoded bytes than to write e.g. 500K
+ * a very long match length (e.g. 2M) and write only its encoded bytes than to write e.g. 500K
* of literals
*/
-#define LZ77_MATCH_MIN_LEN 4
+#define LZ77_MATCH_MIN_LEN 3
#define LZ77_MATCH_MAX_LEN S32_MAX
#define LZ77_MATCH_MIN_DIST 1
-#define LZ77_MATCH_MAX_DIST SZ_2K
+#define LZ77_MATCH_MAX_DIST SZ_1K
#define LZ77_HASH_LOG 15
#define LZ77_HASH_SIZE (1 << LZ77_HASH_LOG)
#define lz77_read(nbits, _ptr) get_unaligned((const u ## nbits *)(_ptr))
#define lz77_write(nbits, _ptr, _v) put_unaligned_le ## nbits((_v), (typeof((_v)) *)(_ptr))
-static __always_inline u16 lz77_read16(const void *ptr)
+static __always_inline u16 lz77_read16(const u8 *ptr)
{
return lz77_read(16, ptr);
}
-static __always_inline u32 lz77_read32(const void *ptr)
+static __always_inline u32 lz77_read32(const u8 *ptr)
{
return lz77_read(32, ptr);
}
-static __always_inline u64 lz77_read64(const void *ptr)
+static __always_inline u64 lz77_read64(const u8 *ptr)
{
return lz77_read(64, ptr);
}
-static __always_inline void lz77_write8(void *ptr, u8 v)
+static __always_inline void lz77_write8(u8 *ptr, u8 v)
{
- *(u8 *)ptr = v;
+ put_unaligned(v, ptr);
}
-static __always_inline void lz77_write16(void *ptr, u16 v)
+static __always_inline void lz77_write16(u8 *ptr, u16 v)
{
lz77_write(16, ptr, v);
}
-static __always_inline void lz77_write32(void *ptr, u32 v)
+static __always_inline void lz77_write32(u8 *ptr, u32 v)
{
lz77_write(32, ptr, v);
}
@@ -73,7 +71,7 @@ static __always_inline void lz77_write32(void *ptr, u32 v)
static __always_inline u64 lz77_hash_bytes(const u8 *ptr)
{
/* A "hash function" -- simple, fast, and enough for our use-case. */
- return ((lz77_read64(ptr) << 24) * 889523592379ULL) >> (64 - LZ77_HASH_LOG);
+ return ((lz77_read64(ptr) << 24) * 889523592379ULL) >> (64 - LZ77_HASH_LOG);
}
/*
@@ -85,44 +83,38 @@ static __always_inline u32 lz77_count_common_bytes(const u64 diff)
return count_trailing_zeros(diff) >> 3;
}
-static __always_inline u32 lz77_match_len(const u8 *match, const u8 *cur, const u8 *end)
+static __always_inline u32 lz77_match_len(const u8 *wnd, const u8 *cur, const u8 *end)
{
const u8 *start = cur;
- u32 step = sizeof(u64);
- u64 diff;
+ const u32 step = sizeof(u64);
+ u64 diff = lz77_read32(cur) ^ lz77_read32(wnd);
- while (likely(cur < end - (step - 1))) {
- diff = lz77_read64(cur) ^ lz77_read64(match);
+ if (diff)
+ return 0;
+
+ cur += sizeof(u32);
+ wnd += sizeof(u32);
+
+ if (cur + sizeof(u64) >= end)
+ return (cur - start);
+
+ while (likely(cur + step < end)) {
+ diff = lz77_read64(cur) ^ lz77_read64(wnd);
if (!diff) {
cur += step;
- match += step;
+ wnd += step;
continue;
}
cur += lz77_count_common_bytes(diff);
- return (u32)(cur - start);
- }
-
- step = sizeof(u32);
-
- if (cur < end - (step - 1) && !(lz77_read32(cur) ^ lz77_read32(match))) {
- cur += step;
- match += step;
+ return (cur - start);
}
- step = sizeof(u16);
+ while (cur < end && *cur++ == *wnd++) { }
- if (cur < end - (step - 1) && lz77_read16(cur) == lz77_read16(match)) {
- cur += step;
- match += step;
- }
-
- if (cur < end && *cur == *match)
- cur++;
-
- return (u32)(cur - start);
+ return (cur - start);
}
static __always_inline u8 *lz77_write_match(u8 *dst, u8 **nib, u32 dist, u32 len)
@@ -200,16 +192,15 @@ static __always_inline void lz77_copy(u8 *dst, const u8 *src, size_t count)
noinline int lz77_compress(const u8 *src, u32 slen, u8 *dst, u32 *dlen)
{
- const u8 *srcp, *end, *anchor;
+ const u8 *srcp, *end;
u8 *dstp, *nib, *flag_pos;
u32 flag_count = 0;
u64 *htable;
long flag = 0;
- int ret;
+ int i;
unsigned long s, e;
srcp = src;
- anchor = srcp;
end = src + slen;
dstp = dst;
@@ -223,52 +214,53 @@ noinline int lz77_compress(const u8 *src, u32 slen, u8 *dst, u32 *dlen)
s = jiffies;
- /* Main loop. */
- while (likely(srcp < end)) {
- u32 offset, dist, len;
- u64 hash;
-
- while (likely(srcp + LZ77_MATCH_MIN_LEN < end)) {
- offset = srcp - src;
- hash = lz77_hash_bytes(srcp);
- dist = offset - htable[hash];
- if (dist >= LZ77_MATCH_MIN_DIST && dist < LZ77_MATCH_MAX_DIST)
- len = lz77_match_len(src + htable[hash], srcp, end);
+ for (i = 0; i < LZ77_MATCH_MIN_LEN; i++) {
+ u64 hash = lz77_hash_bytes(srcp);
- htable[hash] = offset;
-
- if (len >= LZ77_MATCH_MIN_LEN)
- break;
-
- dist = 0;
- len = 0;
- srcp++;
- }
-
- while (likely(anchor < srcp)) {
- u32 c = umin(srcp - anchor, 32 - flag_count);
-
- lz77_copy(dstp, anchor, c);
-
- dstp += c;
- anchor += c;
+ htable[hash] = i;
+ *dstp++ = *srcp++;
+ flag_count++;
+ }
- flag <<= c;
- flag_count += c;
+ /* Main loop. */
+ do {
+ u64 hash = lz77_hash_bytes(srcp);
+ u32 offset = srcp - src;
+ u32 dist = offset - htable[hash];
+ u32 len = 0;
+
+ /*
+ * Perf note:
+ * It's faster to compare (src + htable[hash]) when htable[hash] == 0 in
+ * lz77_match_len() than to add a check here.
+ */
+ dist = offset - htable[hash];
+ if (dist >= LZ77_MATCH_MIN_DIST && dist < LZ77_MATCH_MAX_DIST)
+ len = lz77_match_len(src + htable[hash], srcp, end);
+
+ htable[hash] = offset;
+
+ if (len < LZ77_MATCH_MIN_LEN) {
+ /*
+ * Perf note:
+ * It's faster to simply copy the literal byte than to accumulate it and
+ * write them in 32-byte chunks.
+ */
+ lz77_write8(dstp++, *srcp++);
+ flag <<= 1;
+ flag_count++;
if (flag_count == 32) {
lz77_write32(flag_pos, flag);
flag_count = 0;
flag_pos = dstp;
dstp += 4;
}
- }
- if (unlikely(srcp + LZ77_MATCH_MIN_LEN >= end))
- goto leftovers;
+ continue;
+ }
dstp = lz77_write_match(dstp, &nib, dist, len);
srcp += len;
- anchor = srcp;
flag = (flag << 1) | 1;
flag_count++;
@@ -278,44 +270,48 @@ noinline int lz77_compress(const u8 *src, u32 slen, u8 *dst, u32 *dlen)
flag_pos = dstp;
dstp += 4;
}
- }
-leftovers:
- if (srcp < end) {
- while (likely(srcp < end)) {
- u32 c = umin(end - srcp, 32 - flag_count);
+ } while (likely(srcp + sizeof(u64) < end));
- lz77_copy(dstp, srcp, c);
+ //pr_err("%s: leftovers %zu, flag count %u\n", __func__, end - srcp, flag_count);
+ while (srcp < end) {
+ u32 c;
- dstp += c;
- srcp += c;
+ if (unlikely((dstp - dst) + (end - srcp) > slen)) {
+ *dlen = slen;
+ goto out;
+ }
- flag <<= c;
- flag_count += c;
- if (flag_count == 32) {
- lz77_write32(flag_pos, flag);
- flag_count = 0;
- flag_pos = dstp;
- dstp += 4;
- }
+ c = umin(end - srcp, 32 - flag_count);
+
+ lz77_copy(dstp, srcp, c);
+
+ dstp += c;
+ srcp += c;
+
+ flag <<= c;
+ flag_count += c;
+ if (flag_count == 32) {
+ lz77_write32(flag_pos, flag);
+ flag_count = 0;
+ flag_pos = dstp;
+ dstp += 4;
}
}
+ kvfree(htable);
+
flag <<= (32 - flag_count);
flag |= (1 << (32 - flag_count)) - 1;
lz77_write32(flag_pos, flag);
*dlen = dstp - (u8 *)dst;
+out:
e = jiffies;
pr_err("%s: (fast) took %ums to compress (slen=%u, dlen=%u)\n", __func__, jiffies_to_msecs(e - s), slen, *dlen);
if (*dlen < slen)
- ret = 0;
- else
- ret = -EMSGSIZE;
-
- kvfree(htable);
-
+ return 0;
- return ret;
+ return -EMSGSIZE;
}
diff --git a/fs/smb/client/compress/lz77.h b/fs/smb/client/compress/lz77.h
index 904023f0fb4d..7d616f746990 100644
--- a/fs/smb/client/compress/lz77.h
+++ b/fs/smb/client/compress/lz77.h
@@ -9,11 +9,12 @@
#ifndef _SMB_COMPRESS_LZ77_H
#define _SMB_COMPRESS_LZ77_H
+#include <asm/page_types.h>
#include <linux/kernel.h>
static __always_inline size_t lz77_compress_mem_required(size_t len)
{
- return len + (len >> 3);
+ return len + (len >> 3) + PAGE_SIZE;
}
int lz77_compress(const u8 *src, u32 slen, u8 *dst, u32 *dlen);