diff options
author | Enzo Matsumiya <ematsumiya@suse.de> | 2024-08-13 20:22:50 -0300 |
---|---|---|
committer | Enzo Matsumiya <ematsumiya@suse.de> | 2024-08-15 13:35:06 -0300 |
commit | 7167cd9466b4e8734c2f7510f37f8ea20c2dbd21 (patch) | |
tree | 9e842b283f2fa2264f1ce474cae8ce0bc2f90eb2 | |
parent | 50f11f6ac98b5270a078bedd2685d3941dc6135b (diff) | |
download | linux-7167cd9466b4e8734c2f7510f37f8ea20c2dbd21.tar.gz linux-7167cd9466b4e8734c2f7510f37f8ea20c2dbd21.tar.bz2 linux-7167cd9466b4e8734c2f7510f37f8ea20c2dbd21.zip |
smb: client: compression WIP
works
Signed-off-by: Enzo Matsumiya <ematsumiya@suse.de>
-rw-r--r-- | fs/smb/client/compress.c | 2 | ||||
-rw-r--r-- | fs/smb/client/compress/lz77-fastest-2024-08-13/lz77.c | 458 | ||||
-rw-r--r-- | fs/smb/client/compress/lz77-fastest-2024-08-13/lz77.h | 20 | ||||
-rw-r--r-- | fs/smb/client/compress/lz77.c | 275 | ||||
-rw-r--r-- | fs/smb/client/compress/lz77.h | 273 | ||||
-rw-r--r-- | fs/smb/client/transport.c | 13 |
6 files changed, 688 insertions, 353 deletions
diff --git a/fs/smb/client/compress.c b/fs/smb/client/compress.c index 645edde6e328..c313e9e6504c 100644 --- a/fs/smb/client/compress.c +++ b/fs/smb/client/compress.c @@ -322,7 +322,7 @@ bool should_compress(const struct cifs_tcon *tcon, const struct smb_rqst *rq) int smb_compress(struct TCP_Server_Info *server, struct smb_rqst *rq, compress_send_fn send_fn) { struct iov_iter iter; - size_t slen, dlen; + u32 slen, dlen; void *src, *dst; int ret; diff --git a/fs/smb/client/compress/lz77-fastest-2024-08-13/lz77.c b/fs/smb/client/compress/lz77-fastest-2024-08-13/lz77.c new file mode 100644 index 000000000000..1d949ee823dc --- /dev/null +++ b/fs/smb/client/compress/lz77-fastest-2024-08-13/lz77.c @@ -0,0 +1,458 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2024, SUSE LLC + * + * Authors: Enzo Matsumiya <ematsumiya@suse.de> + * + * Implementation of the LZ77 "plain" compression algorithm, as per MS-XCA spec. + */ +#include <linux/prefetch.h> +#include <linux/slab.h> +#include <linux/sizes.h> +#include <linux/count_zeros.h> + +#include "../compress.h" +#include "lz77.h" + +#ifdef CONFIG_CIFS_DEBUG2 +# define LZ77_STATS_ENABLED 1 +#else +# define LZ77_STATS_ENABLED 0 +#endif + +/* + * Compression parameters. + * + * - finding closer and smaller matches is faster, but also compresses worse, e.g. + * match min len = 4, max dist = 4K compared to match min len = 8, max dist = 8K: + * the former is ~10% faster, but yields a ~5% bigger output + * - we don't really care about a maximum match length -- it's simply much, much faster to compute + * a very long match length (e.g. >2M) and write only its encoded bytes than to write e.g. 500K + * of literals + */ +#define LZ77_MATCH_MIN_LEN 4 +#define LZ77_MATCH_MAX_LEN S32_MAX +#define LZ77_MATCH_MIN_DIST 1 +#define LZ77_MATCH_MAX_DIST SZ_2K +#define LZ77_HASH_LOG 15 +#define LZ77_HASH_SIZE (1 << LZ77_HASH_LOG) +#define LZ77_HASH_MASK (LZ77_HASH_SIZE - 1) + +struct lz77_flag { + u8 *pos; + u32 count; + long val; /* This must be signed. */ +}; + +#define lz77_read(nbits, _ptr) get_unaligned((const u ## nbits *)(_ptr)) +#define lz77_write(nbits, _ptr, _v) put_unaligned_le ## nbits((_v), (typeof((_v)) *)(_ptr)) + +static __always_inline u16 lz77_read16(const void *ptr) +{ + return lz77_read(16, ptr); +} + +static __always_inline u32 lz77_read32(const void *ptr) +{ + return lz77_read(32, ptr); +} + +static __always_inline u64 lz77_read64(const void *ptr) +{ + return lz77_read(64, ptr); +} + +static __always_inline void lz77_write8(void *ptr, u8 v) +{ + *(u8 *)ptr = v; +} + +static __always_inline void lz77_write16(void *ptr, u16 v) +{ + lz77_write(16, ptr, v); +} + +static __always_inline void lz77_write32(void *ptr, u32 v) +{ + lz77_write(32, ptr, v); +} + +static __always_inline u16 lz77_hash_bytes16(const u8 *ptr) +{ + /* A "hash function" -- simple, fast, and enough for our use-case. */ + return ((lz77_read32(ptr) * 2654435769LL) >> 16) & LZ77_HASH_MASK; +} + +static __always_inline u32 lz77_hash_bytes32(const u8 *ptr) +{ + /* A "hash function" -- simple, fast, and enough for our use-case. */ + return ((lz77_read32(ptr) * 2654435769LL) >> (32 - LZ77_HASH_LOG)) & LZ77_HASH_MASK; +} + +static __always_inline u64 lz77_hash_bytes64(const u8 *ptr) +{ + /* A "hash function" -- simple, fast, and enough for our use-case. */ + return ((lz77_read64(ptr) << 24) * 889523592379ULL) >> (64 - LZ77_HASH_LOG); +} + +#define lz77_hash_bytes lz77_hash_bytes64 + +/* + * Count common bytes in @diff, where @diff is: + * (*(u64 *)bufA ^ *(u64)bufB) + */ +static __always_inline u32 lz77_count_common_bytes(const u64 diff) +{ + return count_trailing_zeros(diff) >> 3; +} + +static __always_inline u32 lz77_match_len(const u8 *match, const u8 *cur, const u8 *end) +{ + const u8 *start = cur; + u32 step = sizeof(u32); + u64 diff; + + if (cur > end - step) + return 0; + + if (lz77_read32(cur) ^ lz77_read32(match)) + return 0; + + cur += step; + match += step; + step = sizeof(u64); + + while (likely(cur < end - (step - 1))) { + diff = lz77_read64(cur) ^ lz77_read64(match); + if (!diff) { + cur += step; + match += step; + + continue; + } + + cur += lz77_count_common_bytes(diff); + + return (u32)(cur - start); + } + + step = sizeof(u32); + + if (cur < end - (step - 1) && !(lz77_read32(cur) ^ lz77_read32(match))) { + cur += step; + match += step; + } + + step = sizeof(u16); + + if (cur < end - (step - 1) && lz77_read16(cur) == lz77_read16(match)) { + cur += step; + match += step; + } + + if (cur < end && *cur == *match) + cur++; + + return (u32)(cur - start); +} + +static __always_inline u8 *lz77_write_match(u8 *dst, u8 **nib, u32 dist, u32 len) +{ + len -= 3; + dist--; + dist <<= 3; + + if (len < 7) { + lz77_write16(dst, dist + len); + + return dst + 2; + } + + dist |= 7; + lz77_write16(dst, dist); + dst += 2; + len -= 7; + + if (!*nib) { + lz77_write8(dst, umin(len, 15)); + *nib = dst; + dst++; + } else { + **nib |= umin(len, 15) << 4; + *nib = NULL; + } + + if (len < 15) + return dst; + + len -= 15; + if (len < 255) { + lz77_write8(dst, len); + + return dst + 1; + } + + lz77_write8(dst, 0xff); + dst++; + len += 7 + 15; + + if (len <= 0xffff) { + lz77_write16(dst, len); + + return dst + 2; + } + + lz77_write16(dst, 0); + dst += 2; + lz77_write32(dst, len); + + return dst + 4; +} + +static __always_inline void lz77_copy(u8 *dst, const u8 *src, size_t count) +{ + while (count > 8) { + *(u64 *)dst = *(u64 *)src; + count -= 8; + src += 8; + dst += 8; + } + + while (count > 4) { + *(u32 *)dst = *(u32 *)src; + count -= 4; + src += 4; + dst += 4; + } + + if (count) + memcpy(dst, src, count); +} + +/* + * Writing literals is the most expensive operation in the algorithm. Every and any perf + * improvements here are welcome. + * + * Note: "O3" will actually slow things down for some kinds of input, whereas "O2" improves in all + * (tested) scenarios. + */ +#pragma GCC push_options +#pragma GCC optimize ("O2") +static __maybe_unused __always_inline u8 *lz77_write_literals(u8 *dst, const u8 *src, u32 count, struct lz77_flag *flags) +{ + u32 c = umin(count, 32 - flags->count); + + /* + * Writes split in 2 parts -- fast path, where we don't need to check the flag count on + * every iteration, and the slow path, for fast path leftovers or when writing < 32 + * literals. + */ + while (c == 32) { + lz77_copy(dst, src, c); + + dst += c; + src += c; + count -= c; + + flags->val <<= c; + lz77_write32(flags->pos, flags->val); + flags->count = 0; + flags->pos = dst; + dst += 4; + + c = umin(count, 32); + } + + while (count) { + lz77_copy(dst, src, c); + + dst += c; + src += c; + count -= c; + + flags->val <<= c; + flags->count += c; + if (flags->count == 32) { + lz77_write32(flags->pos, flags->val); + flags->count = 0; + flags->pos = dst; + dst += 4; + } + + c = umin(count, 32); + } + + return dst; +} +#pragma GCC pop_options + +noinline int lz77_compress(const u8 *src, u32 slen, u8 *dst, u32 *dlen) +{ + const u8 *srcp, *end, *anchor; + //struct lz77_flag flags = { 0 }; + long f = 0; + unsigned long _s, _e; + u8 *dstp, *nib, *fp; + u64 *htable; + u32 fc = 0; + int ret; + + if (!src || !dst) + return -EINVAL; + + srcp = src; + anchor = srcp; + end = src + slen; + + dstp = dst; + nib = NULL; + + /* Output buffer start with a flag. */ + //flags.pos = dstp; + fp = dstp; + dstp += 4; + + htable = kvcalloc(LZ77_HASH_SIZE, sizeof(*htable), GFP_KERNEL); + if (!htable) + return -ENOMEM; + + if (lz77_read32(srcp) ^ lz77_read32(srcp + LZ77_MATCH_MIN_LEN)) { + u32 hash; + + lz77_copy(dstp, srcp, LZ77_MATCH_MIN_LEN); + dstp += LZ77_MATCH_MIN_LEN; + fc += LZ77_MATCH_MIN_LEN; + +#define htable_add(_p, _i) \ +({ \ + hash = lz77_hash_bytes(_p); \ + htable[hash] = (_i); \ +}) + htable_add(srcp++, 0); + htable_add(srcp++, 1); + htable_add(srcp++, 2); + htable_add(srcp++, 3); + } + + _s = jiffies; +find_matches: + while (likely(srcp + 32 + LZ77_MATCH_MIN_LEN < end)) { + u32 offset, dist, len = 0; + u64 hash; + //while (likely(srcp + LZ77_MATCH_MIN_LEN < end)) { + //while (likely(len == 0) || fc < 32) { + for (; fc < 32 && len == 0; fc++, srcp++) { + offset = srcp - src; + + hash = lz77_hash_bytes(srcp); + dist = offset - htable[hash]; + + if (dist >= LZ77_MATCH_MIN_DIST && dist < LZ77_MATCH_MAX_DIST) + len = lz77_match_len(src + htable[hash], srcp, end); + + htable[hash] = offset; + + //if (len >= LZ77_MATCH_MIN_LEN) + // break; + + //dist = 0; + //len = 0; + //srcp++; + } + + if (len == 0) { + //WARN(fc > 32, "%s: flag count (%u) > 32 (unexpected) (src pos %zu)\n", __func__, fc, srcp - src); + + lz77_copy(dstp, anchor, 32); + + dstp += 32; + anchor = srcp; + + f <<= 32; + *fp = f; + fc -= 32; + fp = dstp; + dstp += 4; + + goto find_matches; + } + + //if (unlikely(srcp + LZ77_MATCH_MIN_LEN >= end)) { + // srcp = anchor; + // break; + //} + + //dstp = lz77_write_literals(dstp, anchor, srcp - anchor, &flags); + dstp = lz77_write_match(dstp, &nib, dist, len); + srcp += len; + anchor = srcp; + + f = (f << 1) | 1; + fc++; + if (fc == 32) { + *fp = f; + fc = 0; + fp = dstp; + dstp += 4; + } + +#if 0 + flags.val = (flags.val << 1) | 1; + flags.count++; + if (flags.count == 32) { + lz77_write32(flags.pos, flags.val); + flags.count = 0; + flags.pos = dstp; + dstp += 4; + } +#endif + + htable_add(srcp, srcp++ - src); + htable_add(srcp, srcp++ - src); + htable_add(srcp, srcp++ - src); + htable_add(srcp, srcp++ - src); + } + + //if (srcp < end) { + //dstp = lz77_write_literals(dstp, srcp, end - srcp, &flags); + while (srcp < end) { + u32 c = umin(end - srcp, 32 - fc); + + lz77_copy(dstp, srcp, c); + + dstp += c; + srcp += c; + + f <<= c; + fc += c; + if (fc == 32) { + *fp = f; + fc = 0; + fp = dstp; + dstp += 4; + } + } + + f <<= (32 - fc); + f |= (1 << (32 - fc)) - 1; + *fp = f; +#if 0 + flags.val <<= (32 - flags.count); + flags.val |= (1 << (32 - flags.count)) - 1; + lz77_write32(flags.pos, flags.val); +#endif + + _e = jiffies; + + *dlen = dstp - (u8 *)dst; + pr_err("%s: took %ums to compress (slen=%u, dlen=%u)\n", __func__, jiffies_to_msecs(_e - _s), slen, *dlen); + + if (*dlen < slen) + ret = 0; + else + ret = -EMSGSIZE; + + kvfree(htable); + + return ret; +} diff --git a/fs/smb/client/compress/lz77-fastest-2024-08-13/lz77.h b/fs/smb/client/compress/lz77-fastest-2024-08-13/lz77.h new file mode 100644 index 000000000000..904023f0fb4d --- /dev/null +++ b/fs/smb/client/compress/lz77-fastest-2024-08-13/lz77.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2024, SUSE LLC + * + * Authors: Enzo Matsumiya <ematsumiya@suse.de> + * + * Implementation of the LZ77 "plain" compression algorithm, as per MS-XCA spec. + */ +#ifndef _SMB_COMPRESS_LZ77_H +#define _SMB_COMPRESS_LZ77_H + +#include <linux/kernel.h> + +static __always_inline size_t lz77_compress_mem_required(size_t len) +{ + return len + (len >> 3); +} + +int lz77_compress(const u8 *src, u32 slen, u8 *dst, u32 *dlen); +#endif /* _SMB_COMPRESS_LZ77_H */ diff --git a/fs/smb/client/compress/lz77.c b/fs/smb/client/compress/lz77.c index 2b8d548f9492..2cbfcf3ed9bf 100644 --- a/fs/smb/client/compress/lz77.c +++ b/fs/smb/client/compress/lz77.c @@ -6,15 +6,142 @@ * * Implementation of the LZ77 "plain" compression algorithm, as per MS-XCA spec. */ +#include <linux/prefetch.h> #include <linux/slab.h> +#include <linux/sizes.h> +#include <linux/count_zeros.h> + +#include "../compress.h" #include "lz77.h" -static __always_inline u32 hash3(const u8 *ptr) +#ifdef CONFIG_CIFS_DEBUG2 +# define LZ77_STATS_ENABLED 1 +#else +# define LZ77_STATS_ENABLED 0 +#endif + +/* + * Compression parameters. + * + * - finding closer and smaller matches is faster, but also compresses worse, e.g. + * match min len = 4, max dist = 4K compared to match min len = 8, max dist = 8K: + * the former is ~10% faster, but yields a ~5% bigger output + * - we don't really care about a maximum match length -- it's simply much, much faster to compute + * a very long match length (e.g. >2M) and write only its encoded bytes than to write e.g. 500K + * of literals + */ +#define LZ77_MATCH_MIN_LEN 4 +#define LZ77_MATCH_MAX_LEN S32_MAX +#define LZ77_MATCH_MIN_DIST 1 +#define LZ77_MATCH_MAX_DIST SZ_2K +#define LZ77_HASH_LOG 15 +#define LZ77_HASH_SIZE (1 << LZ77_HASH_LOG) + +#define lz77_read(nbits, _ptr) get_unaligned((const u ## nbits *)(_ptr)) +#define lz77_write(nbits, _ptr, _v) put_unaligned_le ## nbits((_v), (typeof((_v)) *)(_ptr)) + +struct lz77_flag { + u8 *pos; + u32 count; + long val; +}; + +static __always_inline u16 lz77_read16(const void *ptr) +{ + return lz77_read(16, ptr); +} + +static __always_inline u32 lz77_read32(const void *ptr) { - return lz77_hash32(lz77_read32(ptr) & 0xffffff, LZ77_HASH_LOG); + return lz77_read(32, ptr); +} + +static __always_inline u64 lz77_read64(const void *ptr) +{ + return lz77_read(64, ptr); +} + +static __always_inline void lz77_write8(void *ptr, u8 v) +{ + *(u8 *)ptr = v; +} + +static __always_inline void lz77_write16(void *ptr, u16 v) +{ + lz77_write(16, ptr, v); +} + +static __always_inline void lz77_write32(void *ptr, u32 v) +{ + lz77_write(32, ptr, v); +} + +static __always_inline u64 lz77_hash_bytes(const u8 *ptr) +{ + /* A "hash function" -- simple, fast, and enough for our use-case. */ + return ((lz77_read64(ptr) << 24) * 889523592379ULL) >> (64 - LZ77_HASH_LOG); +} + +/* + * Count common bytes in @diff, where @diff is: + * (*(u64 *)bufA ^ *(u64)bufB) + */ +static __always_inline u32 lz77_count_common_bytes(const u64 diff) +{ + return count_trailing_zeros(diff) >> 3; +} + +static __always_inline u32 lz77_match_len(const u8 *match, const u8 *cur, const u8 *end) +{ + const u8 *start = cur; + u32 step = sizeof(u32); + u64 diff; + + if (cur > end - step) + return 0; + + if (lz77_read32(cur) ^ lz77_read32(match)) + return 0; + + cur += step; + match += step; + step = sizeof(u64); + + while (likely(cur < end - (step - 1))) { + diff = lz77_read64(cur) ^ lz77_read64(match); + if (!diff) { + cur += step; + match += step; + + continue; + } + + cur += lz77_count_common_bytes(diff); + + return (u32)(cur - start); + } + + step = sizeof(u32); + + if (cur < end - (step - 1) && !(lz77_read32(cur) ^ lz77_read32(match))) { + cur += step; + match += step; + } + + step = sizeof(u16); + + if (cur < end - (step - 1) && lz77_read16(cur) == lz77_read16(match)) { + cur += step; + match += step; + } + + if (cur < end && *cur == *match) + cur++; + + return (u32)(cur - start); } -static u8 *write_match(u8 *dst, u8 **nib, u32 dist, u32 len) +static __always_inline u8 *lz77_write_match(u8 *dst, u8 **nib, u32 dist, u32 len) { len -= 3; dist--; @@ -22,6 +149,7 @@ static u8 *write_match(u8 *dst, u8 **nib, u32 dist, u32 len) if (len < 7) { lz77_write16(dst, dist + len); + return dst + 2; } @@ -31,11 +159,11 @@ static u8 *write_match(u8 *dst, u8 **nib, u32 dist, u32 len) len -= 7; if (!*nib) { + lz77_write8(dst, umin(len, 15)); *nib = dst; - lz77_write8(dst, min_t(unsigned int, len, 15)); dst++; } else { - **nib |= min_t(unsigned int, len, 15) << 4; + **nib |= umin(len, 15) << 4; *nib = NULL; } @@ -45,15 +173,17 @@ static u8 *write_match(u8 *dst, u8 **nib, u32 dist, u32 len) len -= 15; if (len < 255) { lz77_write8(dst, len); + return dst + 1; } lz77_write8(dst, 0xff); dst++; - len += 7 + 15; + if (len <= 0xffff) { lz77_write16(dst, len); + return dst + 2; } @@ -64,19 +194,34 @@ static u8 *write_match(u8 *dst, u8 **nib, u32 dist, u32 len) return dst + 4; } -static u8 *write_literals(u8 *dst, const u8 *dst_end, const u8 *src, size_t count, - struct lz77_flags *flags) +static __always_inline void lz77_copy(u8 *dst, const u8 *src, size_t count) { - const u8 *end = src + count; + while (count > 8) { + *(u64 *)dst = *(u64 *)src; + count -= 8; + src += 8; + dst += 8; + } - while (src < end) { - size_t c = lz77_min(count, 32 - flags->count); + while (count > 4) { + *(u32 *)dst = *(u32 *)src; + count -= 4; + src += 4; + dst += 4; + } - if (dst + c >= dst_end) - return ERR_PTR(-EFAULT); + if (count) + memcpy(dst, src, count); +} + +static u8 *lz77_write_literals(u8 *dst, const u8 *src, u32 count, struct lz77_flag *flags) +{ + const u8 *end = src + count; + + while (likely(src < end)) { + u32 c = umin(count, 32 - flags->count); - if (lz77_copy(dst, src, c)) - return ERR_PTR(-EFAULT); + lz77_copy(dst, src, c); dst += c; src += c; @@ -95,90 +240,58 @@ static u8 *write_literals(u8 *dst, const u8 *dst_end, const u8 *src, size_t coun return dst; } -static __always_inline bool is_valid_match(const u32 dist, const u32 len) -{ - return (dist >= LZ77_MATCH_MIN_DIST && dist < LZ77_MATCH_MAX_DIST) && - (len >= LZ77_MATCH_MIN_LEN && len < LZ77_MATCH_MAX_LEN); -} - -static __always_inline const u8 *find_match(u32 *htable, const u8 *base, const u8 *cur, - const u8 *end, u32 *best_len) -{ - const u8 *match; - u32 hash; - size_t offset; - - hash = hash3(cur); - offset = cur - base; - - if (htable[hash] >= offset) - return cur; - - match = base + htable[hash]; - *best_len = lz77_match(match, cur, end); - if (is_valid_match(cur - match, *best_len)) - return match; - - return cur; -} - -int lz77_compress(const u8 *src, size_t src_len, u8 *dst, size_t *dst_len) +noinline int lz77_compress(const u8 *src, u32 slen, u8 *dst, u32 *dlen) { - const u8 *srcp, *src_end, *anchor; - struct lz77_flags flags = { 0 }; - u8 *dstp, *dst_end, *nib; - u32 *htable; + const u8 *srcp, *end, *anchor; + struct lz77_flag flags = { 0 }; + u8 *dstp, *nib; + u64 *htable; int ret; srcp = src; anchor = srcp; - src_end = src + src_len; + end = src + slen; dstp = dst; - dst_end = dst + *dst_len; - flags.pos = dstp; nib = NULL; - memset(dstp, 0, *dst_len); + /* Output buffer start with a 4 byte flags. */ + flags.pos = dstp; dstp += 4; - htable = kvcalloc(LZ77_HASH_SIZE, sizeof(u32), GFP_KERNEL); + htable = kvcalloc(LZ77_HASH_SIZE, sizeof(*htable), GFP_KERNEL); if (!htable) return -ENOMEM; - /* fill hashtable with invalid offsets */ - memset(htable, 0xff, LZ77_HASH_SIZE * sizeof(u32)); + /* Main loop. */ + while (likely(srcp < end)) { + u32 offset, dist, len; + u64 hash; - /* from here on, any error is because @dst_len reached >= @src_len */ - ret = -EMSGSIZE; + while (likely(srcp + LZ77_MATCH_MIN_LEN < end)) { + offset = srcp - src; + hash = lz77_hash_bytes(srcp); - /* main loop */ - while (srcp < src_end) { - u32 hash, dist, len; - const u8 *match; + dist = offset - htable[hash]; + if (dist >= LZ77_MATCH_MIN_DIST && dist < LZ77_MATCH_MAX_DIST) + len = lz77_match_len(src + htable[hash], srcp, end); - while (srcp + 3 < src_end) { - len = LZ77_MATCH_MIN_LEN - 1; - match = find_match(htable, src, srcp, src_end, &len); - hash = hash3(srcp); - htable[hash] = srcp - src; + htable[hash] = offset; - if (likely(match < srcp)) { - dist = srcp - match; + if (len >= LZ77_MATCH_MIN_LEN) break; - } + dist = 0; + len = 0; srcp++; } - dstp = write_literals(dstp, dst_end, anchor, srcp - anchor, &flags); - if (IS_ERR(dstp)) - goto err_free; + dstp = lz77_write_literals(dstp, anchor, srcp - anchor, &flags); - if (srcp + 3 >= src_end) + if (unlikely(srcp + LZ77_MATCH_MIN_LEN >= end)) goto leftovers; - dstp = write_match(dstp, &nib, dist, len); + dstp = lz77_write_match(dstp, &nib, dist, len); srcp += len; anchor = srcp; @@ -192,19 +305,19 @@ int lz77_compress(const u8 *src, size_t src_len, u8 *dst, size_t *dst_len) } } leftovers: - if (srcp < src_end) { - dstp = write_literals(dstp, dst_end, srcp, src_end - srcp, &flags); - if (IS_ERR(dstp)) - goto err_free; - } + if (srcp < end) + dstp = lz77_write_literals(dstp, srcp, end - srcp, &flags); flags.val <<= (32 - flags.count); flags.val |= (1 << (32 - flags.count)) - 1; lz77_write32(flags.pos, flags.val); - *dst_len = dstp - dst; - ret = 0; -err_free: + *dlen = dstp - (u8 *)dst; + if (*dlen < slen) + ret = 0; + else + ret = -EMSGSIZE; +//out: kvfree(htable); return ret; diff --git a/fs/smb/client/compress/lz77.h b/fs/smb/client/compress/lz77.h index 0cc9c6abf840..904023f0fb4d 100644 --- a/fs/smb/client/compress/lz77.h +++ b/fs/smb/client/compress/lz77.h @@ -4,282 +4,17 @@ * * Authors: Enzo Matsumiya <ematsumiya@suse.de> * - * Definitions and optmized helpers for LZ77 compression. + * Implementation of the LZ77 "plain" compression algorithm, as per MS-XCA spec. */ #ifndef _SMB_COMPRESS_LZ77_H #define _SMB_COMPRESS_LZ77_H -#ifdef CONFIG_CIFS_COMPRESSION -#include <asm/ptrace.h> #include <linux/kernel.h> -#include <linux/string.h> -#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS -#include <asm-generic/unaligned.h> -#endif -#define LZ77_HASH_LOG 13 -#define LZ77_HASH_SIZE (1 << LZ77_HASH_LOG) -#define LZ77_HASH_MASK lz77_hash_mask(LZ77_HASH_LOG) - -/* We can increase this for better compression (but worse performance). */ -#define LZ77_MATCH_MIN_LEN 3 -/* From MS-XCA, but it's arbitrarily chosen. */ -#define LZ77_MATCH_MAX_LEN S32_MAX -/* - * Check this to ensure we don't match the current position, which would - * end up doing a verbatim copy of the input, and actually overflowing - * the output buffer because of the encoded metadata. - */ -#define LZ77_MATCH_MIN_DIST 1 -/* How far back in the buffer can we try to find a match (i.e. window size) */ -#define LZ77_MATCH_MAX_DIST 8192 - -#define LZ77_STEPSIZE_16 sizeof(u16) -#define LZ77_STEPSIZE_32 sizeof(u32) -#define LZ77_STEPSIZE_64 sizeof(u64) - -struct lz77_flags { - u8 *pos; - size_t count; - long val; -}; - -static __always_inline u32 lz77_hash_mask(const unsigned int log2) -{ - return ((1 << log2) - 1); -} - -static __always_inline u32 lz77_hash64(const u64 v, const unsigned int log2) -{ - const u64 prime5bytes = 889523592379ULL; - - return (u32)(((v << 24) * prime5bytes) >> (64 - log2)); -} - -static __always_inline u32 lz77_hash32(const u32 v, const unsigned int log2) -{ - return ((v * 2654435769LL) >> (32 - log2)) & lz77_hash_mask(log2); -} - -static __always_inline u32 lz77_log2(unsigned int x) -{ - return x ? ((u32)(31 - __builtin_clz(x))) : 0; -} - -#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS -static __always_inline u8 lz77_read8(const void *ptr) -{ - return *(u8 *)ptr; -} - -static __always_inline u16 lz77_read16(const void *ptr) -{ - return *(u16 *)ptr; -} - -static __always_inline u32 lz77_read32(const void *ptr) -{ - return *(u32 *)ptr; -} - -static __always_inline u64 lz77_read64(const void *ptr) -{ - return *(u64 *)ptr; -} - -static __always_inline void lz77_write8(void *ptr, const u8 v) -{ - *(u8 *)ptr = v; -} - -static __always_inline void lz77_write16(void *ptr, const u16 v) -{ - *(u16 *)ptr = v; -} - -static __always_inline void lz77_write32(void *ptr, const u32 v) +static __always_inline size_t lz77_compress_mem_required(size_t len) { - *(u32 *)ptr = v; -} - -static __always_inline void lz77_write64(void *ptr, const u64 v) -{ - *(u64 *)ptr = v; -} - -static __always_inline void lz77_write_ptr16(void *ptr, const void *vp) -{ - *(u16 *)ptr = *(const u16 *)vp; -} - -static __always_inline void lz77_write_ptr32(void *ptr, const void *vp) -{ - *(u32 *)ptr = *(const u32 *)vp; -} - -static __always_inline void lz77_write_ptr64(void *ptr, const void *vp) -{ - *(u64 *)ptr = *(const u64 *)vp; -} - -static __always_inline long lz77_copy(u8 *dst, const u8 *src, size_t count) -{ - return copy_from_kernel_nofault(dst, src, count); -} -#else /* CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS */ -static __always_inline u8 lz77_read8(const void *ptr) -{ - return get_unaligned((u8 *)ptr); -} - -static __always_inline u16 lz77_read16(const void *ptr) -{ - return lz77_read8(ptr) | (lz77_read8(ptr + 1) << 8); -} - -static __always_inline u32 lz77_read32(const void *ptr) -{ - return lz77_read16(ptr) | (lz77_read16(ptr + 2) << 16); -} - -static __always_inline u64 lz77_read64(const void *ptr) -{ - return lz77_read32(ptr) | ((u64)lz77_read32(ptr + 4) << 32); -} - -static __always_inline void lz77_write8(void *ptr, const u8 v) -{ - put_unaligned(v, (u8 *)ptr); -} - -static __always_inline void lz77_write16(void *ptr, const u16 v) -{ - lz77_write8(ptr, v & 0xff); - lz77_write8(ptr + 1, (v >> 8) & 0xff); -} - -static __always_inline void lz77_write32(void *ptr, const u32 v) -{ - lz77_write16(ptr, v & 0xffff); - lz77_write16(ptr + 2, (v >> 16) & 0xffff); -} - -static __always_inline void lz77_write64(void *ptr, const u64 v) -{ - lz77_write32(ptr, v & 0xffffffff); - lz77_write32(ptr + 4, (v >> 32) & 0xffffffff); -} - -static __always_inline void lz77_write_ptr16(void *ptr, const void *vp) -{ - const u16 v = lz77_read16(vp); - - lz77_write16(ptr, v); -} - -static __always_inline void lz77_write_ptr32(void *ptr, const void *vp) -{ - const u32 v = lz77_read32(vp); - - lz77_write32(ptr, v); -} - -static __always_inline void lz77_write_ptr64(void *ptr, const void *vp) -{ - const u64 v = lz77_read64(vp); - - lz77_write64(ptr, v); -} -static __always_inline long lz77_copy(u8 *dst, const u8 *src, size_t count) -{ - memcpy(dst, src, count); - return 0; -} -#endif /* !CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS */ - -static __always_inline unsigned int __count_common_bytes(const unsigned long diff) -{ -#ifdef __has_builtin -# if __has_builtin(__builtin_ctzll) - return (unsigned int)__builtin_ctzll(diff) >> 3; -# endif -#else - /* count trailing zeroes */ - unsigned long bits = 0, i, z = 0; - - bits |= diff; - for (i = 0; i < 64; i++) { - if (bits[i]) - break; - z++; - } - - return (unsigned int)z >> 3; -#endif -} - -static __always_inline size_t lz77_match(const u8 *match, const u8 *cur, const u8 *end) -{ - const u8 *start = cur; - - if (cur == match) - return 0; - - if (likely(cur < end - (LZ77_STEPSIZE_64 - 1))) { - u64 const diff = lz77_read64(cur) ^ lz77_read64(match); - - if (!diff) { - cur += LZ77_STEPSIZE_64; - match += LZ77_STEPSIZE_64; - } else { - return __count_common_bytes(diff); - } - } - - while (likely(cur < end - (LZ77_STEPSIZE_64 - 1))) { - u64 const diff = lz77_read64(cur) ^ lz77_read64(match); - - if (!diff) { - cur += LZ77_STEPSIZE_64; - match += LZ77_STEPSIZE_64; - continue; - } - - cur += __count_common_bytes(diff); - return (size_t)(cur - start); - } - - if (cur < end - 3 && !(lz77_read32(cur) ^ lz77_read32(match))) { - cur += LZ77_STEPSIZE_32; - match += LZ77_STEPSIZE_32; - } - - if (cur < end - 1 && lz77_read16(cur) == lz77_read16(match)) { - cur += LZ77_STEPSIZE_16; - match += LZ77_STEPSIZE_16; - } - - if (cur < end && *cur == *match) - cur++; - - return (size_t)(cur - start); -} - -static __always_inline unsigned long lz77_max(unsigned long a, unsigned long b) -{ - int m = (a < b) - 1; - - return (a & m) | (b & ~m); -} - -static __always_inline unsigned long lz77_min(unsigned long a, unsigned long b) -{ - int m = (a > b) - 1; - - return (a & m) | (b & ~m); + return len + (len >> 3); } -int lz77_compress(const u8 *src, size_t src_len, u8 *dst, size_t *dst_len); -/* when CONFIG_CIFS_COMPRESSION not set lz77_compress() is not called */ -#endif /* !CONFIG_CIFS_COMPRESSION */ +int lz77_compress(const u8 *src, u32 slen, u8 *dst, u32 *dlen); #endif /* _SMB_COMPRESS_LZ77_H */ diff --git a/fs/smb/client/transport.c b/fs/smb/client/transport.c index fd5a85d43759..3d5416e7c07d 100644 --- a/fs/smb/client/transport.c +++ b/fs/smb/client/transport.c @@ -433,8 +433,17 @@ smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, struct kvec *iov; int rc; - if (flags & CIFS_COMPRESS_REQ) |