diff options
author | Enzo Matsumiya <ematsumiya@suse.de> | 2024-07-29 14:16:40 -0300 |
---|---|---|
committer | Enzo Matsumiya <ematsumiya@suse.de> | 2024-08-15 13:35:06 -0300 |
commit | f88976e507284bd37819355e08965f44a148be35 (patch) | |
tree | 5dbb1de858d9e70153fca08032786b2afa336cca | |
parent | 6eb329ec298bc3884cfa93e0641266bf18f96f55 (diff) | |
download | linux-f88976e507284bd37819355e08965f44a148be35.tar.gz linux-f88976e507284bd37819355e08965f44a148be35.tar.bz2 linux-f88976e507284bd37819355e08965f44a148be35.zip |
smb: client: add heuristics to determine compressibility of data
Change should_compress() to take an smb_rqst (instead of the header buf
only).
Introduce is_compressible() function, called from should_compress(),
where its only argument is &rqst->rq_iter containing the uncompressed
data.
Also add inlined versions of should_compress() and smb_compress() so
they can be present in callers, but still return false/-EOPNOTSUPP when
CONFIG_CIFS_COMPRESSION is disabled.
Signed-off-by: Enzo Matsumiya <ematsumiya@suse.de>
-rw-r--r-- | fs/smb/client/compress.c | 295 | ||||
-rw-r--r-- | fs/smb/client/compress.h | 41 |
2 files changed, 310 insertions, 26 deletions
diff --git a/fs/smb/client/compress.c b/fs/smb/client/compress.c index 4efbccbd40bf..dcb563a9dde6 100644 --- a/fs/smb/client/compress.c +++ b/fs/smb/client/compress.c @@ -15,6 +15,7 @@ #include <linux/slab.h> #include <linux/kernel.h> #include <linux/uio.h> +#include <linux/sort.h> #include "cifsglob.h" #include "../common/smb2pdu.h" @@ -24,6 +25,300 @@ #include "compress/lz77.h" #include "compress.h" +/* + * The heuristic_*() functions below the compressibility of the sampled uncompressed data. + * + * Derived from fs/btrfs/compression.c, changing coding style, some parameters, and removing + * unused parts. + * + * Read that file for better and more detailed explanation of the calculations. + * + * The algorithms are ran in a collected sample of the input (uncompressed) data. + * The sample is formed of 2K reads in PAGE_SIZE intervals, with a maximum size of 4M. + * + * Parsing the sample goes from "low-hanging fruits" (fastest algorithms, likely compressible) + * to "need more analysis" (likely uncompressible). + */ + +struct bucket { + unsigned int count; +}; + +/** + * calc_shannon_entropy() - Compute Shannon entropy of the sampled data. + * @bucket: Bytes counts of the sampled data. + * @sample_size: Size of the sample. + * + * Return: true if the level (percentage of number of bits that would be required to + * compress the data) is below the minimum threshold. + * + * Note: + * There _is_ an entropy level here that's between 65 (minimum threshold) and a higher + * level (e.g. 200) that would indicate a possibility of compression, but compressing, + * or even further analysing, it would waste so many resources that it's simply not + * worth it. + * + * Also Shannon entropy is the last computed heuristic; if we got this far and ended up + * with uncertainty, just stay on the safe side and call it uncompressible. + */ +static bool calc_shannon_entropy(struct bucket *bucket, size_t sample_size) +{ + const size_t threshold = 65, max_entropy = 8 * ilog2(16); + size_t i, p, p2, len, sum = 0; + +#define pow4(n) (n * n * n * n) + len = ilog2(pow4(sample_size)); + + for (i = 0; i < 256 && bucket[i].count > 0; i++) { + p = bucket[i].count; + p2 = ilog2(pow4(p)); + sum += p * (len - p2); + } + + sum /= sample_size; + + return ((sum * 100 / max_entropy) <= threshold); +} + +/** + * calc_byte_distribution() - Compute byte distribution on the sampled data. + * @bucket: Bytes counts of the sampled data. + * @sample_size: Size of the sample. + * + * Return: + * 1: If there's high probability (normal (Gaussian) distribution) of the data being compressible. + * 0: A "hard no" for compreesibility -- either a computed uniform distribution of the bytes (e.g. + * random or encrypted data), or calc_shannon_entropy() returned false (see above). + * 2: When computed byte distribution resulted in "low > n < high" middle ground. + * calc_shannon_entropy() should be used for a final decision. + */ +static int calc_byte_distribution(struct bucket *bucket, size_t sample_size) +{ + const size_t low = 64, high = 200, threshold = sample_size * 90 / 100; + size_t sum = 0; + int i; + + for (i = 0; i < low; i++) + sum += bucket[i].count; + + if (sum > threshold) + return i; + + for (; i < high && bucket[i].count > 0; i++) { + sum += bucket[i].count; + if (sum > threshold) + break; + } + + if (i <= low) + return 1; + + if (i >= high) + return 0; + + return 2; +} + +static bool check_ascii_bytes(struct bucket *bucket) +{ + const size_t threshold = 64; + size_t count = 0; + int i; + + for (i = 0; i < threshold; i++) + if (bucket[i].count > 0) + count++; + + for (; i < 256; i++) { + if (bucket[i].count > 0) { + count++; + if (count > threshold) + break; + } + } + + return (count < threshold); +} + +static bool check_repeated_data(const u8 *sample, size_t sample_size) +{ + size_t s = sample_size / 2; + + return (!memcmp(&sample[0], &sample[s], s)); +} + +static int cmp_bkt(const void *_a, const void *_b) +{ + const struct bucket *a = _a, *b = _b; + + /* Reverse sort. */ + if (a->count > b->count) + return -1; + + return 1; +} + +/* + * TODO: + * Support other iter types, if required. + * Only ITER_XARRAY is supported for now. + */ +static int collect_sample(const struct iov_iter *iter, ssize_t max, u8 *sample) +{ + struct folio *folios[16], *folio; + unsigned int nr, i, j, npages; + loff_t start = iter->xarray_start + iter->iov_offset; + pgoff_t last, index = start / PAGE_SIZE; + size_t len, off, foff; + ssize_t ret = 0; + void *p; + int s = 0; + + last = (start + max - 1) / PAGE_SIZE; + do { + nr = xa_extract(iter->xarray, (void **)folios, index, last, ARRAY_SIZE(folios), XA_PRESENT); + if (nr == 0) + return -EIO; + + for (i = 0; i < nr; i++) { + folio = folios[i]; + npages = folio_nr_pages(folio); + foff = start - folio_pos(folio); + off = foff % PAGE_SIZE; + + for (j = foff / PAGE_SIZE; j < npages; j++) { + size_t len2; + + len = min_t(size_t, max, PAGE_SIZE - off); + len2 = min_t(size_t, len, SZ_2K); + + p = kmap_local_page(folio_page(folio, j)); + memcpy(&sample[s], p, len2); + kunmap_local(p); + + if (ret < 0) + return ret; + + s += len2; + + if (len2 < SZ_2K || s >= max - SZ_2K) + return s; + + max -= len; + if (max <= 0) + return s; + + start += len; + off = 0; + index++; + } + } + } while (nr == ARRAY_SIZE(folios)); + + return s; +} + +/** + * is_compressible() - Determines if a piece of data is compressible. + * @data: iterator containing the uncompressed data + * + * Return: + * 0: @data is not compressible + * 1: @data is compressible + * -ENOMEM: failed to allocate memory for sample buffer + * + * Tests shows that this function is quite reliable in predicting data compressibility, + * matching 1:1 with the behaviour of LZ77 compression success and failures + * (ouput size >= input size). + * + * But we'll still err on the safe side because it runs much faster (about 1/3) than LZ77. + */ +static int is_compressible(const struct iov_iter *data) +{ + const unsigned int read_size = SZ_2K, bkt_size = 256, max = SZ_4M; + struct bucket *bkt; + int i = 0, ret = 0; + size_t len; + u8 *sample; + + len = iov_iter_count(data); + if (len < read_size) + return 0; + + if (len - read_size > max) + len = max; + + sample = kvzalloc(len, GFP_KERNEL); + if (!sample) + return -ENOMEM; + + prefetchw(sample); + + /* Sample 2K bytes per page of the uncompressed data. */ + ret = collect_sample(data, len, sample); + if (ret < 0) + goto out; + + len = ret; + ret = 1; + + if (check_repeated_data(sample, len)) + goto out; + + bkt = kcalloc(bkt_size, sizeof(*bkt), GFP_KERNEL); + if (!bkt) { + kvfree(sample); + return -ENOMEM; + } + + for (i = 0; i < len; i++) + bkt[sample[i]].count++; + + if (check_ascii_bytes(bkt)) + goto out; + + /* Sort in descending order */ + sort(bkt, bkt_size, sizeof(*bkt), cmp_bkt, NULL); + + ret = calc_byte_distribution(bkt, len); + if (ret != 2) + goto out; + + ret = calc_shannon_entropy(bkt, len); +out: + kvfree(sample); + kfree(bkt); + + WARN(ret < 0, "%s: ret=%d\n", __func__, ret); + + return !!ret; +} + +bool should_compress(const struct cifs_tcon *tcon, const struct smb_rqst *rq) +{ + const struct smb2_hdr *shdr = rq->rq_iov->iov_base; + + if (unlikely(!tcon || !tcon->ses || !tcon->ses->server)) + return false; + + if (!tcon->ses->server->compression.enabled) + return false; + + if (!(tcon->share_flags & SMB2_SHAREFLAG_COMPRESS_DATA)) + return false; + + if (shdr->Command == SMB2_WRITE) { + const struct smb2_write_req *wreq = rq->rq_iov->iov_base; + + if (wreq->Length < SMB_COMPRESS_MIN_LEN) + return false; + + return is_compressible(&rq->rq_iter); + } + + return (shdr->Command == SMB2_READ); +} + int smb_compress(void *buf, const void *data, size_t *len) { struct smb2_compression_hdr *hdr; diff --git a/fs/smb/client/compress.h b/fs/smb/client/compress.h index c0dabe0a60d8..316bb43cb277 100644 --- a/fs/smb/client/compress.h +++ b/fs/smb/client/compress.h @@ -67,43 +67,32 @@ static __always_inline int smb_compress_alg_valid(__le16 alg, bool valid_none) * should_compress() - Determines if a request (write) or the response to a * request (read) should be compressed. * @tcon: tcon of the request is being sent to - * @buf: buffer with an SMB2 READ/WRITE request + * @rqst: request to evaluate * * Return: true iff: * - compression was successfully negotiated with server * - server has enabled compression for the share * - it's a read or write request - * - if write, request length is >= SMB_COMPRESS_MIN_LEN + * - (write only) request length is >= SMB_COMPRESS_MIN_LEN + * - (write only) is_compressible() returns 1 * * Return false otherwise. */ -static __always_inline bool should_compress(const struct cifs_tcon *tcon, const void *buf) +bool should_compress(const struct cifs_tcon *tcon, const struct smb_rqst *rq); +#else /* !CONFIG_CIFS_COMPRESSION */ +static inline int smb_compress(...) { - const struct smb2_hdr *shdr = buf; - - if (!tcon || !tcon->ses || !tcon->ses->server) - return false; - - if (!tcon->ses->server->compression.enabled) - return false; - - if (!(tcon->share_flags & SMB2_SHAREFLAG_COMPRESS_DATA)) - return false; - - if (shdr->Command == SMB2_WRITE) { - const struct smb2_write_req *req = buf; + return -EOPNOTSUPP; +} - return (req->Length >= SMB_COMPRESS_MIN_LEN); - } +static inline int smb_compress_alg_valid(__le16 unused1, bool unused2) +{ + return -EOPNOTSUPP; +} - return (shdr->Command == SMB2_READ); +static inline bool should_compress(void *unused1, void *unused2) +{ + return false; } -/* - * #else !CONFIG_CIFS_COMPRESSION ... - * These routines should not be called when CONFIG_CIFS_COMPRESSION disabled - * #define smb_compress(arg1, arg2, arg3) (-EOPNOTSUPP) - * #define smb_compress_alg_valid(arg1, arg2) (-EOPNOTSUPP) - * #define should_compress(arg1, arg2) (false) - */ #endif /* !CONFIG_CIFS_COMPRESSION */ #endif /* _SMB_COMPRESS_H */ |