diff options
| -rw-r--r-- | fs/smb/client/compress.c | 233 |
1 files changed, 52 insertions, 181 deletions
diff --git a/fs/smb/client/compress.c b/fs/smb/client/compress.c index ba51888a3278..ca06696ae191 100644 --- a/fs/smb/client/compress.c +++ b/fs/smb/client/compress.c @@ -27,110 +27,56 @@ #define decompress_err(fmt, ...) cifs_dbg(VFS, "Decompression failure: " fmt, __VA_ARGS__) /* - * The heuristic_*() functions below try to determine data compressibility. - * - * Derived from fs/btrfs/compression.c, changing coding style, some parameters, and removing - * unused parts. - * - * Read that file for better and more detailed explanation of the calculations. + * The functions below try to determine data compressibility. * * The algorithms are ran in a collected sample of the input (uncompressed) data. - * The sample is formed of 2K reads in PAGE_SIZE intervals, with a maximum size of 4M. + * They are ran over 7/8 of that data. * * Parsing the sample goes from "low-hanging fruits" (fastest algorithms, likely compressible) - * to "need more analysis" (likely uncompressible). + * to "need more analysis" (tending to uncompressible). */ -struct bucket { - unsigned int count; -}; - -/** - * has_low_entropy() - Compute Shannon entropy of the sampled data. - * @bkt: Bytes counts of the sample. - * @slen: Size of the sample. - * - * Return: true if the level (percentage of number of bits that would be required to - * compress the data) is below the minimum threshold. - * - * Note: - * There _is_ an entropy level here that's > 65 (minimum threshold) that would indicate a - * possibility of compression, but compressing, or even further analysing, it would waste so much - * resources that it's simply not worth it. - * - * Also Shannon entropy is the last computed heuristic; if we got this far and ended up - * with uncertainty, just stay on the safe side and call it uncompressible. - */ -static bool has_low_entropy(struct bucket *bkt, size_t slen) +/* fixed-point log2(x) in Q8 */ +static __always_inline u32 log2_q8(u32 x) { - const size_t threshold = 65, max_entropy = 8 * ilog2(16); - size_t i, p, p2, len, sum = 0; - -#define pow4(n) (n * n * n * n) - len = ilog2(pow4(slen)); - - for (i = 0; i < 256 && bkt[i].count > 0; i++) { - p = bkt[i].count; - p2 = ilog2(pow4(p)); - sum += p * (len - p2); - } - - sum /= slen; + u32 l = ilog2(x); + u32 frac = x - (1U << l); - return ((sum * 100 / max_entropy) <= threshold); + return (l << 8) + ((frac << 8) >> l); } -#define BYTE_DIST_BAD 0 -#define BYTE_DIST_GOOD 1 -#define BYTE_DIST_MAYBE 2 /** - * calc_byte_distribution() - Compute byte distribution on the sampled data. - * @bkt: Byte counts of the sample. + * has_low_entropy() - Compute Shannon entropy of the sampled data. + * @counts: Bytes counts of the sample (expected to be sorted in descending order). * @slen: Size of the sample. * - * Return: - * BYTE_DIST_BAD: A "hard no" for compression -- a computed uniform distribution of - * the bytes (e.g. random or encrypted data). - * BYTE_DIST_GOOD: High probability (normal (Gaussian) distribution) of the data being - * compressible. - * BYTE_DIST_MAYBE: When computed byte distribution resulted in "low > n < high" - * grounds. has_low_entropy() should be used for a final decision. + * Return: true if the level (percentage of number of bits that would be required to + * compress the data) is below the minimum threshold. */ -static int calc_byte_distribution(struct bucket *bkt, size_t slen) +static bool has_low_entropy(u32 *counts, size_t slen) { - const size_t low = 64, high = 200, threshold = slen * 90 / 100; - size_t sum = 0; - int i; + const size_t threshold = 700 << 8, len = ilog2(slen) << 8; + size_t i, p, sum = 0; - for (i = 0; i < low; i++) - sum += bkt[i].count; - - if (sum > threshold) - return BYTE_DIST_BAD; - - for (; i < high && bkt[i].count > 0; i++) { - sum += bkt[i].count; - if (sum > threshold) - break; + for (i = 0; i < 256 && counts[i] > 0; i++) { + p = counts[i]; + sum += p * (len - log2_q8(p)); } - if (i <= low) - return BYTE_DIST_GOOD; - - if (i >= high) - return BYTE_DIST_BAD; - - return BYTE_DIST_MAYBE; + return ((sum * 100 / slen) <= threshold); } -static bool is_mostly_ascii(const struct bucket *bkt) +static bool is_mostly_ascii(const u32 *counts) { size_t count = 0; int i; - for (i = 0; i < 256; i++) - if (bkt[i].count > 0) - /* Too many non-ASCII (0-63) bytes. */ + for (i = 0; i < 64; i++) + if (counts[i] > 0) + count++; + + for (; i < 256; i++) + if (counts[i] > 0) if (++count > 64) return false; @@ -139,131 +85,52 @@ static bool is_mostly_ascii(const struct bucket *bkt) static bool has_repeated_data(const u8 *sample, size_t len) { - size_t s = len / 2; + len /= 2; - return (!memcmp(&sample[0], &sample[s], s)); + return (!memcmp(&sample[0], &sample[len], len)); } -static int cmp_bkt(const void *_a, const void *_b) +static int cmp_counts(const void *_a, const void *_b) { - const struct bucket *a = _a, *b = _b; + const u32 *a = _a, *b = _b; /* Reverse sort. */ - if (a->count > b->count) + if (a > b) return -1; return 1; } -static size_t collect_step(void *base, size_t progress, size_t len, void *priv_iov, void *priv_len) -{ - size_t plen, *slen = priv_len; - struct kvec *iov = priv_iov; - - if (progress >= iov->iov_len) - return len; - - plen = min_t(size_t, len, SZ_2K); - - memcpy(iov->iov_base + *slen, base, plen); - - *slen += plen; - - if (len < SZ_2K) - return len; - - return 0; -} - -/* - * Collect sample buffers from @iter. - * Return sample size, or 0 if too big (> @max_sample_len). - */ -static size_t collect_sample(const struct iov_iter *iter, u8 *sample, ssize_t max_sample_len) -{ - struct iov_iter it = *iter; - size_t ret, len = 0, max = iov_iter_count(iter); - struct kvec iov = { - .iov_base = sample, - .iov_len = max_sample_len, - }; - - ret = iterate_and_advance_kernel(&it, max, &iov, &len, collect_step); - if (ret > max || ret > max_sample_len) - return 0; - - return ret; -} - /** * is_compressible() - Determines if a chunk of data is compressible. - * @data: Iterator containing uncompressed data. + * @data: Buffer containing uncompressed data. * * Return: true if @data is compressible, false otherwise. * * Tests shows that this function is quite reliable in predicting data compressibility, * matching close to 1:1 with the behaviour of LZ77 compression success and failures. */ -static bool is_compressible(const struct iov_iter *data) +static bool is_compressible(const void *data, size_t len) { - const size_t read_size = SZ_2K, bkt_size = 256, max = SZ_2M; - struct bucket *bkt = NULL; - size_t i, len; - u8 *sample; - bool ret = false; - - /* Preventive double check -- already checked in should_compress(). */ - len = iov_iter_count(data); - if (unlikely(len < read_size)) - return ret; - - if (len - read_size > max) - len = max; - - len = round_down(len / 2, read_size); - sample = kvzalloc(len, GFP_KERNEL); - if (WARN_ON_ONCE(!sample)) - return ret; - - /* Sample 2K bytes per page of the uncompressed data. */ - len = collect_sample(data, sample, len); - if (len == 0) - goto out; - - ret = true; - if (has_repeated_data(sample, len)) - goto out; + const u8 *sample = data; + u32 i, counts[256] = {}; - bkt = kcalloc(bkt_size, sizeof(*bkt), GFP_KERNEL); - if (!bkt) { - WARN_ON_ONCE(1); - ret = false; + len = umin(len, SZ_2M); + len = round_down(len, SMB_COMPRESS_MIN_LEN); - goto out; - } + if (has_repeated_data(sample, len)) + return true; for (i = 0; i < len; i++) - bkt[sample[i]].count++; + counts[sample[i]]++; - if (is_mostly_ascii(bkt)) - goto out; + if (is_mostly_ascii(counts)) + return true; /* Sort in descending order */ - sort(bkt, bkt_size, sizeof(*bkt), cmp_bkt, NULL); - - i = calc_byte_distribution(bkt, len); - if (i != BYTE_DIST_MAYBE) { - ret = !!i; - - goto out; - } + sort(counts, 256, sizeof(*counts), cmp_counts, NULL); - ret = has_low_entropy(bkt, len); -out: - kvfree(sample); - kfree(bkt); - - return ret; + return has_low_entropy(counts, len); } /* @@ -501,10 +368,7 @@ bool should_compress(const struct cifs_tcon *tcon, const struct smb_rqst *rq) if (shdr->Command == SMB2_WRITE) { const struct smb2_write_req *wreq = rq->rq_iov->iov_base; - if (le32_to_cpu(wreq->Length) < SMB_COMPRESS_MIN_LEN) - return false; - - return is_compressible(&rq->rq_iter); + return (le32_to_cpu(wreq->Length) >= SMB_COMPRESS_MIN_LEN); } return (shdr->Command == SMB2_READ); @@ -559,6 +423,13 @@ int smb_compress(struct TCP_Server_Info *server, struct smb_rqst *rq, compress_s goto err_free; } + ret = is_compressible(src, slen); + if (ret <= 0) { + if (ret == 0) + ret = send_fn(server, 1, rq); + goto err_free; + } + dlen = lz77_calc_dlen(slen); /* Pattern_V1 payloads (fwd and bwd) are included in @dst */ |
