1 files changed, 132 insertions, 61 deletions
diff --git a/fs/smb/client/compress/lz77.c b/fs/smb/client/compress/lz77.c
index 4a1d62c07921..2188ff2b7322 100644
--- a/fs/smb/client/compress/lz77.c
+++ b/fs/smb/client/compress/lz77.c
@@ -15,13 +15,33 @@
 
 /*
  * Compression parameters.
+ *
+ * LZ77_MATCH_MIN_LEN:		Minimum match length (can be 3 - 8, default sizeof(u32)).
+ *				Smaller: many small matches, better perf, worse compression ratio.
+ *				Bigger: fewer longer matches, wose perf, better compression ratio.
+ * LZ77_MATCH_MAX_DIST:		Offset from current position back to match position (can be 256 -
+ *				8K).
+ *				Smaller: closer matches, better perf, worse compression ratio.
+ *				Bigger: better matches, worse perf, better compression ratio.
+ * LZ77_HASH_LOG:
+ * LZ77_HASH_SIZE:		ilog2 hash size (recommended to be 13 - 16, default 15 (hash size
+ *				32k)).
+ *				Smaller: less memory, faster access, more hash collisions.
+ *				Bigger: more memory, slower access, less hash collisions.
+ * LZ77_RSTEP_SIZE:		Number of bytes to read from input buffer to check initial match
+ *				(default LZ77_MATCH_MIN_LEN).
+ * LZ77_MSTEP_SIZE:		Number of bytes to extend compare a found match (default
+ *				sizeof(u64)).
+ *
+ * MATCH_MIN_LEN and RSTEP_SIZE are conveniently set to the same value so we can read, hash, and
+ * compare a single value, without having to mess with offsets.
  */
-#define LZ77_MATCH_MIN_LEN	4
-#define LZ77_MATCH_MIN_DIST	1
-#define LZ77_MATCH_MAX_DIST	SZ_1K
+#define LZ77_MATCH_MIN_LEN	sizeof(u32)
+#define LZ77_MATCH_MAX_DIST	SZ_4K
 #define LZ77_HASH_LOG		15
 #define LZ77_HASH_SIZE		(1 << LZ77_HASH_LOG)
-#define LZ77_STEP_SIZE		sizeof(u64)
+#define LZ77_RSTEP_SIZE		sizeof(u32)
+#define LZ77_MSTEP_SIZE		sizeof(u64)
 
 static __always_inline u8 lz77_read8(const u8 *ptr)
 {
@@ -67,8 +87,8 @@ static __always_inline u32 lz77_match_len(const void *wnd, const void *cur, cons
 		const u64 diff = lz77_read64(cur) ^ lz77_read64(wnd);
 
 		if (!diff) {
-			cur += LZ77_STEP_SIZE;
-			wnd += LZ77_STEP_SIZE;
+			cur += LZ77_MSTEP_SIZE;
+			wnd += LZ77_MSTEP_SIZE;
 
 			continue;
 		}
@@ -77,8 +97,9 @@ static __always_inline u32 lz77_match_len(const void *wnd, const void *cur, cons
 		cur += count_trailing_zeros(diff) >> 3;
 
 		return (cur - start);
-	} while (likely(cur + LZ77_STEP_SIZE < end));
+	} while (likely(cur + LZ77_MSTEP_SIZE < end));
 
+	/* fallback to byte-by-byte comparison for last <8 bytes */
 	while (cur < end && lz77_read8(cur) == lz77_read8(wnd)) {
 		cur++;
 		wnd++;
@@ -141,6 +162,40 @@ static __always_inline void *lz77_write_match(void *dst, void **nib, u32 dist, u
 	return dst + 4;
 }
 
+static __always_inline void *lz77_write_literals(const void *start, const void *end, void *dst,
+						 long *f, u32 *fc, void **fp)
+{
+	u32 c, remaining;
+
+	/* lz77_compress() ensures that start < end when enter here */
+	remaining = end - start;
+	do {
+		c = umin(remaining, 32 - *fc);
+
+		memcpy(dst, start, c);
+
+		dst += c;
+		start += c;
+		remaining -= c;
+
+		*f <<= c;
+		*fc += c;
+		if (*fc == 32) {
+			lz77_write32(*fp, *f);
+			*fc = 0;
+			*fp = dst;
+			dst += 4;
+		}
+	} while (remaining);
+
+	return dst;
+}
+
+static __always_inline u32 lz77_hash(u32 v)
+{
+	return ((v ^ 0x9E3779B9) * 0x85EBCA6B) >> (32 - LZ77_HASH_LOG);
+}
+
 /*
  * Compute compressed (dst) buffer length based on uncompressed (src) length.
  *
@@ -163,14 +218,20 @@ u32 lz77_calc_dlen(u32 slen)
 
 noinline int lz77_compress(const void *src, u32 slen, void *dst, u32 *dlen)
 {
-	const void *srcp, *end;
+	const void *srcp, *limit, *end, *anchor;
 	void *dstp, *nib, *flag_pos;
 	u32 flag_count = 0;
 	long flag = 0;
-	u64 *htable;
+	u32 *htable;
 
 	srcp = src;
-	end = src + slen;
+	anchor = srcp;
+	end = srcp + slen;
+	/*
+	 * We read 4 bytes from srcp[0] and srcp[1] at each loop iteration, and if a match is found,
+	 * lz77_match_len() will do 8 byte reads (on a do/while loop).
+	 */
+	limit = end - LZ77_RSTEP_SIZE - 1 - LZ77_MSTEP_SIZE;
 	dstp = dst;
 	nib = NULL;
 	flag_pos = dstp;
@@ -180,45 +241,71 @@ noinline int lz77_compress(const void *src, u32 slen, void *dst, u32 *dlen)
 	if (!htable)
 		return -ENOMEM;
 
+	prefetch(srcp + 8);
+
+	/* adjust @srcp so we don't get a false positive match on first iteration */
+	htable[lz77_hash(lz77_read32(srcp++))] = 0;
+
 	/*
 	 * Main loop.
 	 *
-	 * Assuming @dlen was computed with lz77_calc_dlen(), we can run without bound checking
-	 * @dst.
+	 * Assumes @dlen was computed with lz77_calc_dlen(), so run without bound-checking @dst.
+	 *
+	 * This code was crafted in a way to best utilise fetch-decode-execute CPU flow.
+	 * Any attempt to optimize it, or even organize it, can lead to huge performance loss.
+	 *
+	 * Hot path:
+	 * - input reads (@srcp and @srcp + 1), streamed and possibly prefetched -- very fast
+	 *   Note: it's much cheaper to preprocess @srcp + 1 and ignore it when @srcp has a match,
+	 *   than to conditionally process it (i.e. only if @srcp doesn't have a match)
+	 * - compute hashes
+	 * - get offsets from @htable
+	 * - update @htable offsets -- this is important as htable[h0/h1] is still in cache, so
+	 *   doing it only later, e.g. if no match was found, would require fetching it from memory
+	 *   again)
+	 * - compute distance _before_ checking if values match -- this is also important because a
+	 *   too far window is much more common than a mismatch, and also comparing values requires
+	 *   reading from window buffer (more expensive)
+	 * - don't zero-check w0/w1 -- (important) it adds a non-negligible delay in processing,
+	 *   such that it's faster to just read and compare @src + 0
+	 *
+	 * Whenever a match is found, we break out of the hotpath to begin writing to @dstp.
 	 */
 	do {
-		u32 dist, len = 0;
-		const void *wnd;
-		u64 hash;
-
-		hash = ((lz77_read64(srcp) << 24) * 889523592379ULL) >> (64 - LZ77_HASH_LOG);
-		wnd = src + htable[hash];
-		htable[hash] = srcp - src;
-		dist = srcp - wnd;
-
-		if (dist && dist < LZ77_MATCH_MAX_DIST)
-			len = lz77_match_len(wnd, srcp, end);
-
-		if (len < LZ77_MATCH_MIN_LEN) {
-			lz77_write8(dstp, lz77_read8(srcp));
-
-			dstp++;
-			srcp++;
-
-			flag <<= 1;
-			flag_count++;
-			if (flag_count == 32) {
-				lz77_write32(flag_pos, flag);
-				flag_count = 0;
-				flag_pos = dstp;
-				dstp += 4;
-			}
-
-			continue;
-		}
-
-		dstp = lz77_write_match(dstp, &nib, dist, len);
+		const u32 p0 = srcp - src;
+		const u32 p1 = p0 + 1;
+		const u32 v0 = lz77_read32(srcp);
+		const u32 v1 = lz77_read32(srcp + 1);
+		const u32 h0 = lz77_hash(v0);
+		const u32 h1 = lz77_hash(v1);
+		const u32 w0 = htable[h0];
+		const u32 w1 = htable[h1];
+		const void *wnd = src + w0;
+		u32 len;
+
+		htable[h0] = p0;
+		htable[h1] = p1;
+
+		if (unlikely(p0 - w0 < LZ77_MATCH_MAX_DIST && lz77_read32(wnd) == v0))
+			goto _match;
+
+		srcp++;
+		wnd = src + w1;
+		if (unlikely(p1 - w1 < LZ77_MATCH_MAX_DIST && lz77_read32(wnd) == v1))
+			goto _match;
+
+		srcp++;
+		continue;
+_match:
+		prefetchw(dstp);
+		dstp = lz77_write_literals(anchor, srcp, dstp, &flag, &flag_count, &flag_pos);
+		wnd += LZ77_RSTEP_SIZE;
+		srcp += LZ77_RSTEP_SIZE;
+		len = lz77_match_len(wnd, srcp, end);
+		dstp = lz77_write_match(dstp, &nib, srcp - wnd, len + LZ77_RSTEP_SIZE);
 		srcp += len;
+		anchor = srcp;
+		prefetch(srcp);
 
 		flag = (flag << 1) | 1;
 		flag_count++;
@@ -228,25 +315,9 @@ noinline int lz77_compress(const void *src, u32 slen, void *dst, u32 *dlen)
 			flag_pos = dstp;
 			dstp += 4;
 		}
-	} while (likely(srcp + LZ77_STEP_SIZE < end));
-
-	while (srcp < end) {
-		u32 c = umin(end - srcp, 32 - flag_count);
+	} while (srcp < limit);
 
-		memcpy(dstp, srcp, c);
-
-		dstp += c;
-		srcp += c;
-
-		flag <<= c;
-		flag_count += c;
-		if (flag_count == 32) {
-			lz77_write32(flag_pos, flag);
-			flag_count = 0;
-			flag_pos = dstp;
-			dstp += 4;
-		}
-	}
+	dstp = lz77_write_literals(anchor, end, dstp, &flag, &flag_count, &flag_pos);
 
 	flag <<= (32 - flag_count);
 	flag |= (1UL << (32 - flag_count)) - 1;