From 688eb8191b475db5acfd48634600b04fd3dda9ad Mon Sep 17 00:00:00 2001 From: Noah Goldstein Date: Wed, 10 May 2023 20:10:02 -0500 Subject: x86/csum: Improve performance of `csum_partial` 1) Add special case for len == 40 as that is the hottest value. The nets a ~8-9% latency improvement and a ~30% throughput improvement in the len == 40 case. 2) Use multiple accumulators in the 64-byte loop. This dramatically improves ILP and results in up to a 40% latency/throughput improvement (better for more iterations). Results from benchmarking on Icelake. Times measured with rdtsc() len lat_new lat_old r tput_new tput_old r 8 3.58 3.47 1.032 3.58 3.51 1.021 16 4.14 4.02 1.028 3.96 3.78 1.046 24 4.99 5.03 0.992 4.23 4.03 1.050 32 5.09 5.08 1.001 4.68 4.47 1.048 40 5.57 6.08 0.916 3.05 4.43 0.690 48 6.65 6.63 1.003 4.97 4.69 1.059 56 7.74 7.72 1.003 5.22 4.95 1.055 64 6.65 7.22 0.921 6.38 6.42 0.994 96 9.43 9.96 0.946 7.46 7.54 0.990 128 9.39 12.15 0.773 8.90 8.79 1.012 200 12.65 18.08 0.699 11.63 11.60 1.002 272 15.82 23.37 0.677 14.43 14.35 1.005 440 24.12 36.43 0.662 21.57 22.69 0.951 952 46.20 74.01 0.624 42.98 53.12 0.809 1024 47.12 78.24 0.602 46.36 58.83 0.788 1552 72.01 117.30 0.614 71.92 96.78 0.743 2048 93.07 153.25 0.607 93.28 137.20 0.680 2600 114.73 194.30 0.590 114.28 179.32 0.637 3608 156.34 268.41 0.582 154.97 254.02 0.610 4096 175.01 304.03 0.576 175.89 292.08 0.602 There is no such thing as a free lunch, however, and the special case for len == 40 does add overhead to the len != 40 cases. This seems to amount to be ~5% throughput and slightly less in terms of latency. Testing: Part of this change is a new kunit test. The tests check all alignment X length pairs in [0, 64) X [0, 512). There are three cases. 1) Precomputed random inputs/seed. The expected results where generated use the generic implementation (which is assumed to be non-buggy). 2) An input of all 1s. The goal of this test is to catch any case a carry is missing. 3) An input that never carries. The goal of this test si to catch any case of incorrectly carrying. More exhaustive tests that test all alignment X length pairs in [0, 8192) X [0, 8192] on random data are also available here: https://github.com/goldsteinn/csum-reproduction The reposity also has the code for reproducing the above benchmark numbers. Signed-off-by: Noah Goldstein Signed-off-by: Dave Hansen Link: https://lore.kernel.org/all/20230511011002.935690-1-goldstein.w.n%40gmail.com --- arch/x86/lib/csum-partial_64.c | 97 ++++++++++++++++++++++++++++-------------- 1 file changed, 65 insertions(+), 32 deletions(-) (limited to 'arch/x86/lib') diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c index 50734a23034c..fe5861951b15 100644 --- a/arch/x86/lib/csum-partial_64.c +++ b/arch/x86/lib/csum-partial_64.c @@ -5,22 +5,32 @@ * This file contains network checksum routines that are better done * in an architecture-specific manner due to speed. */ - + #include #include #include #include -static inline unsigned short from32to16(unsigned a) +static inline unsigned short from32to16(unsigned a) { - unsigned short b = a >> 16; + unsigned short b = a >> 16; asm("addw %w2,%w0\n\t" - "adcw $0,%w0\n" + "adcw $0,%w0\n" : "=r" (b) : "0" (b), "r" (a)); return b; } +static inline __wsum csum_tail(unsigned int result, u64 temp64, int odd) +{ + result = add32_with_carry(temp64 >> 32, temp64 & 0xffffffff); + if (unlikely(odd)) { + result = from32to16(result); + result = ((result >> 8) & 0xff) | ((result & 0xff) << 8); + } + return (__force __wsum)result; +} + /* * Do a checksum on an arbitrary memory area. * Returns a 32bit checksum. @@ -47,21 +57,52 @@ __wsum csum_partial(const void *buff, int len, __wsum sum) buff++; } - while (unlikely(len >= 64)) { + /* + * len == 40 is the hot case due to IPv6 headers, but annotating it likely() + * has noticeable negative affect on codegen for all other cases with + * minimal performance benefit here. + */ + if (len == 40) { asm("addq 0*8(%[src]),%[res]\n\t" "adcq 1*8(%[src]),%[res]\n\t" "adcq 2*8(%[src]),%[res]\n\t" "adcq 3*8(%[src]),%[res]\n\t" "adcq 4*8(%[src]),%[res]\n\t" - "adcq 5*8(%[src]),%[res]\n\t" - "adcq 6*8(%[src]),%[res]\n\t" - "adcq 7*8(%[src]),%[res]\n\t" "adcq $0,%[res]" - : [res] "+r" (temp64) - : [src] "r" (buff) - : "memory"); - buff += 64; - len -= 64; + : [res] "+r"(temp64) + : [src] "r"(buff), "m"(*(const char(*)[40])buff)); + return csum_tail(result, temp64, odd); + } + if (unlikely(len >= 64)) { + /* + * Extra accumulators for better ILP in the loop. + */ + u64 tmp_accum, tmp_carries; + + asm("xorl %k[tmp_accum],%k[tmp_accum]\n\t" + "xorl %k[tmp_carries],%k[tmp_carries]\n\t" + "subl $64, %[len]\n\t" + "1:\n\t" + "addq 0*8(%[src]),%[res]\n\t" + "adcq 1*8(%[src]),%[res]\n\t" + "adcq 2*8(%[src]),%[res]\n\t" + "adcq 3*8(%[src]),%[res]\n\t" + "adcl $0,%k[tmp_carries]\n\t" + "addq 4*8(%[src]),%[tmp_accum]\n\t" + "adcq 5*8(%[src]),%[tmp_accum]\n\t" + "adcq 6*8(%[src]),%[tmp_accum]\n\t" + "adcq 7*8(%[src]),%[tmp_accum]\n\t" + "adcl $0,%k[tmp_carries]\n\t" + "addq $64, %[src]\n\t" + "subl $64, %[len]\n\t" + "jge 1b\n\t" + "addq %[tmp_accum],%[res]\n\t" + "adcq %[tmp_carries],%[res]\n\t" + "adcq $0,%[res]" + : [tmp_accum] "=&r"(tmp_accum), + [tmp_carries] "=&r"(tmp_carries), [res] "+r"(temp64), + [len] "+r"(len), [src] "+r"(buff) + : "m"(*(const char *)buff)); } if (len & 32) { @@ -70,45 +111,37 @@ __wsum csum_partial(const void *buff, int len, __wsum sum) "adcq 2*8(%[src]),%[res]\n\t" "adcq 3*8(%[src]),%[res]\n\t" "adcq $0,%[res]" - : [res] "+r" (temp64) - : [src] "r" (buff) - : "memory"); + : [res] "+r"(temp64) + : [src] "r"(buff), "m"(*(const char(*)[32])buff)); buff += 32; } if (len & 16) { asm("addq 0*8(%[src]),%[res]\n\t" "adcq 1*8(%[src]),%[res]\n\t" "adcq $0,%[res]" - : [res] "+r" (temp64) - : [src] "r" (buff) - : "memory"); + : [res] "+r"(temp64) + : [src] "r"(buff), "m"(*(const char(*)[16])buff)); buff += 16; } if (len & 8) { asm("addq 0*8(%[src]),%[res]\n\t" "adcq $0,%[res]" - : [res] "+r" (temp64) - : [src] "r" (buff) - : "memory"); + : [res] "+r"(temp64) + : [src] "r"(buff), "m"(*(const char(*)[8])buff)); buff += 8; } if (len & 7) { - unsigned int shift = (8 - (len & 7)) * 8; + unsigned int shift = (-len << 3) & 63; unsigned long trail; trail = (load_unaligned_zeropad(buff) << shift) >> shift; asm("addq %[trail],%[res]\n\t" "adcq $0,%[res]" - : [res] "+r" (temp64) - : [trail] "r" (trail)); + : [res] "+r"(temp64) + : [trail] "r"(trail)); } - result = add32_with_carry(temp64 >> 32, temp64 & 0xffffffff); - if (unlikely(odd)) { - result = from32to16(result); - result = ((result >> 8) & 0xff) | ((result & 0xff) << 8); - } - return (__force __wsum)result; + return csum_tail(result, temp64, odd); } EXPORT_SYMBOL(csum_partial); @@ -118,6 +151,6 @@ EXPORT_SYMBOL(csum_partial); */ __sum16 ip_compute_csum(const void *buff, int len) { - return csum_fold(csum_partial(buff,len,0)); + return csum_fold(csum_partial(buff, len, 0)); } EXPORT_SYMBOL(ip_compute_csum); -- cgit v1.2.3 From 2fe1e67e6987b6f05329740da79c8150a2205b0d Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Fri, 26 May 2023 08:47:40 -0700 Subject: x86/csum: Fix clang -Wuninitialized in csum_partial() Clang warns: arch/x86/lib/csum-partial_64.c:74:20: error: variable 'result' is uninitialized when used here [-Werror,-Wuninitialized] return csum_tail(result, temp64, odd); ^~~~~~ arch/x86/lib/csum-partial_64.c:48:22: note: initialize the variable 'result' to silence this warning unsigned odd, result; ^ = 0 1 error generated. The only initialization and uses of result in csum_partial() were moved into csum_tail() but result is still being passed by value to csum_tail() (clang's -Wuninitialized does not do interprocedural analysis to realize that result is always assigned in csum_tail() however). Sink the declaration of result into csum_tail() to clear up the warning. Closes: https://lore.kernel.org/202305262039.3HUYjWJk-lkp@intel.com/ Fixes: 688eb8191b47 ("x86/csum: Improve performance of `csum_partial`") Reported-by: kernel test robot Signed-off-by: Nathan Chancellor Signed-off-by: Dave Hansen Link: https://lore.kernel.org/all/20230526-csum_partial-wuninitialized-v1-1-ebc0108dcec1%40kernel.org --- arch/x86/lib/csum-partial_64.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/x86/lib') diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c index fe5861951b15..cea25ca8b8cf 100644 --- a/arch/x86/lib/csum-partial_64.c +++ b/arch/x86/lib/csum-partial_64.c @@ -21,8 +21,10 @@ static inline unsigned short from32to16(unsigned a) return b; } -static inline __wsum csum_tail(unsigned int result, u64 temp64, int odd) +static inline __wsum csum_tail(u64 temp64, int odd) { + unsigned int result; + result = add32_with_carry(temp64 >> 32, temp64 & 0xffffffff); if (unlikely(odd)) { result = from32to16(result); @@ -45,7 +47,7 @@ static inline __wsum csum_tail(unsigned int result, u64 temp64, int odd) __wsum csum_partial(const void *buff, int len, __wsum sum) { u64 temp64 = (__force u64)sum; - unsigned odd, result; + unsigned odd; odd = 1 & (unsigned long) buff; if (unlikely(odd)) { @@ -71,7 +73,7 @@ __wsum csum_partial(const void *buff, int len, __wsum sum) "adcq $0,%[res]" : [res] "+r"(temp64) : [src] "r"(buff), "m"(*(const char(*)[40])buff)); - return csum_tail(result, temp64, odd); + return csum_tail(temp64, odd); } if (unlikely(len >= 64)) { /* @@ -141,7 +143,7 @@ __wsum csum_partial(const void *buff, int len, __wsum sum) : [res] "+r"(temp64) : [trail] "r"(trail)); } - return csum_tail(result, temp64, odd); + return csum_tail(temp64, odd); } EXPORT_SYMBOL(csum_partial); -- cgit v1.2.3 From 5516c89d58283413134f8d26960c6303d5d5bd89 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Thu, 25 May 2023 11:42:44 -0700 Subject: x86/lib: Make get/put_user() exception handling a visible symbol The .L-prefixed exception handling symbols of get_user() and put_user() do get discarded from the symbol table of the final kernel image. This confuses tools which parse that symbol table and try to map the chunk of code to a symbol. And, in general, from toolchain perspective, it is a good practice to have all code belong to a symbol, and the correct one at that. ( Currently, objdump displays that exception handling chunk as part of the previous symbol which is a "fallback" of sorts and not correct. ) While at it, rename them to something more descriptive. [ bp: Rewrite commit message, rename symbols. ] Signed-off-by: Nadav Amit Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20230525184244.2311-1-namit@vmware.com --- arch/x86/lib/getuser.S | 32 ++++++++++++++++---------------- arch/x86/lib/putuser.S | 24 ++++++++++++------------ 2 files changed, 28 insertions(+), 28 deletions(-) (limited to 'arch/x86/lib') diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S index b64a2bd1a1ef..9c63713477bb 100644 --- a/arch/x86/lib/getuser.S +++ b/arch/x86/lib/getuser.S @@ -143,43 +143,43 @@ SYM_FUNC_END(__get_user_nocheck_8) EXPORT_SYMBOL(__get_user_nocheck_8) -SYM_CODE_START_LOCAL(.Lbad_get_user_clac) +SYM_CODE_START_LOCAL(__get_user_handle_exception) ASM_CLAC .Lbad_get_user: xor %edx,%edx mov $(-EFAULT),%_ASM_AX RET -SYM_CODE_END(.Lbad_get_user_clac) +SYM_CODE_END(__get_user_handle_exception) #ifdef CONFIG_X86_32 -SYM_CODE_START_LOCAL(.Lbad_get_user_8_clac) +SYM_CODE_START_LOCAL(__get_user_8_handle_exception) ASM_CLAC bad_get_user_8: xor %edx,%edx xor %ecx,%ecx mov $(-EFAULT),%_ASM_AX RET -SYM_CODE_END(.Lbad_get_user_8_clac) +SYM_CODE_END(__get_user_8_handle_exception) #endif /* get_user */ - _ASM_EXTABLE(1b, .Lbad_get_user_clac) - _ASM_EXTABLE(2b, .Lbad_get_user_clac) - _ASM_EXTABLE(3b, .Lbad_get_user_clac) + _ASM_EXTABLE(1b, __get_user_handle_exception) + _ASM_EXTABLE(2b, __get_user_handle_exception) + _ASM_EXTABLE(3b, __get_user_handle_exception) #ifdef CONFIG_X86_64 - _ASM_EXTABLE(4b, .Lbad_get_user_clac) + _ASM_EXTABLE(4b, __get_user_handle_exception) #else - _ASM_EXTABLE(4b, .Lbad_get_user_8_clac) - _ASM_EXTABLE(5b, .Lbad_get_user_8_clac) + _ASM_EXTABLE(4b, __get_user_8_handle_exception) + _ASM_EXTABLE(5b, __get_user_8_handle_exception) #endif /* __get_user */ - _ASM_EXTABLE(6b, .Lbad_get_user_clac) - _ASM_EXTABLE(7b, .Lbad_get_user_clac) - _ASM_EXTABLE(8b, .Lbad_get_user_clac) + _ASM_EXTABLE(6b, __get_user_handle_exception) + _ASM_EXTABLE(7b, __get_user_handle_exception) + _ASM_EXTABLE(8b, __get_user_handle_exception) #ifdef CONFIG_X86_64 - _ASM_EXTABLE(9b, .Lbad_get_user_clac) + _ASM_EXTABLE(9b, __get_user_handle_exception) #else - _ASM_EXTABLE(9b, .Lbad_get_user_8_clac) - _ASM_EXTABLE(10b, .Lbad_get_user_8_clac) + _ASM_EXTABLE(9b, __get_user_8_handle_exception) + _ASM_EXTABLE(10b, __get_user_8_handle_exception) #endif diff --git a/arch/x86/lib/putuser.S b/arch/x86/lib/putuser.S index 3062d09a776d..1451e0c4ae22 100644 --- a/arch/x86/lib/putuser.S +++ b/arch/x86/lib/putuser.S @@ -131,22 +131,22 @@ SYM_FUNC_START(__put_user_nocheck_8) SYM_FUNC_END(__put_user_nocheck_8) EXPORT_SYMBOL(__put_user_nocheck_8) -SYM_CODE_START_LOCAL(.Lbad_put_user_clac) +SYM_CODE_START_LOCAL(__put_user_handle_exception) ASM_CLAC .Lbad_put_user: movl $-EFAULT,%ecx RET -SYM_CODE_END(.Lbad_put_user_clac) +SYM_CODE_END(__put_user_handle_exception) - _ASM_EXTABLE(1b, .Lbad_put_user_clac) - _ASM_EXTABLE(2b, .Lbad_put_user_clac) - _ASM_EXTABLE(3b, .Lbad_put_user_clac) - _ASM_EXTABLE(4b, .Lbad_put_user_clac) - _ASM_EXTABLE(5b, .Lbad_put_user_clac) - _ASM_EXTABLE(6b, .Lbad_put_user_clac) - _ASM_EXTABLE(7b, .Lbad_put_user_clac) - _ASM_EXTABLE(9b, .Lbad_put_user_clac) + _ASM_EXTABLE(1b, __put_user_handle_exception) + _ASM_EXTABLE(2b, __put_user_handle_exception) + _ASM_EXTABLE(3b, __put_user_handle_exception) + _ASM_EXTABLE(4b, __put_user_handle_exception) + _ASM_EXTABLE(5b, __put_user_handle_exception) + _ASM_EXTABLE(6b, __put_user_handle_exception) + _ASM_EXTABLE(7b, __put_user_handle_exception) + _ASM_EXTABLE(9b, __put_user_handle_exception) #ifdef CONFIG_X86_32 - _ASM_EXTABLE(8b, .Lbad_put_user_clac) - _ASM_EXTABLE(10b, .Lbad_put_user_clac) + _ASM_EXTABLE(8b, __put_user_handle_exception) + _ASM_EXTABLE(10b, __put_user_handle_exception) #endif -- cgit v1.2.3