From 3fad96e9b21bed214c1593d7d7fb3e40d1fbf6f4 Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Tue, 24 Oct 2023 11:57:15 +0100 Subject: firmware: arm_ffa: Declare ffa_bus_type structure in the header smatch reports: drivers/firmware/arm_ffa/bus.c:108:17: warning: symbol 'ffa_bus_type' was not declared. Should it be static? ffa_bus_type is exported to be useful in the FF-A driver. So this warning is not correct. However, declaring the ffa_bus_type structure in the header like many other bus_types do already removes this warning. So let us just do the same and get rid of the warning. Link: https://lore.kernel.org/r/20231024105715.2369638-1-sudeep.holla@arm.com Signed-off-by: Sudeep Holla --- include/linux/arm_ffa.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h index 1abedb5b2e48..3d0fde57ba90 100644 --- a/include/linux/arm_ffa.h +++ b/include/linux/arm_ffa.h @@ -209,6 +209,8 @@ bool ffa_device_is_valid(struct ffa_device *ffa_dev) { return false; } #define module_ffa_driver(__ffa_driver) \ module_driver(__ffa_driver, ffa_register, ffa_unregister) +extern struct bus_type ffa_bus_type; + /* FFA transport related */ struct ffa_partition_info { u16 id; -- cgit v1.2.3 From e0894ff038d86f30614ec16ec26dacb88c8d2bd4 Mon Sep 17 00:00:00 2001 From: "Luke D. Jones" Date: Mon, 27 Nov 2023 12:05:21 +1300 Subject: platform/x86: asus-wmi: disable USB0 hub on ROG Ally before suspend MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ASUS have worked around an issue in XInput where it doesn't support USB selective suspend, which causes suspend issues in Windows. They worked around this by adjusting the MCU firmware to disable the USB0 hub when the screen is switched off during the Microsoft DSM suspend path in ACPI. The issue we have with this however is one of timing - the call the tells the MCU to this isn't able to complete before suspend is done so we call this in a prepare() and add a small msleep() to ensure it is done. This must be done before the screen is switched off to prevent a variety of possible races. Further to this the MCU powersave option must also be disabled as it can cause a number of issues such as: - unreliable resume connection of N-Key - complete loss of N-Key if the power is plugged in while suspended Disabling the powersave option prevents this. Without this the MCU is unable to initialise itself correctly on resume. Signed-off-by: "Luke D. Jones" Tested-by: Philip Mueller Reviewed-by: Hans de Goede Link: https://lore.kernel.org/r/20231126230521.125708-2-luke@ljones.dev Signed-off-by: Ilpo Järvinen --- include/linux/platform_data/x86/asus-wmi.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h index 63e630276499..ab1c7deff118 100644 --- a/include/linux/platform_data/x86/asus-wmi.h +++ b/include/linux/platform_data/x86/asus-wmi.h @@ -114,6 +114,9 @@ /* Charging mode - 1=Barrel, 2=USB */ #define ASUS_WMI_DEVID_CHARGE_MODE 0x0012006C +/* MCU powersave mode */ +#define ASUS_WMI_DEVID_MCU_POWERSAVE 0x001200E2 + /* epu is connected? 1 == true */ #define ASUS_WMI_DEVID_EGPU_CONNECTED 0x00090018 /* egpu on/off */ -- cgit v1.2.3 From 6a3afb6ac6dfab158ebdd4b87941178f58c8939f Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Wed, 29 Nov 2023 19:47:40 +0800 Subject: jbd2: increase the journal IO's priority Current jbd2 only add REQ_SYNC for descriptor block, metadata log buffer, commit buffer and superblock buffer, the submitted IO could be throttled by writeback throttle in block layer, that could lead to priority inversion in some cases. The log IO looks like a kind of high priority metadata IO, so it should not be throttled by WBT like QOS policies in block layer, let's add REQ_SYNC | REQ_IDLE to exempt from writeback throttle, and also add REQ_META together indicates it's a metadata IO. Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20231129114740.2686201-2-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- include/linux/jbd2.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 6dcbb4eb80fb..beb30719ee16 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1374,6 +1374,9 @@ JBD2_FEATURE_INCOMPAT_FUNCS(csum2, CSUM_V2) JBD2_FEATURE_INCOMPAT_FUNCS(csum3, CSUM_V3) JBD2_FEATURE_INCOMPAT_FUNCS(fast_commit, FAST_COMMIT) +/* Journal high priority write IO operation flags */ +#define JBD2_JOURNAL_REQ_FLAGS (REQ_META | REQ_SYNC | REQ_IDLE) + /* * Journal flag definitions */ -- cgit v1.2.3 From c55e0a55b165202f18cbc4a20650d2e1becd5507 Mon Sep 17 00:00:00 2001 From: Tyler Fanelli Date: Tue, 19 Sep 2023 22:40:00 -0400 Subject: fuse: Rename DIRECT_IO_RELAX to DIRECT_IO_ALLOW_MMAP Although DIRECT_IO_RELAX's initial usage is to allow shared mmap, its description indicates a purpose of reducing memory footprint. This may imply that it could be further used to relax other DIRECT_IO operations in the future. Replace it with a flag DIRECT_IO_ALLOW_MMAP which does only one thing, allow shared mmap of DIRECT_IO files while still bypassing the cache on regular reads and writes. [Miklos] Also Keep DIRECT_IO_RELAX definition for backward compatibility. Signed-off-by: Tyler Fanelli Fixes: e78662e818f9 ("fuse: add a new fuse init flag to relax restrictions in no cache mode") Cc: # v6.6 Signed-off-by: Miklos Szeredi --- include/uapi/linux/fuse.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index db92a7202b34..e7418d15fe39 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -209,7 +209,7 @@ * - add FUSE_HAS_EXPIRE_ONLY * * 7.39 - * - add FUSE_DIRECT_IO_RELAX + * - add FUSE_DIRECT_IO_ALLOW_MMAP * - add FUSE_STATX and related structures */ @@ -409,8 +409,7 @@ struct fuse_file_lock { * FUSE_CREATE_SUPP_GROUP: add supplementary group info to create, mkdir, * symlink and mknod (single group that matches parent) * FUSE_HAS_EXPIRE_ONLY: kernel supports expiry-only entry invalidation - * FUSE_DIRECT_IO_RELAX: relax restrictions in FOPEN_DIRECT_IO mode, for now - * allow shared mmap + * FUSE_DIRECT_IO_ALLOW_MMAP: allow shared mmap in FOPEN_DIRECT_IO mode. */ #define FUSE_ASYNC_READ (1 << 0) #define FUSE_POSIX_LOCKS (1 << 1) @@ -449,7 +448,10 @@ struct fuse_file_lock { #define FUSE_HAS_INODE_DAX (1ULL << 33) #define FUSE_CREATE_SUPP_GROUP (1ULL << 34) #define FUSE_HAS_EXPIRE_ONLY (1ULL << 35) -#define FUSE_DIRECT_IO_RELAX (1ULL << 36) +#define FUSE_DIRECT_IO_ALLOW_MMAP (1ULL << 36) + +/* Obsolete alias for FUSE_DIRECT_IO_ALLOW_MMAP */ +#define FUSE_DIRECT_IO_RELAX FUSE_DIRECT_IO_ALLOW_MMAP /** * CUSE INIT request/reply flags -- cgit v1.2.3 From 4fbc3a52cd4d14de3793f4b2c721d7306ea84cf9 Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Wed, 29 Nov 2023 14:21:41 -0600 Subject: RDMA/core: Fix umem iterator when PAGE_SIZE is greater then HCA pgsz 64k pages introduce the situation in this diagram when the HCA 4k page size is being used: +-------------------------------------------+ <--- 64k aligned VA | | | HCA 4k page | | | +-------------------------------------------+ | o | | | | o | | | | o | +-------------------------------------------+ | | | HCA 4k page | | | +-------------------------------------------+ <--- Live HCA page |OOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO| <--- offset | | <--- VA | MR data | +-------------------------------------------+ | | | HCA 4k page | | | +-------------------------------------------+ | o | | | | o | | | | o | +-------------------------------------------+ | | | HCA 4k page | | | +-------------------------------------------+ The VA addresses are coming from rdma-core in this diagram can be arbitrary, but for 64k pages, the VA may be offset by some number of HCA 4k pages and followed by some number of HCA 4k pages. The current iterator doesn't account for either the preceding 4k pages or the following 4k pages. Fix the issue by extending the ib_block_iter to contain the number of DMA pages like comment [1] says and by using __sg_advance to start the iterator at the first live HCA page. The changes are contained in a parallel set of iterator start and next functions that are umem aware and specific to umem since there is one user of the rdma_for_each_block() without umem. These two fixes prevents the extra pages before and after the user MR data. Fix the preceding pages by using the __sq_advance field to start at the first 4k page containing MR data. Fix the following pages by saving the number of pgsz blocks in the iterator state and downcounting on each next. This fix allows for the elimination of the small page crutch noted in the Fixes. Fixes: 10c75ccb54e4 ("RDMA/umem: Prevent small pages from being returned by ib_umem_find_best_pgsz()") Link: https://lore.kernel.org/r/20231129202143.1434-2-shiraz.saleem@intel.com Signed-off-by: Mike Marciniszyn Signed-off-by: Shiraz Saleem Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- include/rdma/ib_umem.h | 9 ++++++++- include/rdma/ib_verbs.h | 1 + 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h index 95896472a82b..565a85044541 100644 --- a/include/rdma/ib_umem.h +++ b/include/rdma/ib_umem.h @@ -77,6 +77,13 @@ static inline void __rdma_umem_block_iter_start(struct ib_block_iter *biter, { __rdma_block_iter_start(biter, umem->sgt_append.sgt.sgl, umem->sgt_append.sgt.nents, pgsz); + biter->__sg_advance = ib_umem_offset(umem) & ~(pgsz - 1); + biter->__sg_numblocks = ib_umem_num_dma_blocks(umem, pgsz); +} + +static inline bool __rdma_umem_block_iter_next(struct ib_block_iter *biter) +{ + return __rdma_block_iter_next(biter) && biter->__sg_numblocks--; } /** @@ -92,7 +99,7 @@ static inline void __rdma_umem_block_iter_start(struct ib_block_iter *biter, */ #define rdma_umem_for_each_dma_block(umem, biter, pgsz) \ for (__rdma_umem_block_iter_start(biter, umem, pgsz); \ - __rdma_block_iter_next(biter);) + __rdma_umem_block_iter_next(biter);) #ifdef CONFIG_INFINIBAND_USER_MEM diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index fb1a2d6b1969..b7b6b58dd348 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2850,6 +2850,7 @@ struct ib_block_iter { /* internal states */ struct scatterlist *__sg; /* sg holding the current aligned block */ dma_addr_t __dma_addr; /* unaligned DMA address of this block */ + size_t __sg_numblocks; /* ib_umem_num_dma_blocks() */ unsigned int __sg_nents; /* number of SG entries */ unsigned int __sg_advance; /* number of bytes to advance in sg in next step */ unsigned int __pg_bit; /* alignment of current block */ -- cgit v1.2.3 From 37e4b8df27bc68340f3fc80dbb27e3549c7f881c Mon Sep 17 00:00:00 2001 From: Jianheng Zhang Date: Fri, 1 Dec 2023 03:22:03 +0000 Subject: net: stmmac: fix FPE events losing The status bits of register MAC_FPE_CTRL_STS are clear on read. Using 32-bit read for MAC_FPE_CTRL_STS in dwmac5_fpe_configure() and dwmac5_fpe_send_mpacket() clear the status bits. Then the stmmac interrupt handler missing FPE event status and leads to FPE handshaking failure and retries. To avoid clear status bits of MAC_FPE_CTRL_STS in dwmac5_fpe_configure() and dwmac5_fpe_send_mpacket(), add fpe_csr to stmmac_fpe_cfg structure to cache the control bits of MAC_FPE_CTRL_STS and to avoid reading MAC_FPE_CTRL_STS in those methods. Fixes: 5a5586112b92 ("net: stmmac: support FPE link partner hand-shaking procedure") Reviewed-by: Serge Semin Signed-off-by: Jianheng Zhang Link: https://lore.kernel.org/r/CY5PR12MB637225A7CF529D5BE0FBE59CBF81A@CY5PR12MB6372.namprd12.prod.outlook.com Signed-off-by: Jakub Kicinski --- include/linux/stmmac.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 0b4658a7eceb..dee5ad6e48c5 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -175,6 +175,7 @@ struct stmmac_fpe_cfg { bool hs_enable; /* FPE handshake enable */ enum stmmac_fpe_state lp_fpe_state; /* Link Partner FPE state */ enum stmmac_fpe_state lo_fpe_state; /* Local station FPE state */ + u32 fpe_csr; /* MAC_FPE_CTRL_STS reg cache */ }; struct stmmac_safety_feature_cfg { -- cgit v1.2.3 From a5e400a985df8041ed4659ed1462aa9134318130 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 20 Aug 2023 20:58:56 +0300 Subject: net/mlx5e: Honor user choice of IPsec replay window size Users can configure IPsec replay window size, but mlx5 driver didn't honor their choice and set always 32bits. Fix assignment logic to configure right size from the beginning. Fixes: 7db21ef4566e ("net/mlx5e: Set IPsec replay sequence numbers") Reviewed-by: Patrisious Haddad Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 6f3631425f38..90ca63f4bf63 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -12001,6 +12001,13 @@ enum { MLX5_IPSEC_ASO_INC_SN = 0x2, }; +enum { + MLX5_IPSEC_ASO_REPLAY_WIN_32BIT = 0x0, + MLX5_IPSEC_ASO_REPLAY_WIN_64BIT = 0x1, + MLX5_IPSEC_ASO_REPLAY_WIN_128BIT = 0x2, + MLX5_IPSEC_ASO_REPLAY_WIN_256BIT = 0x3, +}; + struct mlx5_ifc_ipsec_aso_bits { u8 valid[0x1]; u8 reserved_at_201[0x1]; -- cgit v1.2.3 From c2bf84f1d1a1595dcc45fe867f0e02b331993fee Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 12 Nov 2023 13:50:00 +0200 Subject: net/mlx5e: Tidy up IPsec NAT-T SA discovery IPsec NAT-T packets are UDP encapsulated packets over ESP normal ones. In case they arrive to RX, the SPI and ESP are located in inner header, while the check was performed on outer header instead. That wrong check caused to the situation where received rekeying request was missed and caused to rekey timeout, which "compensated" this failure by completing rekeying. Fixes: d65954934937 ("net/mlx5e: Support IPsec NAT-T functionality") Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 90ca63f4bf63..3f7b664d625b 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -621,7 +621,7 @@ struct mlx5_ifc_fte_match_set_misc_bits { u8 reserved_at_140[0x8]; u8 bth_dst_qp[0x18]; - u8 reserved_at_160[0x20]; + u8 inner_esp_spi[0x20]; u8 outer_esp_spi[0x20]; u8 reserved_at_1a0[0x60]; }; -- cgit v1.2.3 From 58d3aade20cdddbac6c9707ac0f3f5f8c1278b74 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 4 Dec 2023 17:08:05 +0100 Subject: tcp: fix mid stream window clamp. After the blamed commit below, if the user-space application performs window clamping when tp->rcv_wnd is 0, the TCP socket will never be able to announce a non 0 receive window, even after completely emptying the receive buffer and re-setting the window clamp to higher values. Refactor tcp_set_window_clamp() to address the issue: when the user decreases the current clamp value, set rcv_ssthresh according to the same logic used at buffer initialization, but ensuring reserved mem provisioning. To avoid code duplication factor-out the relevant bits from tcp_adjust_rcv_ssthresh() in a new helper and reuse it in the above scenario. When increasing the clamp value, give the rcv_ssthresh a chance to grow according to previously implemented heuristic. Fixes: 3aa7857fe1d7 ("tcp: enable mid stream window clamp") Reported-by: David Gibson Reported-by: Stefano Brivio Signed-off-by: Paolo Abeni Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/705dad54e6e6e9a010e571bf58e0b35a8ae70503.1701706073.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- include/net/tcp.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index d2f0736b76b8..144ba48bb07b 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1514,17 +1514,22 @@ static inline int tcp_full_space(const struct sock *sk) return tcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf)); } -static inline void tcp_adjust_rcv_ssthresh(struct sock *sk) +static inline void __tcp_adjust_rcv_ssthresh(struct sock *sk, u32 new_ssthresh) { int unused_mem = sk_unused_reserved_mem(sk); struct tcp_sock *tp = tcp_sk(sk); - tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); + tp->rcv_ssthresh = min(tp->rcv_ssthresh, new_ssthresh); if (unused_mem) tp->rcv_ssthresh = max_t(u32, tp->rcv_ssthresh, tcp_win_from_space(sk, unused_mem)); } +static inline void tcp_adjust_rcv_ssthresh(struct sock *sk) +{ + __tcp_adjust_rcv_ssthresh(sk, 4U * tcp_sk(sk)->advmss); +} + void tcp_cleanup_rbuf(struct sock *sk, int copied); void __tcp_cleanup_rbuf(struct sock *sk, int copied); -- cgit v1.2.3 From 7037d95a047cd89b1f680eed253c6ab586bef1ed Mon Sep 17 00:00:00 2001 From: Kelly Kane Date: Sat, 2 Dec 2023 17:17:12 -0800 Subject: r8152: add vendor/device ID pair for ASUS USB-C2500 The ASUS USB-C2500 is an RTL8156 based 2.5G Ethernet controller. Add the vendor and product ID values to the driver. This makes Ethernet work with the adapter. Signed-off-by: Kelly Kane Link: https://lore.kernel.org/r/20231203011712.6314-1-kelly@hawknetworks.com Signed-off-by: Paolo Abeni --- include/linux/usb/r8152.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/usb/r8152.h b/include/linux/usb/r8152.h index 287e9d83fb8b..33a4c146dc19 100644 --- a/include/linux/usb/r8152.h +++ b/include/linux/usb/r8152.h @@ -30,6 +30,7 @@ #define VENDOR_ID_NVIDIA 0x0955 #define VENDOR_ID_TPLINK 0x2357 #define VENDOR_ID_DLINK 0x2001 +#define VENDOR_ID_ASUS 0x0b05 #if IS_REACHABLE(CONFIG_USB_RTL8152) extern u8 rtl8152_get_version(struct usb_interface *intf); -- cgit v1.2.3 From e0f04e41e8eedd4e5a1275f2318df7e1841855f2 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Mon, 4 Dec 2023 09:32:33 +0100 Subject: drm/atomic-helpers: Invoke end_fb_access while owning plane state Invoke drm_plane_helper_funcs.end_fb_access before drm_atomic_helper_commit_hw_done(). The latter function hands over ownership of the plane state to the following commit, which might free it. Releasing resources in end_fb_access then operates on undefined state. This bug has been observed with non-blocking commits when they are being queued up quickly. Here is an example stack trace from the bug report. The plane state has been free'd already, so the pages for drm_gem_fb_vunmap() are gone. Unable to handle kernel paging request at virtual address 0000000100000049 [...] drm_gem_fb_vunmap+0x18/0x74 drm_gem_end_shadow_fb_access+0x1c/0x2c drm_atomic_helper_cleanup_planes+0x58/0xd8 drm_atomic_helper_commit_tail+0x90/0xa0 commit_tail+0x15c/0x188 commit_work+0x14/0x20 Fix this by running end_fb_access immediately after updating all planes in drm_atomic_helper_commit_planes(). The existing clean-up helper drm_atomic_helper_cleanup_planes() now only handles cleanup_fb. For aborted commits, roll back from drm_atomic_helper_prepare_planes() in the new helper drm_atomic_helper_unprepare_planes(). This case is different from regular cleanup, as we have to release the new state; regular cleanup releases the old state. The new helper also invokes cleanup_fb for all planes. The changes mostly involve DRM's atomic helpers. Only two drivers, i915 and nouveau, implement their own commit function. Update them to invoke drm_atomic_helper_unprepare_planes(). Drivers with custom commit_tail function do not require changes. v4: * fix documentation (kernel test robot) v3: * add drm_atomic_helper_unprepare_planes() for rolling back * use correct state for end_fb_access v2: * fix test in drm_atomic_helper_cleanup_planes() Reported-by: Alyssa Ross Closes: https://lore.kernel.org/dri-devel/87leazm0ya.fsf@alyssa.is/ Suggested-by: Daniel Vetter Fixes: 94d879eaf7fb ("drm/atomic-helper: Add {begin,end}_fb_access to plane helpers") Tested-by: Alyssa Ross Reviewed-by: Alyssa Ross Signed-off-by: Thomas Zimmermann Cc: # v6.2+ Link: https://patchwork.freedesktop.org/patch/msgid/20231204083247.22006-1-tzimmermann@suse.de --- include/drm/drm_atomic_helper.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/drm/drm_atomic_helper.h b/include/drm/drm_atomic_helper.h index 536a0b0091c3..006b5c977ad7 100644 --- a/include/drm/drm_atomic_helper.h +++ b/include/drm/drm_atomic_helper.h @@ -97,6 +97,8 @@ void drm_atomic_helper_commit_modeset_enables(struct drm_device *dev, int drm_atomic_helper_prepare_planes(struct drm_device *dev, struct drm_atomic_state *state); +void drm_atomic_helper_unprepare_planes(struct drm_device *dev, + struct drm_atomic_state *state); #define DRM_PLANE_COMMIT_ACTIVE_ONLY BIT(0) #define DRM_PLANE_COMMIT_NO_DISABLE_AFTER_MODESET BIT(1) -- cgit v1.2.3 From da7dfaa6d6f731c30eca6ffa808b83634d43e26f Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 4 Dec 2023 19:00:41 +0000 Subject: net/tcp: Consistently align TCP-AO option in the header Currently functions that pre-calculate TCP header options length use unaligned TCP-AO header + MAC-length for skb reservation. And the functions that actually write TCP-AO options into skb do align the header. Nothing good can come out of this for ((maclen % 4) != 0). Provide tcp_ao_len_aligned() helper and use it everywhere for TCP header options space calculations. Fixes: 1e03d32bea8e ("net/tcp: Add TCP-AO sign to outgoing packets") Signed-off-by: Dmitry Safonov Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- include/net/tcp_ao.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/net/tcp_ao.h b/include/net/tcp_ao.h index b56be10838f0..647781080613 100644 --- a/include/net/tcp_ao.h +++ b/include/net/tcp_ao.h @@ -62,11 +62,17 @@ static inline int tcp_ao_maclen(const struct tcp_ao_key *key) return key->maclen; } +/* Use tcp_ao_len_aligned() for TCP header calculations */ static inline int tcp_ao_len(const struct tcp_ao_key *key) { return tcp_ao_maclen(key) + sizeof(struct tcp_ao_hdr); } +static inline int tcp_ao_len_aligned(const struct tcp_ao_key *key) +{ + return round_up(tcp_ao_len(key), 4); +} + static inline unsigned int tcp_ao_digest_size(struct tcp_ao_key *key) { return key->digest_size; -- cgit v1.2.3 From 9396c4ee93f9ac03cd0cea0bb345fbc657772943 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 4 Dec 2023 19:00:44 +0000 Subject: net/tcp: Don't store TCP-AO maclen on reqsk This extra check doesn't work for a handshake when SYN segment has (current_key.maclen != rnext_key.maclen). It could be amended to preserve rnext_key.maclen instead of current_key.maclen, but that requires a lookup on listen socket. Originally, this extra maclen check was introduced just because it was cheap. Drop it and convert tcp_request_sock::maclen into boolean tcp_request_sock::used_tcp_ao. Fixes: 06b22ef29591 ("net/tcp: Wire TCP-AO to request sockets") Signed-off-by: Dmitry Safonov Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- include/linux/tcp.h | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 68f3d315d2e1..b646b574b060 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -169,7 +169,7 @@ struct tcp_request_sock { #ifdef CONFIG_TCP_AO u8 ao_keyid; u8 ao_rcv_next; - u8 maclen; + bool used_tcp_ao; #endif }; @@ -180,14 +180,10 @@ static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req) static inline bool tcp_rsk_used_ao(const struct request_sock *req) { - /* The real length of MAC is saved in the request socket, - * signing anything with zero-length makes no sense, so here is - * a little hack.. - */ #ifndef CONFIG_TCP_AO return false; #else - return tcp_rsk(req)->maclen != 0; + return tcp_rsk(req)->used_tcp_ao; #endif } -- cgit v1.2.3 From 4b7de801606e504e69689df71475d27e35336fb3 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 6 Dec 2023 09:30:40 +0100 Subject: bpf: Fix prog_array_map_poke_run map poke update Lee pointed out issue found by syscaller [0] hitting BUG in prog array map poke update in prog_array_map_poke_run function due to error value returned from bpf_arch_text_poke function. There's race window where bpf_arch_text_poke can fail due to missing bpf program kallsym symbols, which is accounted for with check for -EINVAL in that BUG_ON call. The problem is that in such case we won't update the tail call jump and cause imbalance for the next tail call update check which will fail with -EBUSY in bpf_arch_text_poke. I'm hitting following race during the program load: CPU 0 CPU 1 bpf_prog_load bpf_check do_misc_fixups prog_array_map_poke_track map_update_elem bpf_fd_array_map_update_elem prog_array_map_poke_run bpf_arch_text_poke returns -EINVAL bpf_prog_kallsyms_add After bpf_arch_text_poke (CPU 1) fails to update the tail call jump, the next poke update fails on expected jump instruction check in bpf_arch_text_poke with -EBUSY and triggers the BUG_ON in prog_array_map_poke_run. Similar race exists on the program unload. Fixing this by moving the update to bpf_arch_poke_desc_update function which makes sure we call __bpf_arch_text_poke that skips the bpf address check. Each architecture has slightly different approach wrt looking up bpf address in bpf_arch_text_poke, so instead of splitting the function or adding new 'checkip' argument in previous version, it seems best to move the whole map_poke_run update as arch specific code. [0] https://syzkaller.appspot.com/bug?extid=97a4fe20470e9bc30810 Fixes: ebf7d1f508a7 ("bpf, x64: rework pro/epilogue and tailcall handling in JIT") Reported-by: syzbot+97a4fe20470e9bc30810@syzkaller.appspotmail.com Signed-off-by: Jiri Olsa Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Cc: Lee Jones Cc: Maciej Fijalkowski Link: https://lore.kernel.org/bpf/20231206083041.1306660-2-jolsa@kernel.org --- include/linux/bpf.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 6762dac3ef76..cff5bb08820e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -3175,6 +3175,9 @@ enum bpf_text_poke_type { int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, void *addr1, void *addr2); +void bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke, + struct bpf_prog *new, struct bpf_prog *old); + void *bpf_arch_text_copy(void *dst, void *src, size_t len); int bpf_arch_text_invalidate(void *dst, size_t len); -- cgit v1.2.3 From 187da0f8250aa94bd96266096aef6f694e0b4cd2 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Mon, 13 Nov 2023 17:20:33 -0800 Subject: hugetlb: fix null-ptr-deref in hugetlb_vma_lock_write The routine __vma_private_lock tests for the existence of a reserve map associated with a private hugetlb mapping. A pointer to the reserve map is in vma->vm_private_data. __vma_private_lock was checking the pointer for NULL. However, it is possible that the low bits of the pointer could be used as flags. In such instances, vm_private_data is not NULL and not a valid pointer. This results in the null-ptr-deref reported by syzbot: general protection fault, probably for non-canonical address 0xdffffc000000001d: 0000 [#1] PREEMPT SMP KASAN KASAN: null-ptr-deref in range [0x00000000000000e8-0x00000000000000ef] CPU: 0 PID: 5048 Comm: syz-executor139 Not tainted 6.6.0-rc7-syzkaller-00142-g88 8cf78c29e2 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 1 0/09/2023 RIP: 0010:__lock_acquire+0x109/0x5de0 kernel/locking/lockdep.c:5004 ... Call Trace: lock_acquire kernel/locking/lockdep.c:5753 [inline] lock_acquire+0x1ae/0x510 kernel/locking/lockdep.c:5718 down_write+0x93/0x200 kernel/locking/rwsem.c:1573 hugetlb_vma_lock_write mm/hugetlb.c:300 [inline] hugetlb_vma_lock_write+0xae/0x100 mm/hugetlb.c:291 __hugetlb_zap_begin+0x1e9/0x2b0 mm/hugetlb.c:5447 hugetlb_zap_begin include/linux/hugetlb.h:258 [inline] unmap_vmas+0x2f4/0x470 mm/memory.c:1733 exit_mmap+0x1ad/0xa60 mm/mmap.c:3230 __mmput+0x12a/0x4d0 kernel/fork.c:1349 mmput+0x62/0x70 kernel/fork.c:1371 exit_mm kernel/exit.c:567 [inline] do_exit+0x9ad/0x2a20 kernel/exit.c:861 __do_sys_exit kernel/exit.c:991 [inline] __se_sys_exit kernel/exit.c:989 [inline] __x64_sys_exit+0x42/0x50 kernel/exit.c:989 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x38/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd Mask off low bit flags before checking for NULL pointer. In addition, the reserve map only 'belongs' to the OWNER (parent in parent/child relationships) so also check for the OWNER flag. Link: https://lkml.kernel.org/r/20231114012033.259600-1-mike.kravetz@oracle.com Reported-by: syzbot+6ada951e7c0f7bc8a71e@syzkaller.appspotmail.com Closes: https://lore.kernel.org/linux-mm/00000000000078d1e00608d7878b@google.com/ Fixes: bf4916922c60 ("hugetlbfs: extend hugetlb_vma_lock to private VMAs") Signed-off-by: Mike Kravetz Reviewed-by: Rik van Riel Cc: Edward Adam Davis Cc: Muchun Song Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Tom Rix Cc: Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index d3acecc5db4b..236ec7b63c54 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -1268,10 +1268,7 @@ static inline bool __vma_shareable_lock(struct vm_area_struct *vma) return (vma->vm_flags & VM_MAYSHARE) && vma->vm_private_data; } -static inline bool __vma_private_lock(struct vm_area_struct *vma) -{ - return (!(vma->vm_flags & VM_MAYSHARE)) && vma->vm_private_data; -} +bool __vma_private_lock(struct vm_area_struct *vma); /* * Safe version of huge_pte_offset() to check the locks. See comments -- cgit v1.2.3 From 8e92157d7f6190c86bfd6144a409001469827100 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 28 Nov 2023 19:44:03 +0200 Subject: units: add missing header BITS_PER_BYTE is defined in bits.h. Link: https://lkml.kernel.org/r/20231128174404.393393-1-andriy.shevchenko@linux.intel.com Fixes: e8eed5f7366f ("units: Add BYTES_PER_*BIT") Signed-off-by: Andy Shevchenko Reviewed-by: Randy Dunlap Cc: Damian Muszynski Cc: Rasmus Villemoes Cc: Herbert Xu Signed-off-by: Andrew Morton --- include/linux/units.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/units.h b/include/linux/units.h index ff1bd6b5f5b3..45110daaf8d3 100644 --- a/include/linux/units.h +++ b/include/linux/units.h @@ -2,6 +2,7 @@ #ifndef _LINUX_UNITS_H #define _LINUX_UNITS_H +#include #include /* Metric prefixes in accordance with Système international (d'unités) */ -- cgit v1.2.3 From 73424d00dc63ba681856e06cfb0a5abbdb62e2b5 Mon Sep 17 00:00:00 2001 From: Su Hui Date: Thu, 30 Nov 2023 11:40:18 +0800 Subject: highmem: fix a memory copy problem in memcpy_from_folio Clang static checker complains that value stored to 'from' is never read. And memcpy_from_folio() only copy the last chunk memory from folio to destination. Use 'to += chunk' to replace 'from += chunk' to fix this typo problem. Link: https://lkml.kernel.org/r/20231130034017.1210429-1-suhui@nfschina.com Fixes: b23d03ef7af5 ("highmem: add memcpy_to_folio() and memcpy_from_folio()") Signed-off-by: Su Hui Reviewed-by: Matthew Wilcox (Oracle) Cc: Ira Weiny Cc: Jiaqi Yan Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Peter Collingbourne Cc: Tom Rix Cc: Tony Luck Cc: Signed-off-by: Andrew Morton --- include/linux/highmem.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 4cacc0e43b51..be20cff4ba73 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -454,7 +454,7 @@ static inline void memcpy_from_folio(char *to, struct folio *folio, memcpy(to, from, chunk); kunmap_local(from); - from += chunk; + to += chunk; offset += chunk; len -= chunk; } while (len > 0); -- cgit v1.2.3 From e03781879a0d524ce3126678d50a80484a513c4b Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Wed, 6 Dec 2023 23:31:02 +0200 Subject: drop_monitor: Require 'CAP_SYS_ADMIN' when joining "events" group The "NET_DM" generic netlink family notifies drop locations over the "events" multicast group. This is problematic since by default generic netlink allows non-root users to listen to these notifications. Fix by adding a new field to the generic netlink multicast group structure that when set prevents non-root users or root without the 'CAP_SYS_ADMIN' capability (in the user namespace owning the network namespace) from joining the group. Set this field for the "events" group. Use 'CAP_SYS_ADMIN' rather than 'CAP_NET_ADMIN' because of the nature of the information that is shared over this group. Note that the capability check in this case will always be performed against the initial user namespace since the family is not netns aware and only operates in the initial network namespace. A new field is added to the structure rather than using the "flags" field because the existing field uses uAPI flags and it is inappropriate to add a new uAPI flag for an internal kernel check. In net-next we can rework the "flags" field to use internal flags and fold the new field into it. But for now, in order to reduce the amount of changes, add a new field. Since the information can only be consumed by root, mark the control plane operations that start and stop the tracing as root-only using the 'GENL_ADMIN_PERM' flag. Tested using [1]. Before: # capsh -- -c ./dm_repo # capsh --drop=cap_sys_admin -- -c ./dm_repo After: # capsh -- -c ./dm_repo # capsh --drop=cap_sys_admin -- -c ./dm_repo Failed to join "events" multicast group [1] $ cat dm.c #include #include #include #include int main(int argc, char **argv) { struct nl_sock *sk; int grp, err; sk = nl_socket_alloc(); if (!sk) { fprintf(stderr, "Failed to allocate socket\n"); return -1; } err = genl_connect(sk); if (err) { fprintf(stderr, "Failed to connect socket\n"); return err; } grp = genl_ctrl_resolve_grp(sk, "NET_DM", "events"); if (grp < 0) { fprintf(stderr, "Failed to resolve \"events\" multicast group\n"); return grp; } err = nl_socket_add_memberships(sk, grp, NFNLGRP_NONE); if (err) { fprintf(stderr, "Failed to join \"events\" multicast group\n"); return err; } return 0; } $ gcc -I/usr/include/libnl3 -lnl-3 -lnl-genl-3 -o dm_repo dm.c Fixes: 9a8afc8d3962 ("Network Drop Monitor: Adding drop monitor implementation & Netlink protocol") Reported-by: "The UK's National Cyber Security Centre (NCSC)" Signed-off-by: Ido Schimmel Reviewed-by: Jacob Keller Reviewed-by: Jiri Pirko Link: https://lore.kernel.org/r/20231206213102.1824398-3-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- include/net/genetlink.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/genetlink.h b/include/net/genetlink.h index e18a4c0d69ee..c53244f20437 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -12,10 +12,12 @@ * struct genl_multicast_group - generic netlink multicast group * @name: name of the multicast group, names are per-family * @flags: GENL_* flags (%GENL_ADMIN_PERM or %GENL_UNS_ADMIN_PERM) + * @cap_sys_admin: whether %CAP_SYS_ADMIN is required for binding */ struct genl_multicast_group { char name[GENL_NAMSIZ]; u8 flags; + u8 cap_sys_admin:1; }; struct genl_split_ops; -- cgit v1.2.3 From bd4a816752bab609dd6d65ae021387beb9e2ddbd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20=C5=BBenczykowski?= Date: Wed, 6 Dec 2023 09:36:12 -0800 Subject: net: ipv6: support reporting otherwise unknown prefix flags in RTM_NEWPREFIX MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Lorenzo points out that we effectively clear all unknown flags from PIO when copying them to userspace in the netlink RTM_NEWPREFIX notification. We could fix this one at a time as new flags are defined, or in one fell swoop - I choose the latter. We could either define 6 new reserved flags (reserved1..6) and handle them individually (and rename them as new flags are defined), or we could simply copy the entire unmodified byte over - I choose the latter. This unfortunately requires some anonymous union/struct magic, so we add a static assert on the struct size for a little extra safety. Cc: David Ahern Cc: Lorenzo Colitti Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Maciej Żenczykowski Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/net/addrconf.h | 12 ++++++++++-- include/net/if_inet6.h | 4 ---- 2 files changed, 10 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 82da55101b5a..61ebe723ee4d 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -31,17 +31,22 @@ struct prefix_info { __u8 length; __u8 prefix_len; + union __packed { + __u8 flags; + struct __packed { #if defined(__BIG_ENDIAN_BITFIELD) - __u8 onlink : 1, + __u8 onlink : 1, autoconf : 1, reserved : 6; #elif defined(__LITTLE_ENDIAN_BITFIELD) - __u8 reserved : 6, + __u8 reserved : 6, autoconf : 1, onlink : 1; #else #error "Please fix " #endif + }; + }; __be32 valid; __be32 prefered; __be32 reserved2; @@ -49,6 +54,9 @@ struct prefix_info { struct in6_addr prefix; }; +/* rfc4861 4.6.2: IPv6 PIO is 32 bytes in size */ +static_assert(sizeof(struct prefix_info) == 32); + #include #include #include diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h index 3e454c4d7ba6..f07642264c1e 100644 --- a/include/net/if_inet6.h +++ b/include/net/if_inet6.h @@ -22,10 +22,6 @@ #define IF_RS_SENT 0x10 #define IF_READY 0x80000000 -/* prefix flags */ -#define IF_PREFIX_ONLINK 0x01 -#define IF_PREFIX_AUTOCONF 0x02 - enum { INET6_IFADDR_STATE_PREDAD, INET6_IFADDR_STATE_DAD, -- cgit v1.2.3 From 125f1c7f26ffcdbf96177abe75b70c1a6ceb17bc Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Tue, 5 Dec 2023 18:25:54 +0100 Subject: net/sched: act_ct: Take per-cb reference to tcf_ct_flow_table The referenced change added custom cleanup code to act_ct to delete any callbacks registered on the parent block when deleting the tcf_ct_flow_table instance. However, the underlying issue is that the drivers don't obtain the reference to the tcf_ct_flow_table instance when registering callbacks which means that not only driver callbacks may still be on the table when deleting it but also that the driver can still have pointers to its internal nf_flowtable and can use it concurrently which results either warning in netfilter[0] or use-after-free. Fix the issue by taking a reference to the underlying struct tcf_ct_flow_table instance when registering the callback and release the reference when unregistering. Expose new API required for such reference counting by adding two new callbacks to nf_flowtable_type and implementing them for act_ct flowtable_ct type. This fixes the issue by extending the lifetime of nf_flowtable until all users have unregistered. [0]: [106170.938634] ------------[ cut here ]------------ [106170.939111] WARNING: CPU: 21 PID: 3688 at include/net/netfilter/nf_flow_table.h:262 mlx5_tc_ct_del_ft_cb+0x267/0x2b0 [mlx5_core] [106170.940108] Modules linked in: act_ct nf_flow_table act_mirred act_skbedit act_tunnel_key vxlan cls_matchall nfnetlink_cttimeout act_gact cls_flower sch_ingress mlx5_vdpa vringh vhost_iotlb vdpa bonding openvswitch nsh rpcrdma rdma_ucm ib_iser libiscsi scsi_transport_iscsi ib_umad rdma_cm ib_ipoib iw_cm ib_cm mlx5_ib ib_uverbs ib_core xt_MASQUERADE nf_conntrack_netlink nfnetlink iptable_nat xt_addrtype xt_conntrack nf_nat br_netfilter rpcsec_gss_krb5 auth_rpcgss oid_regis try overlay mlx5_core [106170.943496] CPU: 21 PID: 3688 Comm: kworker/u48:0 Not tainted 6.6.0-rc7_for_upstream_min_debug_2023_11_01_13_02 #1 [106170.944361] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 [106170.945292] Workqueue: mlx5e mlx5e_rep_neigh_update [mlx5_core] [106170.945846] RIP: 0010:mlx5_tc_ct_del_ft_cb+0x267/0x2b0 [mlx5_core] [106170.946413] Code: 89 ef 48 83 05 71 a4 14 00 01 e8 f4 06 04 e1 48 83 05 6c a4 14 00 01 48 83 c4 28 5b 5d 41 5c 41 5d c3 48 83 05 d1 8b 14 00 01 <0f> 0b 48 83 05 d7 8b 14 00 01 e9 96 fe ff ff 48 83 05 a2 90 14 00 [106170.947924] RSP: 0018:ffff88813ff0fcb8 EFLAGS: 00010202 [106170.948397] RAX: 0000000000000000 RBX: ffff88811eabac40 RCX: ffff88811eabad48 [106170.949040] RDX: ffff88811eab8000 RSI: ffffffffa02cd560 RDI: 0000000000000000 [106170.949679] RBP: ffff88811eab8000 R08: 0000000000000001 R09: ffffffffa0229700 [106170.950317] R10: ffff888103538fc0 R11: 0000000000000001 R12: ffff88811eabad58 [106170.950969] R13: ffff888110c01c00 R14: ffff888106b40000 R15: 0000000000000000 [106170.951616] FS: 0000000000000000(0000) GS:ffff88885fd40000(0000) knlGS:0000000000000000 [106170.952329] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [106170.952834] CR2: 00007f1cefd28cb0 CR3: 000000012181b006 CR4: 0000000000370ea0 [106170.953482] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [106170.954121] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [106170.954766] Call Trace: [106170.955057] [106170.955315] ? __warn+0x79/0x120 [106170.955648] ? mlx5_tc_ct_del_ft_cb+0x267/0x2b0 [mlx5_core] [106170.956172] ? report_bug+0x17c/0x190 [106170.956537] ? handle_bug+0x3c/0x60 [106170.956891] ? exc_invalid_op+0x14/0x70 [106170.957264] ? asm_exc_invalid_op+0x16/0x20 [106170.957666] ? mlx5_del_flow_rules+0x10/0x310 [mlx5_core] [106170.958172] ? mlx5_tc_ct_block_flow_offload_add+0x1240/0x1240 [mlx5_core] [106170.958788] ? mlx5_tc_ct_del_ft_cb+0x267/0x2b0 [mlx5_core] [106170.959339] ? mlx5_tc_ct_del_ft_cb+0xc6/0x2b0 [mlx5_core] [106170.959854] ? mapping_remove+0x154/0x1d0 [mlx5_core] [106170.960342] ? mlx5e_tc_action_miss_mapping_put+0x4f/0x80 [mlx5_core] [106170.960927] mlx5_tc_ct_delete_flow+0x76/0xc0 [mlx5_core] [106170.961441] mlx5_free_flow_attr_actions+0x13b/0x220 [mlx5_core] [106170.962001] mlx5e_tc_del_fdb_flow+0x22c/0x3b0 [mlx5_core] [106170.962524] mlx5e_tc_del_flow+0x95/0x3c0 [mlx5_core] [106170.963034] mlx5e_flow_put+0x73/0xe0 [mlx5_core] [106170.963506] mlx5e_put_flow_list+0x38/0x70 [mlx5_core] [106170.964002] mlx5e_rep_update_flows+0xec/0x290 [mlx5_core] [106170.964525] mlx5e_rep_neigh_update+0x1da/0x310 [mlx5_core] [106170.965056] process_one_work+0x13a/0x2c0 [106170.965443] worker_thread+0x2e5/0x3f0 [106170.965808] ? rescuer_thread+0x410/0x410 [106170.966192] kthread+0xc6/0xf0 [106170.966515] ? kthread_complete_and_exit+0x20/0x20 [106170.966970] ret_from_fork+0x2d/0x50 [106170.967332] ? kthread_complete_and_exit+0x20/0x20 [106170.967774] ret_from_fork_asm+0x11/0x20 [106170.970466] [106170.970726] ---[ end trace 0000000000000000 ]--- Fixes: 77ac5e40c44e ("net/sched: act_ct: remove and free nf_table callbacks") Signed-off-by: Vlad Buslov Reviewed-by: Paul Blakey Acked-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- include/net/netfilter/nf_flow_table.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index fe1507c1db82..692d5955911c 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -62,6 +62,8 @@ struct nf_flowtable_type { enum flow_offload_tuple_dir dir, struct nf_flow_rule *flow_rule); void (*free)(struct nf_flowtable *ft); + void (*get)(struct nf_flowtable *ft); + void (*put)(struct nf_flowtable *ft); nf_hookfn *hook; struct module *owner; }; @@ -240,6 +242,11 @@ nf_flow_table_offload_add_cb(struct nf_flowtable *flow_table, } list_add_tail(&block_cb->list, &block->cb_list); + up_write(&flow_table->flow_block_lock); + + if (flow_table->type->get) + flow_table->type->get(flow_table); + return 0; unlock: up_write(&flow_table->flow_block_lock); @@ -262,6 +269,9 @@ nf_flow_table_offload_del_cb(struct nf_flowtable *flow_table, WARN_ON(true); } up_write(&flow_table->flow_block_lock); + + if (flow_table->type->put) + flow_table->type->put(flow_table); } void flow_offload_route_init(struct flow_offload *flow, -- cgit v1.2.3 From 718ab8226636a1a3a7d281f5d6a7ad7c925efe5a Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Tue, 28 Nov 2023 09:15:07 +0100 Subject: PCI/ASPM: Add pci_enable_link_state_locked() Add pci_enable_link_state_locked() for enabling link states that can be used in contexts where a pci_bus_sem read lock is already held (e.g. from pci_walk_bus()). This helper will be used to fix a couple of potential deadlocks where the current helper is called with the lock already held, hence the CC stable tag. Fixes: f492edb40b54 ("PCI: vmd: Add quirk to configure PCIe ASPM and LTR") Link: https://lore.kernel.org/r/20231128081512.19387-2-johan+linaro@kernel.org Signed-off-by: Johan Hovold [bhelgaas: include helper name in subject, commit log] Signed-off-by: Bjorn Helgaas Reviewed-by: Manivannan Sadhasivam Cc: # 6.3 Cc: Michael Bottini Cc: David E. Box --- include/linux/pci.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/pci.h b/include/linux/pci.h index 60ca768bc867..dea043bc1e38 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1829,6 +1829,7 @@ extern bool pcie_ports_native; int pci_disable_link_state(struct pci_dev *pdev, int state); int pci_disable_link_state_locked(struct pci_dev *pdev, int state); int pci_enable_link_state(struct pci_dev *pdev, int state); +int pci_enable_link_state_locked(struct pci_dev *pdev, int state); void pcie_no_aspm(void); bool pcie_aspm_support_enabled(void); bool pcie_aspm_enabled(struct pci_dev *pdev); @@ -1839,6 +1840,8 @@ static inline int pci_disable_link_state_locked(struct pci_dev *pdev, int state) { return 0; } static inline int pci_enable_link_state(struct pci_dev *pdev, int state) { return 0; } +static inline int pci_enable_link_state_locked(struct pci_dev *pdev, int state) +{ return 0; } static inline void pcie_no_aspm(void) { } static inline bool pcie_aspm_support_enabled(void) { return false; } static inline bool pcie_aspm_enabled(struct pci_dev *pdev) { return false; } -- cgit v1.2.3 From d3bb89ea9c13e5a98d2b7a0ba8e50a77893132cb Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 7 Dec 2023 23:25:25 +0800 Subject: mm: fix VMA heap bounds checking After converting selinux to VMA heap check helper, the gcl triggers an execheap SELinux denial, which is caused by a changed logic check. Previously selinux only checked that the VMA range was within the VMA heap range, and the implementation checks the intersection between the two ranges, but the corner case (vm_end=start_brk, brk=vm_start) isn't handled correctly. Since commit 11250fd12eb8 ("mm: factor out VMA stack and heap checks") was only a function extraction, it seems that the issue was introduced by commit 0db0c01b53a1 ("procfs: fix /proc//maps heap check"). Let's fix above corner cases, meanwhile, correct the wrong indentation of the stack and heap check helpers. Fixes: 11250fd12eb8 ("mm: factor out VMA stack and heap checks") Signed-off-by: Kefeng Wang Reported-by: Ondrej Mosnacek Closes: https://lore.kernel.org/selinux/CAFqZXNv0SVT0fkOK6neP9AXbj3nxJ61JAY4+zJzvxqJaeuhbFw@mail.gmail.com/ Tested-by: Ondrej Mosnacek Link: https://lkml.kernel.org/r/20231207152525.2607420-1-wangkefeng.wang@huawei.com Cc: David Hildenbrand Cc: Paul Moore Cc: Peter Zijlstra Cc: Stephen Smalley Signed-off-by: Andrew Morton --- include/linux/mm.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 418d26608ece..da5219b48d52 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -886,8 +886,8 @@ static inline bool vma_is_anonymous(struct vm_area_struct *vma) */ static inline bool vma_is_initial_heap(const struct vm_area_struct *vma) { - return vma->vm_start <= vma->vm_mm->brk && - vma->vm_end >= vma->vm_mm->start_brk; + return vma->vm_start < vma->vm_mm->brk && + vma->vm_end > vma->vm_mm->start_brk; } /* @@ -901,8 +901,8 @@ static inline bool vma_is_initial_stack(const struct vm_area_struct *vma) * its "stack". It's not even well-defined for programs written * languages like Go. */ - return vma->vm_start <= vma->vm_mm->start_stack && - vma->vm_end >= vma->vm_mm->start_stack; + return vma->vm_start <= vma->vm_mm->start_stack && + vma->vm_end >= vma->vm_mm->start_stack; } static inline bool vma_is_temporary_stack(struct vm_area_struct *vma) -- cgit v1.2.3 From 6376a824595607e99d032a39ba3394988b4fce96 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 8 Dec 2023 17:50:18 +0000 Subject: mm/damon/core: make damon_start() waits until kdamond_fn() starts The cleanup tasks of kdamond threads including reset of corresponding DAMON context's ->kdamond field and decrease of global nr_running_ctxs counter is supposed to be executed by kdamond_fn(). However, commit 0f91d13366a4 ("mm/damon: simplify stop mechanism") made neither damon_start() nor damon_stop() ensure the corresponding kdamond has started the execution of kdamond_fn(). As a result, the cleanup can be skipped if damon_stop() is called fast enough after the previous damon_start(). Especially the skipped reset of ->kdamond could cause a use-after-free. Fix it by waiting for start of kdamond_fn() execution from damon_start(). Link: https://lkml.kernel.org/r/20231208175018.63880-1-sj@kernel.org Fixes: 0f91d13366a4 ("mm/damon: simplify stop mechanism") Signed-off-by: SeongJae Park Reported-by: Jakub Acs Cc: Changbin Du Cc: Jakub Acs Cc: # 5.15.x Signed-off-by: Andrew Morton --- include/linux/damon.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index ab2f17d9926b..e00ddf1ed39c 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -559,6 +559,8 @@ struct damon_ctx { * update */ unsigned long next_ops_update_sis; + /* for waiting until the execution of the kdamond_fn is started */ + struct completion kdamond_started; /* public: */ struct task_struct *kdamond; -- cgit v1.2.3 From 081488051d28d32569ebb7c7a23572778b2e7d57 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Thu, 7 Dec 2023 23:14:04 -0700 Subject: mm/mglru: fix underprotected page cache Unmapped folios accessed through file descriptors can be underprotected. Those folios are added to the oldest generation based on: 1. The fact that they are less costly to reclaim (no need to walk the rmap and flush the TLB) and have less impact on performance (don't cause major PFs and can be non-blocking if needed again). 2. The observation that they are likely to be single-use. E.g., for client use cases like Android, its apps parse configuration files and store the data in heap (anon); for server use cases like MySQL, it reads from InnoDB files and holds the cached data for tables in buffer pools (anon). However, the oldest generation can be very short lived, and if so, it doesn't provide the PID controller with enough time to respond to a surge of refaults. (Note that the PID controller uses weighted refaults and those from evicted generations only take a half of the whole weight.) In other words, for a short lived generation, the moving average smooths out the spike quickly. To fix the problem: 1. For folios that are already on LRU, if they can be beyond the tracking range of tiers, i.e., five accesses through file descriptors, move them to the second oldest generation to give them more time to age. (Note that tiers are used by the PID controller to statistically determine whether folios accessed multiple times through file descriptors are worth protecting.) 2. When adding unmapped folios to LRU, adjust the placement of them so that they are not too close to the tail. The effect of this is similar to the above. On Android, launching 55 apps sequentially: Before After Change workingset_refault_anon 25641024 25598972 0% workingset_refault_file 115016834 106178438 -8% Link: https://lkml.kernel.org/r/20231208061407.2125867-1-yuzhao@google.com Fixes: ac35a4902374 ("mm: multi-gen LRU: minimal implementation") Signed-off-by: Yu Zhao Reported-by: Charan Teja Kalla Tested-by: Kalesh Singh Cc: T.J. Mercier Cc: Kairui Song Cc: Hillf Danton Cc: Jaroslav Pulchart Cc: Signed-off-by: Andrew Morton --- include/linux/mm_inline.h | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 9ae7def16cb2..f4fe593c1400 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -232,22 +232,27 @@ static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, if (folio_test_unevictable(folio) || !lrugen->enabled) return false; /* - * There are three common cases for this page: - * 1. If it's hot, e.g., freshly faulted in or previously hot and - * migrated, add it to the youngest generation. - * 2. If it's cold but can't be evicted immediately, i.e., an anon page - * not in swapcache or a dirty page pending writeback, add it to the - * second oldest generation. - * 3. Everything else (clean, cold) is added to the oldest generation. + * There are four common cases for this page: + * 1. If it's hot, i.e., freshly faulted in, add it to the youngest + * generation, and it's protected over the rest below. + * 2. If it can't be evicted immediately, i.e., a dirty page pending + * writeback, add it to the second youngest generation. + * 3. If it should be evicted first, e.g., cold and clean from + * folio_rotate_reclaimable(), add it to the oldest generation. + * 4. Everything else falls between 2 & 3 above and is added to the + * second oldest generation if it's considered inactive, or the + * oldest generation otherwise. See lru_gen_is_active(). */ if (folio_test_active(folio)) seq = lrugen->max_seq; else if ((type == LRU_GEN_ANON && !folio_test_swapcache(folio)) || (folio_test_reclaim(folio) && (folio_test_dirty(folio) || folio_test_writeback(folio)))) - seq = lrugen->min_seq[type] + 1; - else + seq = lrugen->max_seq - 1; + else if (reclaiming || lrugen->min_seq[type] + MIN_NR_GENS >= lrugen->max_seq) seq = lrugen->min_seq[type]; + else + seq = lrugen->min_seq[type] + 1; gen = lru_gen_from_seq(seq); flags = (gen + 1UL) << LRU_GEN_PGOFF; -- cgit v1.2.3 From 8aa420617918d12d1f5d55030a503c9418e73c2c Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Thu, 7 Dec 2023 23:14:06 -0700 Subject: mm/mglru: respect min_ttl_ms with memcgs While investigating kswapd "consuming 100% CPU" [1] (also see "mm/mglru: try to stop at high watermarks"), it was discovered that the memcg LRU can breach the thrashing protection imposed by min_ttl_ms. Before the memcg LRU: kswapd() shrink_node_memcgs() mem_cgroup_iter() inc_max_seq() // always hit a different memcg lru_gen_age_node() mem_cgroup_iter() check the timestamp of the oldest generation After the memcg LRU: kswapd() shrink_many() restart: iterate the memcg LRU: inc_max_seq() // occasionally hit the same memcg if raced with lru_gen_rotate_memcg(): goto restart lru_gen_age_node() mem_cgroup_iter() check the timestamp of the oldest generation Specifically, when the restart happens in shrink_many(), it needs to stick with the (memcg LRU) generation it began with. In other words, it should neither re-read memcg_lru->seq nor age an lruvec of a different generation. Otherwise it can hit the same memcg multiple times without giving lru_gen_age_node() a chance to check the timestamp of that memcg's oldest generation (against min_ttl_ms). [1] https://lore.kernel.org/CAK8fFZ4DY+GtBA40Pm7Nn5xCHy+51w3sfxPqkqpqakSXYyX+Wg@mail.gmail.com/ Link: https://lkml.kernel.org/r/20231208061407.2125867-3-yuzhao@google.com Fixes: e4dde56cd208 ("mm: multi-gen LRU: per-node lru_gen_folio lists") Signed-off-by: Yu Zhao Tested-by: T.J. Mercier Cc: Charan Teja Kalla Cc: Hillf Danton Cc: Jaroslav Pulchart Cc: Kairui Song Cc: Kalesh Singh Cc: Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 3c25226beeed..23533b12bee2 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -505,33 +505,37 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw); * the old generation, is incremented when all its bins become empty. * * There are four operations: - * 1. MEMCG_LRU_HEAD, which moves an memcg to the head of a random bin in its + * 1. MEMCG_LRU_HEAD, which moves a memcg to the head of a random bin in its * current generation (old or young) and updates its "seg" to "head"; - * 2. MEMCG_LRU_TAIL, which moves an memcg to the tail of a random bin in its + * 2. MEMCG_LRU_TAIL, which moves a memcg to the tail of a random bin in its * current generation (old or young) and updates its "seg" to "tail"; - * 3. MEMCG_LRU_OLD, which moves an memcg to the head of a random bin in the old + * 3. MEMCG_LRU_OLD, which moves a memcg to the head of a random bin in the old * generation, updates its "gen" to "old" and resets its "seg" to "default"; - * 4. MEMCG_LRU_YOUNG, which moves an memcg to the tail of a random bin in the + * 4. MEMCG_LRU_YOUNG, which moves a memcg to the tail of a random bin in the * young generation, updates its "gen" to "young" and resets its "seg" to * "default". * * The events that trigger the above operations are: * 1. Exceeding the soft limit, which triggers MEMCG_LRU_HEAD; - * 2. The first attempt to reclaim an memcg below low, which triggers + * 2. The first attempt to reclaim a memcg below low, which triggers * MEMCG_LRU_TAIL; - * 3. The first attempt to reclaim an memcg below reclaimable size threshold, + * 3. The first attempt to reclaim a memcg below reclaimable size threshold, * which triggers MEMCG_LRU_TAIL; - * 4. The second attempt to reclaim an memcg below reclaimable size threshold, + * 4. The second attempt to reclaim a memcg below reclaimable size threshold, * which triggers MEMCG_LRU_YOUNG; - * 5. Attempting to reclaim an memcg below min, which triggers MEMCG_LRU_YOUNG; + * 5. Attempting to reclaim a memcg below min, which triggers MEMCG_LRU_YOUNG; * 6. Finishing the aging on the eviction path, which triggers MEMCG_LRU_YOUNG; - * 7. Offlining an memcg, which triggers MEMCG_LRU_OLD. + * 7. Offlining a memcg, which triggers MEMCG_LRU_OLD. * - * Note that memcg LRU only applies to global reclaim, and the round-robin - * incrementing of their max_seq counters ensures the eventual fairness to all - * eligible memcgs. For memcg reclaim, it still relies on mem_cgroup_iter(). + * Notes: + * 1. Memcg LRU only applies to global reclaim, and the round-robin incrementing + *