From 9a9b6aa6a8759c83024627d681eff982d6ee03b7 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Tue, 28 Jun 2016 10:32:00 -0700 Subject: Input: add SW_PEN_INSERTED define Some devices with a pen may have a switch that can be used to detect when the pen is inserted or removed to a slot on the device. Let's add a define to the input event codes so that everyone can be on the same page for what event we should generate when the pen is inserted or removed. In general the pen switch could be used by the software on the device to kick off any number of actions when the pen is inserted or removed. Signed-off-by: Douglas Anderson Signed-off-by: Dmitry Torokhov --- include/uapi/linux/input-event-codes.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/input-event-codes.h b/include/uapi/linux/input-event-codes.h index 737fa32faad4..d6d071fc3c56 100644 --- a/include/uapi/linux/input-event-codes.h +++ b/include/uapi/linux/input-event-codes.h @@ -780,6 +780,7 @@ #define SW_ROTATE_LOCK 0x0c /* set = rotate locked/disabled */ #define SW_LINEIN_INSERT 0x0d /* set = inserted */ #define SW_MUTE_DEVICE 0x0e /* set = device disabled */ +#define SW_PEN_INSERTED 0x0f /* set = pen inserted */ #define SW_MAX 0x0f #define SW_CNT (SW_MAX+1) -- cgit v1.2.3 From 94477bff390aa4612d2332c8abafaae0a13d6923 Mon Sep 17 00:00:00 2001 From: Sinclair Yeh Date: Wed, 29 Jun 2016 12:58:49 -0700 Subject: drm/ttm: Make ttm_bo_mem_compat available There are cases where it is desired to see if a proposed placement is compatible with a buffer object before calling ttm_bo_validate(). Signed-off-by: Sinclair Yeh Reviewed-by: Thomas Hellstrom Cc: --- This is the first of a 3-patch series to fix a black screen issue observed on Ubuntu 16.04 server. --- include/drm/ttm/ttm_bo_api.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include') diff --git a/include/drm/ttm/ttm_bo_api.h b/include/drm/ttm/ttm_bo_api.h index c801d9028e37..4cecb0b75b9c 100644 --- a/include/drm/ttm/ttm_bo_api.h +++ b/include/drm/ttm/ttm_bo_api.h @@ -316,6 +316,20 @@ ttm_bo_reference(struct ttm_buffer_object *bo) */ extern int ttm_bo_wait(struct ttm_buffer_object *bo, bool interruptible, bool no_wait); + +/** + * ttm_bo_mem_compat - Check if proposed placement is compatible with a bo + * + * @placement: Return immediately if buffer is busy. + * @mem: The struct ttm_mem_reg indicating the region where the bo resides + * @new_flags: Describes compatible placement found + * + * Returns true if the placement is compatible + */ +extern bool ttm_bo_mem_compat(struct ttm_placement *placement, + struct ttm_mem_reg *mem, + uint32_t *new_flags); + /** * ttm_bo_validate * -- cgit v1.2.3 From 487cf917ed0d12afaf403d9d77684bf44b8c13be Mon Sep 17 00:00:00 2001 From: Sinan Kaya Date: Wed, 29 Jun 2016 04:27:36 -0400 Subject: Revert "ACPI, PCI, IRQ: remove redundant code in acpi_irq_penalty_init()" Trying to make the ISA and PCI init functionality common turned out to be a bad idea, because the ISA path depends on external functionality. Restore the previous behavior and limit the refactoring to PCI interrupts only. Fixes: 1fcb6a813c4f "ACPI,PCI,IRQ: remove redundant code in acpi_irq_penalty_init()" Signed-off-by: Sinan Kaya Tested-by: Wim Osterholt Signed-off-by: Rafael J. Wysocki --- include/acpi/acpi_drivers.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/acpi/acpi_drivers.h b/include/acpi/acpi_drivers.h index 797ae2ec8eee..29c691265b49 100644 --- a/include/acpi/acpi_drivers.h +++ b/include/acpi/acpi_drivers.h @@ -78,6 +78,7 @@ /* ACPI PCI Interrupt Link (pci_link.c) */ +int acpi_irq_penalty_init(void); int acpi_pci_link_allocate_irq(acpi_handle handle, int index, int *triggering, int *polarity, char **name); int acpi_pci_link_free_irq(acpi_handle handle); -- cgit v1.2.3 From c8607e020014cf11a61601a0005270bad81cabdf Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 6 Jul 2016 14:53:06 +0200 Subject: netfilter: nft_ct: fix expiration getter We need to compute timeout.expires - jiffies, not the other way around. Add a helper, another patch can then later change more places in conntrack code where we currently open-code this. Will allow us to only change one place later when we remove per-ct timer. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index dd78bea227c8..b6083c34ef0d 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -284,6 +284,14 @@ static inline bool nf_is_loopback_packet(const struct sk_buff *skb) return skb->dev && skb->skb_iif && skb->dev->flags & IFF_LOOPBACK; } +/* jiffies until ct expires, 0 if already expired */ +static inline unsigned long nf_ct_expires(const struct nf_conn *ct) +{ + long timeout = (long)ct->timeout.expires - (long)jiffies; + + return timeout > 0 ? timeout : 0; +} + struct kernel_param; int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp); -- cgit v1.2.3 From abb2bafd295fe962bbadc329dbfb2146457283ac Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Sun, 12 Jun 2016 12:31:53 +0200 Subject: x86/quirks: Add early quirk to reset Apple AirPort card MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The EFI firmware on Macs contains a full-fledged network stack for downloading OS X images from osrecovery.apple.com. Unfortunately on Macs introduced 2011 and 2012, EFI brings up the Broadcom 4331 wireless card on every boot and leaves it enabled even after ExitBootServices has been called. The card continues to assert its IRQ line, causing spurious interrupts if the IRQ is shared. It also corrupts memory by DMAing received packets, allowing for remote code execution over the air. This only stops when a driver is loaded for the wireless card, which may be never if the driver is not installed or blacklisted. The issue seems to be constrained to the Broadcom 4331. Chris Milsted has verified that the newer Broadcom 4360 built into the MacBookPro11,3 (2013/2014) does not exhibit this behaviour. The chances that Apple will ever supply a firmware fix for the older machines appear to be zero. The solution is to reset the card on boot by writing to a reset bit in its mmio space. This must be done as an early quirk and not as a plain vanilla PCI quirk to successfully combat memory corruption by DMAed packets: Matthew Garrett found out in 2012 that the packets are written to EfiBootServicesData memory (http://mjg59.dreamwidth.org/11235.html). This type of memory is made available to the page allocator by efi_free_boot_services(). Plain vanilla PCI quirks run much later, in subsys initcall level. In-between a time window would be open for memory corruption. Random crashes occurring in this time window and attributed to DMAed packets have indeed been observed in the wild by Chris Bainbridge. When Matthew Garrett analyzed the memory corruption issue in 2012, he sought to fix it with a grub quirk which transitions the card to D3hot: http://git.savannah.gnu.org/cgit/grub.git/commit/?id=9d34bb85da56 This approach does not help users with other bootloaders and while it may prevent DMAed packets, it does not cure the spurious interrupts emanating from the card. Unfortunately the card's mmio space is inaccessible in D3hot, so to reset it, we have to undo the effect of Matthew's grub patch and transition the card back to D0. Note that the quirk takes a few shortcuts to reduce the amount of code: The size of BAR 0 and the location of the PM capability is identical on all affected machines and therefore hardcoded. Only the address of BAR 0 differs between models. Also, it is assumed that the BCMA core currently mapped is the 802.11 core. The EFI driver seems to always take care of this. Michael Büsch, Bjorn Helgaas and Matt Fleming contributed feedback towards finding the best solution to this problem. The following should be a comprehensive list of affected models: iMac13,1 2012 21.5" [Root Port 00:1c.3 = 8086:1e16] iMac13,2 2012 27" [Root Port 00:1c.3 = 8086:1e16] Macmini5,1 2011 i5 2.3 GHz [Root Port 00:1c.1 = 8086:1c12] Macmini5,2 2011 i5 2.5 GHz [Root Port 00:1c.1 = 8086:1c12] Macmini5,3 2011 i7 2.0 GHz [Root Port 00:1c.1 = 8086:1c12] Macmini6,1 2012 i5 2.5 GHz [Root Port 00:1c.1 = 8086:1e12] Macmini6,2 2012 i7 2.3 GHz [Root Port 00:1c.1 = 8086:1e12] MacBookPro8,1 2011 13" [Root Port 00:1c.1 = 8086:1c12] MacBookPro8,2 2011 15" [Root Port 00:1c.1 = 8086:1c12] MacBookPro8,3 2011 17" [Root Port 00:1c.1 = 8086:1c12] MacBookPro9,1 2012 15" [Root Port 00:1c.1 = 8086:1e12] MacBookPro9,2 2012 13" [Root Port 00:1c.1 = 8086:1e12] MacBookPro10,1 2012 15" [Root Port 00:1c.1 = 8086:1e12] MacBookPro10,2 2012 13" [Root Port 00:1c.1 = 8086:1e12] For posterity, spurious interrupts caused by the Broadcom 4331 wireless card resulted in splats like this (stacktrace omitted): irq 17: nobody cared (try booting with the "irqpoll" option) handlers: [] pcie_isr [] sdhci_irq [sdhci] threaded [] sdhci_thread_irq [sdhci] [] azx_interrupt [snd_hda_codec] Disabling IRQ #17 Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=79301 Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=111781 Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=728916 Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=895951#c16 Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1009819 Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1098621 Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1149632#c5 Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1279130 Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1332732 Tested-by: Konstantin Simanov # [MacBookPro8,1] Tested-by: Lukas Wunner # [MacBookPro9,1] Tested-by: Bryan Paradis # [MacBookPro9,2] Tested-by: Andrew Worsley # [MacBookPro10,1] Tested-by: Chris Bainbridge # [MacBookPro10,2] Signed-off-by: Lukas Wunner Acked-by: Rafał Miłecki Acked-by: Matt Fleming Cc: Andy Lutomirski Cc: Bjorn Helgaas Cc: Borislav Petkov Cc: Brian Gerst Cc: Chris Milsted Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Matthew Garrett Cc: Michael Buesch Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Yinghai Lu Cc: b43-dev@lists.infradead.org Cc: linux-pci@vger.kernel.org Cc: linux-wireless@vger.kernel.org Cc: stable@vger.kernel.org Cc: stable@vger.kernel.org # 123456789abc: x86/quirks: Apply nvidia_bugs quirk only on root bus Cc: stable@vger.kernel.org # 123456789abc: x86/quirks: Reintroduce scanning of secondary buses Link: http://lkml.kernel.org/r/48d0972ac82a53d460e5fce77a07b2560db95203.1465690253.git.lukas@wunner.de [ Did minor readability edits. ] Signed-off-by: Ingo Molnar --- include/linux/bcma/bcma.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/bcma/bcma.h b/include/linux/bcma/bcma.h index e6b41f42602b..3db25df396cb 100644 --- a/include/linux/bcma/bcma.h +++ b/include/linux/bcma/bcma.h @@ -159,6 +159,7 @@ struct bcma_host_ops { #define BCMA_CORE_DEFAULT 0xFFF #define BCMA_MAX_NR_CORES 16 +#define BCMA_CORE_SIZE 0x1000 /* Chip IDs of PCIe devices */ #define BCMA_CHIP_ID_BCM4313 0x4313 -- cgit v1.2.3 From 00c611def8748a0a1cf1d31842e49b42dfdb3de1 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 11 Jul 2016 16:21:08 +0200 Subject: Revert "ACPI 2.0 / AML: Improve module level execution by moving the If/Else/While execution to per-table basis" Revert commit 3d4b7ae96d81 (ACPI 2.0 / AML: Improve module level execution by moving the If/Else/While execution to per-table basis) that enabled the execution of module-level AML after loading each table (rather than after all AML tables have been loaded), but overlooked locking issues resulting from that change. Fixes: 3d4b7ae96d81 (ACPI 2.0 / AML: Improve module level execution by moving the If/Else/While execution to per-table basis) Reported-by: Mika Westerberg Signed-off-by: Rafael J. Wysocki --- include/acpi/acpixf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/acpi/acpixf.h b/include/acpi/acpixf.h index 4e4c21491c41..1ff3a76c265d 100644 --- a/include/acpi/acpixf.h +++ b/include/acpi/acpixf.h @@ -192,7 +192,7 @@ ACPI_INIT_GLOBAL(u8, acpi_gbl_do_not_use_xsdt, FALSE); /* * Optionally support group module level code. */ -ACPI_INIT_GLOBAL(u8, acpi_gbl_group_module_level_code, FALSE); +ACPI_INIT_GLOBAL(u8, acpi_gbl_group_module_level_code, TRUE); /* * Optionally use 32-bit FADT addresses if and when there is a conflict -- cgit v1.2.3 From 6d4e56ce977864b0fcd28c61555060e6010aa89b Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 11 Jul 2016 09:10:06 -0400 Subject: posix_acl: de-union a_refcount and a_rcu Currently the two are unioned together, but I don't think that's safe. It looks like get_cached_acl could race with the last put in posix_acl_release. get_cached_acl calls atomic_inc_not_zero on a_refcount, but that field could have already been clobbered by call_rcu, and may no longer be zero. Fix this by de-unioning the two fields. Fixes: b8a7a3a66747 (posix_acl: Inode acl caching fixes) Signed-off-by: Jeff Layton Signed-off-by: Al Viro --- include/linux/posix_acl.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h index 5b5a80cc5926..c818772d9f9d 100644 --- a/include/linux/posix_acl.h +++ b/include/linux/posix_acl.h @@ -43,10 +43,8 @@ struct posix_acl_entry { }; struct posix_acl { - union { - atomic_t a_refcount; - struct rcu_head a_rcu; - }; + atomic_t a_refcount; + struct rcu_head a_rcu; unsigned int a_count; struct posix_acl_entry a_entries[0]; }; -- cgit v1.2.3 From f4979fcea7fd36d8e2f556abef86f80e0d5af1ba Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Tue, 12 Jul 2016 18:18:56 -0400 Subject: rose: limit sk_filter trim to payload Sockets can have a filter program attached that drops or trims incoming packets based on the filter program return value. Rose requires data packets to have at least ROSE_MIN_LEN bytes. It verifies this on arrival in rose_route_frame and unconditionally pulls the bytes in rose_recvmsg. The filter can trim packets to below this value in-between, causing pull to fail, leaving the partial header at the time of skb_copy_datagram_msg. Place a lower bound on the size to which sk_filter may trim packets by introducing sk_filter_trim_cap and call this for rose packets. Signed-off-by: Willem de Bruijn Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/filter.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/filter.h b/include/linux/filter.h index 6fc31ef1da2d..8f74f3d61894 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -467,7 +467,11 @@ static inline void bpf_prog_unlock_ro(struct bpf_prog *fp) } #endif /* CONFIG_DEBUG_SET_MODULE_RONX */ -int sk_filter(struct sock *sk, struct sk_buff *skb); +int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap); +static inline int sk_filter(struct sock *sk, struct sk_buff *skb) +{ + return sk_filter_trim_cap(sk, skb, 1); +} struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err); void bpf_prog_free(struct bpf_prog *fp); -- cgit v1.2.3 From 4f0c40d94461cfd23893a17335b2ab78ecb333c8 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Tue, 12 Jul 2016 18:18:57 -0400 Subject: dccp: limit sk_filter trim to payload Dccp verifies packet integrity, including length, at initial rcv in dccp_invalid_packet, later pulls headers in dccp_enqueue_skb. A call to sk_filter in-between can cause __skb_pull to wrap skb->len. skb_copy_datagram_msg interprets this as a negative value, so (correctly) fails with EFAULT. The negative length is reported in ioctl SIOCINQ or possibly in a DCCP_WARN in dccp_close. Introduce an sk_receive_skb variant that caps how small a filter program can trim packets, and call this in dccp with the header length. Excessively trimmed packets are now processed normally and queued for reception as 0B payloads. Fixes: 7c657876b63c ("[DCCP]: Initial implementation") Signed-off-by: Willem de Bruijn Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/net/sock.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 649d2a8c17fc..ff5be7e8ddea 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1576,7 +1576,13 @@ static inline void sock_put(struct sock *sk) */ void sock_gen_put(struct sock *sk); -int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested); +int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested, + unsigned int trim_cap); +static inline int sk_receive_skb(struct sock *sk, struct sk_buff *skb, + const int nested) +{ + return __sk_receive_skb(sk, skb, nested, 1); +} static inline void sk_tx_queue_set(struct sock *sk, int tx_queue) { -- cgit v1.2.3 From 12cb22bb8ae9aff9d72a9c0a234f26d641b20eb6 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 14 Jul 2016 12:07:15 -0700 Subject: uapi: export lirc.h header This header contains the userspace API for lirc. This is a fixup for commit b7be755733dc ("[media] bz#75751: Move internal header file lirc.h to uapi/"). It moved the header to the right place, but it forgot to add it at Kbuild. So, despite being at uapi, it is not copied to the right place. Fixes: b7be755733dc44c72 ("[media] bz#75751: Move internal header file lirc.h to uapi/") Link: http://lkml.kernel.org/r/320c765d32bfc82c582e336d52ffe1026c73c644.1468439021.git.mchehab@s-opensource.com Signed-off-by: Mauro Carvalho Chehab Cc: Alec Leamas Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/uapi/linux/Kbuild | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild index 8bdae34d1f9a..ec10cfef166a 100644 --- a/include/uapi/linux/Kbuild +++ b/include/uapi/linux/Kbuild @@ -245,6 +245,7 @@ endif header-y += hw_breakpoint.h header-y += l2tp.h header-y += libc-compat.h +header-y += lirc.h header-y += limits.h header-y += llc.h header-y += loop.h -- cgit v1.2.3 From e41f501d391265ff568f3e49d6128cc30856a36f Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Thu, 14 Jul 2016 12:07:29 -0700 Subject: vmlinux.lds: account for destructor sections If CONFIG_KASAN is enabled and gcc is configured with --disable-initfini-array and/or gold linker is used, gcc emits .ctors/.dtors and .text.startup/.text.exit sections instead of .init_array/.fini_array. .dtors section is not explicitly accounted in the linker script and messes vvar/percpu layout. We want: ffffffff822bfd80 D _edata ffffffff822c0000 D __vvar_beginning_hack ffffffff822c0000 A __vvar_page ffffffff822c0080 0000000000000098 D vsyscall_gtod_data ffffffff822c1000 A __init_begin ffffffff822c1000 D init_per_cpu__irq_stack_union ffffffff822c1000 A __per_cpu_load ffffffff822d3000 D init_per_cpu__gdt_page We got: ffffffff8279a600 D _edata ffffffff8279b000 A __vvar_page ffffffff8279c000 A __init_begin ffffffff8279c000 D init_per_cpu__irq_stack_union ffffffff8279c000 A __per_cpu_load ffffffff8279e000 D __vvar_beginning_hack ffffffff8279e080 0000000000000098 D vsyscall_gtod_data ffffffff827ae000 D init_per_cpu__gdt_page This happens because __vvar_page and .vvar get different addresses in arch/x86/kernel/vmlinux.lds.S: . = ALIGN(PAGE_SIZE); __vvar_page = .; .vvar : AT(ADDR(.vvar) - LOAD_OFFSET) { /* work around gold bug 13023 */ __vvar_beginning_hack = .; Discard .dtors/.fini_array/.text.exit, since we don't call dtors. Merge .text.startup into init text. Link: http://lkml.kernel.org/r/1467386363-120030-1-git-send-email-dvyukov@google.com Signed-off-by: Dmitry Vyukov Reviewed-by: Andrey Ryabinin Cc: [4.0+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/vmlinux.lds.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 6a67ab94b553..081d0f258d4c 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -542,15 +542,19 @@ #define INIT_TEXT \ *(.init.text) \ + *(.text.startup) \ MEM_DISCARD(init.text) #define EXIT_DATA \ *(.exit.data) \ + *(.fini_array) \ + *(.dtors) \ MEM_DISCARD(exit.data) \ MEM_DISCARD(exit.rodata) #define EXIT_TEXT \ *(.exit.text) \ + *(.text.exit) \ MEM_DISCARD(exit.text) #define EXIT_CALL \ -- cgit v1.2.3 From 33f4751e99601b7bfd1d66aedabd3ee9140922de Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Thu, 14 Jul 2016 12:07:32 -0700 Subject: mm: thp: move pmd check inside ptl for freeze_page() I found a race condition triggering VM_BUG_ON() in freeze_page(), when running a testcase with 3 processes: - process 1: keep writing thp, - process 2: keep clearing soft-dirty bits from virtual address of process 1 - process 3: call migratepages for process 1, The kernel message is like this: kernel BUG at /src/linux-dev/mm/huge_memory.c:3096! invalid opcode: 0000 [#1] SMP Modules linked in: cfg80211 rfkill crc32c_intel ppdev serio_raw pcspkr virtio_balloon virtio_console parport_pc parport pvpanic acpi_cpufreq tpm_tis tpm i2c_piix4 virtio_blk virtio_net ata_generic pata_acpi floppy virtio_pci virtio_ring virtio CPU: 0 PID: 28863 Comm: migratepages Not tainted 4.6.0-v4.6-160602-0827-+ #2 Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 task: ffff880037320000 ti: ffff88007cdd0000 task.ti: ffff88007cdd0000 RIP: 0010:[] [] split_huge_page_to_list+0x496/0x590 RSP: 0018:ffff88007cdd3b70 EFLAGS: 00010202 RAX: 0000000000000001 RBX: ffff88007c7b88c0 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000700000200 RDI: ffffea0003188000 RBP: ffff88007cdd3bb8 R08: 0000000000000001 R09: 00003ffffffff000 R10: ffff880000000000 R11: ffffc000001fffff R12: ffffea0003188000 R13: ffffea0003188000 R14: 0000000000000000 R15: 0400000000000080 FS: 00007f8ec241d740(0000) GS:ffff88007dc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f8ec1f3ed20 CR3: 000000003707b000 CR4: 00000000000006f0 Call Trace: ? list_del+0xd/0x30 queue_pages_pte_range+0x4d1/0x590 __walk_page_range+0x204/0x4e0 walk_page_range+0x71/0xf0 queue_pages_range+0x75/0x90 ? queue_pages_hugetlb+0x190/0x190 ? new_node_page+0xc0/0xc0 ? change_prot_numa+0x40/0x40 migrate_to_node+0x71/0xd0 do_migrate_pages+0x1c3/0x210 SyS_migrate_pages+0x261/0x290 entry_SYSCALL_64_fastpath+0x1a/0xa4 Code: e8 b0 87 fb ff 0f 0b 48 c7 c6 30 32 9f 81 e8 a2 87 fb ff 0f 0b 48 c7 c6 b8 46 9f 81 e8 94 87 fb ff 0f 0b 85 c0 0f 84 3e fd ff ff <0f> 0b 85 c0 0f 85 a6 00 00 00 48 8b 75 c0 4c 89 f7 41 be f0 ff RIP split_huge_page_to_list+0x496/0x590 I'm not sure of the full scenario of the reproduction, but my debug showed that split_huge_pmd_address(freeze=true) returned without running main code of pmd splitting because pmd_present(*pmd) in precheck somehow returned 0. If this happens, the subsequent try_to_unmap() fails and returns non-zero (because page_mapcount() still > 0), and finally VM_BUG_ON() fires. This patch tries to fix it by prechecking pmd state inside ptl. Link: http://lkml.kernel.org/r/1466990929-7452-1-git-send-email-n-horiguchi@ah.jp.nec.com Signed-off-by: Naoya Horiguchi Signed-off-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 419fb9e03447..f0a7a0320300 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -94,7 +94,7 @@ static inline int split_huge_page(struct page *page) void deferred_split_huge_page(struct page *page); void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long address, bool freeze); + unsigned long address, bool freeze, struct page *page); #define split_huge_pmd(__vma, __pmd, __address) \ do { \ @@ -102,7 +102,7 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, if (pmd_trans_huge(*____pmd) \ || pmd_devmap(*____pmd)) \ __split_huge_pmd(__vma, __pmd, __address, \ - false); \ + false, NULL); \ } while (0) -- cgit v1.2.3 From 5a49973d7143ebbabd76e1dcd69ee42e349bb7b9 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 14 Jul 2016 12:07:38 -0700 Subject: mm: thp: refix false positive BUG in page_move_anon_rmap() The VM_BUG_ON_PAGE in page_move_anon_rmap() is more trouble than it's worth: the syzkaller fuzzer hit it again. It's still wrong for some THP cases, because linear_page_index() was never intended to apply to addresses before the start of a vma. That's easily fixed with a signed long cast inside linear_page_index(); and Dmitry has tested such a patch, to verify the false positive. But why extend linear_page_index() just for this case? when the avoidance in page_move_anon_rmap() has already grown ugly, and there's no reason for the check at all (nothing else there is using address or index). Remove address arg from page_move_anon_rmap(), remove VM_BUG_ON_PAGE, remove CONFIG_DEBUG_VM PageTransHuge adjustment. And one more thing: should the compound_head(page) be done inside or outside page_move_anon_rmap()? It's usually pushed down to the lowest level nowadays (and mm/memory.c shows no other explicit use of it), so I think it's better done in page_move_anon_rmap() than by caller. Fixes: 0798d3c022dc ("mm: thp: avoid false positive VM_BUG_ON_PAGE in page_move_anon_rmap()") Link: http://lkml.kernel.org/r/alpine.LSU.2.11.1607120444540.12528@eggly.anvils Signed-off-by: Hugh Dickins Reported-by: Dmitry Vyukov Acked-by: Kirill A. Shutemov Cc: Mika Westerberg Cc: Andrea Arcangeli Cc: Rik van Riel Cc: [4.5+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rmap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 49eb4f8ebac9..2b0fad83683f 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -158,7 +158,7 @@ struct anon_vma *page_get_anon_vma(struct page *page); /* * rmap interfaces called when adding or removing pte of page */ -void page_move_anon_rmap(struct page *, struct vm_area_struct *, unsigned long); +void page_move_anon_rmap(struct page *, struct vm_area_struct *); void page_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long, bool); void do_page_add_anon_rmap(struct page *, struct vm_area_struct *, -- cgit v1.2.3 From 18d3df3eab23796d7f852f9c6bb60962b8372ced Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 14 Jul 2016 18:00:10 +0200 Subject: vlan: use a valid default mtu value for vlan over macsec macsec can't cope with mtu frames which need vlan tag insertion, and vlan device set the default mtu equal to the underlying dev's one. By default vlan over macsec devices use invalid mtu, dropping all the large packets. This patch adds a netif helper to check if an upper vlan device needs mtu reduction. The helper is used during vlan devices initialization to set a valid default and during mtu updating to forbid invalid, too bit, mtu values. The helper currently only check if the lower dev is a macsec device, if we get more users, we need to update only the helper (possibly reserving an additional IFF bit). Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- include/linux/netdevice.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index f45929ce8157..da4b33bea982 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4145,6 +4145,13 @@ static inline void netif_keep_dst(struct net_device *dev) dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM); } +/* return true if dev can't cope with mtu frames that need vlan tag insertion */ +static inline bool netif_reduces_vlan_mtu(struct net_device *dev) +{ + /* TODO: reserve and use an additional IFF bit, if we get more users */ + return dev->priv_flags & IFF_MACSEC; +} + extern struct pernet_operations __net_initdata loopback_net_ops; /* Logging, debugging and troubleshooting/diagnostic helpers. */ -- cgit v1.2.3 From eabfdda93477f6ee5e153f560560e9cb1c617fd7 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Mon, 18 Jul 2016 15:02:06 -0400 Subject: net: switchdev: change ageing_time type to clock_t The switchdev value for the SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME attribute is a clock_t and requires to use helpers such as clock_t_to_jiffies() to convert to milliseconds. Change ageing_time type from u32 to clock_t to make it explicit. Fixes: f55ac58ae64c ("switchdev: add bridge ageing_time attribute") Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- include/net/switchdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/switchdev.h b/include/net/switchdev.h index 985619a59323..1d8e158241da 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -60,7 +60,7 @@ struct switchdev_attr { struct netdev_phys_item_id ppid; /* PORT_PARENT_ID */ u8 stp_state; /* PORT_STP_STATE */ unsigned long brport_flags; /* PORT_BRIDGE_FLAGS */ - u32 ageing_time; /* BRIDGE_AGEING_TIME */ + clock_t ageing_time; /* BRIDGE_AGEING_TIME */ bool vlan_filtering; /* BRIDGE_VLAN_FILTERING */ } u; }; -- cgit v1.2.3 From 73f576c04b9410ed19660f74f97521bee6e1c546 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 20 Jul 2016 15:44:57 -0700 Subject: mm: memcontrol: fix cgroup creation failure after many small jobs The memory controller has quite a bit of state that usually outlives the cgroup and pins its CSS until said state disappears. At the same time it imposes a 16-bit limit on the CSS ID space to economically store IDs in the wild. Consequently, when we use cgroups to contain frequent but small and short-lived jobs that leave behind some page cache, we quickly run into the 64k limitations of outstanding CSSs. Creating a new cgroup fails with -ENOSPC while there are only a few, or even no user-visible cgroups in existence. Although pinning CSSs past cgroup removal is common, there are only two instances that actually need an ID after a cgroup is deleted: cache shadow entries and swapout records. Cache shadow entries reference the ID weakly and can deal with the CSS having disappeared when it's looked up later. They pose no hurdle. Swap-out records do need to pin the css to hierarchically attribute swapins after the cgroup has been deleted; though the only pages that remain swapped out after offlining are tmpfs/shmem pages. And those references are under the user's control, so they are manageable. This patch introduces a private 16-bit memcg ID and switches swap and cache shadow entries over to using that. This ID can then be recycled after offlining when the CSS remains pinned only by objects that don't specifically need it. This script demonstrates the problem by faulting one cache page in a new cgroup and deleting it again: set -e mkdir -p pages for x in `seq 128000`; do [ $((x % 1000)) -eq 0 ] && echo $x mkdir /cgroup/foo echo $$ >/cgroup/foo/cgroup.procs echo trex >pages/$x echo $$ >/cgroup/cgroup.procs rmdir /cgroup/foo done When run on an unpatched kernel, we eventually run out of possible IDs even though there are no visible cgroups: [root@ham ~]# ./cssidstress.sh [...] 65000 mkdir: cannot create directory '/cgroup/foo': No space left on device After this patch, the IDs get released upon cgroup destruction and the cache and css objects get released once memory reclaim kicks in. [hannes@cmpxchg.org: init the IDR] Link: http://lkml.kernel.org/r/20160621154601.GA22431@cmpxchg.org Fixes: b2052564e66d ("mm: memcontrol: continue cache reclaim from offlined groups") Link: http://lkml.kernel.org/r/20160617162516.GD19084@cmpxchg.org Signed-off-by: Johannes Weiner Reported-by: John Garcia Reviewed-by: Vladimir Davydov Acked-by: Tejun Heo Cc: Nikolay Borisov Cc: [3.19+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index a805474df4ab..56e6069d2452 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -97,6 +97,11 @@ enum mem_cgroup_events_target { #define MEM_CGROUP_ID_SHIFT 16 #define MEM_CGROUP_ID_MAX USHRT_MAX +struct mem_cgroup_id { + int id; + atomic_t ref; +}; + struct mem_cgroup_stat_cpu { long count[MEMCG_NR_STAT]; unsigned long events[MEMCG_NR_EVENTS]; @@ -172,6 +177,9 @@ enum memcg_kmem_state { struct mem_cgroup { struct cgroup_subsys_state css; + /* Private memcg ID. Used to ID objects that outlive the cgroup */ + struct mem_cgroup_id id; + /* Accounted resources */ struct page_counter memory; struct page_counter swap; @@ -330,22 +338,9 @@ static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) if (mem_cgroup_disabled()) return 0; - return memcg->css.id; -} - -/** - * mem_cgroup_from_id - look up a memcg from an id - * @id: the id to look up - * - * Caller must hold rcu_read_lock() and use css_tryget() as necessary. - */ -static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) -{ - struct cgroup_subsys_state *css; - - css = css_from_id(id, &memory_cgrp_subsys); - return mem_cgroup_from_css(css); + return memcg->id.id; } +struct mem_cgroup *mem_cgroup_from_id(unsigned short id); /** * parent_mem_cgroup - find the accounting parent of a memcg -- cgit v1.2.3 From 3cb9185c67304b2a7ea9be73e7d13df6fb2793a1 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Wed, 20 Jul 2016 15:45:00 -0700 Subject: radix-tree: fix radix_tree_iter_retry() for tagged iterators. radix_tree_iter_retry() resets slot to NULL, but it doesn't reset tags. Then NULL slot and non-zero iter.tags passed to radix_tree_next_slot() leading to crash: RIP: radix_tree_next_slot include/linux/radix-tree.h:473 find_get_pages_tag+0x334/0x930 mm/filemap.c:1452 .... Call Trace: pagevec_lookup_tag+0x3a/0x80 mm/swap.c:960 mpage_prepare_extent_to_map+0x321/0xa90 fs/ext4/inode.c:2516 ext4_writepages+0x10be/0x2b20 fs/ext4/inode.c:2736 do_writepages+0x97/0x100 mm/page-writeback.c:2364 __filemap_fdatawrite_range+0x248/0x2e0 mm/filemap.c:300 filemap_write_and_wait_range+0x121/0x1b0 mm/filemap.c:490 ext4_sync_file+0x34d/0xdb0 fs/ext4/fsync.c:115 vfs_fsync_range+0x10a/0x250 fs/sync.c:195 vfs_fsync fs/sync.c:209 do_fsync+0x42/0x70 fs/sync.c:219 SYSC_fdatasync fs/sync.c:232 SyS_fdatasync+0x19/0x20 fs/sync.c:230 entry_SYSCALL_64_fastpath+0x23/0xc1 arch/x86/entry/entry_64.S:207 We must reset iterator's tags to bail out from radix_tree_next_slot() and go to the slow-path in radix_tree_next_chunk(). Fixes: 46437f9a554f ("radix-tree: fix race in gang lookup") Link: http://lkml.kernel.org/r/1468495196-10604-1-git-send-email-aryabinin@virtuozzo.com Signed-off-by: Andrey Ryabinin Reported-by: Dmitry Vyukov Acked-by: Konstantin Khlebnikov Cc: Matthew Wilcox Cc: Hugh Dickins Cc: Ross Zwisler Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/radix-tree.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index cb4b7e8cee81..eca6f626c16e 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -407,6 +407,7 @@ static inline __must_check void **radix_tree_iter_retry(struct radix_tree_iter *iter) { iter->next_index = iter->index; + iter->tags = 0; return NULL; } -- cgit v1.2.3