diff options
Diffstat (limited to 'drivers/virtio/virtio_mem.c')
| -rw-r--r-- | drivers/virtio/virtio_mem.c | 1789 |
1 files changed, 1287 insertions, 502 deletions
diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c index 181e2f18beae..9fc9ec4a25f5 100644 --- a/drivers/virtio/virtio_mem.c +++ b/drivers/virtio/virtio_mem.c @@ -27,20 +27,74 @@ static bool unplug_online = true; module_param(unplug_online, bool, 0644); MODULE_PARM_DESC(unplug_online, "Try to unplug online memory"); -enum virtio_mem_mb_state { +static bool force_bbm; +module_param(force_bbm, bool, 0444); +MODULE_PARM_DESC(force_bbm, + "Force Big Block Mode. Default is 0 (auto-selection)"); + +static unsigned long bbm_block_size; +module_param(bbm_block_size, ulong, 0444); +MODULE_PARM_DESC(bbm_block_size, + "Big Block size in bytes. Default is 0 (auto-detection)."); + +static bool bbm_safe_unplug = true; +module_param(bbm_safe_unplug, bool, 0444); +MODULE_PARM_DESC(bbm_safe_unplug, + "Use a safe unplug mechanism in BBM, avoiding long/endless loops"); + +/* + * virtio-mem currently supports the following modes of operation: + * + * * Sub Block Mode (SBM): A Linux memory block spans 2..X subblocks (SB). The + * size of a Sub Block (SB) is determined based on the device block size, the + * pageblock size, and the maximum allocation granularity of the buddy. + * Subblocks within a Linux memory block might either be plugged or unplugged. + * Memory is added/removed to Linux MM in Linux memory block granularity. + * + * * Big Block Mode (BBM): A Big Block (BB) spans 1..X Linux memory blocks. + * Memory is added/removed to Linux MM in Big Block granularity. + * + * The mode is determined automatically based on the Linux memory block size + * and the device block size. + * + * User space / core MM (auto onlining) is responsible for onlining added + * Linux memory blocks - and for selecting a zone. Linux Memory Blocks are + * always onlined separately, and all memory within a Linux memory block is + * onlined to the same zone - virtio-mem relies on this behavior. + */ + +/* + * State of a Linux memory block in SBM. + */ +enum virtio_mem_sbm_mb_state { /* Unplugged, not added to Linux. Can be reused later. */ - VIRTIO_MEM_MB_STATE_UNUSED = 0, + VIRTIO_MEM_SBM_MB_UNUSED = 0, /* (Partially) plugged, not added to Linux. Error on add_memory(). */ - VIRTIO_MEM_MB_STATE_PLUGGED, + VIRTIO_MEM_SBM_MB_PLUGGED, /* Fully plugged, fully added to Linux, offline. */ - VIRTIO_MEM_MB_STATE_OFFLINE, + VIRTIO_MEM_SBM_MB_OFFLINE, /* Partially plugged, fully added to Linux, offline. */ - VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL, + VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL, /* Fully plugged, fully added to Linux, online. */ - VIRTIO_MEM_MB_STATE_ONLINE, + VIRTIO_MEM_SBM_MB_ONLINE, /* Partially plugged, fully added to Linux, online. */ - VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL, - VIRTIO_MEM_MB_STATE_COUNT + VIRTIO_MEM_SBM_MB_ONLINE_PARTIAL, + VIRTIO_MEM_SBM_MB_COUNT +}; + +/* + * State of a Big Block (BB) in BBM, covering 1..X Linux memory blocks. + */ +enum virtio_mem_bbm_bb_state { + /* Unplugged, not added to Linux. Can be reused later. */ + VIRTIO_MEM_BBM_BB_UNUSED = 0, + /* Plugged, not added to Linux. Error on add_memory(). */ + VIRTIO_MEM_BBM_BB_PLUGGED, + /* Plugged and added to Linux. */ + VIRTIO_MEM_BBM_BB_ADDED, + /* All online parts are fake-offline, ready to remove. */ + VIRTIO_MEM_BBM_BB_FAKE_OFFLINE, + VIRTIO_MEM_BBM_BB_COUNT }; struct virtio_mem { @@ -51,6 +105,7 @@ struct virtio_mem { /* Workqueue that processes the plug/unplug requests. */ struct work_struct wq; + atomic_t wq_active; atomic_t config_changed; /* Virtqueue for guest->host requests. */ @@ -70,27 +125,13 @@ struct virtio_mem { /* The device block size (for communicating with the device). */ uint64_t device_block_size; - /* The translated node id. NUMA_NO_NODE in case not specified. */ + /* The determined node id for all memory of the device. */ int nid; /* Physical start address of the memory region. */ uint64_t addr; /* Maximum region size in bytes. */ uint64_t region_size; - /* The subblock size. */ - uint64_t subblock_size; - /* The number of subblocks per memory block. */ - uint32_t nb_sb_per_mb; - - /* Id of the first memory block of this device. */ - unsigned long first_mb_id; - /* Id of the last memory block of this device. */ - unsigned long last_mb_id; - /* Id of the last usable memory block of this device. */ - unsigned long last_usable_mb_id; - /* Id of the next memory bock to prepare when needed. */ - unsigned long next_mb_id; - /* The parent resource for all memory added via this device. */ struct resource *parent_resource; /* @@ -99,31 +140,79 @@ struct virtio_mem { */ const char *resource_name; - /* Summary of all memory block states. */ - unsigned long nb_mb_state[VIRTIO_MEM_MB_STATE_COUNT]; -#define VIRTIO_MEM_NB_OFFLINE_THRESHOLD 10 - - /* - * One byte state per memory block. - * - * Allocated via vmalloc(). When preparing new blocks, resized - * (alloc+copy+free) when needed (crossing pages with the next mb). - * (when crossing pages). - * - * With 128MB memory blocks, we have states for 512GB of memory in one - * page. - */ - uint8_t *mb_state; - /* - * $nb_sb_per_mb bit per memory block. Handled similar to mb_state. - * - * With 4MB subblocks, we manage 128GB of memory in one page. + * We don't want to add too much memory if it's not getting onlined, + * to avoid running OOM. Besides this threshold, we allow to have at + * least two offline blocks at a time (whatever is bigger). */ - unsigned long *sb_bitmap; +#define VIRTIO_MEM_DEFAULT_OFFLINE_THRESHOLD (1024 * 1024 * 1024) + atomic64_t offline_size; + uint64_t offline_threshold; + + /* If set, the driver is in SBM, otherwise in BBM. */ + bool in_sbm; + + union { + struct { + /* Id of the first memory block of this device. */ + unsigned long first_mb_id; + /* Id of the last usable memory block of this device. */ + unsigned long last_usable_mb_id; + /* Id of the next memory bock to prepare when needed. */ + unsigned long next_mb_id; + + /* The subblock size. */ + uint64_t sb_size; + /* The number of subblocks per Linux memory block. */ + uint32_t sbs_per_mb; + + /* Summary of all memory block states. */ + unsigned long mb_count[VIRTIO_MEM_SBM_MB_COUNT]; + + /* + * One byte state per memory block. Allocated via + * vmalloc(). Resized (alloc+copy+free) on demand. + * + * With 128 MiB memory blocks, we have states for 512 + * GiB of memory in one 4 KiB page. + */ + uint8_t *mb_states; + + /* + * Bitmap: one bit per subblock. Allocated similar to + * sbm.mb_states. + * + * A set bit means the corresponding subblock is + * plugged, otherwise it's unblocked. + * + * With 4 MiB subblocks, we manage 128 GiB of memory + * in one 4 KiB page. + */ + unsigned long *sb_states; + } sbm; + + struct { + /* Id of the first big block of this device. */ + unsigned long first_bb_id; + /* Id of the last usable big block of this device. */ + unsigned long last_usable_bb_id; + /* Id of the next device bock to prepare when needed. */ + unsigned long next_bb_id; + + /* Summary of all big block states. */ + unsigned long bb_count[VIRTIO_MEM_BBM_BB_COUNT]; + + /* One byte state per big block. See sbm.mb_states. */ + uint8_t *bb_states; + + /* The block size used for plugging/adding/removing. */ + uint64_t bb_size; + } bbm; + }; /* - * Mutex that protects the nb_mb_state, mb_state, and sb_bitmap. + * Mutex that protects the sbm.mb_count, sbm.mb_states, + * sbm.sb_states, bbm.bb_count, and bbm.bb_states * * When this lock is held the pointers can't change, ONLINE and * OFFLINE blocks can't change the state and no subblocks will get @@ -160,6 +249,11 @@ static DEFINE_MUTEX(virtio_mem_mutex); static LIST_HEAD(virtio_mem_devices); static void virtio_mem_online_page_cb(struct page *page, unsigned int order); +static void virtio_mem_fake_offline_going_offline(unsigned long pfn, + unsigned long nr_pages); +static void virtio_mem_fake_offline_cancel_offline(unsigned long pfn, + unsigned long nr_pages); +static void virtio_mem_retry(struct virtio_mem *vm); /* * Register a virtio-mem device so it will be considered for the online_page @@ -213,6 +307,24 @@ static unsigned long virtio_mem_mb_id_to_phys(unsigned long mb_id) } /* + * Calculate the big block id of a given address. + */ +static unsigned long virtio_mem_phys_to_bb_id(struct virtio_mem *vm, + uint64_t addr) +{ + return addr / vm->bbm.bb_size; +} + +/* + * Calculate the physical start address of a given big block id. + */ +static uint64_t virtio_mem_bb_id_to_phys(struct virtio_mem *vm, + unsigned long bb_id) +{ + return bb_id * vm->bbm.bb_size; +} + +/* * Calculate the subblock id of a given address. */ static unsigned long virtio_mem_phys_to_sb_id(struct virtio_mem *vm, @@ -221,89 +333,164 @@ static unsigned long virtio_mem_phys_to_sb_id(struct virtio_mem *vm, const unsigned long mb_id = virtio_mem_phys_to_mb_id(addr); const unsigned long mb_addr = virtio_mem_mb_id_to_phys(mb_id); - return (addr - mb_addr) / vm->subblock_size; + return (addr - mb_addr) / vm->sbm.sb_size; +} + +/* + * Set the state of a big block, taking care of the state counter. + */ +static void virtio_mem_bbm_set_bb_state(struct virtio_mem *vm, + unsigned long bb_id, + enum virtio_mem_bbm_bb_state state) +{ + const unsigned long idx = bb_id - vm->bbm.first_bb_id; + enum virtio_mem_bbm_bb_state old_state; + + old_state = vm->bbm.bb_states[idx]; + vm->bbm.bb_states[idx] = state; + + BUG_ON(vm->bbm.bb_count[old_state] == 0); + vm->bbm.bb_count[old_state]--; + vm->bbm.bb_count[state]++; +} + +/* + * Get the state of a big block. + */ +static enum virtio_mem_bbm_bb_state virtio_mem_bbm_get_bb_state(struct virtio_mem *vm, + unsigned long bb_id) +{ + return vm->bbm.bb_states[bb_id - vm->bbm.first_bb_id]; +} + +/* + * Prepare the big block state array for the next big block. + */ +static int virtio_mem_bbm_bb_states_prepare_next_bb(struct virtio_mem *vm) +{ + unsigned long old_bytes = vm->bbm.next_bb_id - vm->bbm.first_bb_id; + unsigned long new_bytes = old_bytes + 1; + int old_pages = PFN_UP(old_bytes); + int new_pages = PFN_UP(new_bytes); + uint8_t *new_array; + + if (vm->bbm.bb_states && old_pages == new_pages) + return 0; + + new_array = vzalloc(new_pages * PAGE_SIZE); + if (!new_array) + return -ENOMEM; + + mutex_lock(&vm->hotplug_mutex); + if (vm->bbm.bb_states) + memcpy(new_array, vm->bbm.bb_states, old_pages * PAGE_SIZE); + vfree(vm->bbm.bb_states); + vm->bbm.bb_states = new_array; + mutex_unlock(&vm->hotplug_mutex); + + return 0; } +#define virtio_mem_bbm_for_each_bb(_vm, _bb_id, _state) \ + for (_bb_id = vm->bbm.first_bb_id; \ + _bb_id < vm->bbm.next_bb_id && _vm->bbm.bb_count[_state]; \ + _bb_id++) \ + if (virtio_mem_bbm_get_bb_state(_vm, _bb_id) == _state) + +#define virtio_mem_bbm_for_each_bb_rev(_vm, _bb_id, _state) \ + for (_bb_id = vm->bbm.next_bb_id - 1; \ + _bb_id >= vm->bbm.first_bb_id && _vm->bbm.bb_count[_state]; \ + _bb_id--) \ + if (virtio_mem_bbm_get_bb_state(_vm, _bb_id) == _state) + /* * Set the state of a memory block, taking care of the state counter. */ -static void virtio_mem_mb_set_state(struct virtio_mem *vm, unsigned long mb_id, - enum virtio_mem_mb_state state) +static void virtio_mem_sbm_set_mb_state(struct virtio_mem *vm, + unsigned long mb_id, uint8_t state) { - const unsigned long idx = mb_id - vm->first_mb_id; - enum virtio_mem_mb_state old_state; + const unsigned long idx = mb_id - vm->sbm.first_mb_id; + uint8_t old_state; - old_state = vm->mb_state[idx]; - vm->mb_state[idx] = state; + old_state = vm->sbm.mb_states[idx]; + vm->sbm.mb_states[idx] = state; - BUG_ON(vm->nb_mb_state[old_state] == 0); - vm->nb_mb_state[old_state]--; - vm->nb_mb_state[state]++; + BUG_ON(vm->sbm.mb_count[old_state] == 0); + vm->sbm.mb_count[old_state]--; + vm->sbm.mb_count[state]++; } /* * Get the state of a memory block. */ -static enum virtio_mem_mb_state virtio_mem_mb_get_state(struct virtio_mem *vm, - unsigned long mb_id) +static uint8_t virtio_mem_sbm_get_mb_state(struct virtio_mem *vm, + unsigned long mb_id) { - const unsigned long idx = mb_id - vm->first_mb_id; + const unsigned long idx = mb_id - vm->sbm.first_mb_id; - return vm->mb_state[idx]; + return vm->sbm.mb_states[idx]; } /* * Prepare the state array for the next memory block. */ -static int virtio_mem_mb_state_prepare_next_mb(struct virtio_mem *vm) +static int virtio_mem_sbm_mb_states_prepare_next_mb(struct virtio_mem *vm) { - unsigned long old_bytes = vm->next_mb_id - vm->first_mb_id + 1; - unsigned long new_bytes = vm->next_mb_id - vm->first_mb_id + 2; - int old_pages = PFN_UP(old_bytes); - int new_pages = PFN_UP(new_bytes); - uint8_t *new_mb_state; + int old_pages = PFN_UP(vm->sbm.next_mb_id - vm->sbm.first_mb_id); + int new_pages = PFN_UP(vm->sbm.next_mb_id - vm->sbm.first_mb_id + 1); + uint8_t *new_array; - if (vm->mb_state && old_pages == new_pages) + if (vm->sbm.mb_states && old_pages == new_pages) return 0; - new_mb_state = vzalloc(new_pages * PAGE_SIZE); - if (!new_mb_state) + new_array = vzalloc(new_pages * PAGE_SIZE); + if (!new_array) return -ENOMEM; mutex_lock(&vm->hotplug_mutex); - if (vm->mb_state) - memcpy(new_mb_state, vm->mb_state, old_pages * PAGE_SIZE); - vfree(vm->mb_state); - vm->mb_state = new_mb_state; + if (vm->sbm.mb_states) + memcpy(new_array, vm->sbm.mb_states, old_pages * PAGE_SIZE); + vfree(vm->sbm.mb_states); + vm->sbm.mb_states = new_array; mutex_unlock(&vm->hotplug_mutex); return 0; } -#define virtio_mem_for_each_mb_state(_vm, _mb_id, _state) \ - for (_mb_id = _vm->first_mb_id; \ - _mb_id < _vm->next_mb_id && _vm->nb_mb_state[_state]; \ +#define virtio_mem_sbm_for_each_mb(_vm, _mb_id, _state) \ + for (_mb_id = _vm->sbm.first_mb_id; \ + _mb_id < _vm->sbm.next_mb_id && _vm->sbm.mb_count[_state]; \ _mb_id++) \ - if (virtio_mem_mb_get_state(_vm, _mb_id) == _state) + if (virtio_mem_sbm_get_mb_state(_vm, _mb_id) == _state) -#define virtio_mem_for_each_mb_state_rev(_vm, _mb_id, _state) \ - for (_mb_id = _vm->next_mb_id - 1; \ - _mb_id >= _vm->first_mb_id && _vm->nb_mb_state[_state]; \ +#define virtio_mem_sbm_for_each_mb_rev(_vm, _mb_id, _state) \ + for (_mb_id = _vm->sbm.next_mb_id - 1; \ + _mb_id >= _vm->sbm.first_mb_id && _vm->sbm.mb_count[_state]; \ _mb_id--) \ - if (virtio_mem_mb_get_state(_vm, _mb_id) == _state) + if (virtio_mem_sbm_get_mb_state(_vm, _mb_id) == _state) + +/* + * Calculate the bit number in the subblock bitmap for the given subblock + * inside the given memory block. + */ +static int virtio_mem_sbm_sb_state_bit_nr(struct virtio_mem *vm, + unsigned long mb_id, int sb_id) +{ + return (mb_id - vm->sbm.first_mb_id) * vm->sbm.sbs_per_mb + sb_id; +} /* * Mark all selected subblocks plugged. * * Will not modify the state of the memory block. */ -static void virtio_mem_mb_set_sb_plugged(struct virtio_mem *vm, - unsigned long mb_id, int sb_id, - int count) +static void virtio_mem_sbm_set_sb_plugged(struct virtio_mem *vm, + unsigned long mb_id, int sb_id, + int count) { - const int bit = (mb_id - vm->first_mb_id) * vm->nb_sb_per_mb + sb_id; + const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id); - __bitmap_set(vm->sb_bitmap, bit, count); + __bitmap_set(vm->sbm.sb_states, bit, count); } /* @@ -311,105 +498,114 @@ static void virtio_mem_mb_set_sb_plugged(struct virtio_mem *vm, * * Will not modify the state of the memory block. */ -static void virtio_mem_mb_set_sb_unplugged(struct virtio_mem *vm, - unsigned long mb_id, int sb_id, - int count) +static void virtio_mem_sbm_set_sb_unplugged(struct virtio_mem *vm, + unsigned long mb_id, int sb_id, + int count) { - const int bit = (mb_id - vm->first_mb_id) * vm->nb_sb_per_mb + sb_id; + const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id); - __bitmap_clear(vm->sb_bitmap, bit, count); + __bitmap_clear(vm->sbm.sb_states, bit, count); } /* * Test if all selected subblocks are plugged. */ -static bool virtio_mem_mb_test_sb_plugged(struct virtio_mem *vm, - unsigned long mb_id, int sb_id, - int count) +static bool virtio_mem_sbm_test_sb_plugged(struct virtio_mem *vm, + unsigned long mb_id, int sb_id, + int count) { - const int bit = (mb_id - vm->first_mb_id) * vm->nb_sb_per_mb + sb_id; + const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id); if (count == 1) - return test_bit(bit, vm->sb_bitmap); + return test_bit(bit, vm->sbm.sb_states); /* TODO: Helper similar to bitmap_set() */ - return find_next_zero_bit(vm->sb_bitmap, bit + count, bit) >= + return find_next_zero_bit(vm->sbm.sb_states, bit + count, bit) >= bit + count; } /* * Test if all selected subblocks are unplugged. */ -static bool virtio_mem_mb_test_sb_unplugged(struct virtio_mem *vm, - unsigned long mb_id, int sb_id, - int count) +static bool virtio_mem_sbm_test_sb_unplugged(struct virtio_mem *vm, + unsigned long mb_id, int sb_id, + int count) { - const int bit = (mb_id - vm->first_mb_id) * vm->nb_sb_per_mb + sb_id; + const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id); /* TODO: Helper similar to bitmap_set() */ - return find_next_bit(vm->sb_bitmap, bit + count, bit) >= bit + count; + return find_next_bit(vm->sbm.sb_states, bit + count, bit) >= + bit + count; } /* - * Find the first unplugged subblock. Returns vm->nb_sb_per_mb in case there is + * Find the first unplugged subblock. Returns vm->sbm.sbs_per_mb in case there is * none. */ -static int virtio_mem_mb_first_unplugged_sb(struct virtio_mem *vm, +static int virtio_mem_sbm_first_unplugged_sb(struct virtio_mem *vm, unsigned long mb_id) { - const int bit = (mb_id - vm->first_mb_id) * vm->nb_sb_per_mb; + const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, 0); - return find_next_zero_bit(vm->sb_bitmap, bit + vm->nb_sb_per_mb, bit) - - bit; + return find_next_zero_bit(vm->sbm.sb_states, + bit + vm->sbm.sbs_per_mb, bit) - bit; } /* * Prepare the subblock bitmap for the next memory block. */ -static int virtio_mem_sb_bitmap_prepare_next_mb(struct virtio_mem *vm) +static int virtio_mem_sbm_sb_states_prepare_next_mb(struct virtio_mem *vm) { - const unsigned long old_nb_mb = vm->next_mb_id - vm->first_mb_id; - const unsigned long old_nb_bits = old_nb_mb * vm->nb_sb_per_mb; - const unsigned long new_nb_bits = (old_nb_mb + 1) * vm->nb_sb_per_mb; + const unsigned long old_nb_mb = vm->sbm.next_mb_id - vm->sbm.first_mb_id; + const unsigned long old_nb_bits = old_nb_mb * vm->sbm.sbs_per_mb; + const unsigned long new_nb_bits = (old_nb_mb + 1) * vm->sbm.sbs_per_mb; int old_pages = PFN_UP(BITS_TO_LONGS(old_nb_bits) * sizeof(long)); int new_pages = PFN_UP(BITS_TO_LONGS(new_nb_bits) * sizeof(long)); - unsigned long *new_sb_bitmap, *old_sb_bitmap; + unsigned long *new_bitmap, *old_bitmap; - if (vm->sb_bitmap && old_pages == new_pages) + if (vm->sbm.sb_states && old_pages == new_pages) return 0; - new_sb_bitmap = vzalloc(new_pages * PAGE_SIZE); - if (!new_sb_bitmap) + new_bitmap = vzalloc(new_pages * PAGE_SIZE); + if (!new_bitmap) return -ENOMEM; mutex_lock(&vm->hotplug_mutex); - if (new_sb_bitmap) - memcpy(new_sb_bitmap, vm->sb_bitmap, old_pages * PAGE_SIZE); + if (new_bitmap) + memcpy(new_bitmap, vm->sbm.sb_states, old_pages * PAGE_SIZE); - old_sb_bitmap = vm->sb_bitmap; - vm->sb_bitmap = new_sb_bitmap; + old_bitmap = vm->sbm.sb_states; + vm->sbm.sb_states = new_bitmap; mutex_unlock(&vm->hotplug_mutex); - vfree(old_sb_bitmap); + vfree(old_bitmap); return 0; } /* - * Try to add a memory block to Linux. This will usually only fail - * if out of memory. + * Test if we could add memory without creating too much offline memory - + * to avoid running OOM if memory is getting onlined deferred. + */ +static bool virtio_mem_could_add_memory(struct virtio_mem *vm, uint64_t size) +{ + if (WARN_ON_ONCE(size > vm->offline_threshold)) + return false; + + return atomic64_read(&vm->offline_size) + size <= vm->offline_threshold; +} + +/* + * Try adding memory to Linux. Will usually only fail if out of memory. * * Must not be called with the vm->hotplug_mutex held (possible deadlock with * onlining code). * - * Will not modify the state of the memory block. + * Will not modify the state of memory blocks in virtio-mem. */ -static int virtio_mem_mb_add(struct virtio_mem *vm, unsigned long mb_id) +static int virtio_mem_add_memory(struct virtio_mem *vm, uint64_t addr, + uint64_t size) { - const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id); - int nid = vm->nid; - - if (nid == NUMA_NO_NODE) - nid = memory_add_physaddr_to_nid(addr); + int rc; /* * When force-unloading the driver and we still have memory added to @@ -422,53 +618,155 @@ static int virtio_mem_mb_add(struct virtio_mem *vm, unsigned long mb_id) return -ENOMEM; } - dev_dbg(&vm->vdev->dev, "adding memory block: %lu\n", mb_id); - return add_memory_driver_managed(nid, addr, memory_block_size_bytes(), - vm->resource_name, - MEMHP_MERGE_RESOURCE); + dev_dbg(&vm->vdev->dev, "adding memory: 0x%llx - 0x%llx\n", addr, + addr + size - 1); + /* Memory might get onlined immediately. */ + atomic64_add(size, &vm->offline_size); + rc = add_memory_driver_managed(vm->nid, addr, size, vm->resource_name, + MEMHP_MERGE_RESOURCE); + if (rc) { + atomic64_sub(size, &vm->offline_size); + dev_warn(&vm->vdev->dev, "adding memory failed: %d\n", rc); + /* + * TODO: Linux MM does not properly clean up yet in all cases + * where adding of memory failed - especially on -ENOMEM. + */ + } + return rc; +} + +/* + * See virtio_mem_add_memory(): Try adding a single Linux memory block. + */ +static int virtio_mem_sbm_add_mb(struct virtio_mem *vm, unsigned long mb_id) +{ + const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id); + const uint64_t size = memory_block_size_bytes(); + + return virtio_mem_add_memory(vm, addr, size); +} + +/* + * See virtio_mem_add_memory(): Try adding a big block. + */ +static int virtio_mem_bbm_add_bb(struct virtio_mem *vm, unsigned long bb_id) +{ + const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id); + const uint64_t size = vm->bbm.bb_size; + + return virtio_mem_add_memory(vm, addr, size); } /* - * Try to remove a memory block from Linux. Will only fail if the memory block - * is not offline. + * Try removing memory from Linux. Will only fail if memory blocks aren't + * offline. * * Must not be called with the vm->hotplug_mutex held (possible deadlock with * onlining code). * - * Will not modify the state of the memory block. + * Will not modify the state of memory blocks in virtio-mem. */ -static int virtio_mem_mb_remove(struct virtio_mem *vm, unsigned long mb_id) +static int virtio_mem_remove_memory(struct virtio_mem *vm, uint64_t addr, + uint64_t size) +{ + int rc; + + dev_dbg(&vm->vdev->dev, "removing memory: 0x%llx - 0x%llx\n", addr, + addr + size - 1); + rc = remove_memory(vm->nid, addr, size); + if (!rc) { + atomic64_sub(size, &vm->offline_size); + /* + * We might have freed up memory we can now unplug, retry + * immediately instead of waiting. + */ + virtio_mem_retry(vm); + } else { + dev_dbg(&vm->vdev->dev, "removing memory failed: %d\n", rc); + } + return rc; +} + +/* + * See virtio_mem_remove_memory(): Try removing a single Linux memory block. + */ +static int virtio_mem_sbm_remove_mb(struct virtio_mem *vm, unsigned long mb_id) { const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id); - int nid = vm->nid; + const uint64_t size = memory_block_size_bytes(); - if (nid == NUMA_NO_NODE) - nid = memory_add_physaddr_to_nid(addr); + return virtio_mem_remove_memory(vm, addr, size); +} - dev_dbg(&vm->vdev->dev, "removing memory block: %lu\n", mb_id); - return remove_memory(nid, addr, memory_block_size_bytes()); +/* + * See virtio_mem_remove_memory(): Try to remove all Linux memory blocks covered + * by the big block. + */ +static int virtio_mem_bbm_remove_bb(struct virtio_mem *vm, unsigned long bb_id) +{ + const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id); + const uint64_t size = vm->bbm.bb_size; + + return virtio_mem_remove_memory(vm, addr, size); } /* - * Try to offline and remove a memory block from Linux. + * Try offlining and removing memory from Linux. * * Must not be called with the vm->hotplug_mutex held (possible deadlock with * onlining code). * - * Will not modify the state of the memory block. + * Will not modify the state of memory blocks in virtio-mem. */ -static int virtio_mem_mb_offline_and_remove(struct virtio_mem *vm, - unsigned long mb_id) +static int virtio_mem_offline_and_remove_memory(struct virtio_mem *vm, + uint64_t addr, + uint64_t size) +{ + int rc; + + dev_dbg(&vm->vdev->dev, + "offlining and removing memory: 0x%llx - 0x%llx\n", addr, + addr + size - 1); + + rc = offline_and_remove_memory(vm->nid, addr, size); + if (!rc) { + atomic64_sub(size, &vm->offline_size); + /* + * We might have freed up memory we can now unplug, retry + * immediately instead of waiting. + */ + virtio_mem_retry(vm); + } else { + dev_dbg(&vm->vdev->dev, + "offlining and removing memory failed: %d\n", rc); + } + return rc; +} + +/* + * See virtio_mem_offline_and_remove_memory(): Try offlining and removing + * a single Linux memory block. + */ +static int virtio_mem_sbm_offline_and_remove_mb(struct virtio_mem *vm, + unsigned long mb_id) { const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id); - int nid = vm->nid; + const uint64_t size = memory_block_size_bytes(); + + return virtio_mem_offline_and_remove_memory(vm, addr, size); +} - if (nid == NUMA_NO_NODE) - nid = memory_add_physaddr_to_nid(addr); +/* + * See virtio_mem_offline_and_remove_memory(): Try to offline and remove a + * all Linux memory blocks covered by the big block. + */ +static int virtio_mem_bbm_offline_and_remove_bb(struct virtio_mem *vm, + unsigned long bb_id) +{ + const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id); + const uint64_t size = vm->bbm.bb_size; - dev_dbg(&vm->vdev->dev, "offlining and removing memory block: %lu\n", - mb_id); - return offline_and_remove_memory(nid, addr, memory_block_size_bytes()); + return virtio_mem_offline_and_remove_memory(vm, addr, size); } /* @@ -499,31 +797,28 @@ static int virtio_mem_translate_node_id(struct virtio_mem *vm, uint16_t node_id) * Test if a virtio-mem device overlaps with the given range. Can be called * from (notifier) callbacks lockless. */ -static bool virtio_mem_overlaps_range(struct virtio_mem *vm, - unsigned long start, unsigned long size) +static bool virtio_mem_overlaps_range(struct virtio_mem *vm, uint64_t start, + uint64_t size) { - unsigned long dev_start = virtio_mem_mb_id_to_phys(vm->first_mb_id); - unsigned long dev_end = virtio_mem_mb_id_to_phys(vm->last_mb_id) + - memory_block_size_bytes(); - - return start < dev_end && dev_start < start + size; + return start < vm->addr + vm->region_size && vm->addr < start + size; } /* - * Test if a virtio-mem device owns a memory block. Can be called from + * Test if a virtio-mem device contains a given range. Can be called from * (notifier) callbacks lockless. */ -static bool virtio_mem_owned_mb(struct virtio_mem *vm, unsigned long mb_id) +static bool virtio_mem_contains_range(struct virtio_mem *vm, uint64_t start, + uint64_t size) { - return mb_id >= vm->first_mb_id && mb_id <= vm->last_mb_id; + return start >= vm->addr && start + size <= vm->addr + vm->region_size; } -static int virtio_mem_notify_going_online(struct virtio_mem *vm, - unsigned long mb_id) +static int virtio_mem_sbm_notify_going_online(struct virtio_mem *vm, + unsigned long mb_id) { - switch (virtio_mem_mb_get_state(vm, mb_id)) { - case VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL: - case VIRTIO_MEM_MB_STATE_OFFLINE: + switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) { + case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL: + case VIRTIO_MEM_SBM_MB_OFFLINE: return NOTIFY_OK; default: break; @@ -533,108 +828,100 @@ static int virtio_mem_notify_going_online(struct virtio_mem *vm, return NOTIFY_BAD; } -static void virtio_mem_notify_offline(struct virtio_mem *vm, - unsigned long mb_id) +static void virtio_mem_sbm_notify_offline(struct virtio_mem *vm, + unsigned long mb_id) { - switch (virtio_mem_mb_get_state(vm, mb_id)) { - case VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL: - virtio_mem_mb_set_state(vm, mb_id, - VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL); + switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) { + case VIRTIO_MEM_SBM_MB_ONLINE_PARTIAL: + virtio_mem_sbm_set_mb_state(vm, mb_id, + VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL); break; - case VIRTIO_MEM_MB_STATE_ONLINE: - virtio_mem_mb_set_state(vm, mb_id, - VIRTIO_MEM_MB_STATE_OFFLINE); + case VIRTIO_MEM_SBM_MB_ONLINE: + virtio_mem_sbm_set_mb_state(vm, mb_id, + VIRTIO_MEM_SBM_MB_OFFLINE); break; default: BUG(); break; } - - /* - * Trigger the workqueue, maybe we can now unplug memory. Also, - * when we offline and remove a memory block, this will re-trigger - * us immediately - which is often nice because the removal of - * the memory block (e.g., memmap) might have freed up memory - * on other memory blocks we manage. - */ - virtio_mem_retry(vm); } -static void virtio_mem_notify_online(struct virtio_mem *vm, unsigned long mb_id) +static void virtio_mem_sbm_notify_online(struct virtio_mem *vm, + unsigned long mb_id) { - unsigned long nb_offline; - - switch (virtio_mem_mb_get_state(vm, mb_id)) { - case VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL: - virtio_mem_mb_set_state(vm, mb_id, - VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL); + switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) { + case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL: + virtio_mem_sbm_set_mb_state(vm, mb_id, + VIRTIO_MEM_SBM_MB_ONLINE_PARTIAL); break; - case VIRTIO_MEM_MB_STATE_OFFLINE: - virtio_mem_mb_set_state(vm, mb_id, VIRTIO_MEM_MB_STATE_ONLINE); + case VIRTIO_MEM_SBM_MB_OFFLINE: + virtio_mem_sbm_set_mb_state(vm, mb_id, + VIRTIO_MEM_SBM_MB_ONLINE); break; default: BUG(); break; } - nb_offline = vm->nb_mb_state[VIRTIO_MEM_MB_STATE_OFFLINE] + - vm->nb_mb_state[VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL]; - - /* see if we can add new blocks now that we onlined one block */ - if (nb_offline == VIRTIO_MEM_NB_OFFLINE_THRESHOLD - 1) - virtio_mem_retry(vm); } -static void virtio_mem_notify_going_offline(struct virtio_mem *vm, - unsigned long mb_id) +static void virtio_mem_sbm_notify_going_offline(struct virtio_mem *vm, + unsigned long mb_id) { - const unsigned long nr_pages = PFN_DOWN(vm->subblock_size); - struct page *page; + const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size); unsigned long pfn; - int sb_id, i; + int sb_id; - for (sb_id = 0; sb_id < vm->nb_sb_per_mb; sb_id++) { - if (virtio_mem_mb_test_sb_plugged(vm, mb_id, sb_id, 1)) + for (sb_id = 0; sb_id < vm->sbm.sbs_per_mb; sb_id++) { + if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1)) continue; - /* - * Drop our reference to the pages so the memory can get - * offlined and add the unplugged pages to the managed - * page counters (so offlining code can correctly subtract - * them again). - */ pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) + - sb_id * vm->subblock_size); - adjust_managed_page_count(pfn_to_page(pfn), nr_pages); - for (i = 0; i < nr_pages; i++) { - page = pfn_to_page(pfn + i); - if (WARN_ON(!page_ref_dec_and_test(page))) - dump_page(page, "unplugged page referenced"); - } + sb_id * vm->sbm.sb_size); + virtio_mem_fake_offline_going_offline(pfn, nr_pages); } } -static void virtio_mem_notify_cancel_offline(struct virtio_mem *vm, - unsigned long mb_id) +static void virtio_mem_sbm_notify_cancel_offline(struct virtio_mem *vm, + unsigned long mb_id) { - const unsigned long nr_pages = PFN_DOWN(vm->subblock_size); + const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size); unsigned long pfn; - int sb_id, i; + int sb_id; - for (sb_id = 0; sb_id < vm->nb_sb_per_mb; sb_id++) { - if (virtio_mem_mb_test_sb_plugged(vm, mb_id, sb_id, 1)) + for (sb_id = 0; sb_id < vm->sbm.sbs_per_mb; sb_id++) { + if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1)) continue; - /* - * Get the reference we dropped when going offline and - * subtract the unplugged pages from the managed page - * counters. - */ pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) + - sb_id * vm->subblock_size); - adjust_managed_page_count(pfn_to_page(pfn), -nr_pages); - for (i = 0; i < nr_pages; i++) - page_ref_inc(pfn_to_page(pfn + i)); + sb_id * vm->sbm.sb_size); + virtio_mem_fake_offline_cancel_offline(pfn, nr_pages); } } +static void virtio_mem_bbm_notify_going_offline(struct virtio_mem *vm, + unsigned long bb_id, + unsigned long pfn, + unsigned long nr_pages) +{ + /* + * When marked as "fake-offline", all online memory of this device block + * is allocated by us. Otherwise, we don't have any memory allocated. + */ + if (virtio_mem_bbm_get_bb_state(vm, bb_id) != + VIRTIO_MEM_BBM_BB_FAKE_OFFLINE) + return; + virtio_mem_fake_offline_going_offline(pfn, nr_pages); +} + +static void virtio_mem_bbm_notify_cancel_offline(struct virtio_mem *vm, + unsigned long bb_id, + unsigned long pfn, + unsigned long nr_pages) +{ + if (virtio_mem_bbm_get_bb_state(vm, bb_id) != + VIRTIO_MEM_BBM_BB_FAKE_OFFLINE) + return; + virtio_mem_fake_offline_cancel_offline(pfn, nr_pages); +} + /* * This callback will either be called synchronously from add_memory() or * asynchronously (e.g., triggered via user space). We have to be careful @@ -648,20 +935,33 @@ static int virtio_mem_memory_notifier_cb(struct notifier_block *nb, struct memory_notify *mhp = arg; const unsigned long start = PFN_PHYS(mhp->start_pfn); const unsigned long size = PFN_PHYS(mhp->nr_pages); - const unsigned long mb_id = virtio_mem_phys_to_mb_id(start); int rc = NOTIFY_OK; + unsigned long id; if (!virtio_mem_overlaps_range(vm, start, size)) return NOTIFY_DONE; - /* - * Memory is onlined/offlined in memory block granularity. We cannot - * cross virtio-mem device boundaries and memory block boundaries. Bail - * out if this ever changes. - */ - if (WARN_ON_ONCE(size != memory_block_size_bytes() || - !IS_ALIGNED(start, memory_block_size_bytes()))) - return NOTIFY_BAD; + if (vm->in_sbm) { + id = virtio_mem_phys_to_mb_id(start); + /* + * In SBM, we add memory in separate memory blocks - we expect + * it to be onlined/offlined in the same granularity. Bail out + * if this ever changes. + */ + if (WARN_ON_ONCE(size != memory_block_size_bytes() || |
