summaryrefslogtreecommitdiff
path: root/drivers/virtio/virtio_mem.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/virtio/virtio_mem.c')
-rw-r--r--drivers/virtio/virtio_mem.c1789
1 files changed, 1287 insertions, 502 deletions
diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c
index 181e2f18beae..9fc9ec4a25f5 100644
--- a/drivers/virtio/virtio_mem.c
+++ b/drivers/virtio/virtio_mem.c
@@ -27,20 +27,74 @@ static bool unplug_online = true;
module_param(unplug_online, bool, 0644);
MODULE_PARM_DESC(unplug_online, "Try to unplug online memory");
-enum virtio_mem_mb_state {
+static bool force_bbm;
+module_param(force_bbm, bool, 0444);
+MODULE_PARM_DESC(force_bbm,
+ "Force Big Block Mode. Default is 0 (auto-selection)");
+
+static unsigned long bbm_block_size;
+module_param(bbm_block_size, ulong, 0444);
+MODULE_PARM_DESC(bbm_block_size,
+ "Big Block size in bytes. Default is 0 (auto-detection).");
+
+static bool bbm_safe_unplug = true;
+module_param(bbm_safe_unplug, bool, 0444);
+MODULE_PARM_DESC(bbm_safe_unplug,
+ "Use a safe unplug mechanism in BBM, avoiding long/endless loops");
+
+/*
+ * virtio-mem currently supports the following modes of operation:
+ *
+ * * Sub Block Mode (SBM): A Linux memory block spans 2..X subblocks (SB). The
+ * size of a Sub Block (SB) is determined based on the device block size, the
+ * pageblock size, and the maximum allocation granularity of the buddy.
+ * Subblocks within a Linux memory block might either be plugged or unplugged.
+ * Memory is added/removed to Linux MM in Linux memory block granularity.
+ *
+ * * Big Block Mode (BBM): A Big Block (BB) spans 1..X Linux memory blocks.
+ * Memory is added/removed to Linux MM in Big Block granularity.
+ *
+ * The mode is determined automatically based on the Linux memory block size
+ * and the device block size.
+ *
+ * User space / core MM (auto onlining) is responsible for onlining added
+ * Linux memory blocks - and for selecting a zone. Linux Memory Blocks are
+ * always onlined separately, and all memory within a Linux memory block is
+ * onlined to the same zone - virtio-mem relies on this behavior.
+ */
+
+/*
+ * State of a Linux memory block in SBM.
+ */
+enum virtio_mem_sbm_mb_state {
/* Unplugged, not added to Linux. Can be reused later. */
- VIRTIO_MEM_MB_STATE_UNUSED = 0,
+ VIRTIO_MEM_SBM_MB_UNUSED = 0,
/* (Partially) plugged, not added to Linux. Error on add_memory(). */
- VIRTIO_MEM_MB_STATE_PLUGGED,
+ VIRTIO_MEM_SBM_MB_PLUGGED,
/* Fully plugged, fully added to Linux, offline. */
- VIRTIO_MEM_MB_STATE_OFFLINE,
+ VIRTIO_MEM_SBM_MB_OFFLINE,
/* Partially plugged, fully added to Linux, offline. */
- VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL,
+ VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL,
/* Fully plugged, fully added to Linux, online. */
- VIRTIO_MEM_MB_STATE_ONLINE,
+ VIRTIO_MEM_SBM_MB_ONLINE,
/* Partially plugged, fully added to Linux, online. */
- VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL,
- VIRTIO_MEM_MB_STATE_COUNT
+ VIRTIO_MEM_SBM_MB_ONLINE_PARTIAL,
+ VIRTIO_MEM_SBM_MB_COUNT
+};
+
+/*
+ * State of a Big Block (BB) in BBM, covering 1..X Linux memory blocks.
+ */
+enum virtio_mem_bbm_bb_state {
+ /* Unplugged, not added to Linux. Can be reused later. */
+ VIRTIO_MEM_BBM_BB_UNUSED = 0,
+ /* Plugged, not added to Linux. Error on add_memory(). */
+ VIRTIO_MEM_BBM_BB_PLUGGED,
+ /* Plugged and added to Linux. */
+ VIRTIO_MEM_BBM_BB_ADDED,
+ /* All online parts are fake-offline, ready to remove. */
+ VIRTIO_MEM_BBM_BB_FAKE_OFFLINE,
+ VIRTIO_MEM_BBM_BB_COUNT
};
struct virtio_mem {
@@ -51,6 +105,7 @@ struct virtio_mem {
/* Workqueue that processes the plug/unplug requests. */
struct work_struct wq;
+ atomic_t wq_active;
atomic_t config_changed;
/* Virtqueue for guest->host requests. */
@@ -70,27 +125,13 @@ struct virtio_mem {
/* The device block size (for communicating with the device). */
uint64_t device_block_size;
- /* The translated node id. NUMA_NO_NODE in case not specified. */
+ /* The determined node id for all memory of the device. */
int nid;
/* Physical start address of the memory region. */
uint64_t addr;
/* Maximum region size in bytes. */
uint64_t region_size;
- /* The subblock size. */
- uint64_t subblock_size;
- /* The number of subblocks per memory block. */
- uint32_t nb_sb_per_mb;
-
- /* Id of the first memory block of this device. */
- unsigned long first_mb_id;
- /* Id of the last memory block of this device. */
- unsigned long last_mb_id;
- /* Id of the last usable memory block of this device. */
- unsigned long last_usable_mb_id;
- /* Id of the next memory bock to prepare when needed. */
- unsigned long next_mb_id;
-
/* The parent resource for all memory added via this device. */
struct resource *parent_resource;
/*
@@ -99,31 +140,79 @@ struct virtio_mem {
*/
const char *resource_name;
- /* Summary of all memory block states. */
- unsigned long nb_mb_state[VIRTIO_MEM_MB_STATE_COUNT];
-#define VIRTIO_MEM_NB_OFFLINE_THRESHOLD 10
-
- /*
- * One byte state per memory block.
- *
- * Allocated via vmalloc(). When preparing new blocks, resized
- * (alloc+copy+free) when needed (crossing pages with the next mb).
- * (when crossing pages).
- *
- * With 128MB memory blocks, we have states for 512GB of memory in one
- * page.
- */
- uint8_t *mb_state;
-
/*
- * $nb_sb_per_mb bit per memory block. Handled similar to mb_state.
- *
- * With 4MB subblocks, we manage 128GB of memory in one page.
+ * We don't want to add too much memory if it's not getting onlined,
+ * to avoid running OOM. Besides this threshold, we allow to have at
+ * least two offline blocks at a time (whatever is bigger).
*/
- unsigned long *sb_bitmap;
+#define VIRTIO_MEM_DEFAULT_OFFLINE_THRESHOLD (1024 * 1024 * 1024)
+ atomic64_t offline_size;
+ uint64_t offline_threshold;
+
+ /* If set, the driver is in SBM, otherwise in BBM. */
+ bool in_sbm;
+
+ union {
+ struct {
+ /* Id of the first memory block of this device. */
+ unsigned long first_mb_id;
+ /* Id of the last usable memory block of this device. */
+ unsigned long last_usable_mb_id;
+ /* Id of the next memory bock to prepare when needed. */
+ unsigned long next_mb_id;
+
+ /* The subblock size. */
+ uint64_t sb_size;
+ /* The number of subblocks per Linux memory block. */
+ uint32_t sbs_per_mb;
+
+ /* Summary of all memory block states. */
+ unsigned long mb_count[VIRTIO_MEM_SBM_MB_COUNT];
+
+ /*
+ * One byte state per memory block. Allocated via
+ * vmalloc(). Resized (alloc+copy+free) on demand.
+ *
+ * With 128 MiB memory blocks, we have states for 512
+ * GiB of memory in one 4 KiB page.
+ */
+ uint8_t *mb_states;
+
+ /*
+ * Bitmap: one bit per subblock. Allocated similar to
+ * sbm.mb_states.
+ *
+ * A set bit means the corresponding subblock is
+ * plugged, otherwise it's unblocked.
+ *
+ * With 4 MiB subblocks, we manage 128 GiB of memory
+ * in one 4 KiB page.
+ */
+ unsigned long *sb_states;
+ } sbm;
+
+ struct {
+ /* Id of the first big block of this device. */
+ unsigned long first_bb_id;
+ /* Id of the last usable big block of this device. */
+ unsigned long last_usable_bb_id;
+ /* Id of the next device bock to prepare when needed. */
+ unsigned long next_bb_id;
+
+ /* Summary of all big block states. */
+ unsigned long bb_count[VIRTIO_MEM_BBM_BB_COUNT];
+
+ /* One byte state per big block. See sbm.mb_states. */
+ uint8_t *bb_states;
+
+ /* The block size used for plugging/adding/removing. */
+ uint64_t bb_size;
+ } bbm;
+ };
/*
- * Mutex that protects the nb_mb_state, mb_state, and sb_bitmap.
+ * Mutex that protects the sbm.mb_count, sbm.mb_states,
+ * sbm.sb_states, bbm.bb_count, and bbm.bb_states
*
* When this lock is held the pointers can't change, ONLINE and
* OFFLINE blocks can't change the state and no subblocks will get
@@ -160,6 +249,11 @@ static DEFINE_MUTEX(virtio_mem_mutex);
static LIST_HEAD(virtio_mem_devices);
static void virtio_mem_online_page_cb(struct page *page, unsigned int order);
+static void virtio_mem_fake_offline_going_offline(unsigned long pfn,
+ unsigned long nr_pages);
+static void virtio_mem_fake_offline_cancel_offline(unsigned long pfn,
+ unsigned long nr_pages);
+static void virtio_mem_retry(struct virtio_mem *vm);
/*
* Register a virtio-mem device so it will be considered for the online_page
@@ -213,6 +307,24 @@ static unsigned long virtio_mem_mb_id_to_phys(unsigned long mb_id)
}
/*
+ * Calculate the big block id of a given address.
+ */
+static unsigned long virtio_mem_phys_to_bb_id(struct virtio_mem *vm,
+ uint64_t addr)
+{
+ return addr / vm->bbm.bb_size;
+}
+
+/*
+ * Calculate the physical start address of a given big block id.
+ */
+static uint64_t virtio_mem_bb_id_to_phys(struct virtio_mem *vm,
+ unsigned long bb_id)
+{
+ return bb_id * vm->bbm.bb_size;
+}
+
+/*
* Calculate the subblock id of a given address.
*/
static unsigned long virtio_mem_phys_to_sb_id(struct virtio_mem *vm,
@@ -221,89 +333,164 @@ static unsigned long virtio_mem_phys_to_sb_id(struct virtio_mem *vm,
const unsigned long mb_id = virtio_mem_phys_to_mb_id(addr);
const unsigned long mb_addr = virtio_mem_mb_id_to_phys(mb_id);
- return (addr - mb_addr) / vm->subblock_size;
+ return (addr - mb_addr) / vm->sbm.sb_size;
+}
+
+/*
+ * Set the state of a big block, taking care of the state counter.
+ */
+static void virtio_mem_bbm_set_bb_state(struct virtio_mem *vm,
+ unsigned long bb_id,
+ enum virtio_mem_bbm_bb_state state)
+{
+ const unsigned long idx = bb_id - vm->bbm.first_bb_id;
+ enum virtio_mem_bbm_bb_state old_state;
+
+ old_state = vm->bbm.bb_states[idx];
+ vm->bbm.bb_states[idx] = state;
+
+ BUG_ON(vm->bbm.bb_count[old_state] == 0);
+ vm->bbm.bb_count[old_state]--;
+ vm->bbm.bb_count[state]++;
+}
+
+/*
+ * Get the state of a big block.
+ */
+static enum virtio_mem_bbm_bb_state virtio_mem_bbm_get_bb_state(struct virtio_mem *vm,
+ unsigned long bb_id)
+{
+ return vm->bbm.bb_states[bb_id - vm->bbm.first_bb_id];
+}
+
+/*
+ * Prepare the big block state array for the next big block.
+ */
+static int virtio_mem_bbm_bb_states_prepare_next_bb(struct virtio_mem *vm)
+{
+ unsigned long old_bytes = vm->bbm.next_bb_id - vm->bbm.first_bb_id;
+ unsigned long new_bytes = old_bytes + 1;
+ int old_pages = PFN_UP(old_bytes);
+ int new_pages = PFN_UP(new_bytes);
+ uint8_t *new_array;
+
+ if (vm->bbm.bb_states && old_pages == new_pages)
+ return 0;
+
+ new_array = vzalloc(new_pages * PAGE_SIZE);
+ if (!new_array)
+ return -ENOMEM;
+
+ mutex_lock(&vm->hotplug_mutex);
+ if (vm->bbm.bb_states)
+ memcpy(new_array, vm->bbm.bb_states, old_pages * PAGE_SIZE);
+ vfree(vm->bbm.bb_states);
+ vm->bbm.bb_states = new_array;
+ mutex_unlock(&vm->hotplug_mutex);
+
+ return 0;
}
+#define virtio_mem_bbm_for_each_bb(_vm, _bb_id, _state) \
+ for (_bb_id = vm->bbm.first_bb_id; \
+ _bb_id < vm->bbm.next_bb_id && _vm->bbm.bb_count[_state]; \
+ _bb_id++) \
+ if (virtio_mem_bbm_get_bb_state(_vm, _bb_id) == _state)
+
+#define virtio_mem_bbm_for_each_bb_rev(_vm, _bb_id, _state) \
+ for (_bb_id = vm->bbm.next_bb_id - 1; \
+ _bb_id >= vm->bbm.first_bb_id && _vm->bbm.bb_count[_state]; \
+ _bb_id--) \
+ if (virtio_mem_bbm_get_bb_state(_vm, _bb_id) == _state)
+
/*
* Set the state of a memory block, taking care of the state counter.
*/
-static void virtio_mem_mb_set_state(struct virtio_mem *vm, unsigned long mb_id,
- enum virtio_mem_mb_state state)
+static void virtio_mem_sbm_set_mb_state(struct virtio_mem *vm,
+ unsigned long mb_id, uint8_t state)
{
- const unsigned long idx = mb_id - vm->first_mb_id;
- enum virtio_mem_mb_state old_state;
+ const unsigned long idx = mb_id - vm->sbm.first_mb_id;
+ uint8_t old_state;
- old_state = vm->mb_state[idx];
- vm->mb_state[idx] = state;
+ old_state = vm->sbm.mb_states[idx];
+ vm->sbm.mb_states[idx] = state;
- BUG_ON(vm->nb_mb_state[old_state] == 0);
- vm->nb_mb_state[old_state]--;
- vm->nb_mb_state[state]++;
+ BUG_ON(vm->sbm.mb_count[old_state] == 0);
+ vm->sbm.mb_count[old_state]--;
+ vm->sbm.mb_count[state]++;
}
/*
* Get the state of a memory block.
*/
-static enum virtio_mem_mb_state virtio_mem_mb_get_state(struct virtio_mem *vm,
- unsigned long mb_id)
+static uint8_t virtio_mem_sbm_get_mb_state(struct virtio_mem *vm,
+ unsigned long mb_id)
{
- const unsigned long idx = mb_id - vm->first_mb_id;
+ const unsigned long idx = mb_id - vm->sbm.first_mb_id;
- return vm->mb_state[idx];
+ return vm->sbm.mb_states[idx];
}
/*
* Prepare the state array for the next memory block.
*/
-static int virtio_mem_mb_state_prepare_next_mb(struct virtio_mem *vm)
+static int virtio_mem_sbm_mb_states_prepare_next_mb(struct virtio_mem *vm)
{
- unsigned long old_bytes = vm->next_mb_id - vm->first_mb_id + 1;
- unsigned long new_bytes = vm->next_mb_id - vm->first_mb_id + 2;
- int old_pages = PFN_UP(old_bytes);
- int new_pages = PFN_UP(new_bytes);
- uint8_t *new_mb_state;
+ int old_pages = PFN_UP(vm->sbm.next_mb_id - vm->sbm.first_mb_id);
+ int new_pages = PFN_UP(vm->sbm.next_mb_id - vm->sbm.first_mb_id + 1);
+ uint8_t *new_array;
- if (vm->mb_state && old_pages == new_pages)
+ if (vm->sbm.mb_states && old_pages == new_pages)
return 0;
- new_mb_state = vzalloc(new_pages * PAGE_SIZE);
- if (!new_mb_state)
+ new_array = vzalloc(new_pages * PAGE_SIZE);
+ if (!new_array)
return -ENOMEM;
mutex_lock(&vm->hotplug_mutex);
- if (vm->mb_state)
- memcpy(new_mb_state, vm->mb_state, old_pages * PAGE_SIZE);
- vfree(vm->mb_state);
- vm->mb_state = new_mb_state;
+ if (vm->sbm.mb_states)
+ memcpy(new_array, vm->sbm.mb_states, old_pages * PAGE_SIZE);
+ vfree(vm->sbm.mb_states);
+ vm->sbm.mb_states = new_array;
mutex_unlock(&vm->hotplug_mutex);
return 0;
}
-#define virtio_mem_for_each_mb_state(_vm, _mb_id, _state) \
- for (_mb_id = _vm->first_mb_id; \
- _mb_id < _vm->next_mb_id && _vm->nb_mb_state[_state]; \
+#define virtio_mem_sbm_for_each_mb(_vm, _mb_id, _state) \
+ for (_mb_id = _vm->sbm.first_mb_id; \
+ _mb_id < _vm->sbm.next_mb_id && _vm->sbm.mb_count[_state]; \
_mb_id++) \
- if (virtio_mem_mb_get_state(_vm, _mb_id) == _state)
+ if (virtio_mem_sbm_get_mb_state(_vm, _mb_id) == _state)
-#define virtio_mem_for_each_mb_state_rev(_vm, _mb_id, _state) \
- for (_mb_id = _vm->next_mb_id - 1; \
- _mb_id >= _vm->first_mb_id && _vm->nb_mb_state[_state]; \
+#define virtio_mem_sbm_for_each_mb_rev(_vm, _mb_id, _state) \
+ for (_mb_id = _vm->sbm.next_mb_id - 1; \
+ _mb_id >= _vm->sbm.first_mb_id && _vm->sbm.mb_count[_state]; \
_mb_id--) \
- if (virtio_mem_mb_get_state(_vm, _mb_id) == _state)
+ if (virtio_mem_sbm_get_mb_state(_vm, _mb_id) == _state)
+
+/*
+ * Calculate the bit number in the subblock bitmap for the given subblock
+ * inside the given memory block.
+ */
+static int virtio_mem_sbm_sb_state_bit_nr(struct virtio_mem *vm,
+ unsigned long mb_id, int sb_id)
+{
+ return (mb_id - vm->sbm.first_mb_id) * vm->sbm.sbs_per_mb + sb_id;
+}
/*
* Mark all selected subblocks plugged.
*
* Will not modify the state of the memory block.
*/
-static void virtio_mem_mb_set_sb_plugged(struct virtio_mem *vm,
- unsigned long mb_id, int sb_id,
- int count)
+static void virtio_mem_sbm_set_sb_plugged(struct virtio_mem *vm,
+ unsigned long mb_id, int sb_id,
+ int count)
{
- const int bit = (mb_id - vm->first_mb_id) * vm->nb_sb_per_mb + sb_id;
+ const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id);
- __bitmap_set(vm->sb_bitmap, bit, count);
+ __bitmap_set(vm->sbm.sb_states, bit, count);
}
/*
@@ -311,105 +498,114 @@ static void virtio_mem_mb_set_sb_plugged(struct virtio_mem *vm,
*
* Will not modify the state of the memory block.
*/
-static void virtio_mem_mb_set_sb_unplugged(struct virtio_mem *vm,
- unsigned long mb_id, int sb_id,
- int count)
+static void virtio_mem_sbm_set_sb_unplugged(struct virtio_mem *vm,
+ unsigned long mb_id, int sb_id,
+ int count)
{
- const int bit = (mb_id - vm->first_mb_id) * vm->nb_sb_per_mb + sb_id;
+ const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id);
- __bitmap_clear(vm->sb_bitmap, bit, count);
+ __bitmap_clear(vm->sbm.sb_states, bit, count);
}
/*
* Test if all selected subblocks are plugged.
*/
-static bool virtio_mem_mb_test_sb_plugged(struct virtio_mem *vm,
- unsigned long mb_id, int sb_id,
- int count)
+static bool virtio_mem_sbm_test_sb_plugged(struct virtio_mem *vm,
+ unsigned long mb_id, int sb_id,
+ int count)
{
- const int bit = (mb_id - vm->first_mb_id) * vm->nb_sb_per_mb + sb_id;
+ const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id);
if (count == 1)
- return test_bit(bit, vm->sb_bitmap);
+ return test_bit(bit, vm->sbm.sb_states);
/* TODO: Helper similar to bitmap_set() */
- return find_next_zero_bit(vm->sb_bitmap, bit + count, bit) >=
+ return find_next_zero_bit(vm->sbm.sb_states, bit + count, bit) >=
bit + count;
}
/*
* Test if all selected subblocks are unplugged.
*/
-static bool virtio_mem_mb_test_sb_unplugged(struct virtio_mem *vm,
- unsigned long mb_id, int sb_id,
- int count)
+static bool virtio_mem_sbm_test_sb_unplugged(struct virtio_mem *vm,
+ unsigned long mb_id, int sb_id,
+ int count)
{
- const int bit = (mb_id - vm->first_mb_id) * vm->nb_sb_per_mb + sb_id;
+ const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id);
/* TODO: Helper similar to bitmap_set() */
- return find_next_bit(vm->sb_bitmap, bit + count, bit) >= bit + count;
+ return find_next_bit(vm->sbm.sb_states, bit + count, bit) >=
+ bit + count;
}
/*
- * Find the first unplugged subblock. Returns vm->nb_sb_per_mb in case there is
+ * Find the first unplugged subblock. Returns vm->sbm.sbs_per_mb in case there is
* none.
*/
-static int virtio_mem_mb_first_unplugged_sb(struct virtio_mem *vm,
+static int virtio_mem_sbm_first_unplugged_sb(struct virtio_mem *vm,
unsigned long mb_id)
{
- const int bit = (mb_id - vm->first_mb_id) * vm->nb_sb_per_mb;
+ const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, 0);
- return find_next_zero_bit(vm->sb_bitmap, bit + vm->nb_sb_per_mb, bit) -
- bit;
+ return find_next_zero_bit(vm->sbm.sb_states,
+ bit + vm->sbm.sbs_per_mb, bit) - bit;
}
/*
* Prepare the subblock bitmap for the next memory block.
*/
-static int virtio_mem_sb_bitmap_prepare_next_mb(struct virtio_mem *vm)
+static int virtio_mem_sbm_sb_states_prepare_next_mb(struct virtio_mem *vm)
{
- const unsigned long old_nb_mb = vm->next_mb_id - vm->first_mb_id;
- const unsigned long old_nb_bits = old_nb_mb * vm->nb_sb_per_mb;
- const unsigned long new_nb_bits = (old_nb_mb + 1) * vm->nb_sb_per_mb;
+ const unsigned long old_nb_mb = vm->sbm.next_mb_id - vm->sbm.first_mb_id;
+ const unsigned long old_nb_bits = old_nb_mb * vm->sbm.sbs_per_mb;
+ const unsigned long new_nb_bits = (old_nb_mb + 1) * vm->sbm.sbs_per_mb;
int old_pages = PFN_UP(BITS_TO_LONGS(old_nb_bits) * sizeof(long));
int new_pages = PFN_UP(BITS_TO_LONGS(new_nb_bits) * sizeof(long));
- unsigned long *new_sb_bitmap, *old_sb_bitmap;
+ unsigned long *new_bitmap, *old_bitmap;
- if (vm->sb_bitmap && old_pages == new_pages)
+ if (vm->sbm.sb_states && old_pages == new_pages)
return 0;
- new_sb_bitmap = vzalloc(new_pages * PAGE_SIZE);
- if (!new_sb_bitmap)
+ new_bitmap = vzalloc(new_pages * PAGE_SIZE);
+ if (!new_bitmap)
return -ENOMEM;
mutex_lock(&vm->hotplug_mutex);
- if (new_sb_bitmap)
- memcpy(new_sb_bitmap, vm->sb_bitmap, old_pages * PAGE_SIZE);
+ if (new_bitmap)
+ memcpy(new_bitmap, vm->sbm.sb_states, old_pages * PAGE_SIZE);
- old_sb_bitmap = vm->sb_bitmap;
- vm->sb_bitmap = new_sb_bitmap;
+ old_bitmap = vm->sbm.sb_states;
+ vm->sbm.sb_states = new_bitmap;
mutex_unlock(&vm->hotplug_mutex);
- vfree(old_sb_bitmap);
+ vfree(old_bitmap);
return 0;
}
/*
- * Try to add a memory block to Linux. This will usually only fail
- * if out of memory.
+ * Test if we could add memory without creating too much offline memory -
+ * to avoid running OOM if memory is getting onlined deferred.
+ */
+static bool virtio_mem_could_add_memory(struct virtio_mem *vm, uint64_t size)
+{
+ if (WARN_ON_ONCE(size > vm->offline_threshold))
+ return false;
+
+ return atomic64_read(&vm->offline_size) + size <= vm->offline_threshold;
+}
+
+/*
+ * Try adding memory to Linux. Will usually only fail if out of memory.
*
* Must not be called with the vm->hotplug_mutex held (possible deadlock with
* onlining code).
*
- * Will not modify the state of the memory block.
+ * Will not modify the state of memory blocks in virtio-mem.
*/
-static int virtio_mem_mb_add(struct virtio_mem *vm, unsigned long mb_id)
+static int virtio_mem_add_memory(struct virtio_mem *vm, uint64_t addr,
+ uint64_t size)
{
- const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id);
- int nid = vm->nid;
-
- if (nid == NUMA_NO_NODE)
- nid = memory_add_physaddr_to_nid(addr);
+ int rc;
/*
* When force-unloading the driver and we still have memory added to
@@ -422,53 +618,155 @@ static int virtio_mem_mb_add(struct virtio_mem *vm, unsigned long mb_id)
return -ENOMEM;
}
- dev_dbg(&vm->vdev->dev, "adding memory block: %lu\n", mb_id);
- return add_memory_driver_managed(nid, addr, memory_block_size_bytes(),
- vm->resource_name,
- MEMHP_MERGE_RESOURCE);
+ dev_dbg(&vm->vdev->dev, "adding memory: 0x%llx - 0x%llx\n", addr,
+ addr + size - 1);
+ /* Memory might get onlined immediately. */
+ atomic64_add(size, &vm->offline_size);
+ rc = add_memory_driver_managed(vm->nid, addr, size, vm->resource_name,
+ MEMHP_MERGE_RESOURCE);
+ if (rc) {
+ atomic64_sub(size, &vm->offline_size);
+ dev_warn(&vm->vdev->dev, "adding memory failed: %d\n", rc);
+ /*
+ * TODO: Linux MM does not properly clean up yet in all cases
+ * where adding of memory failed - especially on -ENOMEM.
+ */
+ }
+ return rc;
+}
+
+/*
+ * See virtio_mem_add_memory(): Try adding a single Linux memory block.
+ */
+static int virtio_mem_sbm_add_mb(struct virtio_mem *vm, unsigned long mb_id)
+{
+ const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id);
+ const uint64_t size = memory_block_size_bytes();
+
+ return virtio_mem_add_memory(vm, addr, size);
+}
+
+/*
+ * See virtio_mem_add_memory(): Try adding a big block.
+ */
+static int virtio_mem_bbm_add_bb(struct virtio_mem *vm, unsigned long bb_id)
+{
+ const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id);
+ const uint64_t size = vm->bbm.bb_size;
+
+ return virtio_mem_add_memory(vm, addr, size);
}
/*
- * Try to remove a memory block from Linux. Will only fail if the memory block
- * is not offline.
+ * Try removing memory from Linux. Will only fail if memory blocks aren't
+ * offline.
*
* Must not be called with the vm->hotplug_mutex held (possible deadlock with
* onlining code).
*
- * Will not modify the state of the memory block.
+ * Will not modify the state of memory blocks in virtio-mem.
*/
-static int virtio_mem_mb_remove(struct virtio_mem *vm, unsigned long mb_id)
+static int virtio_mem_remove_memory(struct virtio_mem *vm, uint64_t addr,
+ uint64_t size)
+{
+ int rc;
+
+ dev_dbg(&vm->vdev->dev, "removing memory: 0x%llx - 0x%llx\n", addr,
+ addr + size - 1);
+ rc = remove_memory(vm->nid, addr, size);
+ if (!rc) {
+ atomic64_sub(size, &vm->offline_size);
+ /*
+ * We might have freed up memory we can now unplug, retry
+ * immediately instead of waiting.
+ */
+ virtio_mem_retry(vm);
+ } else {
+ dev_dbg(&vm->vdev->dev, "removing memory failed: %d\n", rc);
+ }
+ return rc;
+}
+
+/*
+ * See virtio_mem_remove_memory(): Try removing a single Linux memory block.
+ */
+static int virtio_mem_sbm_remove_mb(struct virtio_mem *vm, unsigned long mb_id)
{
const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id);
- int nid = vm->nid;
+ const uint64_t size = memory_block_size_bytes();
- if (nid == NUMA_NO_NODE)
- nid = memory_add_physaddr_to_nid(addr);
+ return virtio_mem_remove_memory(vm, addr, size);
+}
- dev_dbg(&vm->vdev->dev, "removing memory block: %lu\n", mb_id);
- return remove_memory(nid, addr, memory_block_size_bytes());
+/*
+ * See virtio_mem_remove_memory(): Try to remove all Linux memory blocks covered
+ * by the big block.
+ */
+static int virtio_mem_bbm_remove_bb(struct virtio_mem *vm, unsigned long bb_id)
+{
+ const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id);
+ const uint64_t size = vm->bbm.bb_size;
+
+ return virtio_mem_remove_memory(vm, addr, size);
}
/*
- * Try to offline and remove a memory block from Linux.
+ * Try offlining and removing memory from Linux.
*
* Must not be called with the vm->hotplug_mutex held (possible deadlock with
* onlining code).
*
- * Will not modify the state of the memory block.
+ * Will not modify the state of memory blocks in virtio-mem.
*/
-static int virtio_mem_mb_offline_and_remove(struct virtio_mem *vm,
- unsigned long mb_id)
+static int virtio_mem_offline_and_remove_memory(struct virtio_mem *vm,
+ uint64_t addr,
+ uint64_t size)
+{
+ int rc;
+
+ dev_dbg(&vm->vdev->dev,
+ "offlining and removing memory: 0x%llx - 0x%llx\n", addr,
+ addr + size - 1);
+
+ rc = offline_and_remove_memory(vm->nid, addr, size);
+ if (!rc) {
+ atomic64_sub(size, &vm->offline_size);
+ /*
+ * We might have freed up memory we can now unplug, retry
+ * immediately instead of waiting.
+ */
+ virtio_mem_retry(vm);
+ } else {
+ dev_dbg(&vm->vdev->dev,
+ "offlining and removing memory failed: %d\n", rc);
+ }
+ return rc;
+}
+
+/*
+ * See virtio_mem_offline_and_remove_memory(): Try offlining and removing
+ * a single Linux memory block.
+ */
+static int virtio_mem_sbm_offline_and_remove_mb(struct virtio_mem *vm,
+ unsigned long mb_id)
{
const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id);
- int nid = vm->nid;
+ const uint64_t size = memory_block_size_bytes();
+
+ return virtio_mem_offline_and_remove_memory(vm, addr, size);
+}
- if (nid == NUMA_NO_NODE)
- nid = memory_add_physaddr_to_nid(addr);
+/*
+ * See virtio_mem_offline_and_remove_memory(): Try to offline and remove a
+ * all Linux memory blocks covered by the big block.
+ */
+static int virtio_mem_bbm_offline_and_remove_bb(struct virtio_mem *vm,
+ unsigned long bb_id)
+{
+ const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id);
+ const uint64_t size = vm->bbm.bb_size;
- dev_dbg(&vm->vdev->dev, "offlining and removing memory block: %lu\n",
- mb_id);
- return offline_and_remove_memory(nid, addr, memory_block_size_bytes());
+ return virtio_mem_offline_and_remove_memory(vm, addr, size);
}
/*
@@ -499,31 +797,28 @@ static int virtio_mem_translate_node_id(struct virtio_mem *vm, uint16_t node_id)
* Test if a virtio-mem device overlaps with the given range. Can be called
* from (notifier) callbacks lockless.
*/
-static bool virtio_mem_overlaps_range(struct virtio_mem *vm,
- unsigned long start, unsigned long size)
+static bool virtio_mem_overlaps_range(struct virtio_mem *vm, uint64_t start,
+ uint64_t size)
{
- unsigned long dev_start = virtio_mem_mb_id_to_phys(vm->first_mb_id);
- unsigned long dev_end = virtio_mem_mb_id_to_phys(vm->last_mb_id) +
- memory_block_size_bytes();
-
- return start < dev_end && dev_start < start + size;
+ return start < vm->addr + vm->region_size && vm->addr < start + size;
}
/*
- * Test if a virtio-mem device owns a memory block. Can be called from
+ * Test if a virtio-mem device contains a given range. Can be called from
* (notifier) callbacks lockless.
*/
-static bool virtio_mem_owned_mb(struct virtio_mem *vm, unsigned long mb_id)
+static bool virtio_mem_contains_range(struct virtio_mem *vm, uint64_t start,
+ uint64_t size)
{
- return mb_id >= vm->first_mb_id && mb_id <= vm->last_mb_id;
+ return start >= vm->addr && start + size <= vm->addr + vm->region_size;
}
-static int virtio_mem_notify_going_online(struct virtio_mem *vm,
- unsigned long mb_id)
+static int virtio_mem_sbm_notify_going_online(struct virtio_mem *vm,
+ unsigned long mb_id)
{
- switch (virtio_mem_mb_get_state(vm, mb_id)) {
- case VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL:
- case VIRTIO_MEM_MB_STATE_OFFLINE:
+ switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) {
+ case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL:
+ case VIRTIO_MEM_SBM_MB_OFFLINE:
return NOTIFY_OK;
default:
break;
@@ -533,108 +828,100 @@ static int virtio_mem_notify_going_online(struct virtio_mem *vm,
return NOTIFY_BAD;
}
-static void virtio_mem_notify_offline(struct virtio_mem *vm,
- unsigned long mb_id)
+static void virtio_mem_sbm_notify_offline(struct virtio_mem *vm,
+ unsigned long mb_id)
{
- switch (virtio_mem_mb_get_state(vm, mb_id)) {
- case VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL:
- virtio_mem_mb_set_state(vm, mb_id,
- VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL);
+ switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) {
+ case VIRTIO_MEM_SBM_MB_ONLINE_PARTIAL:
+ virtio_mem_sbm_set_mb_state(vm, mb_id,
+ VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL);
break;
- case VIRTIO_MEM_MB_STATE_ONLINE:
- virtio_mem_mb_set_state(vm, mb_id,
- VIRTIO_MEM_MB_STATE_OFFLINE);
+ case VIRTIO_MEM_SBM_MB_ONLINE:
+ virtio_mem_sbm_set_mb_state(vm, mb_id,
+ VIRTIO_MEM_SBM_MB_OFFLINE);
break;
default:
BUG();
break;
}
-
- /*
- * Trigger the workqueue, maybe we can now unplug memory. Also,
- * when we offline and remove a memory block, this will re-trigger
- * us immediately - which is often nice because the removal of
- * the memory block (e.g., memmap) might have freed up memory
- * on other memory blocks we manage.
- */
- virtio_mem_retry(vm);
}
-static void virtio_mem_notify_online(struct virtio_mem *vm, unsigned long mb_id)
+static void virtio_mem_sbm_notify_online(struct virtio_mem *vm,
+ unsigned long mb_id)
{
- unsigned long nb_offline;
-
- switch (virtio_mem_mb_get_state(vm, mb_id)) {
- case VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL:
- virtio_mem_mb_set_state(vm, mb_id,
- VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL);
+ switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) {
+ case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL:
+ virtio_mem_sbm_set_mb_state(vm, mb_id,
+ VIRTIO_MEM_SBM_MB_ONLINE_PARTIAL);
break;
- case VIRTIO_MEM_MB_STATE_OFFLINE:
- virtio_mem_mb_set_state(vm, mb_id, VIRTIO_MEM_MB_STATE_ONLINE);
+ case VIRTIO_MEM_SBM_MB_OFFLINE:
+ virtio_mem_sbm_set_mb_state(vm, mb_id,
+ VIRTIO_MEM_SBM_MB_ONLINE);
break;
default:
BUG();
break;
}
- nb_offline = vm->nb_mb_state[VIRTIO_MEM_MB_STATE_OFFLINE] +
- vm->nb_mb_state[VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL];
-
- /* see if we can add new blocks now that we onlined one block */
- if (nb_offline == VIRTIO_MEM_NB_OFFLINE_THRESHOLD - 1)
- virtio_mem_retry(vm);
}
-static void virtio_mem_notify_going_offline(struct virtio_mem *vm,
- unsigned long mb_id)
+static void virtio_mem_sbm_notify_going_offline(struct virtio_mem *vm,
+ unsigned long mb_id)
{
- const unsigned long nr_pages = PFN_DOWN(vm->subblock_size);
- struct page *page;
+ const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size);
unsigned long pfn;
- int sb_id, i;
+ int sb_id;
- for (sb_id = 0; sb_id < vm->nb_sb_per_mb; sb_id++) {
- if (virtio_mem_mb_test_sb_plugged(vm, mb_id, sb_id, 1))
+ for (sb_id = 0; sb_id < vm->sbm.sbs_per_mb; sb_id++) {
+ if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1))
continue;
- /*
- * Drop our reference to the pages so the memory can get
- * offlined and add the unplugged pages to the managed
- * page counters (so offlining code can correctly subtract
- * them again).
- */
pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) +
- sb_id * vm->subblock_size);
- adjust_managed_page_count(pfn_to_page(pfn), nr_pages);
- for (i = 0; i < nr_pages; i++) {
- page = pfn_to_page(pfn + i);
- if (WARN_ON(!page_ref_dec_and_test(page)))
- dump_page(page, "unplugged page referenced");
- }
+ sb_id * vm->sbm.sb_size);
+ virtio_mem_fake_offline_going_offline(pfn, nr_pages);
}
}
-static void virtio_mem_notify_cancel_offline(struct virtio_mem *vm,
- unsigned long mb_id)
+static void virtio_mem_sbm_notify_cancel_offline(struct virtio_mem *vm,
+ unsigned long mb_id)
{
- const unsigned long nr_pages = PFN_DOWN(vm->subblock_size);
+ const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size);
unsigned long pfn;
- int sb_id, i;
+ int sb_id;
- for (sb_id = 0; sb_id < vm->nb_sb_per_mb; sb_id++) {
- if (virtio_mem_mb_test_sb_plugged(vm, mb_id, sb_id, 1))
+ for (sb_id = 0; sb_id < vm->sbm.sbs_per_mb; sb_id++) {
+ if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1))
continue;
- /*
- * Get the reference we dropped when going offline and
- * subtract the unplugged pages from the managed page
- * counters.
- */
pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) +
- sb_id * vm->subblock_size);
- adjust_managed_page_count(pfn_to_page(pfn), -nr_pages);
- for (i = 0; i < nr_pages; i++)
- page_ref_inc(pfn_to_page(pfn + i));
+ sb_id * vm->sbm.sb_size);
+ virtio_mem_fake_offline_cancel_offline(pfn, nr_pages);
}
}
+static void virtio_mem_bbm_notify_going_offline(struct virtio_mem *vm,
+ unsigned long bb_id,
+ unsigned long pfn,
+ unsigned long nr_pages)
+{
+ /*
+ * When marked as "fake-offline", all online memory of this device block
+ * is allocated by us. Otherwise, we don't have any memory allocated.
+ */
+ if (virtio_mem_bbm_get_bb_state(vm, bb_id) !=
+ VIRTIO_MEM_BBM_BB_FAKE_OFFLINE)
+ return;
+ virtio_mem_fake_offline_going_offline(pfn, nr_pages);
+}
+
+static void virtio_mem_bbm_notify_cancel_offline(struct virtio_mem *vm,
+ unsigned long bb_id,
+ unsigned long pfn,
+ unsigned long nr_pages)
+{
+ if (virtio_mem_bbm_get_bb_state(vm, bb_id) !=
+ VIRTIO_MEM_BBM_BB_FAKE_OFFLINE)
+ return;
+ virtio_mem_fake_offline_cancel_offline(pfn, nr_pages);
+}
+
/*
* This callback will either be called synchronously from add_memory() or
* asynchronously (e.g., triggered via user space). We have to be careful
@@ -648,20 +935,33 @@ static int virtio_mem_memory_notifier_cb(struct notifier_block *nb,
struct memory_notify *mhp = arg;
const unsigned long start = PFN_PHYS(mhp->start_pfn);
const unsigned long size = PFN_PHYS(mhp->nr_pages);
- const unsigned long mb_id = virtio_mem_phys_to_mb_id(start);
int rc = NOTIFY_OK;
+ unsigned long id;
if (!virtio_mem_overlaps_range(vm, start, size))
return NOTIFY_DONE;
- /*
- * Memory is onlined/offlined in memory block granularity. We cannot
- * cross virtio-mem device boundaries and memory block boundaries. Bail
- * out if this ever changes.
- */
- if (WARN_ON_ONCE(size != memory_block_size_bytes() ||
- !IS_ALIGNED(start, memory_block_size_bytes())))
- return NOTIFY_BAD;
+ if (vm->in_sbm) {
+ id = virtio_mem_phys_to_mb_id(start);
+ /*
+ * In SBM, we add memory in separate memory blocks - we expect
+ * it to be onlined/offlined in the same granularity. Bail out
+ * if this ever changes.
+ */
+ if (WARN_ON_ONCE(size != memory_block_size_bytes() ||