diff options
| -rw-r--r-- | Documentation/ABI/testing/sysfs-bus-rbd | 4 | ||||
| -rw-r--r-- | drivers/block/rbd.c | 1389 | ||||
| -rw-r--r-- | drivers/block/rbd_types.h | 2 | ||||
| -rw-r--r-- | fs/ceph/addr.c | 60 | ||||
| -rw-r--r-- | fs/ceph/caps.c | 18 | ||||
| -rw-r--r-- | fs/ceph/file.c | 73 | ||||
| -rw-r--r-- | fs/ceph/inode.c | 15 | ||||
| -rw-r--r-- | fs/ceph/mds_client.c | 11 | ||||
| -rw-r--r-- | fs/ceph/super.c | 4 | ||||
| -rw-r--r-- | include/linux/backing-dev.h | 1 | ||||
| -rw-r--r-- | include/linux/ceph/libceph.h | 2 | ||||
| -rw-r--r-- | include/linux/ceph/osdmap.h | 1 | ||||
| -rw-r--r-- | include/linux/ceph/rados.h | 2 | ||||
| -rw-r--r-- | net/ceph/ceph_common.c | 3 | ||||
| -rw-r--r-- | net/ceph/messenger.c | 107 | ||||
| -rw-r--r-- | net/ceph/osd_client.c | 59 | ||||
| -rw-r--r-- | net/ceph/osdmap.c | 47 |
17 files changed, 1192 insertions, 606 deletions
diff --git a/Documentation/ABI/testing/sysfs-bus-rbd b/Documentation/ABI/testing/sysfs-bus-rbd index 1cf2adf46b11..cd9213ccf3dc 100644 --- a/Documentation/ABI/testing/sysfs-bus-rbd +++ b/Documentation/ABI/testing/sysfs-bus-rbd @@ -70,6 +70,10 @@ snap_* A directory per each snapshot +parent + + Information identifying the pool, image, and snapshot id for + the parent image in a layered rbd image (format 2 only). Entries under /sys/bus/rbd/devices/<dev-id>/snap_<snap-name> ------------------------------------------------------------- diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index bb3d9be3b1b4..89576a0b3f2e 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -61,15 +61,29 @@ #define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */ -#define RBD_MAX_SNAP_NAME_LEN 32 +#define RBD_SNAP_DEV_NAME_PREFIX "snap_" +#define RBD_MAX_SNAP_NAME_LEN \ + (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1)) + #define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */ #define RBD_MAX_OPT_LEN 1024 #define RBD_SNAP_HEAD_NAME "-" +/* This allows a single page to hold an image name sent by OSD */ +#define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1) #define RBD_IMAGE_ID_LEN_MAX 64 + #define RBD_OBJ_PREFIX_LEN_MAX 64 +/* Feature bits */ + +#define RBD_FEATURE_LAYERING 1 + +/* Features supported by this (client software) implementation. */ + +#define RBD_FEATURES_ALL (0) + /* * An RBD device name will be "rbd#", where the "rbd" comes from * RBD_DRV_NAME above, and # is a unique integer identifier. @@ -101,6 +115,27 @@ struct rbd_image_header { u64 obj_version; }; +/* + * An rbd image specification. + * + * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely + * identify an image. + */ +struct rbd_spec { + u64 pool_id; + char *pool_name; + + char *image_id; + size_t image_id_len; + char *image_name; + size_t image_name_len; + + u64 snap_id; + char *snap_name; + + struct kref kref; +}; + struct rbd_options { bool read_only; }; @@ -155,11 +190,8 @@ struct rbd_snap { }; struct rbd_mapping { - char *snap_name; - u64 snap_id; u64 size; u64 features; - bool snap_exists; bool read_only; }; @@ -173,7 +205,6 @@ struct rbd_device { struct gendisk *disk; /* blkdev's gendisk and rq */ u32 image_format; /* Either 1 or 2 */ - struct rbd_options rbd_opts; struct rbd_client *rbd_client; char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */ @@ -181,17 +212,17 @@ struct rbd_device { spinlock_t lock; /* queue lock */ struct rbd_image_header header; - char *image_id; - size_t image_id_len; - char *image_name; - size_t image_name_len; + bool exists; + struct rbd_spec *spec; + char *header_name; - char *pool_name; - int pool_id; struct ceph_osd_event *watch_event; struct ceph_osd_request *watch_request; + struct rbd_spec *parent_spec; + u64 parent_overlap; + /* protects updating the header */ struct rw_semaphore header_rwsem; @@ -204,6 +235,7 @@ struct rbd_device { /* sysfs related */ struct device dev; + unsigned long open_count; }; static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */ @@ -218,7 +250,7 @@ static int rbd_dev_snaps_update(struct rbd_device *rbd_dev); static int rbd_dev_snaps_register(struct rbd_device *rbd_dev); static void rbd_dev_release(struct device *dev); -static void __rbd_remove_snap_dev(struct rbd_snap *snap); +static void rbd_remove_snap_dev(struct rbd_snap *snap); static ssize_t rbd_add(struct bus_type *bus, const char *buf, size_t count); @@ -258,17 +290,8 @@ static struct device rbd_root_dev = { # define rbd_assert(expr) ((void) 0) #endif /* !RBD_DEBUG */ -static struct device *rbd_get_dev(struct rbd_device *rbd_dev) -{ - return get_device(&rbd_dev->dev); -} - -static void rbd_put_dev(struct rbd_device *rbd_dev) -{ - put_device(&rbd_dev->dev); -} - -static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver); +static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver); +static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver); static int rbd_open(struct block_device *bdev, fmode_t mode) { @@ -277,8 +300,11 @@ static int rbd_open(struct block_device *bdev, fmode_t mode) if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only) return -EROFS; - rbd_get_dev(rbd_dev); + mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); + (void) get_device(&rbd_dev->dev); set_device_ro(bdev, rbd_dev->mapping.read_only); + rbd_dev->open_count++; + mutex_unlock(&ctl_mutex); return 0; } @@ -287,7 +313,11 @@ static int rbd_release(struct gendisk *disk, fmode_t mode) { struct rbd_device *rbd_dev = disk->private_data; - rbd_put_dev(rbd_dev); + mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); + rbd_assert(rbd_dev->open_count > 0); + rbd_dev->open_count--; + put_device(&rbd_dev->dev); + mutex_unlock(&ctl_mutex); return 0; } @@ -388,7 +418,7 @@ enum { static match_table_t rbd_opts_tokens = { /* int args above */ /* string args above */ - {Opt_read_only, "mapping.read_only"}, + {Opt_read_only, "read_only"}, {Opt_read_only, "ro"}, /* Alternate spelling */ {Opt_read_write, "read_write"}, {Opt_read_write, "rw"}, /* Alternate spelling */ @@ -441,33 +471,17 @@ static int parse_rbd_opts_token(char *c, void *private) * Get a ceph client with specific addr and configuration, if one does * not exist create it. */ -static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr, - size_t mon_addr_len, char *options) +static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts) { - struct rbd_options *rbd_opts = &rbd_dev->rbd_opts; - struct ceph_options *ceph_opts; struct rbd_client *rbdc; - rbd_opts->read_only = RBD_READ_ONLY_DEFAULT; - - ceph_opts = ceph_parse_options(options, mon_addr, - mon_addr + mon_addr_len, - parse_rbd_opts_token, rbd_opts); - if (IS_ERR(ceph_opts)) - return PTR_ERR(ceph_opts); - rbdc = rbd_client_find(ceph_opts); - if (rbdc) { - /* using an existing client */ + if (rbdc) /* using an existing client */ ceph_destroy_options(ceph_opts); - } else { + else rbdc = rbd_client_create(ceph_opts); - if (IS_ERR(rbdc)) - return PTR_ERR(rbdc); - } - rbd_dev->rbd_client = rbdc; - return 0; + return rbdc; } /* @@ -492,10 +506,10 @@ static void rbd_client_release(struct kref *kref) * Drop reference to ceph client node. If it's not referenced anymore, release * it. */ -static void rbd_put_client(struct rbd_device *rbd_dev) +static void rbd_put_client(struct rbd_client *rbdc) { - kref_put(&rbd_dev->rbd_client->kref, rbd_client_release); - rbd_dev->rbd_client = NULL; + if (rbdc) + kref_put(&rbdc->kref, rbd_client_release); } /* @@ -524,6 +538,16 @@ static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk) if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT))) return false; + /* The bio layer requires at least sector-sized I/O */ + + if (ondisk->options.order < SECTOR_SHIFT) + return false; + + /* If we use u64 in a few spots we may be able to loosen this */ + + if (ondisk->options.order > 8 * sizeof (int) - 1) + return false; + /* * The size of a snapshot header has to fit in a size_t, and * that limits the number of snapshots. @@ -635,6 +659,20 @@ out_err: return -ENOMEM; } +static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id) +{ + struct rbd_snap *snap; + + if (snap_id == CEPH_NOSNAP) + return RBD_SNAP_HEAD_NAME; + + list_for_each_entry(snap, &rbd_dev->snaps, node) + if (snap_id == snap->id) + return snap->name; + + return NULL; +} + static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name) { @@ -642,7 +680,7 @@ static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name) list_for_each_entry(snap, &rbd_dev->snaps, node) { if (!strcmp(snap_name, snap->name)) { - rbd_dev->mapping.snap_id = snap->id; + rbd_dev->spec->snap_id = snap->id; rbd_dev->mapping.size = snap->size; rbd_dev->mapping.features = snap->features; @@ -653,26 +691,23 @@ static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name) return -ENOENT; } -static int rbd_dev_set_mapping(struct rbd_device *rbd_dev, char *snap_name) +static int rbd_dev_set_mapping(struct rbd_device *rbd_dev) { int ret; - if (!memcmp(snap_name, RBD_SNAP_HEAD_NAME, + if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME, sizeof (RBD_SNAP_HEAD_NAME))) { - rbd_dev->mapping.snap_id = CEPH_NOSNAP; + rbd_dev->spec->snap_id = CEPH_NOSNAP; rbd_dev->mapping.size = rbd_dev->header.image_size; rbd_dev->mapping.features = rbd_dev->header.features; - rbd_dev->mapping.snap_exists = false; - rbd_dev->mapping.read_only = rbd_dev->rbd_opts.read_only; ret = 0; } else { - ret = snap_by_name(rbd_dev, snap_name); + ret = snap_by_name(rbd_dev, rbd_dev->spec->snap_name); if (ret < 0) goto done; - rbd_dev->mapping.snap_exists = true; rbd_dev->mapping.read_only = true; } - rbd_dev->mapping.snap_name = snap_name; + rbd_dev->exists = true; done: return ret; } @@ -695,13 +730,13 @@ static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset) u64 segment; int ret; - name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO); + name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO); if (!name) return NULL; segment = offset >> rbd_dev->header.obj_order; - ret = snprintf(name, RBD_MAX_SEG_NAME_LEN, "%s.%012llx", + ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx", rbd_dev->header.object_prefix, segment); - if (ret < 0 || ret >= RBD_MAX_SEG_NAME_LEN) { + if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) { pr_err("error formatting segment name for #%llu (%d)\n", segment, ret); kfree(name); @@ -800,77 +835,144 @@ static void zero_bio_chain(struct bio *chain, int start_ofs) } /* - * bio_chain_clone - clone a chain of bios up to a certain length. - * might return a bio_pair that will need to be released. + * Clone a portion of a bio, starting at the given byte offset + * and continuing for the number of bytes indicated. */ -static struct bio *bio_chain_clone(struct bio **old, struct bio **next, - struct bio_pair **bp, - int len, gfp_t gfpmask) -{ - struct bio *old_chain = *old; - struct bio *new_chain = NULL; - struct bio *tail; - int total = 0; - - if (*bp) { - bio_pair_release(*bp); - *bp = NULL; - } +static struct bio *bio_clone_range(struct bio *bio_src, + unsigned int offset, + unsigned int len, + gfp_t gfpmask) +{ + struct bio_vec *bv; + unsigned int resid; + unsigned short idx; + unsigned int voff; + unsigned short end_idx; + unsigned short vcnt; + struct bio *bio; - while (old_chain && (total < len)) { - struct bio *tmp; + /* Handle the easy case for the caller */ - tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs); - if (!tmp) - goto err_out; - gfpmask &= ~__GFP_WAIT; /* can't wait after the first */ + if (!offset && len == bio_src->bi_size) + return bio_clone(bio_src, gfpmask); - if (total + old_chain->bi_size > len) { - struct bio_pair *bp; + if (WARN_ON_ONCE(!len)) + return NULL; + if (WARN_ON_ONCE(len > bio_src->bi_size)) + return NULL; + if (WARN_ON_ONCE(offset > bio_src->bi_size - len)) + return NULL; - /* - * this split can only happen with a single paged bio, - * split_bio will BUG_ON if this is not the case - */ - dout("bio_chain_clone split! total=%d remaining=%d" - "bi_size=%u\n", - total, len - total, old_chain->bi_size); + /* Find first affected segment... */ - /* split the bio. We'll release it either in the next - call, or it will have to be released outside */ - bp = bio_split(old_chain, (len - total) / SECTOR_SIZE); - if (!bp) - goto err_out; + resid = offset; + __bio_for_each_segment(bv, bio_src, idx, 0) { + if (resid < bv->bv_len) + break; + resid -= bv->bv_len; + } + voff = resid; - __bio_clone(tmp, &bp->bio1); + /* ...and the last affected segment */ - *next = &bp->bio2; - } else { - __bio_clone(tmp, old_chain); - *next = old_chain->bi_next; - } + resid += len; + __bio_for_each_segment(bv, bio_src, end_idx, idx) { + if (resid <= bv->bv_len) + break; + resid -= bv->bv_len; + } + vcnt = end_idx - idx + 1; + + /* Build the clone */ - tmp->bi_bdev = NULL; - tmp->bi_next = NULL; - if (new_chain) - tail->bi_next = tmp; - else - new_chain = tmp; - tail = tmp; - old_chain = old_chain->bi_next; + bio = bio_alloc(gfpmask, (unsigned int) vcnt); + if (!bio) + return NULL; /* ENOMEM */ - total += tmp->bi_size; + bio->bi_bdev = bio_src->bi_bdev; + bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT); + bio->bi_rw = bio_src->bi_rw; + bio->bi_flags |= 1 << BIO_CLONED; + + /* + * Copy over our part of the bio_vec, then update the first + * and last (or only) entries. + */ + memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx], + vcnt * sizeof (struct bio_vec)); + bio->bi_io_vec[0].bv_offset += voff; + if (vcnt > 1) { + bio->bi_io_vec[0].bv_len -= voff; + bio->bi_io_vec[vcnt - 1].bv_len = resid; + } else { + bio->bi_io_vec[0].bv_len = len; } - rbd_assert(total == len); + bio->bi_vcnt = vcnt; + bio->bi_size = len; + bio->bi_idx = 0; + + return bio; +} + +/* + * Clone a portion of a bio chain, starting at the given byte offset + * into the first bio in the source chain and continuing for the + * number of bytes indicated. The result is another bio chain of + * exactly the given length, or a null pointer on error. + * + * The bio_src and offset parameters are both in-out. On entry they + * refer to the first source bio and the offset into that bio where + * the start of data to be cloned is located. + * + * On return, bio_src is updated to refer to the bio in the source + * chain that contains first un-cloned byte, and *offset will + * contain the offset of that byte within that bio. + */ +static struct bio *bio_chain_clone_range(struct bio **bio_src, + unsigned int *offset, + unsigned int len, + gfp_t gfpmask) +{ + struct bio *bi = *bio_src; + unsigned int off = *offset; + struct bio *chain = NULL; + struct bio **end; + + /* Build up a chain of clone bios up to the limit */ + + if (!bi || off >= bi->bi_size || !len) + return NULL; /* Nothing to clone */ - *old = old_chain; + end = &chain; + while (len) { + unsigned int bi_size; + struct bio *bio; + + if (!bi) + goto out_err; /* EINVAL; ran out of bio's */ + bi_size = min_t(unsigned int, bi->bi_size - off, len); + bio = bio_clone_range(bi, off, bi_size, gfpmask); + if (!bio) + goto out_err; /* ENOMEM */ + + *end = bio; + end = &bio->bi_next; + + off += bi_size; + if (off == bi->bi_size) { + bi = bi->bi_next; + off = 0; + } + len -= bi_size; + } + *bio_src = bi; + *offset = off; - return new_chain; + return chain; +out_err: + bio_chain_put(chain); -err_out: - dout("bio_chain_clone with err\n"); - bio_chain_put(new_chain); return NULL; } @@ -988,8 +1090,9 @@ static int rbd_do_request(struct request *rq, req_data->coll_index = coll_index; } - dout("rbd_do_request object_name=%s ofs=%llu len=%llu\n", object_name, - (unsigned long long) ofs, (unsigned long long) len); + dout("rbd_do_request object_name=%s ofs=%llu len=%llu coll=%p[%d]\n", + object_name, (unsigned long long) ofs, + (unsigned long long) len, coll, coll_index); osdc = &rbd_dev->rbd_client->client->osdc; req = ceph_osdc_alloc_request(osdc, flags, snapc, ops, @@ -1019,7 +1122,7 @@ static int rbd_do_request(struct request *rq, layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); layout->fl_stripe_count = cpu_to_le32(1); layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); - layout->fl_pg_pool = cpu_to_le32(rbd_dev->pool_id); + layout->fl_pg_pool = cpu_to_le32((int) rbd_dev->spec->pool_id); ret = ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno, req, ops); rbd_assert(ret == 0); @@ -1154,8 +1257,6 @@ done: static int rbd_do_op(struct request *rq, struct rbd_device *rbd_dev, struct ceph_snap_context *snapc, - u64 snapid, - int opcode, int flags, u64 ofs, u64 len, struct bio *bio, struct rbd_req_coll *coll, @@ -1167,6 +1268,9 @@ static int rbd_do_op(struct request *rq, int ret; struct ceph_osd_req_op *ops; u32 payload_len; + int opcode; + int flags; + u64 snapid; seg_name = rbd_segment_name(rbd_dev, ofs); if (!seg_name) @@ -1174,7 +1278,18 @@ static int rbd_do_op(struct request *rq, seg_len = rbd_segment_length(rbd_dev, ofs, len); seg_ofs = rbd_segment_offset(rbd_dev, ofs); - payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0); + if (rq_data_dir(rq) == WRITE) { + opcode = CEPH_OSD_OP_WRITE; + flags = CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK; + snapid = CEPH_NOSNAP; + payload_len = seg_len; + } else { + opcode = CEPH_OSD_OP_READ; + flags = CEPH_OSD_FLAG_READ; + snapc = NULL; + snapid = rbd_dev->spec->snap_id; + payload_len = 0; + } ret = -ENOMEM; ops = rbd_create_rw_ops(1, opcode, payload_len); @@ -1202,41 +1317,6 @@ done: } /* - * Request async osd write - */ -static int rbd_req_write(struct request *rq, - struct rbd_device *rbd_dev, - struct ceph_snap_context *snapc, - u64 ofs, u64 len, - struct bio *bio, - struct rbd_req_coll *coll, - int coll_index) -{ - return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP, - CEPH_OSD_OP_WRITE, - CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, - ofs, len, bio, coll, coll_index); -} - -/* - * Request async osd read - */ -static int rbd_req_read(struct request *rq, - struct rbd_device *rbd_dev, - u64 snapid, - u64 ofs, u64 len, - struct bio *bio, - struct rbd_req_coll *coll, - int coll_index) -{ - return rbd_do_op(rq, rbd_dev, NULL, - snapid, - CEPH_OSD_OP_READ, - CEPH_OSD_FLAG_READ, - ofs, len, bio, coll, coll_index); -} - -/* * Request sync osd read */ static int rbd_req_sync_read(struct rbd_device *rbd_dev, @@ -1304,7 +1384,7 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n", rbd_dev->header_name, (unsigned long long) notify_id, (unsigned int) opcode); - rc = rbd_refresh_header(rbd_dev, &hver); + rc = rbd_dev_refresh(rbd_dev, &hver); if (rc) pr_warning(RBD_DRV_NAME "%d got notification but failed to " " update snaps: %d\n", rbd_dev->major, rc); @@ -1460,18 +1540,16 @@ static void rbd_rq_fn(struct request_queue *q) { struct rbd_device *rbd_dev = q->queuedata; struct request *rq; - struct bio_pair *bp = NULL; while ((rq = blk_fetch_request(q))) { struct bio *bio; - struct bio *rq_bio, *next_bio = NULL; bool do_write; unsigned int size; - u64 op_size = 0; u64 ofs; int num_segs, cur_seg = 0; struct rbd_req_coll *coll; struct ceph_snap_context *snapc; + unsigned int bio_offset; dout("fetched request\n"); @@ -1483,10 +1561,6 @@ static void rbd_rq_fn(struct request_queue *q) /* deduce our operation (read, write) */ do_write = (rq_data_dir(rq) == WRITE); - - size = blk_rq_bytes(rq); - ofs = blk_rq_pos(rq) * SECTOR_SIZE; - rq_bio = rq->bio; if (do_write && rbd_dev->mapping.read_only) { __blk_end_request_all(rq, -EROFS); continue; @@ -1496,8 +1570,8 @@ static void rbd_rq_fn(struct request_queue *q) down_read(&rbd_dev->header_rwsem); - if (rbd_dev->mapping.snap_id != CEPH_NOSNAP && - !rbd_dev->mapping.snap_exists) { + if (!rbd_dev->exists) { + rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP); up_read(&rbd_dev->header_rwsem); dout("request for non-existent snapshot"); spin_lock_irq(q->queue_lock); @@ -1509,6 +1583,10 @@ static void rbd_rq_fn(struct request_queue *q) up_read(&rbd_dev->header_rwsem); + size = blk_rq_bytes(rq); + ofs = blk_rq_pos(rq) * SECTOR_SIZE; + bio = rq->bio; + dout("%s 0x%x bytes at 0x%llx\n", do_write ? "write" : "read", size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE); @@ -1528,45 +1606,37 @@ static void rbd_rq_fn(struct request_queue *q) continue; } + bio_offset = 0; do { - /* a bio clone to be passed down to OSD req */ + u64 limit = rbd_segment_length(rbd_dev, ofs, size); + unsigned int chain_size; + struct bio *bio_chain; + + BUG_ON(limit > (u64) UINT_MAX); + chain_size = (unsigned int) limit; dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt); - op_size = rbd_segment_length(rbd_dev, ofs, size); + kref_get(&coll->kref); - bio = bio_chain_clone(&rq_bio, &next_bio, &bp, - op_size, GFP_ATOMIC); - if (!bio) { - rbd_coll_end_req_index(rq, coll, cur_seg, - -ENOMEM, op_size); - goto next_seg; - } + /* Pass a cloned bio chain via an osd request */ - /* init OSD command: write or read */ - if (do_write) - rbd_req_write(rq, rbd_dev, - snapc, - ofs, - op_size, bio, - coll, cur_seg); + bio_chain = bio_chain_clone_range(&bio, + &bio_offset, chain_size, + GFP_ATOMIC); + if (bio_chain) + (void) rbd_do_op(rq, rbd_dev, snapc, + ofs, chain_size, + bio_chain, coll, cur_seg); else - rbd_req_read(rq, rbd_dev, - rbd_dev->mapping.snap_id, - ofs, - op_size, bio, - coll, cur_seg); - -next_seg: - size -= op_size; - ofs += op_size; + rbd_coll_end_req_index(rq, coll, cur_seg, + -ENOMEM, chain_size); + size -= chain_size; + ofs += chain_size; cur_seg++; - rq_bio = next_bio; } while (size > 0); kref_put(&coll->kref, rbd_coll_release); - if (bp) - bio_pair_release(bp); spin_lock_irq(q->queue_lock); ceph_put_snap_context(snapc); @@ -1576,28 +1646,47 @@ next_seg: /* * a queue callback. Makes sure that we don't create a bio that spans across * multiple osd objects. One exception would be with a single page bios, - * which we handle later at bio_chain_clone + * which we handle later at bio_chain_clone_range() */ static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd, struct bio_vec *bvec) { struct rbd_device *rbd_dev = q->queuedata; - unsigned int chunk_sectors; - sector_t sector; - unsigned int bio_sectors; - int max; + sector_t sector_offset; + sector_t sectors_per_obj; + sector_t obj_sector_offset; + int ret; - chunk_sectors = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT); - sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev); - bio_sectors = bmd->bi_size >> SECTOR_SHIFT; + /* + * Find how far into its rbd object the partition-relative + * bio start sector is to offset relative to the enclosing + * device. + */ + sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector; + sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT); + obj_sector_offset = sector_offset & (sectors_per_obj - 1); + + /* + * Compute the number of bytes from that offset to the end + * of the object. Account for what's already used by the bio. + */ + ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT; + if (ret > bmd->bi_size) + ret -= bmd->bi_size; + else + ret = 0; - max = (chunk_sectors - ((sector & (chunk_sectors - 1)) - + bio_sectors)) << SECTOR_SHIFT; - if (max < 0) - max = 0; /* bio_add cannot handle a negative return */ - if (max <= bvec->bv_len && bio_sectors == 0) - return bvec->bv_len; - return max; + /* + * Don't send back more than was asked for. And if the bio + * was empty, let the whole thing through because: "Note + * that a block device *must* allow a single page to be + * added to an empty bio." + */ + rbd_assert(bvec->bv_len <= PAGE_SIZE); + if (ret > (int) bvec->bv_len || !bmd->bi_size) + ret = (int) bvec->bv_len; + + return ret; } static void rbd_free_disk(struct rbd_device *rbd_dev) @@ -1663,13 +1752,13 @@ rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version) ret = -ENXIO; pr_warning("short header read for image %s" " (want %zd got %d)\n", - rbd_dev->image_name, size, ret); + rbd_dev->spec->image_name, size, ret); goto out_err; } if (!rbd_dev_ondisk_valid(ondisk)) { ret = -ENXIO; pr_warning("invalid header for image %s\n", - rbd_dev->image_name); + rbd_dev->spec->image_name); goto out_err; } @@ -1707,19 +1796,32 @@ static int rbd_read_header(struct rbd_device *rbd_dev, return ret; } -static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev) +static void rbd_remove_all_snaps(struct rbd_device *rbd_dev) { struct rbd_snap *snap; struct rbd_snap *next; list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node) - __rbd_remove_snap_dev(snap); + rbd_remove_snap_dev(snap); +} + +static void rbd_update_mapping_size(struct rbd_device *rbd_dev) +{ + sector_t size; + + if (rbd_dev->spec->snap_id != CEPH_NOSNAP) + return; + + size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE; + dout("setting size to %llu sectors", (unsigned long long) size); + rbd_dev->mapping.size = (u64) size; + set_capacity(rbd_dev->disk, size); } /* * only read the first part of the ondisk header, without the snaps info */ -static int __rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver) +static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver) { int ret; struct rbd_image_header h; @@ -1730,17 +1832,9 @@ static int __rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver) down_write(&rbd_dev->header_rwsem); - /* resized? */ - if (rbd_dev->mapping.snap_id == CEPH_NOSNAP) { - sector_t size = (sector_t) h.image_size / SECTOR_SIZE; - - if (size != (sector_t) rbd_dev->mapping.size) { - dout("setting size to %llu sectors", - (unsigned long long) size); - rbd_dev->mapping.size = (u64) size; - set_capacity(rbd_dev->disk, size); - } - } + /* Update image size, and check for resize of mapped image */ + rbd_dev->header.image_size = h.image_size; + rbd_update_mapping_size(rbd_dev); /* rbd_dev->header.object_prefix shouldn't change */ kfree(rbd_dev->header.snap_sizes); @@ -1768,12 +1862,16 @@ static int __rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver) return ret; } -static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver) +static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver) { int ret; + rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); - ret = __rbd_refresh_header(rbd_dev, hver); + if (rbd_dev->image_format == 1) + ret = rbd_dev_v1_refresh(rbd_dev, hver); + else + ret = rbd_dev_v2_refresh(rbd_dev, hver); mutex_unlock(&ctl_mutex); return ret; @@ -1885,7 +1983,7 @@ static ssize_t rbd_pool_show(struct device *dev, { struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); - return sprintf(buf, "%s\n", rbd_dev->pool_name); + return sprintf(buf, "%s\n", rbd_dev->spec->pool_name); } static ssize_t rbd_pool_id_show(struct device *dev, @@ -1893,7 +1991,8 @@ static ssize_t rbd_pool_id_show(struct device *dev, { struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); - return sprintf(buf, "%d\n", rbd_dev->pool_id); + return sprintf(buf, "%llu\n", + (unsigned long long) rbd_dev->spec->pool_id); } static ssize_t rbd_name_show(struct device *dev, @@ -1901,7 +2000,10 @@ static ssize_t rbd_name_show(struct device *dev, { struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); - return sprintf(buf, "%s\n", rbd_dev->image_name); + if (rbd_dev->spec->image_name) + return sprintf(buf, "%s\n", rbd_dev->spec->image_name); + + return sprintf(buf, "(unknown)\n"); } static ssize_t rbd_image_id_show(struct device *dev, @@ -1909,7 +2011,7 @@ static ssize_t rbd_image_id_show(struct device *dev, { struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); - return sprintf(buf, "%s\n", rbd_dev->image_id); + return sprintf(buf, "%s\n", rbd_dev->spec->image_id); } /* @@ -1922,7 +2024,50 @@ static ssize_t rbd_snap_show(struct device *dev, { struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); - return sprintf(buf, "%s\n", rbd_dev->mapping.snap_name); + return sprintf(buf, "%s\n", rbd_dev->spec->snap_name); +} + +/* + * For an rbd v2 image, shows the pool id, image id, and snapshot id + * for the parent image. If there is no parent, simply shows + * "(no parent image)". + */ +static ssize_t rbd_parent_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); + struct rbd_spec *spec = rbd_dev->parent_spec; + int count; + char *bufp = buf; + + if (!spec) + return sprintf(buf, "(no parent image)\n"); + + count = sprintf(bufp, "pool_id %llu\npool_name %s\n", + (unsigned long long) spec->pool_id, spec->pool_name); + if (count < 0) + return count; + bufp += count; + + count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id, + spec->image_name ? spec->image_name : "(unknown)"); + if (count < 0) + return count; + bufp += count; + + count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n", + (unsigned long long) spec->snap_id, spec->snap_name); + if (count < 0) + return count; + bufp += count; + + count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap); + if (count < 0) + return count; + bufp += count; + + return (ssize_t) (bufp - buf); } static ssize_t r |
