summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--drivers/block/rbd.c87
-rw-r--r--fs/ceph/cache.c1
-rw-r--r--fs/ceph/cache.h10
-rw-r--r--fs/ceph/caps.c9
-rw-r--r--fs/ceph/debugfs.c5
-rw-r--r--fs/ceph/dir.c53
-rw-r--r--fs/ceph/export.c267
-rw-r--r--fs/ceph/file.c8
-rw-r--r--fs/ceph/inode.c76
-rw-r--r--fs/ceph/ioctl.c5
-rw-r--r--fs/ceph/locks.c98
-rw-r--r--fs/ceph/mds_client.c97
-rw-r--r--fs/ceph/mds_client.h4
-rw-r--r--fs/ceph/strings.c1
-rw-r--r--fs/ceph/super.c1
-rw-r--r--fs/ceph/super.h3
-rw-r--r--fs/ceph/xattr.c48
-rw-r--r--include/linux/ceph/ceph_features.h12
-rw-r--r--include/linux/ceph/ceph_fs.h5
-rw-r--r--include/linux/ceph/osd_client.h11
-rw-r--r--include/linux/ceph/osdmap.h50
-rw-r--r--include/linux/ceph/rados.h18
-rw-r--r--include/linux/crush/crush.h7
-rw-r--r--net/ceph/crush/mapper.c85
-rw-r--r--net/ceph/debugfs.c55
-rw-r--r--net/ceph/messenger.c6
-rw-r--r--net/ceph/osd_client.c41
-rw-r--r--net/ceph/osdmap.c993
28 files changed, 1421 insertions, 635 deletions
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 34898d53395b..4c95b503b09e 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1654,7 +1654,7 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
if (osd_req->r_result < 0)
obj_request->result = osd_req->r_result;
- BUG_ON(osd_req->r_num_ops > 2);
+ rbd_assert(osd_req->r_num_ops <= CEPH_OSD_MAX_OP);
/*
* We support a 64-bit length, but ultimately it has to be
@@ -1662,11 +1662,15 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
*/
obj_request->xferred = osd_req->r_reply_op_len[0];
rbd_assert(obj_request->xferred < (u64)UINT_MAX);
+
opcode = osd_req->r_ops[0].op;
switch (opcode) {
case CEPH_OSD_OP_READ:
rbd_osd_read_callback(obj_request);
break;
+ case CEPH_OSD_OP_SETALLOCHINT:
+ rbd_assert(osd_req->r_ops[1].op == CEPH_OSD_OP_WRITE);
+ /* fall through */
case CEPH_OSD_OP_WRITE:
rbd_osd_write_callback(obj_request);
break;
@@ -1715,9 +1719,16 @@ static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
snapc, CEPH_NOSNAP, &mtime);
}
+/*
+ * Create an osd request. A read request has one osd op (read).
+ * A write request has either one (watch) or two (hint+write) osd ops.
+ * (All rbd data writes are prefixed with an allocation hint op, but
+ * technically osd watch is a write request, hence this distinction.)
+ */
static struct ceph_osd_request *rbd_osd_req_create(
struct rbd_device *rbd_dev,
bool write_request,
+ unsigned int num_ops,
struct rbd_obj_request *obj_request)
{
struct ceph_snap_context *snapc = NULL;
@@ -1733,10 +1744,13 @@ static struct ceph_osd_request *rbd_osd_req_create(
snapc = img_request->snapc;
}
- /* Allocate and initialize the request, for the single op */
+ rbd_assert(num_ops == 1 || (write_request && num_ops == 2));
+
+ /* Allocate and initialize the request, for the num_ops ops */
osdc = &rbd_dev->rbd_client->client->osdc;
- osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC);
+ osd_req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false,
+ GFP_ATOMIC);
if (!osd_req)
return NULL; /* ENOMEM */
@@ -1756,8 +1770,8 @@ static struct ceph_osd_request *rbd_osd_req_create(
/*
* Create a copyup osd request based on the information in the
- * object request supplied. A copyup request has two osd ops,
- * a copyup method call, and a "normal" write request.
+ * object request supplied. A copyup request has three osd ops,
+ * a copyup method call, a hint op, and a write op.
*/
static struct ceph_osd_request *
rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
@@ -1773,12 +1787,12 @@ rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
rbd_assert(img_request);
rbd_assert(img_request_write_test(img_request));
- /* Allocate and initialize the request, for the two ops */
+ /* Allocate and initialize the request, for the three ops */
snapc = img_request->snapc;
rbd_dev = img_request->rbd_dev;
osdc = &rbd_dev->rbd_client->client->osdc;
- osd_req = ceph_osdc_alloc_request(osdc, snapc, 2, false, GFP_ATOMIC);
+ osd_req = ceph_osdc_alloc_request(osdc, snapc, 3, false, GFP_ATOMIC);
if (!osd_req)
return NULL; /* ENOMEM */
@@ -2178,6 +2192,7 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request,
const char *object_name;
u64 offset;
u64 length;
+ unsigned int which = 0;
object_name = rbd_segment_name(rbd_dev, img_offset);
if (!object_name)
@@ -2190,6 +2205,7 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request,
rbd_segment_name_free(object_name);
if (!obj_request)
goto out_unwind;
+
/*
* set obj_request->img_request before creating the
* osd_request so that it gets the right snapc
@@ -2207,7 +2223,7 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request,
clone_size,
GFP_ATOMIC);
if (!obj_request->bio_list)
- goto out_partial;
+ goto out_unwind;
} else {
unsigned int page_count;
@@ -2220,19 +2236,27 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request,
}
osd_req = rbd_osd_req_create(rbd_dev, write_request,
- obj_request);
+ (write_request ? 2 : 1),
+ obj_request);
if (!osd_req)
- goto out_partial;
+ goto out_unwind;
obj_request->osd_req = osd_req;
obj_request->callback = rbd_img_obj_callback;
- osd_req_op_extent_init(osd_req, 0, opcode, offset, length,
- 0, 0);
+ if (write_request) {
+ osd_req_op_alloc_hint_init(osd_req, which,
+ rbd_obj_bytes(&rbd_dev->header),
+ rbd_obj_bytes(&rbd_dev->header));
+ which++;
+ }
+
+ osd_req_op_extent_init(osd_req, which, opcode, offset, length,
+ 0, 0);
if (type == OBJ_REQUEST_BIO)
- osd_req_op_extent_osd_data_bio(osd_req, 0,
+ osd_req_op_extent_osd_data_bio(osd_req, which,
obj_request->bio_list, length);
else
- osd_req_op_extent_osd_data_pages(osd_req, 0,
+ osd_req_op_extent_osd_data_pages(osd_req, which,
obj_request->pages, length,
offset & ~PAGE_MASK, false, false);
@@ -2249,11 +2273,9 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request,
return 0;
-out_partial:
- rbd_obj_request_put(obj_request);
out_unwind:
for_each_obj_request_safe(img_request, obj_request, next_obj_request)
- rbd_obj_request_put(obj_request);
+ rbd_img_obj_request_del(img_request, obj_request);
return -ENOMEM;
}
@@ -2353,7 +2375,7 @@ rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request)
/*
* The original osd request is of no use to use any more.
- * We need a new one that can hold the two ops in a copyup
+ * We need a new one that can hold the three ops in a copyup
* request. Allocate the new copyup osd request for the
* original request, and release the old one.
*/
@@ -2372,17 +2394,22 @@ rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request)
osd_req_op_cls_request_data_pages(osd_req, 0, pages, parent_length, 0,
false, false);
- /* Then the original write request op */
+ /* Then the hint op */
+
+ osd_req_op_alloc_hint_init(osd_req, 1, rbd_obj_bytes(&rbd_dev->header),
+ rbd_obj_bytes(&rbd_dev->header));
+
+ /* And the original write request op */
offset = orig_request->offset;
length = orig_request->length;
- osd_req_op_extent_init(osd_req, 1, CEPH_OSD_OP_WRITE,
+ osd_req_op_extent_init(osd_req, 2, CEPH_OSD_OP_WRITE,
offset, length, 0, 0);
if (orig_request->type == OBJ_REQUEST_BIO)
- osd_req_op_extent_osd_data_bio(osd_req, 1,
+ osd_req_op_extent_osd_data_bio(osd_req, 2,
orig_request->bio_list, length);
else
- osd_req_op_extent_osd_data_pages(osd_req, 1,
+ osd_req_op_extent_osd_data_pages(osd_req, 2,
orig_request->pages, length,
offset & ~PAGE_MASK, false, false);
@@ -2603,8 +2630,8 @@ static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request)
rbd_assert(obj_request->img_request);
rbd_dev = obj_request->img_request->rbd_dev;
- stat_request->osd_req = rbd_osd_req_create(rbd_dev, false,
- stat_request);
+ stat_request->osd_req = rbd_osd_req_create(rbd_dev, false, 1,
+ stat_request);
if (!stat_request->osd_req)
goto out;
stat_request->callback = rbd_img_obj_exists_callback;
@@ -2807,7 +2834,8 @@ static int rbd_obj_notify_ack_sync(struct rbd_device *rbd_dev, u64 notify_id)
return -ENOMEM;
ret = -ENOMEM;
- obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
+ obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, 1,
+ obj_request);
if (!obj_request->osd_req)
goto out;
@@ -2870,7 +2898,8 @@ static int __rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, bool start)
if (!obj_request)
goto out_cancel;
- obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, obj_request);
+ obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, 1,
+ obj_request);
if (!obj_request->osd_req)
goto out_cancel;
@@ -2978,7 +3007,8 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
obj_request->pages = pages;
obj_request->page_count = page_count;
- obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
+ obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, 1,
+ obj_request);
if (!obj_request->osd_req)
goto out;
@@ -3211,7 +3241,8 @@ static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
obj_request->pages = pages;
obj_request->page_count = page_count;
- obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
+ obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, 1,
+ obj_request);
if (!obj_request->osd_req)
goto out;
diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
index 8c44fdd4e1c3..834f9f3723fb 100644
--- a/fs/ceph/cache.c
+++ b/fs/ceph/cache.c
@@ -205,6 +205,7 @@ void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc,
ci->fscache = fscache_acquire_cookie(fsc->fscache,
&ceph_fscache_inode_object_def,
ci, true);
+ fscache_check_consistency(ci->fscache);
done:
mutex_unlock(&inode->i_mutex);
diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h
index da95f61b7a09..5ac591bd012b 100644
--- a/fs/ceph/cache.h
+++ b/fs/ceph/cache.h
@@ -48,6 +48,12 @@ void ceph_readpage_to_fscache(struct inode *inode, struct page *page);
void ceph_invalidate_fscache_page(struct inode* inode, struct page *page);
void ceph_queue_revalidate(struct inode *inode);
+static inline void ceph_fscache_update_objectsize(struct inode *inode)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ fscache_attr_changed(ci->fscache);
+}
+
static inline void ceph_fscache_invalidate(struct inode *inode)
{
fscache_invalidate(ceph_inode(inode)->fscache);
@@ -135,6 +141,10 @@ static inline void ceph_readpage_to_fscache(struct inode *inode,
{
}
+static inline void ceph_fscache_update_objectsize(struct inode *inode)
+{
+}
+
static inline void ceph_fscache_invalidate(struct inode *inode)
{
}
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 17543383545c..2e5e648eb5c3 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -622,8 +622,10 @@ retry:
if (flags & CEPH_CAP_FLAG_AUTH) {
if (ci->i_auth_cap == NULL ||
- ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0)
+ ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0) {
ci->i_auth_cap = cap;
+ cap->mds_wanted = wanted;
+ }
ci->i_cap_exporting_issued = 0;
} else {
WARN_ON(ci->i_auth_cap == cap);
@@ -885,7 +887,10 @@ int __ceph_caps_mds_wanted(struct ceph_inode_info *ci)
cap = rb_entry(p, struct ceph_cap, ci_node);
if (!__cap_is_valid(cap))
continue;
- mds_wanted |= cap->mds_wanted;
+ if (cap == ci->i_auth_cap)
+ mds_wanted |= cap->mds_wanted;
+ else
+ mds_wanted |= (cap->mds_wanted & ~CEPH_CAP_ANY_FILE_WR);
}
return mds_wanted;
}
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 6d59006bfa27..16b54aa31f08 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -93,6 +93,8 @@ static int mdsc_show(struct seq_file *s, void *p)
} else if (req->r_path1) {
seq_printf(s, " #%llx/%s", req->r_ino1.ino,
req->r_path1);
+ } else {
+ seq_printf(s, " #%llx", req->r_ino1.ino);
}
if (req->r_old_dentry) {
@@ -102,7 +104,8 @@ static int mdsc_show(struct seq_file *s, void *p)
path = NULL;
spin_lock(&req->r_old_dentry->d_lock);
seq_printf(s, " #%llx/%.*s (%s)",
- ceph_ino(req->r_old_dentry_dir),
+ req->r_old_dentry_dir ?
+ ceph_ino(req->r_old_dentry_dir) : 0,
req->r_old_dentry->d_name.len,
req->r_old_dentry->d_name.name,
path ? path : "");
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 45eda6d7a40c..766410a12c2c 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -119,7 +119,8 @@ static int fpos_cmp(loff_t l, loff_t r)
* defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by
* the MDS if/when the directory is modified).
*/
-static int __dcache_readdir(struct file *file, struct dir_context *ctx)
+static int __dcache_readdir(struct file *file, struct dir_context *ctx,
+ u32 shared_gen)
{
struct ceph_file_info *fi = file->private_data;
struct dentry *parent = file->f_dentry;
@@ -133,8 +134,8 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx)
last = fi->dentry;
fi->dentry = NULL;
- dout("__dcache_readdir %p at %llu (last %p)\n", dir, ctx->pos,
- last);
+ dout("__dcache_readdir %p v%u at %llu (last %p)\n",
+ dir, shared_gen, ctx->pos, last);
spin_lock(&parent->d_lock);
@@ -161,7 +162,8 @@ more:
goto out_unlock;
}
spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
- if (!d_unhashed(dentry) && dentry->d_inode &&
+ if (di->lease_shared_gen == shared_gen &&
+ !d_unhashed(dentry) && dentry->d_inode &&
ceph_snap(dentry->d_inode) != CEPH_SNAPDIR &&
ceph_ino(dentry->d_inode) != CEPH_INO_CEPH &&
fpos_cmp(ctx->pos, di->offset) <= 0)
@@ -190,7 +192,7 @@ more:
if (last) {
/* remember our position */
fi->dentry = last;
- fi->next_offset = di->offset;
+ fi->next_offset = fpos_off(di->offset);
}
dput(dentry);
return 0;
@@ -252,8 +254,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
int err;
u32 ftype;
struct ceph_mds_reply_info_parsed *rinfo;
- const int max_entries = fsc->mount_options->max_readdir;
- const int max_bytes = fsc->mount_options->max_readdir_bytes;
dout("readdir %p file %p frag %u off %u\n", inode, file, frag, off);
if (fi->flags & CEPH_F_ATEND)
@@ -291,8 +291,9 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
ceph_snap(inode) != CEPH_SNAPDIR &&
__ceph_dir_is_complete(ci) &&
__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {
+ u32 shared_gen = ci->i_shared_gen;
spin_unlock(&ci->i_ceph_lock);
- err = __dcache_readdir(file, ctx);
+ err = __dcache_readdir(file, ctx, shared_gen);
if (err != -EAGAIN)
return err;
} else {
@@ -322,14 +323,16 @@ more:
fi->last_readdir = NULL;
}
- /* requery frag tree, as the frag topology may have changed */
- frag = ceph_choose_frag(ceph_inode(inode), frag, NULL, NULL);
-
dout("readdir fetching %llx.%llx frag %x offset '%s'\n",
ceph_vinop(inode), frag, fi->last_name);
req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
if (IS_ERR(req))
return PTR_ERR(req);
+ err = ceph_alloc_readdir_reply_buffer(req, inode);
+ if (err) {
+ ceph_mdsc_put_request(req);
+ return err;
+ }
req->r_inode = inode;
ihold(inode);
req->r_dentry = dget(file->f_dentry);
@@ -340,9 +343,6 @@ more:
req->r_path2 = kstrdup(fi->last_name, GFP_NOFS);
req->r_readdir_offset = fi->next_offset;
req->r_args.readdir.frag = cpu_to_le32(frag);
- req->r_args.readdir.max_entries = cpu_to_le32(max_entries);
- req->r_args.readdir.max_bytes = cpu_to_le32(max_bytes);
- req->r_num_caps = max_entries + 1;
err = ceph_mdsc_do_request(mdsc, NULL, req);
if (err < 0) {
ceph_mdsc_put_request(req);
@@ -369,9 +369,9 @@ more:
fi->next_offset = 0;
off = fi->next_offset;
}
+ fi->frag = frag;
fi->offset = fi->next_offset;
fi->last_readdir = req;
- fi->frag = frag;
if (req->r_reply_info.dir_end) {
kfree(fi->last_name);
@@ -454,7 +454,7 @@ more:
return 0;
}
-static void reset_readdir(struct ceph_file_info *fi)
+static void reset_readdir(struct ceph_file_info *fi, unsigned frag)
{
if (fi->last_readdir) {
ceph_mdsc_put_request(fi->last_readdir);
@@ -462,7 +462,10 @@ static void reset_readdir(struct ceph_file_info *fi)
}
kfree(fi->last_name);
fi->last_name = NULL;
- fi->next_offset = 2; /* compensate for . and .. */
+ if (ceph_frag_is_leftmost(frag))
+ fi->next_offset = 2; /* compensate for . and .. */
+ else
+ fi->next_offset = 0;
if (fi->dentry) {
dput(fi->dentry);
fi->dentry = NULL;
@@ -474,7 +477,7 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
{
struct ceph_file_info *fi = file->private_data;
struct inode *inode = file->f_mapping->host;
- loff_t old_offset = offset;
+ loff_t old_offset = ceph_make_fpos(fi->frag, fi->next_offset);
loff_t retval;
mutex_lock(&inode->i_mutex);
@@ -491,7 +494,7 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
goto out;
}
- if (offset >= 0 && offset <= inode->i_sb->s_maxbytes) {
+ if (offset >= 0) {
if (offset != file->f_pos) {
file->f_pos = offset;
file->f_version = 0;
@@ -504,14 +507,14 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
* seek to new frag, or seek prior to current chunk.
*/
if (offset == 0 ||
- fpos_frag(offset) != fpos_frag(old_offset) ||
+ fpos_frag(offset) != fi->frag ||
fpos_off(offset) < fi->offset) {
dout("dir_llseek dropping %p content\n", file);
- reset_readdir(fi);
+ reset_readdir(fi, fpos_frag(offset));
}
/* bump dir_release_count if we did a forward seek */
- if (offset > old_offset)
+ if (fpos_cmp(offset, old_offset) > 0)
fi->dir_release_count--;
}
out:
@@ -812,8 +815,7 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir,
}
req->r_dentry = dget(dentry);
req->r_num_caps = 2;
- req->r_old_dentry = dget(old_dentry); /* or inode? hrm. */
- req->r_old_dentry_dir = ceph_get_dentry_parent_inode(old_dentry);
+ req->r_old_dentry = dget(old_dentry);
req->r_locked_dir = dir;
req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
@@ -911,10 +913,11 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RENAME, USE_AUTH_MDS);
if (IS_ERR(req))
return PTR_ERR(req);
+ ihold(old_dir);
req->r_dentry = dget(new_dentry);
req->r_num_caps = 2;
req->r_old_dentry = dget(old_dentry);
- req->r_old_dentry_dir = ceph_get_dentry_parent_inode(old_dentry);
+ req->r_old_dentry_dir = old_dir;
req->r_locked_dir = new_dir;
req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED;
req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL;
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 16796be53ca5..00d6af6a32ec 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -8,23 +8,6 @@
#include "mds_client.h"
/*
- * NFS export support
- *
- * NFS re-export of a ceph mount is, at present, only semireliable.
- * The basic issue is that the Ceph architectures doesn't lend itself
- * well to generating filehandles that will remain valid forever.
- *
- * So, we do our best. If you're lucky, your inode will be in the
- * client's cache. If it's not, and you have a connectable fh, then
- * the MDS server may be able to find it for you. Otherwise, you get
- * ESTALE.
- *
- * There are ways to this more reliable, but in the non-connectable fh
- * case, we won't every work perfectly, and in the connectable case,
- * some changes are needed on the MDS side to work better.
- */
-
-/*
* Basic fh
*/
struct ceph_nfs_fh {
@@ -32,22 +15,12 @@ struct ceph_nfs_fh {
} __attribute__ ((packed));
/*
- * Larger 'connectable' fh that includes parent ino and name hash.
- * Use this whenever possible, as it works more reliably.
+ * Larger fh that includes parent ino.
*/
struct ceph_nfs_confh {
u64 ino, parent_ino;
- u32 parent_name_hash;
} __attribute__ ((packed));
-/*
- * The presence of @parent_inode here tells us whether NFS wants a
- * connectable file handle. However, we want to make a connectionable
- * file handle unconditionally so that the MDS gets as much of a hint
- * as possible. That means we only use @parent_dentry to indicate
- * whether nfsd wants a connectable fh, and whether we should indicate
- * failure from a too-small @max_len.
- */
static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len,
struct inode *parent_inode)
{
@@ -56,54 +29,36 @@ static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len,
struct ceph_nfs_confh *cfh = (void *)rawfh;
int connected_handle_length = sizeof(*cfh)/4;
int handle_length = sizeof(*fh)/4;
- struct dentry *dentry;
- struct dentry *parent;
/* don't re-export snaps */
if (ceph_snap(inode) != CEPH_NOSNAP)
return -EINVAL;
- dentry = d_find_alias(inode);
+ if (parent_inode && (*max_len < connected_handle_length)) {
+ *max_len = connected_handle_length;
+ return FILEID_INVALID;
+ } else if (*max_len < handle_length) {
+ *max_len = handle_length;
+ return FILEID_INVALID;
+ }
- /* if we found an alias, generate a connectable fh */
- if (*max_len >= connected_handle_length && dentry) {
- dout("encode_fh %p connectable\n", dentry);
- spin_lock(&dentry->d_lock);
- parent = dentry->d_parent;
+ if (parent_inode) {
+ dout("encode_fh %llx with parent %llx\n",
+ ceph_ino(inode), ceph_ino(parent_inode));
cfh->ino = ceph_ino(inode);
- cfh->parent_ino = ceph_ino(parent->d_inode);
- cfh->parent_name_hash = ceph_dentry_hash(parent->d_inode,
- dentry);
+ cfh->parent_ino = ceph_ino(parent_inode);
*max_len = connected_handle_length;
- type = 2;
- spin_unlock(&dentry->d_lock);
- } else if (*max_len >= handle_length) {
- if (parent_inode) {
- /* nfsd wants connectable */
- *max_len = connected_handle_length;
- type = FILEID_INVALID;
- } else {
- dout("encode_fh %p\n", dentry);
- fh->ino = ceph_ino(inode);
- *max_len = handle_length;
- type = 1;
- }
+ type = FILEID_INO32_GEN_PARENT;
} else {
+ dout("encode_fh %llx\n", ceph_ino(inode));
+ fh->ino = ceph_ino(inode);
*max_len = handle_length;
- type = FILEID_INVALID;
+ type = FILEID_INO32_GEN;
}
- if (dentry)
- dput(dentry);
return type;
}
-/*
- * convert regular fh to dentry
- *
- * FIXME: we should try harder by querying the mds for the ino.
- */
-static struct dentry *__fh_to_dentry(struct super_block *sb,
- struct ceph_nfs_fh *fh, int fh_len)
+static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino)
{
struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc;
struct inode *inode;
@@ -111,11 +66,7 @@ static struct dentry *__fh_to_dentry(struct super_block *sb,
struct ceph_vino vino;
int err;
- if (fh_len < sizeof(*fh) / 4)
- return ERR_PTR(-ESTALE);
-
- dout("__fh_to_dentry %llx\n", fh->ino);
- vino.ino = fh->ino;
+ vino.ino = ino;
vino.snap = CEPH_NOSNAP;
inode = ceph_find_inode(sb, vino);
if (!inode) {
@@ -139,139 +90,161 @@ static struct dentry *__fh_to_dentry(struct super_block *sb,
dentry = d_obtain_alias(inode);
if (IS_ERR(dentry)) {
- pr_err("fh_to_dentry %llx -- inode %p but ENOMEM\n",
- fh->ino, inode);
iput(inode);
return dentry;
}
err = ceph_init_dentry(dentry);
if (err < 0) {
- iput(inode);
+ dput(dentry);
return ERR_PTR(err);
}
- dout("__fh_to_dentry %llx %p dentry %p\n", fh->ino, inode, dentry);
+ dout("__fh_to_dentry %llx %p dentry %p\n", ino, inode, dentry);
return dentry;
}
/*
- * convert connectable fh to dentry
+ * convert regular fh to dentry
*/
-static struct dentry *__cfh_to_dentry(struct super_block *sb,
- struct ceph_nfs_confh *cfh, int fh_len)
+static struct dentry *ceph_fh_to_dentry(struct super_block *sb,
+ struct fid *fid,
+ int fh_len, int fh_type)
+{
+ struct ceph_nfs_fh *fh = (void *)fid->raw;
+
+ if (fh_type != FILEID_INO32_GEN &&
+ fh_type != FILEID_INO32_GEN_PARENT)
+ return NULL;
+ if (fh_len < sizeof(*fh) / 4)
+ return NULL;
+
+ dout("fh_to_dentry %llx\n", fh->ino);
+ return __fh_to_dentry(sb, fh->ino);
+}
+
+static struct dentry *__get_parent(struct super_block *sb,
+ struct dentry *child, u64 ino)
{
struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc;
+ struct ceph_mds_request *req;
struct inode *inode;
struct dentry *dentry;
- struct ceph_vino vino;
int err;
- if (fh_len < sizeof(*cfh) / 4)
- return ERR_PTR(-ESTALE);
-
- dout("__cfh_to_dentry %llx (%llx/%x)\n",
- cfh->ino, cfh->parent_ino, cfh->parent_name_hash);
-
- vino.ino = cfh->ino;
- vino.snap = CEPH_NOSNAP;
- inode = ceph_find_inode(sb, vino);
- if (!inode) {
- struct ceph_mds_request *req;
-
- req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPHASH,
- USE_ANY_MDS);
- if (IS_ERR(req))
- return ERR_CAST(req);
+ req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPPARENT,
+ USE_ANY_MDS);
+ if (IS_ERR(req))
+ return ERR_CAST(req);
- req->r_ino1 = vino;
- req->r_ino2.ino = cfh->parent_ino;
- req->r_ino2.snap = CEPH_NOSNAP;
- req->r_path2 = kmalloc(16, GFP_NOFS);
- snprintf(req->r_path2, 16, "%d", cfh->parent_name_hash);
- req->r_num_caps = 1;
- err = ceph_mdsc_do_request(mdsc, NULL, req);
- inode = req->r_target_inode;
- if (inode)
- ihold(inode);
- ceph_mdsc_put_request(req);
- if (!inode)
- return ERR_PTR(err ? err : -ESTALE);
+ if (child) {
+ req->r_inode = child->d_inode;
+ ihold(child->d_inode);
+ } else {
+ req->r_ino1 = (struct ceph_vino) {
+ .ino = ino,
+ .snap = CEPH_NOSNAP,
+ };
}
+ req->r_num_caps = 1;
+ err = ceph_mdsc_do_request(mdsc, NULL, req);
+ inode = req->r_target_inode;
+ if (inode)
+ ihold(inode);
+ ceph_mdsc_put_request(req);
+ if (!inode)
+ return ERR_PTR(-ENOENT);
dentry = d_obtain_alias(inode);
if (IS_ERR(dentry)) {
- pr_err("cfh_to_dentry %llx -- inode %p but ENOMEM\n",
- cfh->ino, inode);
iput(inode);
return dentry;
}
err = ceph_init_dentry(dentry);
if (err < 0) {
- iput(inode);
+ dput(dentry);
return ERR_PTR(err);
}
- dout("__cfh_to_dentry %llx %p dentry %p\n", cfh->ino, inode, dentry);
+ dout("__get_parent ino %llx parent %p ino %llx.%llx\n",
+ child ? ceph_ino(child->d_inode) : ino,
+ dentry, ceph_vinop(inode));
return dentry;
}
-static struct dentry *ceph_fh_to_dentry(struct super_block *sb, struct fid *fid,
- int fh_len, int fh_type)
+struct dentry *ceph_get_parent(struct dentry *child)
{
- if (fh_type == 1)
- return __fh_to_dentry(sb, (struct ceph_nfs_fh *)fid->raw,
- fh_len);
- else
- return __cfh_to_dentry(sb, (struct ceph_nfs_confh *)fid->raw,
- fh_len);
+ /* don't re-export snaps */
+ if (ceph_snap(child->d_inode) != CEPH_NOSNAP)
+ return ERR_PTR(-EINVAL);
+
+ dout("get_parent %p ino %llx.%llx\n",
+ child, ceph_vinop(child->d_inode));
+ return __get_parent(child->d_sb, child, 0);
}
/*
- * get parent, if possible.
- *
- * FIXME: we could do better by querying the mds to discover the
- * parent.
+ * convert regular fh to parent
*/
static struct dentry *ceph_fh_to_parent(struct super_block *sb,
- struct fid *fid,
+ struct fid *fid,
int fh_len, int fh_type)
{
struct ceph_nfs_confh *cfh = (void *)fid->raw;
- struct ceph_vino vino;
- struct inode *inode;
struct dentry *dentry;
- int err;
- if (fh_type == 1)
- return ERR_PTR(-ESTALE);
+ if (fh_type != FILEID_INO32_GEN_PARENT)
+ return NULL;
if (fh_len < sizeof(*cfh) / 4)
- return ERR_PTR(-ESTALE);
+ return NULL;
- pr_debug("fh_to_parent %llx/%d\n", cfh->parent_ino,
- cfh->parent_name_hash);
+ dout("fh_to_parent %llx\n", cfh->parent_ino);
+ dentry = __get_parent(sb, NULL, cfh->ino);
+ if (IS_ERR(dentry) && PTR_ERR(dentry) == -ENOENT)
+ dentry = __fh_to_dentry(sb, cfh->parent_ino);
+ return dentry;
+}
- vino.ino = cfh->ino;
- vino.snap = CEPH_NOSNAP;
- inode = ceph_find_inode(sb, vino);
- if (!inode)
- return ERR_PTR(-ESTALE);
+static int ceph_get_name(struct dentry *parent, char *name,
+ struct dentry *child)
+{
+ struct ceph_mds_client *mdsc;
+ struct ceph_mds_request *req;
+ int err;
- dentry = d_obtain_alias(inode);
- if (IS_ERR(dentry)) {
- pr_err("fh_to_parent %llx -- inode %p but ENOMEM\n",
- cfh->ino, inode);
- iput(inode);
- return dentry;
- }
- err = ceph_init_dentry(dentry);
- if (err < 0) {
- iput(inode);
- return ERR_PTR(err);
+ mdsc = ceph_inode_to_client(child->d_inode)->mdsc;
+ req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPNAME,
+ USE_ANY_MDS);<