// SPDX-License-Identifier: GPL-2.0
#include <linux/ceph/ceph_debug.h>
#include <linux/bvec.h>
#include <linux/crc32c.h>
#include <linux/net.h>
#include <linux/socket.h>
#include <net/sock.h>
#include <linux/ceph/ceph_features.h>
#include <linux/ceph/decode.h>
#include <linux/ceph/libceph.h>
#include <linux/ceph/messenger.h>
/* static tag bytes (protocol control messages) */
static char tag_msg = CEPH_MSGR_TAG_MSG;
static char tag_ack = CEPH_MSGR_TAG_ACK;
static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE;
static char tag_keepalive2 = CEPH_MSGR_TAG_KEEPALIVE2;
/*
* If @buf is NULL, discard up to @len bytes.
*/
static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len)
{
struct kvec iov = {buf, len};
struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
int r;
if (!buf)
msg.msg_flags |= MSG_TRUNC;
iov_iter_kvec(&msg.msg_iter, READ, &iov, 1, len);
r = sock_recvmsg(sock, &msg, msg.msg_flags);
if (r == -EAGAIN)
r = 0;
return r;
}
static int ceph_tcp_recvpage(struct socket *sock, struct page *page,
int page_offset, size_t length)
{
struct bio_vec bvec = {
.bv_page = page,
.bv_offset = page_offset,
.bv_len = length
};
struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
int r;
BUG_ON(page_offset + length > PAGE_SIZE);
iov_iter_bvec(&msg.msg_iter, READ, &bvec, 1, length);
r = sock_recvmsg(sock, &msg, msg.msg_flags);
if (r == -EAGAIN)
r = 0;
return r;
}
/*
* write something. @more is true if caller will be sending more data
* shortly.
*/
static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov,
size_t kvlen, size_t len, bool more)
{
struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
int r;
if (more)
msg.msg_flags |= MSG_MORE;
else
msg.msg_flags |= MSG_EOR; /* superfluous, but what the hell */
r = kernel_sendmsg(sock, &msg, iov, kvlen, len);
if (r == -EAGAIN)
r = 0;
return r;
}
/*
* @more: either or both of MSG_MORE and MSG_SENDPAGE_NOTLAST
*/
static int ceph_tcp_sendpage(struct socket *sock, struct page *page,
int offset, size_t size, int more)
{
ssize_t (*sendpage)(struct socket *sock, struct page *page,
int offset, size_t size, int flags);
int flags = MSG_DONTWAIT | MSG_NOSIGNAL | more;
int ret;
/*
* sendpage cannot properly handle pages with page_count == 0,
* we need to fall back to sendmsg if that's the case.
*
* Same goes for slab pages: skb_can_coalesce() allows
* coalescing neighboring slab objects into a single frag which
* triggers one of hardened usercopy checks.
*/
if (sendpage_ok(page))
sendpage = sock->ops->sendpage;
else
sendpage = sock_no_sendpage;
ret = sendpage(sock, page, offset, size, flags);
if (ret == -EAGAIN)
ret = 0;
return ret;
}
static void con_out_kvec_reset(struct ceph_connection *con)
{
BUG_ON(con->v1.out_skip);
con->v1.out_kvec_left = 0;
con->v1.out_kvec_bytes = 0;
con->v1.out_kvec_cur = &con->v1.out_kvec[0];
}
static void con_out_kvec_add(struct ceph_connection *con,
size_t size, void *data)
{
int index = con->v1.out_kvec_left;
BUG_ON(con->v1.out_skip);
BUG_ON(index >= ARRAY_SIZE(con->v1.out_kvec));
con->v1.out_kvec[index].iov_len = size;
con->v1.out_kvec[index].iov_base = data;
con->v1.out_kvec_left++;
con->v1.out_kvec_bytes += size;
}
/*
* Chop off a kvec from the end. Return residual number of bytes for
* that kvec, i.e. how many bytes would have been written if the kvec
* hadn't been nuked.
*/
static int con_out_kvec_skip(struct ceph_connection *con)
{
int skip = 0;
if (con->v1.out_kvec_bytes > 0) {
skip = con->v1.out_kvec_cur[con->v1.out_kvec_left - 1].iov_len;
BUG_ON(con->v1.out_kvec_bytes < skip);
BUG_ON(!con->v1.out_kvec_left);
con->v1.out_kvec_bytes -= skip;
con->v1.out_kvec_left--;
}
return skip;
}
static size_t sizeof_footer(struct ceph_connection *con)
{
return (con->peer_features & CEPH_FEATURE_MSG_AUTH) ?
sizeof(struct ceph_msg_footer) :
sizeof(struct ceph_msg_footer_old);
}
static void prepare_message_data(struct ceph_msg *msg, u32 data_len)
{
/* Initialize data cursor */
ceph_msg_data_cursor_init(&msg->cursor, msg, data_len);
}
/*
* Prepare footer for currently outgoing message, and finish things
* off. Assumes out_kvec* are already valid.. we just add on to the end.
*/
static void prepare_write_message_footer(struct ceph_connection *con)
{
struct ceph_msg *m = con->out_msg;
m->footer.flags |= CEPH_MSG_FOOTER_COMPLETE;
dout("prepare_write_message_footer %p\n", con);
con_out_kvec_add(con, sizeof_footer(con), &m->footer);
if (con->peer_features & CEPH_FEATURE_MSG_AUTH) {
if (con->ops->sign_message)
con->ops->sign_message(m);
else
m->footer.sig = 0;
} else {
m->old_footer.flags = m->footer.flags;
}
con->v1.out_more = m->more_to_follow;
con->v1.out_msg_done = true;
}
/*
* Prepare headers for the next outgoing message.
*/
static void prepare_write_message(struct ceph_connection *con)
{
struct ceph_msg *m;
u32 crc;
con_out_kvec_reset(con);
con->v1.out_msg_done = false;
/* Sneak an ack in there first? If we can get it into the same
* TCP packet that's a good thing. */
if (con->in_seq > con->in_seq_acked) {
con->in_seq_acked = con->in_seq;
con_out_kvec_add(con, sizeof (tag_ack), &tag_ack);
con->v1.out_temp_ack = cpu_to_le64(con->in_seq_acked);
con_out_kvec_add(con, sizeof(con->v1.out_temp_ack),
&con->v1.out_temp_ack);
}
ceph_con_get_out_msg(con);
m = con->out_msg;
dout("prepare_write_message %p seq %lld type %d len %d+%d+%zd\n",
m, con->out_seq, le16_to_cpu(m->hdr.type),
le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len),
m->data_length);
WARN_ON(m->front.iov_len != le32_to_cpu(m->hdr.front_len));
WARN_ON(m->data_length != le32_to_cpu(m->hdr.data_len));
/* tag + hdr + front + middle */
con_out_kvec_add(con, sizeof (tag_msg), &tag_msg);
con_out_kvec_add(con, sizeof(con->v1.out_hdr), &con->v1.out_hdr);
con_out_kvec_add(con, m->front.iov_len, m->front.iov_base);
if (m->middle)
con_out_kvec_add(con, m->middle->vec.iov_len,
m->middle->vec.iov_base);
/* fill in hdr crc and finalize hdr */
crc = crc32c(0, &m->hdr, offsetof(struct ceph_msg_header, crc));
con->out_msg->hdr.crc = cpu_to_le32(crc);
memcpy(&con->v1.out_hdr, &con->out_msg->hdr, sizeof(con->v1.out_hdr));
/* fill in front and middle crc, footer */
crc = crc32c(0, m->front.iov_base, m->front.iov_len);
con->out_msg->footer.front_crc = cpu_to_le32(crc);
if (m->middle) {
crc = crc32c(0, m->middle->vec.iov_base,
m->middle->vec.iov_len);
con->out_msg->footer.middle_crc = cpu_to_le32(crc);
} else
con->out_msg->footer.middle_crc = 0;
dout("%s front_crc %u middle_crc %u\n", __func__,
le32_to_cpu(con->out_msg->footer.front_crc),
le32_to_cpu(con->out_msg->footer.middle_crc));
con->out_msg->footer.flags = 0;
/* is there a data payload? */
con->out_msg->footer.data_crc = 0;
if (m->data_length) {
prepare_message_data(con->out_msg, m->data_length);
con->v1.out_more = 1; /* data + footer will follow */
} else {
/* no, queue up footer too and be done */
prepare_write_message_footer(con);
}
ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
}
/*
* Prepare an ack.
*/
static void prepare_write_ack(struct ceph_connection *con)
{
dout("prepare_write_ack %p %llu -> %llu\n", con,
con->in_seq_acked, con->in_seq);
con->in_seq_acked = con->in_seq;
con_out_kvec_reset(con);
con_out_kvec_add(con, sizeof (tag_ack), &tag_ack);
con->v1.out_temp_ack = cpu_to_le64(con->in_seq_acked);
con_out_kvec_add(con, sizeof(con->v1.out_temp_ack),
&con->v1.out_temp_ack);
con->v1.out_more = 1; /* more will follow.. eventually.. */
ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
}
/*
* Prepare to share the seq during handshake
*/
static void prepare_write_seq(struct ceph_connection *con)
{
dout("prepare_write_seq %p %llu -> %llu\n", con,
con->in_seq_acked, con->in_seq);
con->in_seq_acked = con->in_seq;
con_out_k
|