/*
* "splice": joining two ropes together by interweaving their strands.
*
* This is the "extended pipe" functionality, where a pipe is used as
* an arbitrary in-memory buffer. Think of a pipe as a small kernel
* buffer that you can use to transfer data from one end to the other.
*
* The traditional unix read/write is extended with a "splice()" operation
* that transfers data buffers to or from a pipe buffer.
*
* Named by Larry McVoy, original implementation from Linus, extended by
* Jens to support splicing to files, network, direct splicing, etc and
* fixing lots of bugs.
*
* Copyright (C) 2005-2006 Jens Axboe <axboe@kernel.dk>
* Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org>
* Copyright (C) 2006 Ingo Molnar <mingo@elte.hu>
*
*/
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/pagemap.h>
#include <linux/pipe_fs_i.h>
#include <linux/mm_inline.h>
#include <linux/swap.h>
#include <linux/writeback.h>
#include <linux/buffer_head.h>
#include <linux/module.h>
#include <linux/syscalls.h>
#include <linux/uio.h>
struct partial_page {
unsigned int offset;
unsigned int len;
};
/*
* Passed to splice_to_pipe
*/
struct splice_pipe_desc {
struct page **pages; /* page map */
struct partial_page *partial; /* pages[] may not be contig */
int nr_pages; /* number of pages in map */
unsigned int flags; /* splice flags */
struct pipe_buf_operations *ops;/* ops associated with output pipe */
};
/*
* Attempt to steal a page from a pipe buffer. This should perhaps go into
* a vm helper function, it's already simplified quite a bit by the
* addition of remove_mapping(). If success is returned, the caller may
* attempt to reuse this page for another destination.
*/
static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe,
struct pipe_buffer *buf)
{
struct page *page = buf->page;
struct address_space *mapping;
lock_page(page);
mapping = page_mapping(page);
if (mapping) {
WARN_ON(!PageUptodate(page));
/*
* At least for ext2 with nobh option, we need to wait on
* writeback completing on this page, since we'll remove it
* from the pagecache. Otherwise truncate wont wait on the
* page, allowing the disk blocks to be reused by someone else
* before we actually wrote our data to them. fs corruption
* ensues.
*/
wait_on_page_writeback(page);
if (PagePrivate(page))
try_to_release_page(page, mapping_gfp_mask(mapping));
/*
* If we succeeded in removing the mapping, set LRU flag
* and return good.
*/
if (remove_mapping(mapping, page)) {
buf->flags |= PIPE_BUF_FLAG_LRU;
return 0;
}
}
/*
* Raced with truncate or failed to remove page from current
* address space, unlock and return failure.
*/
unlock_page(page);
return 1;
}
static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe,
struct pipe_buffer *buf)
{
page_cache_release(buf->page);
buf->flags &= ~PIPE_BUF_FLAG_LRU;
}
static int page_cache_pipe_buf_pin(struct pipe_inode_info *pipe,
struct pipe_buffer *buf)
{
struct page *page = buf->page;
int err;
if (!PageUptodate(page)) {
lock_page(page);
/*
* Page got truncated/unhashed. This will cause a 0-byte
* splice, if this is the first page.
*/
if (!page->mapping) {
err = -ENODATA;
goto error;
}
/*
* Uh oh, read-error from disk.
*/
if (!PageUptodate(page)) {
err = -EIO;
goto error;
}
/*
* Page is ok afterall, we are done.
*/
unlock_page(page);
}
return 0;
error:
unlock_page(page);
return err;
}
static struct pipe_buf_operations page_cache_pipe_buf_ops = {
.can_merge = 0,
.map = generic_pipe_buf_map,
.unmap = generic_pipe_buf_unmap,
.pin = page_cache_pipe_buf_pin,
.release = page_cache_pipe_buf_release,
.steal = page_cache_pipe_buf_steal,
.get = generic_pipe_buf_get,
};
static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe,
struct pipe_buffer *buf)
{
if (!(buf->flags & PIPE_BUF_FLAG_GIFT))
return 1;
buf->flags |= PIPE_BUF_FLAG_LRU;
return generic_pipe_buf_steal(pipe, buf);
}
static struct pipe_buf_operations user_page_pipe_buf_ops = {
.can_merge = 0,
.map = generic_pipe_buf_map,
.unmap = generic_pipe_buf_unmap,
.pin = generic_pipe_buf_pin,
.release = page_cache_pipe_buf_release,
.steal = user_page_pipe_buf_steal,
.get = generic_pipe_buf_get,
};
/*
* Pipe output worker. This sets up our pipe format with the page cache
* pipe buffer operations. Otherwise very similar to the regular pipe_writev().
*/
static ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
struct splice_pipe_desc *spd)
{
int ret, do_wakeup, page_nr;
ret = 0;
do_wakeup = 0;
page_nr = 0;
if (pipe->inode)
mutex_lock(&pipe->inode->i_mutex);
for (;;) {
if (!pipe->readers) {
send_sig(SIGPIPE, current, 0);
if (!ret)
ret = -EPIPE;
break;
}
if (pipe->nrbufs < PIPE_BUFFERS) {
int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1);
struct pipe_buffer *buf = pipe->bufs + newbuf;
buf->page = spd->pages[page_nr];
buf->offset = spd->partial[page_nr].offset;
buf->len = spd->partial[page_nr].len;
buf->ops = spd->ops;
if (spd->flags & SPLICE_F_GIFT)
buf->flags |= PIPE_BUF_FLAG_GIFT;
pipe->nrbufs++;
page_nr++;
ret += buf->len;
if (pipe->inode)
do_wakeup = 1;
if (!--spd->nr_pages)
break;
if (pipe->nrbufs < PIPE_BUFFERS)
continue;
break;
}
if (spd->flags & SPLICE_F_NONBLOCK) {
if (!ret)
ret = -EAGAIN;
break;
}
if (signal_pending(current)) {
if (!ret)
ret = -ERESTARTSYS;
break;
}
if (do_wakeup) {
smp_mb();
if (waitqueue_active(&pipe->wait))
wake_up_interruptible_sync(&pipe->wait);
kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
do_wakeup = 0;
}
pipe->waiting_writers++;
pipe_wait(pipe);
pipe->waiting_writers--;
}
if (pipe->inode)
mutex_unlock(&pipe->inode->i_mutex);
if (do_wakeup) {
smp_mb();
if (waitqueue_active(&pipe->wait))
wake_up_interruptible(&pipe->wait);
kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
}
while (page_nr < spd->nr_pages)
page_cache_release(spd->pages[page_nr++]);
return ret;
}
static int
__generic_file_splice_read(struct file *in, loff_t *ppos,
struct pipe_inode_info *pipe, size_t len,
unsigned int flags)
{
struct address_space *mapping = in->f_mapping;
unsigned int loff, nr_pages;
struct page *pages[PIPE_BUFFERS];
struct partial_page partial[PIPE_BUFFERS];
struct page *page;
pgoff_t index, end_index;
loff_t isize;
size_t total_len;
int error, page_nr;
struct splice_pipe_desc spd = {
.pages = pages,
.partial = partial,
.flags = flags,
.ops = &page_cache_pipe_buf_ops,
};
index = *ppos >> PAGE_CACHE_SHIFT;
loff = *ppos & ~PAGE_CACHE_MASK;
nr_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
if (nr_pages > PIPE_BUFFERS)
nr_pages = PIPE_BUFFERS;
/*
* Initiate read-ahead on this page range. however, don't call into
* read-ahead if this is a non-zero offset (we are likely doing small
* chunk splice and the page is already there) for a single page.
*/
if (!loff || nr_pages > 1)
page_cache_readahead(mapping, &in->f_ra, in, index, nr_pages);
/*
* Now fill in the holes:
*/
error = 0;
total_len = 0;
/*
* Lookup the (hopefully) full range of pages we need.
*/
spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, pages);
/*
* If find_get_pages_contig() returned fewer pages than we needed,
* allocate the
|