From cfa320f72882f0e944e2237287db84b0f7df877d Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 10 Jun 2022 12:58:27 -0700 Subject: iov: introduce iov_iter_aligned The existing iov_iter_alignment() function returns the logical OR of address and length. For cases where address and length need to be considered separately, introduce a helper function that a caller can specificy length and address masks that indicate if the iov is unaligned. Cc: Alexander Viro Signed-off-by: Keith Busch Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220610195830.3574005-9-kbusch@fb.com Signed-off-by: Jens Axboe --- lib/iov_iter.c | 92 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 92 insertions(+) (limited to 'lib/iov_iter.c') diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 0b64695ab632..507e732ef7cf 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -1268,6 +1268,98 @@ void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count) } EXPORT_SYMBOL(iov_iter_discard); +static bool iov_iter_aligned_iovec(const struct iov_iter *i, unsigned addr_mask, + unsigned len_mask) +{ + size_t size = i->count; + size_t skip = i->iov_offset; + unsigned k; + + for (k = 0; k < i->nr_segs; k++, skip = 0) { + size_t len = i->iov[k].iov_len - skip; + + if (len > size) + len = size; + if (len & len_mask) + return false; + if ((unsigned long)(i->iov[k].iov_base + skip) & addr_mask) + return false; + + size -= len; + if (!size) + break; + } + return true; +} + +static bool iov_iter_aligned_bvec(const struct iov_iter *i, unsigned addr_mask, + unsigned len_mask) +{ + size_t size = i->count; + unsigned skip = i->iov_offset; + unsigned k; + + for (k = 0; k < i->nr_segs; k++, skip = 0) { + size_t len = i->bvec[k].bv_len - skip; + + if (len > size) + len = size; + if (len & len_mask) + return false; + if ((unsigned long)(i->bvec[k].bv_offset + skip) & addr_mask) + return false; + + size -= len; + if (!size) + break; + } + return true; +} + +/** + * iov_iter_is_aligned() - Check if the addresses and lengths of each segments + * are aligned to the parameters. + * + * @i: &struct iov_iter to restore + * @addr_mask: bit mask to check against the iov element's addresses + * @len_mask: bit mask to check against the iov element's lengths + * + * Return: false if any addresses or lengths intersect with the provided masks + */ +bool iov_iter_is_aligned(const struct iov_iter *i, unsigned addr_mask, + unsigned len_mask) +{ + if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) + return iov_iter_aligned_iovec(i, addr_mask, len_mask); + + if (iov_iter_is_bvec(i)) + return iov_iter_aligned_bvec(i, addr_mask, len_mask); + + if (iov_iter_is_pipe(i)) { + unsigned int p_mask = i->pipe->ring_size - 1; + size_t size = i->count; + + if (size & len_mask) + return false; + if (size && allocated(&i->pipe->bufs[i->head & p_mask])) { + if (i->iov_offset & addr_mask) + return false; + } + + return true; + } + + if (iov_iter_is_xarray(i)) { + if (i->count & len_mask) + return false; + if ((i->xarray_start + i->iov_offset) & addr_mask) + return false; + } + + return true; +} +EXPORT_SYMBOL_GPL(iov_iter_is_aligned); + static unsigned long iov_iter_alignment_iovec(const struct iov_iter *i) { unsigned long res = 0; -- cgit v1.2.3 From 59bb69c67cf1475a04cd5629d9c4f6dbbcba5e4a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 26 May 2022 19:07:11 -0400 Subject: copy_page_{to,from}_iter(): switch iovec variants to generic we can do copyin/copyout under kmap_local_page(); it shouldn't overflow the kmap stack - the maximal footprint increase only by one here. Reviewed-by: Jeff Layton Reviewed-by: Christian Brauner (Microsoft) Signed-off-by: Al Viro --- lib/iov_iter.c | 191 ++------------------------------------------------------- 1 file changed, 4 insertions(+), 187 deletions(-) (limited to 'lib/iov_iter.c') diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 6dd5330f7a99..4c658a25e29c 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -168,174 +168,6 @@ static int copyin(void *to, const void __user *from, size_t n) return n; } -static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t bytes, - struct iov_iter *i) -{ - size_t skip, copy, left, wanted; - const struct iovec *iov; - char __user *buf; - void *kaddr, *from; - - if (unlikely(bytes > i->count)) - bytes = i->count; - - if (unlikely(!bytes)) - return 0; - - might_fault(); - wanted = bytes; - iov = i->iov; - skip = i->iov_offset; - buf = iov->iov_base + skip; - copy = min(bytes, iov->iov_len - skip); - - if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_writeable(buf, copy)) { - kaddr = kmap_atomic(page); - from = kaddr + offset; - - /* first chunk, usually the only one */ - left = copyout(buf, from, copy); - copy -= left; - skip += copy; - from += copy; - bytes -= copy; - - while (unlikely(!left && bytes)) { - iov++; - buf = iov->iov_base; - copy = min(bytes, iov->iov_len); - left = copyout(buf, from, copy); - copy -= left; - skip = copy; - from += copy; - bytes -= copy; - } - if (likely(!bytes)) { - kunmap_atomic(kaddr); - goto done; - } - offset = from - kaddr; - buf += copy; - kunmap_atomic(kaddr); - copy = min(bytes, iov->iov_len - skip); - } - /* Too bad - revert to non-atomic kmap */ - - kaddr = kmap(page); - from = kaddr + offset; - left = copyout(buf, from, copy); - copy -= left; - skip += copy; - from += copy; - bytes -= copy; - while (unlikely(!left && bytes)) { - iov++; - buf = iov->iov_base; - copy = min(bytes, iov->iov_len); - left = copyout(buf, from, copy); - copy -= left; - skip = copy; - from += copy; - bytes -= copy; - } - kunmap(page); - -done: - if (skip == iov->iov_len) { - iov++; - skip = 0; - } - i->count -= wanted - bytes; - i->nr_segs -= iov - i->iov; - i->iov = iov; - i->iov_offset = skip; - return wanted - bytes; -} - -static size_t copy_page_from_iter_iovec(struct page *page, size_t offset, size_t bytes, - struct iov_iter *i) -{ - size_t skip, copy, left, wanted; - const struct iovec *iov; - char __user *buf; - void *kaddr, *to; - - if (unlikely(bytes > i->count)) - bytes = i->count; - - if (unlikely(!bytes)) - return 0; - - might_fault(); - wanted = bytes; - iov = i->iov; - skip = i->iov_offset; - buf = iov->iov_base + skip; - copy = min(bytes, iov->iov_len - skip); - - if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_readable(buf, copy)) { - kaddr = kmap_atomic(page); - to = kaddr + offset; - - /* first chunk, usually the only one */ - left = copyin(to, buf, copy); - copy -= left; - skip += copy; - to += copy; - bytes -= copy; - - while (unlikely(!left && bytes)) { - iov++; - buf = iov->iov_base; - copy = min(bytes, iov->iov_len); - left = copyin(to, buf, copy); - copy -= left; - skip = copy; - to += copy; - bytes -= copy; - } - if (likely(!bytes)) { - kunmap_atomic(kaddr); - goto done; - } - offset = to - kaddr; - buf += copy; - kunmap_atomic(kaddr); - copy = min(bytes, iov->iov_len - skip); - } - /* Too bad - revert to non-atomic kmap */ - - kaddr = kmap(page); - to = kaddr + offset; - left = copyin(to, buf, copy); - copy -= left; - skip += copy; - to += copy; - bytes -= copy; - while (unlikely(!left && bytes)) { - iov++; - buf = iov->iov_base; - copy = min(bytes, iov->iov_len); - left = copyin(to, buf, copy); - copy -= left; - skip = copy; - to += copy; - bytes -= copy; - } - kunmap(page); - -done: - if (skip == iov->iov_len) { - iov++; - skip = 0; - } - i->count -= wanted - bytes; - i->nr_segs -= iov - i->iov; - i->iov = iov; - i->iov_offset = skip; - return wanted - bytes; -} - #ifdef PIPE_PARANOIA static bool sanity(const struct iov_iter *i) { @@ -848,24 +680,14 @@ static inline bool page_copy_sane(struct page *page, size_t offset, size_t n) static size_t __copy_page_to_iter(struct page *page, size_t offset, size_t bytes, struct iov_iter *i) { - if (likely(iter_is_iovec(i))) - return copy_page_to_iter_iovec(page, offset, bytes, i); - if (iov_iter_is_bvec(i) || iov_iter_is_kvec(i) || iov_iter_is_xarray(i)) { + if (unlikely(iov_iter_is_pipe(i))) { + return copy_page_to_iter_pipe(page, offset, bytes, i); + } else { void *kaddr = kmap_local_page(page); size_t wanted = _copy_to_iter(kaddr + offset, bytes, i); kunmap_local(kaddr); return wanted; } - if (iov_iter_is_pipe(i)) - return copy_page_to_iter_pipe(page, offset, bytes, i); - if (unlikely(iov_iter_is_discard(i))) { - if (unlikely(i->count < bytes)) - bytes = i->count; - i->count -= bytes; - return bytes; - } - WARN_ON(1); - return 0; } size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, @@ -896,17 +718,12 @@ EXPORT_SYMBOL(copy_page_to_iter); size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes, struct iov_iter *i) { - if (unlikely(!page_copy_sane(page, offset, bytes))) - return 0; - if (likely(iter_is_iovec(i))) - return copy_page_from_iter_iovec(page, offset, bytes, i); - if (iov_iter_is_bvec(i) || iov_iter_is_kvec(i) || iov_iter_is_xarray(i)) { + if (page_copy_sane(page, offset, bytes)) { void *kaddr = kmap_local_page(page); size_t wanted = _copy_from_iter(kaddr + offset, bytes, i); kunmap_local(kaddr); return wanted; } - WARN_ON(1); return 0; } EXPORT_SYMBOL(copy_page_from_iter); -- cgit v1.2.3 From c3497fd009ef2c59eea60d21c3ac22de3585ed7d Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 12 Jun 2022 19:50:29 -0400 Subject: fix short copy handling in copy_mc_pipe_to_iter() Unlike other copying operations on ITER_PIPE, copy_mc_to_iter() can result in a short copy. In that case we need to trim the unused buffers, as well as the length of partially filled one - it's not enough to set ->head, ->iov_offset and ->count to reflect how much had we copied. Not hard to fix, fortunately... I'd put a helper (pipe_discard_from(pipe, head)) into pipe_fs_i.h, rather than iov_iter.c - it has nothing to do with iov_iter and having it will allow us to avoid an ugly kludge in fs/splice.c. We could put it into lib/iov_iter.c for now and move it later, but I don't see the point going that way... Cc: stable@kernel.org # 4.19+ Fixes: ca146f6f091e "lib/iov_iter: Fix pipe handling in _copy_to_iter_mcsafe()" Reviewed-by: Jeff Layton Reviewed-by: Christian Brauner (Microsoft) Signed-off-by: Al Viro --- lib/iov_iter.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'lib/iov_iter.c') diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 0b64695ab632..2bf20b48a04a 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -689,6 +689,7 @@ static size_t copy_mc_pipe_to_iter(const void *addr, size_t bytes, struct pipe_inode_info *pipe = i->pipe; unsigned int p_mask = pipe->ring_size - 1; unsigned int i_head; + unsigned int valid = pipe->head; size_t n, off, xfer = 0; if (!sanity(i)) @@ -702,11 +703,17 @@ static size_t copy_mc_pipe_to_iter(const void *addr, size_t bytes, rem = copy_mc_to_kernel(p + off, addr + xfer, chunk); chunk -= rem; kunmap_local(p); - i->head = i_head; - i->iov_offset = off + chunk; - xfer += chunk; - if (rem) + if (chunk) { + i->head = i_head; + i->iov_offset = off + chunk; + xfer += chunk; + valid = i_head + 1; + } + if (rem) { + pipe->bufs[i_head & p_mask].len -= rem; + pipe_discard_from(pipe, valid); break; + } n -= chunk; off = 0; i_head++; -- cgit v1.2.3 From 18fa9af7263164ec9a8d7b28a848324825f14672 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 6 Jun 2022 23:44:33 -0400 Subject: iov_iter_bvec_advance(): don't bother with bvec_iter do what we do for iovec/kvec; that ends up generating better code, AFAICS. Reviewed-by: Jeff Layton Reviewed-by: Christian Brauner (Microsoft) Signed-off-by: Al Viro --- lib/iov_iter.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) (limited to 'lib/iov_iter.c') diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 4c658a25e29c..c51314639615 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -846,17 +846,22 @@ static void pipe_advance(struct iov_iter *i, size_t size) static void iov_iter_bvec_advance(struct iov_iter *i, size_t size) { - struct bvec_iter bi; + const struct bio_vec *bvec, *end; - bi.bi_size = i->count; - bi.bi_bvec_done = i->iov_offset; - bi.bi_idx = 0; - bvec_iter_advance(i->bvec, &bi, size); + if (!i->count) + return; + i->count -= size; + + size += i->iov_offset; - i->bvec += bi.bi_idx; - i->nr_segs -= bi.bi_idx; - i->count = bi.bi_size; - i->iov_offset = bi.bi_bvec_done; + for (bvec = i->bvec, end = bvec + i->nr_segs; bvec < end; bvec++) { + if (likely(size < bvec->bv_len)) + break; + size -= bvec->bv_len; + } + i->iov_offset = size; + i->nr_segs -= bvec - i->bvec; + i->bvec = bvec; } static void iov_iter_iovec_advance(struct iov_iter *i, size_t size) -- cgit v1.2.3 From 7392ed1734c319150b5ddec3f192a6405728e8d0 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 11 Jun 2022 16:44:21 -0400 Subject: iov_iter_get_pages{,_alloc}(): cap the maxsize with MAX_RW_COUNT All callers can and should handle iov_iter_get_pages() returning fewer pages than requested. All in-kernel ones do. And it makes the arithmetical overflow analysis much simpler... Reviewed-by: Jeff Layton Signed-off-by: Al Viro --- lib/iov_iter.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'lib/iov_iter.c') diff --git a/lib/iov_iter.c b/lib/iov_iter.c index c51314639615..225b968ed8c5 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -1348,6 +1348,8 @@ ssize_t iov_iter_get_pages(struct iov_iter *i, maxsize = i->count; if (!maxsize) return 0; + if (maxsize > MAX_RW_COUNT) + maxsize = MAX_RW_COUNT; if (likely(iter_is_iovec(i))) { unsigned int gup_flags = 0; @@ -1474,6 +1476,8 @@ ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, maxsize = i->count; if (!maxsize) return 0; + if (maxsize > MAX_RW_COUNT) + maxsize = MAX_RW_COUNT; if (likely(iter_is_iovec(i))) { unsigned int gup_flags = 0; -- cgit v1.2.3 From 599a0bdd72f0a7ed5f55faef0ecdcd36cb1bc287 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 10 Jun 2022 20:53:17 -0400 Subject: iov_iter: lift dealing with maxpages out of first_{iovec,bvec}_segment() caller can do that just as easily Signed-off-by: Al Viro --- lib/iov_iter.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'lib/iov_iter.c') diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 225b968ed8c5..1b5e96ddddf3 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -1295,7 +1295,7 @@ static ssize_t iter_xarray_get_pages(struct iov_iter *i, /* must be done on non-empty ITER_IOVEC one */ static unsigned long first_iovec_segment(const struct iov_iter *i, size_t *size, size_t *start, - size_t maxsize, unsigned maxpages) + size_t maxsize) { size_t skip; long k; @@ -1309,8 +1309,6 @@ static unsigned long first_iovec_segment(const struct iov_iter *i, if (len > maxsize) len = maxsize; len += (*start = addr % PAGE_SIZE); - if (len > maxpages * PAGE_SIZE) - len = maxpages * PAGE_SIZE; *size = len; return addr & PAGE_MASK; } @@ -1320,7 +1318,7 @@ static unsigned long first_iovec_segment(const struct iov_iter *i, /* must be done on non-empty ITER_BVEC one */ static struct page *first_bvec_segment(const struct iov_iter *i, size_t *size, size_t *start, - size_t maxsize, unsigned maxpages) + size_t maxsize) { struct page *page; size_t skip = i->iov_offset, len; @@ -1331,8 +1329,6 @@ static struct page *first_bvec_segment(const struct iov_iter *i, skip += i->bvec->bv_offset; page = i->bvec->bv_page + skip / PAGE_SIZE; len += (*start = skip % PAGE_SIZE); - if (len > maxpages * PAGE_SIZE) - len = maxpages * PAGE_SIZE; *size = len; return page; } @@ -1360,7 +1356,9 @@ ssize_t iov_iter_get_pages(struct iov_iter *i, if (i->nofault) gup_flags |= FOLL_NOFAULT; - addr = first_iovec_segment(i, &len, start, maxsize, maxpages); + addr = first_iovec_segment(i, &len, start, maxsize); + if (len > maxpages * PAGE_SIZE) + len = maxpages * PAGE_SIZE; n = DIV_ROUND_UP(len, PAGE_SIZE); res = get_user_pages_fast(addr, n, gup_flags, pages); if (unlikely(res <= 0)) @@ -1370,7 +1368,9 @@ ssize_t iov_iter_get_pages(struct iov_iter *i, if (iov_iter_is_bvec(i)) { struct page *page; - page = first_bvec_segment(i, &len, start, maxsize, maxpages); + page = first_bvec_segment(i, &len, start, maxsize); + if (len > maxpages * PAGE_SIZE) + len = maxpages * PAGE_SIZE; n = DIV_ROUND_UP(len, PAGE_SIZE); while (n--) get_page(*pages++ = page++); @@ -1488,7 +1488,7 @@ ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, if (i->nofault) gup_flags |= FOLL_NOFAULT; - addr = first_iovec_segment(i, &len, start, maxsize, ~0U); + addr = first_iovec_segment(i, &len, start, maxsize); n = DIV_ROUND_UP(len, PAGE_SIZE); p = get_pages_array(n); if (!p) @@ -1505,7 +1505,7 @@ ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, if (iov_iter_is_bvec(i)) { struct page *page; - page = first_bvec_segment(i, &len, start, maxsize, ~0U); + page = first_bvec_segment(i, &len, start, maxsize); n = DIV_ROUND_UP(len, PAGE_SIZE); *pages = p = get_pages_array(n); if (!p) -- cgit v1.2.3 From dda8e5d17c170415a3c10f68365f3a2800a6e68f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 21 Jun 2022 15:55:19 -0400 Subject: iov_iter: first_{iovec,bvec}_segment() - simplify a bit We return length + offset in page via *size. Don't bother - the caller can do that arithmetics just as well; just report the length to it. Signed-off-by: Al Viro --- lib/iov_iter.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'lib/iov_iter.c') diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 1b5e96ddddf3..45dccecae946 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -1308,7 +1308,7 @@ static unsigned long first_iovec_segment(const struct iov_iter *i, continue; if (len > maxsize) len = maxsize; - len += (*start = addr % PAGE_SIZE); + *start = addr % PAGE_SIZE; *size = len; return addr & PAGE_MASK; } @@ -1328,7 +1328,7 @@ static struct page *first_bvec_segment(const struct iov_iter *i, len = maxsize; skip += i->bvec->bv_offset; page = i->bvec->bv_page + skip / PAGE_SIZE; - len += (*start = skip % PAGE_SIZE); + *start = skip % PAGE_SIZE; *size = len; return page; } @@ -1357,24 +1357,24 @@ ssize_t iov_iter_get_pages(struct iov_iter *i, gup_flags |= FOLL_NOFAULT; addr = first_iovec_segment(i, &len, start, maxsize); - if (len > maxpages * PAGE_SIZE) - len = maxpages * PAGE_SIZE; - n = DIV_ROUND_UP(len, PAGE_SIZE); + n = DIV_ROUND_UP(len + *start, PAGE_SIZE); + if (n > maxpages) + n = maxpages; res = get_user_pages_fast(addr, n, gup_flags, pages); if (unlikely(res <= 0)) return res; - return (res == n ? len : res * PAGE_SIZE) - *start; + return min_t(size_t, len, res * PAGE_SIZE - *start); } if (iov_iter_is_bvec(i)) { struct page *page; page = first_bvec_segment(i, &len, start, maxsize); - if (len > maxpages * PAGE_SIZE) - len = maxpages * PAGE_SIZE; - n = DIV_ROUND_UP(len, PAGE_SIZE); - while (n--) + n = DIV_ROUND_UP(len + *start, PAGE_SIZE); + if (n > maxpages) + n = maxpages; + for (int k = 0; k < n; k++) get_page(*pages++ = page++); - return len - *start; + return min_t(size_t, len, n * PAGE_SIZE - *start); } if (iov_iter_is_pipe(i)) return pipe_get_pages(i, pages, maxsize, maxpages, start); @@ -1489,7 +1489,7 @@ ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, gup_flags |= FOLL_NOFAULT; addr = first_iovec_segment(i, &len, start, maxsize); - n = DIV_ROUND_UP(len, PAGE_SIZE); + n = DIV_ROUND_UP(len + *start, PAGE_SIZE); p = get_pages_array(n); if (!p) return -ENOMEM; @@ -1500,19 +1500,19 @@ ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, return res; } *pages = p; - return (res == n ? len : res * PAGE_SIZE) - *start; + return min_t(size_t, len, res * PAGE_SIZE - *start); } if (iov_iter_is_bvec(i)) { struct page *page; page = first_bvec_segment(i, &len, start, maxsize); - n = DIV_ROUND_UP(len, PAGE_SIZE); + n = DIV_ROUND_UP(len + *start, PAGE_SIZE); *pages = p = get_pages_array(n); if (!p) return -ENOMEM; - while (n--) + for (int k = 0; k < n; k++) get_page(*p++ = page++); - return len - *start; + return min_t(size_t, len, n * PAGE_SIZE - *start); } if (iov_iter_is_pipe(i)) return pipe_get_pages_alloc(i, pages, maxsize, start); -- cgit v1.2.3 From 59dbd7d0904a887ede1538b55bb8095ff2ce5078 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 21 Jun 2022 16:10:37 -0400 Subject: iov_iter: massage calling conventions for first_{iovec,bvec}_segment() Pass maxsize by reference, return length via the same. Signed-off-by: Al Viro --- lib/iov_iter.c | 42 ++++++++++++++++++------------------------ 1 file changed, 18 insertions(+), 24 deletions(-) (limited to 'lib/iov_iter.c') diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 45dccecae946..d93c6a1ffe26 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -1294,8 +1294,7 @@ static ssize_t iter_xarray_get_pages(struct iov_iter *i, /* must be done on non-empty ITER_IOVEC one */ static unsigned long first_iovec_segment(const struct iov_iter *i, - size_t *size, size_t *start, - size_t maxsize) + size_t *size, size_t *start) { size_t skip; long k; @@ -1306,10 +1305,9 @@ static unsigned long first_iovec_segment(const struct iov_iter *i, if (unlikely(!len)) continue; - if (len > maxsize) - len = maxsize; + if (*size > len) + *size = len; *start = addr % PAGE_SIZE; - *size = len; return addr & PAGE_MASK; } BUG(); // if it had been empty, we wouldn't get called @@ -1317,19 +1315,17 @@ static unsigned long first_iovec_segment(const struct iov_iter *i, /* must be done on non-empty ITER_BVEC one */ static struct page *first_bvec_segment(const struct iov_iter *i, - size_t *size, size_t *start, - size_t maxsize) + size_t *size, size_t *start) { struct page *page; size_t skip = i->iov_offset, len; len = i->bvec->bv_len - skip; - if (len > maxsize) - len = maxsize; + if (*size > len) + *size = len; skip += i->bvec->bv_offset; page = i->bvec->bv_page + skip / PAGE_SIZE; *start = skip % PAGE_SIZE; - *size = len; return page; } @@ -1337,7 +1333,6 @@ ssize_t iov_iter_get_pages(struct iov_iter *i, struct page **pages, size_t maxsize, unsigned maxpages, size_t *start) { - size_t len; int n, res; if (maxsize > i->count) @@ -1356,25 +1351,25 @@ ssize_t iov_iter_get_pages(struct iov_iter *i, if (i->nofault) gup_flags |= FOLL_NOFAULT; - addr = first_iovec_segment(i, &len, start, maxsize); - n = DIV_ROUND_UP(len + *start, PAGE_SIZE); + addr = first_iovec_segment(i, &maxsize, start); + n = DIV_ROUND_UP(maxsize + *start, PAGE_SIZE); if (n > maxpages) n = maxpages; res = get_user_pages_fast(addr, n, gup_flags, pages); if (unlikely(res <= 0)) return res; - return min_t(size_t, len, res * PAGE_SIZE - *start); + return min_t(size_t, maxsize, res * PAGE_SIZE - *start); } if (iov_iter_is_bvec(i)) { struct page *page; - page = first_bvec_segment(i, &len, start, maxsize); - n = DIV_ROUND_UP(len + *start, PAGE_SIZE); + page = first_bvec_segment(i, &maxsize, start); + n = DIV_ROUND_UP(maxsize + *start, PAGE_SIZE); if (n > maxpages) n = maxpages; for (int k = 0; k < n; k++) get_page(*pages++ = page++); - return min_t(size_t, len, n * PAGE_SIZE - *start); + return min_t(size_t, maxsize, n * PAGE_SIZE - *start); } if (iov_iter_is_pipe(i)) return pipe_get_pages(i, pages, maxsize, maxpages, start); @@ -1469,7 +1464,6 @@ ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, size_t *start) { struct page **p; - size_t len; int n, res; if (maxsize > i->count) @@ -1488,8 +1482,8 @@ ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, if (i->nofault) gup_flags |= FOLL_NOFAULT; - addr = first_iovec_segment(i, &len, start, maxsize); - n = DIV_ROUND_UP(len + *start, PAGE_SIZE); + addr = first_iovec_segment(i, &maxsize, start); + n = DIV_ROUND_UP(maxsize + *start, PAGE_SIZE); p = get_pages_array(n); if (!p) return -ENOMEM; @@ -1500,19 +1494,19 @@ ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, return res; } *pages = p; - return min_t(size_t, len, res * PAGE_SIZE - *start); + return min_t(size_t, maxsize, res * PAGE_SIZE - *start); } if (iov_iter_is_bvec(i)) { struct page *page; - page = first_bvec_segment(i, &len, start, maxsize); - n = DIV_ROUND_UP(len + *start, PAGE_SIZE); + page = first_bvec_segment(i, &maxsize, start); + n = DIV_ROUND_UP(maxsize + *start, PAGE_SIZE); *pages = p = get_pages_array(n); if (!p) return -ENOMEM; for (int k = 0; k < n; k++) get_page(*p++ = page++); - return min_t(size_t, len, n * PAGE_SIZE - *start); + return min_t(size_t, maxsize, n * PAGE_SIZE - *start); } if (iov_iter_is_pipe(i)) return pipe_get_pages_alloc(i, pages, maxsize, start); -- cgit v1.2.3 From dd45ab9dd28c82fc495d98cd9788666fd8d76b99 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 17 Jun 2022 16:07:49 -0400 Subject: first_iovec_segment(): just return address ... and calculate the offset in the caller Reviewed-by: Jeff Layton Signed-off-by: Al Viro --- lib/iov_iter.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'lib/iov_iter.c') diff --git a/lib/iov_iter.c b/lib/iov_iter.c index d93c6a1ffe26..a4a44065cd37 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -1293,22 +1293,19 @@ static ssize_t iter_xarray_get_pages(struct iov_iter *i, } /* must be done on non-empty ITER_IOVEC one */ -static unsigned long first_iovec_segment(const struct iov_iter *i, - size_t *size, size_t *start) +static unsigned long first_iovec_segment(const struct iov_iter *i, size_t *size) { size_t skip; long k; for (k = 0, skip = i->iov_offset; k < i->nr_segs; k++, skip = 0) { - unsigned long addr = (unsigned long)i->iov[k].iov_base + skip; size_t len = i->iov[k].iov_len - skip; if (unlikely(!len)) continue; if (*size > len) *size = len; - *start = addr % PAGE_SIZE; - return addr & PAGE_MASK; + return (unsigned long)i->iov[k].iov_base + skip; } BUG(); // if it had been empty, we wouldn't get called } @@ -1351,7 +1348,9 @@ ssize_t iov_iter_get_pages(struct iov_iter *i, if (i->nofault) gup_flags |= FOLL_NOFAULT; - addr = first_iovec_segment(i, &maxsize, start); + addr = first_iovec_segment(i, &maxsize); + *start = addr % PAGE_SIZE; + addr &= PAGE_MASK; n = DIV_ROUND_UP(maxsize + *start, PAGE_SIZE); if (n > maxpages) n = maxpages; @@ -1482,7 +1481,9 @@ ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, if (i->nofault) gup_flags |= FOLL_NOFAULT; - addr = first_iovec_segment(i, &maxsize, start); + addr = first_iovec_segment(i, &maxsize); + *start = addr % PAGE_SIZE; + addr &= PAGE_MASK; n = DIV_ROUND_UP(maxsize + *start, PAGE_SIZE); p = get_pages_array(n); if (!p) -- cgit v1.2.3 From fcb14cb1bdacec5b4374fe161e83fb8208164a85 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 22 May 2022 14:59:25 -0400 Subject: new iov_iter flavour - ITER_UBUF Equivalent of single-segment iovec. Initialized by iov_iter_ubuf(), checked for by iter_is_ubuf(), otherwise behaves like ITER_IOVEC ones. We are going to expose the things like ->write_iter() et.al. to those in subsequent commits. New predicate (user_backed_iter()) that is true for ITER_IOVEC and ITER_UBUF; places like direct-IO handling should use that for checking that pages we modify after getting them from iov_iter_get_pages() would need to be dirtied. DO NOT assume that replacing iter_is_iovec() with user_backed_iter() will solve all problems - there's code that uses iter_is_iovec() to decide how to poke around in iov_iter guts and for that the predicate replacement obviously won't suffice. Signed-off-by: Al Viro --- lib/iov_iter.c | 87 ++++++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 69 insertions(+), 18 deletions(-) (limited to 'lib/iov_iter.c') diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 0e0be334dbee..b3493d20536e 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -16,6 +16,16 @@ #define PIPE_PARANOIA /* for now */ +/* covers ubuf and kbuf alike */ +#define iterate_buf(i, n, base, len, off, __p, STEP) { \ + size_t __maybe_unused off = 0; \ + len = n; \ + base = __p + i->iov_offset; \ + len -= (STEP); \ + i->iov_offset += len; \ + n = len; \ +} + /* covers iovec and kvec alike */ #define iterate_iovec(i, n, base, len, off, __p, STEP) { \ size_t off = 0; \ @@ -110,7 +120,12 @@ __out: \ if (unlikely(i->count < n)) \ n = i->count; \ if (likely(n)) { \ - if (likely(iter_is_iovec(i))) { \ + if (likely(iter_is_ubuf(i))) { \ + void __user *base; \ + size_t len; \ + iterate_buf(i, n, base, len, off, \ + i->ubuf, (I)) \ + } else if (likely(iter_is_iovec(i))) { \ const struct iovec *iov = i->iov; \ void __user *base; \ size_t len; \ @@ -275,7 +290,11 @@ out: */ size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t size) { - if (iter_is_iovec(i)) { + if (iter_is_ubuf(i)) { + size_t n = min(size, iov_iter_count(i)); + n -= fault_in_readable(i->ubuf + i->iov_offset, n); + return size - n; + } else if (iter_is_iovec(i)) { size_t count = min(size, iov_iter_count(i)); const struct iovec *p; size_t skip; @@ -314,7 +333,11 @@ EXPORT_SYMBOL(fault_in_iov_iter_readable); */ size_t fault_in_iov_iter_writeable(const struct iov_iter *i, size_t size) { - if (iter_is_iovec(i)) { + if (iter_is_ubuf(i)) { + size_t n = min(size, iov_iter_count(i)); + n -= fault_in_safe_writeable(i->ubuf + i->iov_offset, n); + return size - n; + } else if (iter_is_iovec(i)) { size_t count = min(size, iov_iter_count(i)); const struct iovec *p; size_t skip; @@ -345,6 +368,7 @@ void iov_iter_init(struct iov_iter *i, unsigned int direction, *i = (struct iov_iter) { .iter_type = ITER_IOVEC, .nofault = false, + .user_backed = true, .data_source = direction, .iov = iov, .nr_segs = nr_segs, @@ -494,7 +518,7 @@ size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i) { if (unlikely(iov_iter_is_pipe(i))) return copy_pipe_to_iter(addr, bytes, i); - if (iter_is_iovec(i)) + if (user_backed_iter(i)) might_fault(); iterate_and_advance(i, bytes, base, len, off, copyout(base, addr + off, len), @@ -583,7 +607,7 @@ size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i) { if (unlikely(iov_iter_is_pipe(i))) return copy_mc_pipe_to_iter(addr, bytes, i); - if (iter_is_iovec(i)) + if (user_backed_iter(i)) might_fault(); __iterate_and_advance(i, bytes, base, len, off, copyout_mc(base, addr + off, len), @@ -601,7 +625,7 @@ size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i) WARN_ON(1); return 0; } - if (iter_is_iovec(i)) + if (user_backed_iter(i)) might_fault(); iterate_and_advance(i, bytes, base, len, off, copyin(addr + off, base, len), @@ -894,16 +918,16 @@ void iov_iter_advance(struct iov_iter *i, size_t size) { if (unlikely(i->count < size)) size = i->count; - if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) { + if (likely(iter_is_ubuf(i)) || unlikely(iov_iter_is_xarray(i))) { + i->iov_offset += size; + i->count -= size; + } else if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) { /* iovec and kvec have identical layouts */ iov_iter_iovec_advance(i, size); } else if (iov_iter_is_bvec(i)) { iov_iter_bvec_advance(i, size); } else if (iov_iter_is_pipe(i)) { pipe_advance(i, size); - } else if (unlikely(iov_iter_is_xarray(i))) { - i->iov_offset += size; - i->count -= size; } else if (iov_iter_is_discard(i)) { i->count -= size; } @@ -950,7 +974,7 @@ void iov_iter_revert(struct iov_iter *i, size_t unroll) return; } unroll -= i->iov_offset; - if (iov_iter_is_xarray(i)) { + if (iov_iter_is_xarray(i) || iter_is_ubuf(i)) { BUG(); /* We should never go beyond the start of the specified * range since we might then be straying into pages that * aren't pinned. @@ -1158,6 +1182,14 @@ static bool iov_iter_aligned_bvec(const struct iov_iter *i, unsigned addr_mask, bool iov_iter_is_aligned(const struct iov_iter *i, unsigned addr_mask, unsigned len_mask) { + if (likely(iter_is_ubuf(i))) { + if (i->count & len_mask) + return false; + if ((unsigned long)(i->ubuf + i->iov_offset) & addr_mask) + return false; + return true; + } + if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) return iov_iter_aligned_iovec(i, addr_mask, len_mask); @@ -1233,6 +1265,13 @@ static unsigned long iov_iter_alignment_bvec(const struct iov_iter *i) unsigned long iov_iter_alignment(const struct iov_iter *i) { + if (likely(iter_is_ubuf(i))) { + size_t size = i->count; + if (size) + return ((unsigned long)i->ubuf + i->iov_offset) | size; + return 0; + } + /* iovec and kvec have identical layouts */ if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) return iov_iter_alignment_iovec(i); @@ -1263,6 +1302,9 @@ unsigned long iov_iter_gap_alignment(const struct iov_iter *i) size_t size = i->count; unsigned k; + if (iter_is_ubuf(i)) + return 0; + if (WARN_ON(!iter_is_iovec(i))) return ~0U; @@ -1385,12 +1427,15 @@ static ssize_t iter_xarray_get_pages(struct iov_iter *i, return min_t(size_t, nr * PAGE_SIZE - offset, maxsize); } -/* must be done on non-empty ITER_IOVEC one */ +/* must be done on non-empty ITER_UBUF or ITER_IOVEC one */ static unsigned long first_iovec_segment(const struct iov_iter *i, size_t *size) { size_t skip; long k; + if (iter_is_ubuf(i)) + return (unsigned long)i->ubuf + i->iov_offset; + for (k = 0, skip = i->iov_offset; k < i->nr_segs; k++, skip = 0) { size_t len = i->iov[k].iov_len - skip; @@ -1432,7 +1477,7 @@ ssize_t iov_iter_get_pages(struct iov_iter *i, if (maxsize > MAX_RW_COUNT) maxsize = MAX_RW_COUNT; - if (likely(iter_is_iovec(i))) { + if (likely(user_backed_iter(i))) { unsigned int gup_flags = 0; unsigned long addr; @@ -1559,7 +1604,7 @@ ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, if (maxsize > MAX_RW_COUNT) maxsize = MAX_RW_COUNT; - if (likely(iter_is_iovec(i))) { + if (likely(user_backed_iter(i))) { unsigned int gup_flags = 0; unsigned long addr; @@ -1715,6 +1760,11 @@ int iov_iter_npages(const struct iov_iter *i, int maxpages) { if (unlikely(!i->count)) return 0; + if (likely(iter_is_ubuf(i))) { + unsigned offs = offset_in_page(i->ubuf + i->iov_offset); + int npages = DIV_ROUND_UP(offs + i->count, PAGE_SIZE); + return min(npages, maxpages); + } /* iovec and kvec have identical layouts */ if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) return iov_npages(i, maxpages); @@ -1749,17 +1799,16 @@ const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags) WARN_ON(1); return NULL; } - if (unlikely(iov_iter_is_discard(new) || iov_iter_is_xarray(new))) - return NULL; if (iov_iter_is_bvec(new)) return new->bvec = kmemdup(new->bvec, new->nr_segs * sizeof(struct bio_vec), flags); - else + else if (iov_iter_is_kvec(new) || iter_is_iovec(new)) /* iovec and kvec have identical layout */ return new->iov = kmemdup(new->iov, new->nr_segs * sizeof(struct iovec), flags); + return NULL; } EXPORT_SYMBOL(dup_iter); @@ -1953,10 +2002,12 @@ EXPORT_SYMBOL(import_single_range); void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state) { if (WARN_ON_ONCE(!iov_iter_is_bvec(i) && !iter_is_iovec(i)) && - !iov_iter_is_kvec(i)) + !iov_iter_is_kvec(i) && !iter_is_ubuf(i)) return; i->iov_offset = state->iov_offset; i->count = state->count; + if (iter_is_ubuf(i)) + return; /* * For the *vec iters, nr_segs + iov is constant - if we increment * the vec, then we also decrement the nr_segs count. Hence we don't -- cgit v1.2.3 From 2dcedb2a549a4d7430538213b1b28ef7271bc0aa Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 14 Jun 2022 10:24:37 -0400 Subject: ITER_PIPE: helper for getting pipe buffer by index pipe_buffer instances of a pipe are organized as a ring buffer, with power-of-2 size. Indices are kept *not* reduced modulo ring size, so the buffer refered to by index N is pipe->bufs[N & (pipe->ring_size - 1)]. Ring size can change over the lifetime of a pipe, but not while the pipe is locked. So for any iov_iter primitives it's a constant. Original conversion of pipes to this layout went overboard trying to microoptimize that - calculating pipe->ring_size - 1, storing it in a local variable and using through the function. In some cases it might be warranted, but most of the times it only obfuscates what's going on in there. Introduce a helper (pipe_buf(pipe, N)) that would encapsulate that and use it in the obvious cases. More will follow... Reviewed-by: Jeff Layton Reviewed-by: Christian Brauner (Microsoft) Signed-off-by: Al Viro --- lib/iov_iter.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'lib/iov_iter.c') diff --git a/lib/iov_iter.c b/lib/iov_iter.c index b3493d20536e..048026d5aa0d 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -183,13 +183,18 @@ static int copyin(void *to, const void __user *from, size_t n) return n; } +static inline struct pipe_buffer *pipe_buf(const struct pipe_inode_info *pipe, + unsigned int slot) +{ + return &pipe->bufs[slot & (pipe->ring_size - 1)]; +} + #ifdef PIPE_PARANOIA static bool sanity(const struct iov_iter *i) { struct pipe_inode_info *pipe = i->pipe; unsigned int p_head = pipe->head; unsigned int p_tail = pipe->tail; - unsigned int p_mask = pipe->ring_size - 1; unsigned int p_occupancy = pipe_occupancy(p_head, p_tail); unsigned int i_head = i->head; unsigned int idx; @@ -201,7 +206,7 @@ static bool sanity(const struct iov_iter *i) if (unlikely(i_head != p_head - 1)) goto Bad; // must be at the last buffer... - p = &pipe->bufs[i_head & p_mask]; + p = pipe_buf(pipe, i_head); if (unlikely(p->offset + p->len != i->iov_offset)) goto Bad; // ... at the end of segment } else { @@ -386,11 +391,10 @@ static inline bool allocated(struct pipe_buffer *buf) static inline void data_start(const struct iov_iter *i, unsigned int *iter_headp, size_t *offp) { - unsigned int p_mask = i->pipe->ring_size - 1; unsigned int iter_head = i->head; size_t off = i->iov_offset; - if (off && (!allocated(&i->pipe->bufs[iter_head & p_mask]) || + if (off && (!allocated(pipe_buf(i->pipe, iter_head)) || off == PAGE_SIZE)) { iter_head++; off = 0; @@ -1280,10 +1284,9 @@ unsigned long iov_iter_alignment(const struct iov_iter *i) return iov_iter_alignment_bvec(i); if (iov_iter_is_pipe(i)) { - unsigned int p_mask = i->pipe->ring_size - 1; size_t size = i->count; - if (size && i->iov_offset && allocated(&i->pipe->bufs[i->head & p_mask])) + if (size && i->iov_offset && allocated(pipe_buf(i->pipe, i->head))) return size | i->iov_offset; return size; } -- cgit v1.2.3 From 47b7fcae419dc940e3fb8e58088a5b80ad813bbf Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 13 Jun 2022 14:30:15 -0400 Subject: ITER_PIPE: helpers for adding pipe buffers There are only two kinds of pipe_buffer in the area used by ITER_PIPE. 1) anonymous - copy_to_iter() et.al. end up creating those and copying data there. They have zero ->offset, and their ->ops points to default_pipe_page_ops. 2) zero-copy ones - those come from copy_page_to_iter(), and page comes from caller. ->offset is also caller-supplied - it might be non-zero. ->ops points to page_cache_pipe_buf_ops. Move creation and insertion of those into helpers - push_anon(pipe, size) and push_page(pipe, page, offset, size) resp., separating them from the "could we avoid creating a new buffer by merging with the current head?" logics. Acked-by: Jeff Layton Signed-off-by: Al Viro --- lib/iov_iter.c | 88 ++++++++++++++++++++++++++++++---------------------------- 1 file changed, 46 insertions(+), 42 deletions(-) (limited to 'lib/iov_iter.c') diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 048026d5aa0d..a5c436e564e8 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -231,15 +231,39 @@ Bad: #define sanity(i) true #endif +static struct page *push_anon(struct pipe_inode_info *pipe, unsigned size) +{ + struct page *page = alloc_page(GFP_USER); + if (page) { + struct pipe_buffer *buf = pipe_buf(pipe, pipe->head++); + *buf = (struct pipe_buffer) { + .ops = &default_pipe_buf_ops, + .page = page, + .offset = 0, + .len = size + }; + } + return page; +} + +static void push_page(struct pipe_inode_info *pipe, struct page *page, + unsigned int offset, unsigned int size) +{ + struct pipe_buffer *buf = pipe_buf(pipe, pipe->head++); + *buf = (struct pipe_buffer) { + .ops = &page_cache_pipe_buf_ops, + .page = page, + .offset = offset, + .len = size + }; + get_page(page); +} + static size_t copy_page_to_iter_pipe(struct page *page, size_t offset, size_t bytes, struct iov_iter *i) { struct pipe_inode_info *pipe = i->pipe; - struct pipe_buffer *buf; - unsigned int p_tail = pipe->tail; - unsigned int p_mask = pipe->ring_size - 1; - unsigned int i_head = i->head; - size_t off; + unsigned int head = pipe->head; if (unlikely(bytes > i->count)) bytes = i->count; @@ -250,32 +274,21 @@ static size_t copy_page_to_iter_pipe(struct page *page, size_t offset, size_t by if (!sanity(i)) return 0; - off = i->iov_offset; - buf = &pipe->bufs[i_head & p_mask]; - if (off) { - if (offset == off && buf->page == page) { - /* merge with the last one */ + if (offset && i->iov_offset == offset) { // could we merge it? + struct pipe_buffer *buf = pipe_buf(pipe, head - 1); + if (buf->page == page) { buf->len += bytes; i->iov_offset += bytes; - goto out; + i->count -= bytes; + return bytes; } - i_head++; - buf = &pipe->bufs[i_head & p_mask]; } - if (pipe_full(i_head, p_tail, pipe->max_usage)) + if (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) return 0; - buf->ops = &page_cache_pipe_buf_ops; - buf->flags = 0; - get_page(page); - buf->page = page; - buf->offset = offset; - buf->len = bytes; - - pipe->head = i_head + 1; + push_page(pipe, page, offset, bytes); i->iov_offset = offset + bytes; - i->head = i_head; -out: + i->head = head; i->count -= bytes; return bytes; } @@ -407,8 +420,6 @@ static size_t push_pipe(struct iov_iter *i, size_t size, int *iter_headp, size_t *offp) { struct pipe_inode_info *pipe = i->pipe; - unsigned int p_tail = pipe->tail; - unsigned int p_mask = pipe->ring_size - 1; unsigned int iter_head; size_t off; ssize_t left; @@ -423,30 +434,23 @@ static size_t push_pipe(struct iov_iter *i, size_t size, *iter_headp = iter_head; *offp = off; if (off) { + struct pipe_buffer *buf = pipe_buf(pipe, iter_head); + left -= PAGE_SIZE - off; if (left <= 0) { - pipe->bufs[iter_head & p_mask].len += size; + buf->len += size; return size; } - pipe->bufs[iter_head & p_mask].len = PAGE_SIZE; - iter_head++; + buf->len = PAGE_SIZE; } - while (!pipe_full(iter_head, p_tail, pipe->max_usage)) { - struct pipe_buffer *buf = &pipe->bufs[iter_head & p_mask]; - struct page *page = alloc_page(GFP_USER); + while (!pipe_full(pipe->head, pipe->tail, pipe->max_usage)) { + struct page *page = push_anon(pipe, + min_t(ssize_t, left, PAGE_SIZE)); if (!page) break; - buf->ops = &default_pipe_buf_ops; - buf->flags = 0; - buf->page = page; - buf->offset = 0; - buf->len = min_t(ssize_t, left, PAGE_SIZE); - left -= buf->len; - iter_head++; - pipe->head = iter_head; - - if (left == 0) + left -= PAGE_SIZE; + if (left <= 0) return size; } return size - left; -- cgit v1.2.3 From 8fad7767edcfd3f93e0d9985cb2dc1db270b8719 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 14 Jun 2022 13:53:53 -0400 Subject: ITER_PIPE: allocate buffers as we go in copy-to-pipe primitives New helper: append_pipe(). Extends the last buffer if possible, allocates a new one otherwise. Returns page and offset in it on success, NULL on failure. iov_iter is advanced past the data we've got. Use that instead of push_pipe() in copy-to-pipe primitives; they get simpler that way. Handling of short copy (in "mc" one) is done simply by iov_iter_revert() - iov_iter is in consistent state after that one, so we can use that. [Fix for braino caught by Liu Xinpeng folded in] [another braino fix, this time in copy_pipe_to_iter() and pipe_zero(); caught by testcase from Hugh Dickins] Signed-off-by: Al Viro --- lib/iov_iter.c | 171 +++++++++++++++++++++++++++++++++------------------------ 1 file changed, 98 insertions(+), 73 deletions(-) (limited to 'lib/iov_iter.c') diff --git a/lib/iov_iter.c b/lib/iov_iter.c index a5c436e564e8..e22c272cb420 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -259,6 +259,45 @@ static void push_page(struct pipe_inode_info *pipe, struct page *page, get_page(page); } +static inline bool allocated(struct pipe_buffer *buf) +{ + return buf->ops == &default_pipe_buf_ops; +} + +static struct page *append_pipe(struct iov_iter *i, size_t size, + unsigned int *off) +{ + struct pipe_inode_info *pipe = i->pipe; + size_t offset = i->iov_offset; + struct pipe_buffer *buf; + struct page *page; + + if (offset && offset < PAGE_SIZE) { + // some space in the last buffer; can we add to it? + buf = pipe_buf(pipe, pipe->head - 1); + if (allocated(buf)) { + size = min_t(size_t, size, PAGE_SIZE - offset); + buf->len += size; + i->iov_offset += size; + i->count -= size; + *off = offset; + return buf->page; + } + } + // OK, we need a new buffer + *off = 0; + size = min_t(size_t, size, PAGE_SIZE); + if (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) + return NULL; + page = push_anon(pipe, size); + if (!page) + return NULL; + i->head = pipe->head - 1; + i->iov_offset = size; + i->count -= size; + return page; +} + static size_t copy_page_to_iter_pipe(struct page *page, size_t offset, size_t bytes, struct iov_iter *i) { @@ -396,11 +435,6 @@ void iov_iter_init(struct iov_iter *i, unsigned int direction, } EXPORT_SYMBOL(iov_iter_init); -static inline bool allocated(struct pipe_buffer *buf) -{ - return buf->ops == &default_pipe_buf_ops; -} - static inline void data_start(const struct iov_iter *i, unsigned int *iter_headp, size_t *offp) { @@ -459,28 +493,24 @@ static size_t push_pipe(struct iov_iter *i, size_t size, static size_t copy_pipe_to_iter(const void *addr, size_t bytes, struct iov_iter *i) { - struct pipe_inode_info *pipe = i->pipe; - unsigned int p_mask = pipe->ring_size - 1; - unsigned int i_head; - size_t n, off; + unsigned int off, chunk; - if (!sanity(i)) + if (unlikely(bytes > i->count)) + bytes = i->count; + if (unlikely(!bytes)) return 0; - bytes = n = push_pipe(i, bytes, &i_head, &off); - if (unlikely(!n)) + if (!sanity(i)) return 0; - do { - size_t chunk = min_t(size_t, n, PAGE_SIZE - off); - memcpy_to_page(pipe->bufs[i_head & p_mask].page, off, addr, chunk); - i->head = i_head; - i->iov_offset = off + chunk; - n -= chunk; + + for (size_t n = bytes; n; n -= chunk) { + struct page *page = append_pipe(i, n, &off); + chunk = min_t(size_t, n, PAGE_SIZE - off); + if (!page) + return bytes - n; + memcpy_to_page(page, off, addr, chunk); addr += chunk; - off = 0; - i_head++; - } while (n); - i->count -= bytes; + } return bytes; } @@ -494,31 +524,32 @@ static __wsum csum_and_memcpy(void *to, const void *from, size_t len, static size_t csum_and_copy_to_pipe_iter(const void *addr, size_t bytes, struct iov_iter *i, __wsum *sump) { - struct pipe_inode_info *pipe = i->pipe; - unsigned int p_mask = pipe->ring_size - 1; __wsum sum = *sump; size_t off = 0; - unsigned int i_head; - size_t r; + unsigned int chunk, r; + + if (unlikely(bytes > i->count)) + bytes = i->count; + if (unlikely(!bytes)) + return 0; if (!sanity(i)) return 0; - bytes = push_pipe(i, bytes, &i_head, &r); while (bytes) { - size_t chunk = min_t(size_t, bytes, PAGE_SIZE - r); - char *p = kmap_local_page(pipe->bufs[i_head & p_mask].page); + struct page *page = append_pipe(i, bytes, &r); + char *p; + + if (!page) + break; + chunk = min_t(size_t, bytes, PAGE_SIZE - r); + p = kmap_local_page(page); sum = csum_and_memcpy(p + r, addr + off, chunk, sum, off); kunmap_local(p); - i->head = i_head; - i->iov_offset = r + chunk; - bytes -= chunk; off += chunk; - r = 0; - i_head++; + bytes -= chunk; } *sump = sum; - i->count -= off; return off; } @@ -550,39 +581,36 @@ static int copyout_mc(void __user *to, const void *from, size_t n) static size_t copy_mc_pipe_to_iter(const void *addr, size_t bytes, struct iov_iter *i) { - struct pipe_inode_info *pipe = i->pipe; - unsigned int p_mask = pipe->ring_size - 1; - unsigned int i_head; - unsigned int valid = pipe->head; - size_t n, off, xfer = 0; + size_t xfer = 0; + unsigned int off, chunk; + + if (unlikely(bytes > i->count)) + bytes = i->count; + if (unlikely(!bytes)) + return 0; if (!sanity(i)) return 0; - n = push_pipe(i, bytes, &i_head, &off); - while (n) { - size_t chunk = min_t(size_t, n, PAGE_SIZE - off); - char *p = kmap_local_page(pipe->bufs[i_head & p_mask].page); + while (bytes) { + struct page *page = append_pipe(i, bytes, &off); unsigned long rem; + char *p; + + if (!page) + break; + chunk = min_t(size_t, bytes, PAGE_SIZE - off); + p = kmap_local_page(page); rem = copy_mc_to_kernel(p + off, addr + xfer, chunk); chunk -= rem; kunmap_local(p); - if (chunk) { - i->head = i_head; - i->iov_offset = off + chunk; - xfer += chunk; - valid = i_head + 1; - } + xfer += chunk; + bytes -= chunk; if (rem) { - pipe->bufs[i_head & p_mask].len -= rem; - pipe_discard_from(pipe, valid); + iov_iter_revert(i, rem); break; } - n -= chunk; - off = 0; - i_head++; } - i->count -= xfer; return xfer; } @@ -769,30 +797,27 @@ EXPORT_SYMBOL(copy_page_from_iter); static size_t pipe_zero(size_t bytes, struct iov_iter *i) { - struct pipe_inode_info *pipe = i->pipe; - unsigned int p_mask = pipe->ring_size - 1; - unsigned int i_head; - size_t n, off; + unsigned int chunk, off; - if (!sanity(i)) + if (unlikely(bytes > i->count)) + bytes = i->count; + if (unlikely(!bytes)) return 0; - bytes = n = push_pipe(i, bytes, &i_head, &off); - if (unlikely(!n)) + if (!sanity(i)) return 0; - do { - size_t chunk = min_t(size_t, n, PAGE_SIZE - off); - char *p = kmap_local_page(pipe->bufs[i_head & p_mask].page); + for (size_t n = bytes; n; n -= chunk) { + struct page *page = append_pipe(i, n, &off); + char *p; + + if (!page) + return bytes - n; + chunk = min_t(size_t, n, PAGE_SIZE - off); + p = kmap_local_page(page); memset(p + off, 0, chunk); kunmap_local(p); - i->head = i_head; - i->iov_offset = off + chunk; - n -= chunk; - off = 0; - i_head++; - } while (n); - i->count -= bytes; + } return bytes; } -- cgit v1.2.3 From e3b42964f84c028f352c11269661d47f6ca4ab2e Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 11 Jun 2022 02:52:03 -0400 Subject: ITER_PIPE: fold push_pipe() into __pipe_get_pages() Expand the only remaining call of push_pipe() (in __pipe_get_pages()), combine it with the page-collecting loop there. Note that the only reason it's not a loop doing append_pipe() is that append_pipe() is advancing, while iov_iter_get_pages() is not. As soon as it switches to saner semantics, this thing will switch to using append_pipe(). Signed-off-by: Al Viro --- lib/iov_iter.c | 80 ++++++++++++++++++---------------------------------------- 1 file changed, 25 insertions(+), 55 deletions(-) (limited to 'lib/iov_iter.c') diff --git a/lib/iov_iter.c b/lib/iov_iter.c index e22c272cb420..bf600b4fe980 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -450,46 +450,6 @@ static inline void data_start(const struct iov_iter *i, *offp = off; } -static size_t push_pipe(struct iov_iter *i, size_t size, - int *iter_headp, size_t *offp) -{ - struct pipe_inode_info *pipe = i->pipe; - unsigned int iter_head; - size_t off; - ssize_t left; - - if (unlikely(size > i->count)) - size = i->count; - if (unlikely(!size)) - return 0; - - left = size; - data_start(i, &iter_head, &off); - *iter_headp = iter_head; - *offp = off; - if (off) { - struct pipe_buffer *buf = pipe_buf(pipe, iter_head); - - left -= PAGE_SIZE - off; - if (left <= 0) { - buf->len += size; - return size; - } - buf->len = PAGE_SIZE; - } - while (!pipe_full(pipe->head, pipe->tail, pipe->max_usage)) { - struct page *page = push_anon(pipe, - min_t(ssize_t, left, PAGE_SIZE)); - if (!page) - break; - - left -= PAGE_SIZE; - if (left <= 0) - return size; - } - return size - left; -} - static size_t copy_pipe_to_iter(const void *addr, size_t bytes, struct iov_iter *i) { @@ -1359,23 +1319,33 @@ static inline ssize_t __pipe_get_pages(struct iov_iter *i, size_t maxsize, struct page **pages, int iter_head, - size_t *start) + size_t off) { struct pipe_inode_info *pipe = i->pipe; - unsigned int p_mask = pipe->ring_size - 1; - ssize_t n = push_pipe(i, maxsize, &iter_head, start); - if (!n) - return -EFAULT; + ssize_t left = maxsize; - maxsize = n; - n += *start; - while (n > 0) { - get_page(*pages++ = pipe->bufs[iter_head & p_mask].page); - iter_head++; - n -= PAGE_SIZE; - } + if (off) { + struct pipe_buffer *buf = pipe_buf(pipe, iter_head); - return maxsize; + get_page(*pages++ = buf->page); + left -= PAGE_SIZE - off; + if (left <= 0) { + buf->len += maxsize; + return maxsize; + } + buf->len = PAGE_SIZE; + } + while (!pipe_full(pipe->head, pipe->tail, pipe->max_usage)) { + struct page *page = push_anon(pipe, + min_t(ssize_t, left, PAGE_SIZE)); + if (!page) + break; + get_page(*pages++ = page); + left -= PAGE_SIZE; + if (left <= 0) + return maxsize; + } + return maxsize - left ? : -EFAULT; } static ssize_t pipe_get_pages(struct iov_iter *i, @@ -1393,7 +1363,7 @@ static ssize_t pipe_get_pages(struct iov_iter *i, npages = pipe_space_for_user(iter_head, i->pipe->tail, i->pipe); capacity = min(npages, maxpages) * PAGE_SIZE - *start; - return __pipe_get_pages(i, min(maxsize, capacity), pages, iter_head, start); + return __pipe_get_pages(i, min(maxsize, capacity), pages, iter_head, *start); } static ssize_t iter_xarray_populate_pages(struct page **pages, struct xarray *xa, @@ -1575,7 +1545,7 @@ static ssize_t pipe_get_pages_alloc(struct iov_iter *i, p = get_pages_array(npages); if (!p) return -ENOMEM; - n = __pipe_get_pages(i, maxsize, p, iter_head, start); + n = __pipe_get_pages(i, maxsize, p, iter_head, *start); if (n > 0) *pages = p; else -- cgit v1.2.3 From ca591967543ab1af7e6e68bd505ef7869d3f2175 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 16 Jun 2022 14:26:23 -0400 Subject: ITER_PIPE: lose iter_head argument of __pipe_get_pages() it's only used to get to the partial buffer we can add to, and that's always the last one, i.e. pipe->head - 1. Signed-off-by: Al Viro --- lib/iov_iter.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'lib/iov_iter.c') diff --git a/lib/iov_iter.c b/lib/iov_iter.c index bf600b4fe980..95c56d42505b 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -1318,14 +1318,13 @@ EXPORT_SYMBOL(iov_iter_gap_alignment); static inline ssize_t __pipe_get_pages(struct iov_iter *i, size_t maxsize, struct page **pages, - int iter_head, size_t off) { struct pipe_inode_info *pipe = i->pipe; ssize_t left = maxsize; if (off) { - struct pipe_buffer *buf = pipe_buf(pipe, iter_head); + struct pipe_buffer *buf = pipe_buf(pipe, pipe->head - 1); get_page(*pages++ = buf->page); left -= PAGE_SIZE - off; @@ -1363,7 +1362,7 @@ static ssize_t pipe_get_pages(struct iov_iter *i, npages = pipe_space_for_user(iter_head, i->pipe->tail, i->pipe); capacity = min(npages, maxpages) * PAGE_SIZE - *start; - return __pipe_get_pages(i, min(maxsize, capacity), pages, iter_head, *start); + return __pipe_get_pages(i, min(maxsize, capacity), pages, *start); } static ssize_t iter_xarray_populate_pages(struct page **pages, struct xarray *xa, @@ -1545,7 +1544,7 @@ static ssize_t pipe_get_pages_alloc(struct iov_iter *i, p = get_pages_array(npages); if (!p) return -ENOMEM; - n = __pipe_get_pages(i, maxsize, p, iter_head, *start); + n = __pipe_get_pages(i, maxsize, p, *start); if (n > 0) *pages = p; else -- cgit v1.2.3 From 2c855de93314e9573f31044976ffd89cb70a2dbd Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 15 Jun 2022 16:03:25 -0400 Subject: ITER_PIPE: clean pipe_advance() up instead of setting ->iov_offset for new position and calling pipe_truncate() to adjust ->len of the last buffer and discard everything after it, adjust ->len at the same time we set ->iov_offset and use pipe_discard_from() to deal with buffers past that. Signed-off-by: Al Viro --- lib/iov_iter.c | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) (limited to 'lib/iov_iter.c') diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 95c56d42505b..402d49688a16 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -845,27 +845,27 @@ static inline void pipe_truncate(struct iov_iter *i) static void pipe_advance(struct iov_iter *i, size_t size) { struct pipe_inode_info *pipe = i->pipe; - if (size) { - struct pipe_buffer *buf; - unsigned int p_mask = pipe->ring_size - 1; - unsigned int i_head = i->head; - size_t off = i->iov_offset, left = size; + unsigned int off = i->iov_offset; + if (!off && !size) { + pipe_discard_from(pipe, i->start_head); // discard everything + return; + } + i->count -= size; + while (1) { + struct pipe_buffer *buf = pipe_buf(pipe, i->head); if (off) /* make it relative to the beginning of buffer */ - left += off - pipe->bufs[i_head & p_mask].offset; - while (1) { - buf = &pipe->bufs[i_head & p_mask]; - if (left <= buf->len) - break; - left -= buf->len; - i_head++; + size += off - buf->offset; + if (size <= buf->len) { + buf->len = size; + i->iov_offset = buf->offset + size; + break; } - i->head = i_head; - i->iov_offset = buf->offset + left; + size -= buf->len; + i->head++; + off = 0; } - i->count -= size; - /* ... and discard everything past that point */ - pipe_truncate(i); + pipe_discard_from(pipe, i->head + 1); // discard everything past this one } static void iov_iter_bvec_advance(struct iov_iter *i, size_t size) -- cgit v1.2.3 From 92acdc4f37207c556baee0ea28ce0823d22b9812 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 12 Jun 2022 17:54:35 -0400 Subject: ITER_PIPE: clean iov_iter_revert() Fold pipe_truncate() into it, clean up. We can release buffers in the same loop where we walk backwards to the iterator beginning looking for the place where the new position will be. Signed-off-by: Al Viro --- lib/iov_iter.c | 60 ++++++++++++++-------------------------------------------- 1 file changed, 14 insertions(+), 46 deletions(-) (limited to 'lib/iov_iter.c') diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 402d49688a16..c2e08004a1eb 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -816,32 +816,6 @@ size_t copy_page_from_iter_atomic(struct page *page, unsigned offset, size_t byt } EXPORT_SYMBOL(copy_page_from_iter_atomic); -static inline void pipe_truncate(struct iov_iter *i) -{ - struct pipe_inode_info *pipe = i->pipe; - unsigned int p_tail = pipe->tail; - unsigned int p_head = pipe->head; - unsigned int p_mask = pipe->ring_size - 1; - - if (!pipe_empty(p_head, p_tail)) { - struct pipe_buffer *buf; - unsigned int i_head = i->head; - size_t off = i->iov_offset; - - if (off) { - buf = &pipe->bufs[i_head & p_mask]; - buf->len = off - buf->offset; - i_head++; - } - while (p_head != i_head) { - p_head--; - pipe_buf_release(pipe, &pipe->bufs[p_head & p_mask]); - } - - pipe->head = p_head; - } -} - static void pipe_advance(struct iov_iter *i, size_t size) { struct pipe_inode_info *pipe = i->pipe; @@ -936,28 +910,22 @@ void iov_iter_revert(struct iov_iter *i, size_t unroll) i->count += unroll; if (unlikely(iov_iter_is_pipe(i))) { struct pipe_inode_info *pipe = i->pipe; - unsigned int p_mask = pipe->ring_size - 1; - unsigned int i_head = i->head; - size_t off = i->iov_offset; - while (1) { - struct pipe_buffer *b = &pipe->bufs[i_head & p_mask]; - size_t n = off - b->offset; - if (unroll < n) { - off -= unroll; - break; - } - unroll -= n; - if (!unroll && i_head == i->start_head) { - off = 0; - break; + unsigned int head = pipe->head; + + while (head > i->start_head) { + struct pipe_buffer *b = pipe_buf(pipe, --head); + if (unroll < b->len) { + b->len -= unroll; + i->iov_offset = b->offset + b->len; + i->head = head; + return; } - i_head--; - b = &pipe->bufs[i_head & p_mask]; - off = b->offset + b->len; + unroll -= b->len; + pipe_buf_release(pipe, b); + pipe->head--; } - i->iov_offset = off; - i->head = i_head; - pipe_truncate(i); + i->iov_offset = 0; + i->head = head; return; } if (unlikely(iov_iter_is_discard(i))) -- cgit v1.2.3 From 10f525a8cd7a525e9fc73288bb35428c9cad5e63 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 15 Jun 2022 02:02:51 -0400 Subject: ITER_PIPE: cache the type of last buffer We often need to find whether the last buffer is anon or not, and currently it's rather clumsy: check if ->iov_offset is non-zero (i.e. that pipe is not empty) if so, get the corresponding pipe_buffer and check its ->ops if it's &default_pipe_buf_ops, we have an anon buffer. Let's replace the use of ->iov_offset (which is nowhere near similar to its role for other flavours) with signed field (->last_offset), with the following rules: empty, no buffers occupied: 0 anon, with bytes up to N-1 filled: N zero-copy, with bytes up to N-1 filled: -N That way abs(i->last_offset) is equal to what used to be in i->iov_offset and empty vs. anon vs. zero-copy can be distinguished by the sign of i->last_offset. Checks for "should we extend the last buffer or should we start a new one?" become easier to follow that way. Note that most of the operations can only be done in a sane state - i.e. when the pipe has nothing past the current position of iterator. About the only thing that could be done outside of that state is iov_iter_advance(), which transitions to the sane state by truncating the pipe. There are only two cases where we leave the sane state: 1) iov_iter_get_pages()/iov_iter_get_pages_alloc(). Will be dealt with later, when we make get_pages advancing - the callers are actually happier that way. 2) iov_iter copied, then something is put into the copy. Since they share the underlying pipe, the original gets behind. When we decide that we are done with the copy (original is not usable until then) we advance the original. direct_io used to be done that way; nowadays it operates on the original and we do iov_iter_revert() to discard the excessive data. At the moment there's nothing in the kernel that could do that to ITER_PIPE iterators, so this reason for insane state is theoretical right now. Signed-off-by: Al Viro --- lib/iov_iter.c | 77 +++++++++++++++++++++++++++++----------------------------- 1 file chan