diff options
author | Enzo Matsumiya <ematsumiya@suse.de> | 2025-03-26 17:48:27 -0300 |
---|---|---|
committer | Enzo Matsumiya <ematsumiya@suse.de> | 2025-03-26 17:48:27 -0300 |
commit | 8d4c40e084f3d132434d5d3d068175c8db59ce65 (patch) | |
tree | 34d7b026eaed6ab0d5958389afd31a2bf01813db | |
parent | 843e64492a7ed11436cc5c9bbfba46835939071a (diff) | |
download | linux-data_corruption_v6.x.tar.gz linux-data_corruption_v6.x.tar.bz2 linux-data_corruption_v6.x.zip |
smb: client: fix corruption in cifs_extend_writebackdata_corruption_v6.x
cifs.ko writepages implementation will try to extend the write buffer size in order to issue less,
but bigger write requests over the wire.
The function responsible for doing so, cifs_extend_writeback, however, did not account for some
important factors, and not handling some of those factors correctly lead to data corruption on
writes coming through writepages.
Such corrupt writes are very subtle and show no errors whatsoever on dmesg -- they can only be
observed by comparing expected vs actual outputs. Easy reproducer:
done | dd ibs=4194304 iflag=fullblock count=10240000 of=remotefile
8999946
<corrupt lines shows here>
'wc -l' is not really reliable as we've seen files with corrupt lines, but no missing ones.
Of course, the corruption doesn't happen with cache=none mount option.
Bug explanation:
- Pointer arguments are updated before bound checking (actual root cause)
@_len and @_count are updated with the current folio values before actually checking if the current
values fit in their boundaries, so by the time the function exits, the caller (only
cifs_write_back_from_locked_folio(), that BTW doesn't do any further checks) those arguments might
have crossed bounds and extra data (zeroes) are added as padding.
Later, with those offsets marked as 'done', the real actual data that should've been written into
those offsets are skipped, making the final file corrupt.
- Sync calls with ongoing writeback aren't sync
Folios are tested for ongoing writeback (folio_test_writeback), but not handled directly for
data-integrity sync syscalls (e.g. fsync() or msync()). When being called from those, and folio
*is* under writeback, we MUST wait for the writeback to complete because those calls must guarantee
the write went through.
By simply bailing out of the function, the implementation relies on the timing/luck that no further
errors happens later, and that the writeback indeed finished before returning.
- Any failed checks to the folios in @xas would call xas_reset
This means that whenever some/any folios were added to batch and processed, they are so again
in further write calls because @xas, making upper layers do double work on it.
This patch fixes the cases above, and also lessen the 'hard stop' conditions for cases where only a
single folio is affected, but others in @xas can still be processed (more of a performance
improvement).
Signed-off-by: Enzo Matsumiya <ematsumiya@suse.de>
-rw-r--r-- | fs/smb/client/file.c | 147 |
1 files changed, 104 insertions, 43 deletions
diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index cb75b95efb70..eddd0dab44ed 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -2719,15 +2719,17 @@ static void cifs_extend_writeback(struct address_space *mapping, loff_t start, int max_pages, loff_t max_len, - size_t *_len) + size_t *_len, + int sync_mode) { struct folio_batch batch; struct folio *folio; - unsigned int nr_pages; - pgoff_t index = (start + *_len) / PAGE_SIZE; - size_t len; - bool stop = true; - unsigned int i; + unsigned int nr_pages, i; + pgoff_t idx, index = (start + *_len) / PAGE_SIZE; + size_t len = *_len, flen; + bool stop = true, sync = (sync_mode != WB_SYNC_NONE); + long count = *_count; + int npages = max_pages; folio_batch_init(&batch); @@ -2742,59 +2744,110 @@ static void cifs_extend_writeback(struct address_space *mapping, stop = true; if (xas_retry(xas, folio)) continue; - if (xa_is_value(folio)) - break; - if (folio->index != index) { - xas_reset(xas); + if (xa_is_value(folio)) { + stop = false; break; } + if (folio_index(folio) != index) + goto xareset_next; - if (!folio_try_get(folio)) { - xas_reset(xas); - continue; - } - nr_pages = folio_nr_pages(folio); - if (nr_pages > max_pages) { - xas_reset(xas); - break; - } + if (!folio_try_get(folio)) + goto xareset_next; /* Has the page moved or been split? */ - if (unlikely(folio != xas_reload(xas))) { - folio_put(folio); - xas_reset(xas); - break; + if (unlikely(folio != xas_reload(xas) || folio->mapping != mapping)) { + stop = false; + goto put_next; } - if (!folio_trylock(folio)) { - folio_put(folio); - xas_reset(xas); - break; - } - if (!folio_test_dirty(folio) || - folio_test_writeback(folio)) { - folio_unlock(folio); - folio_put(folio); - xas_reset(xas); - break; + if (!folio_trylock(folio)) + goto put_next; + + nr_pages = folio_nr_pages(folio); + if (nr_pages > npages || nr_pages > count) + goto unlock_next; + + if (folio_test_writeback(folio)) { + /* + * For data-integrity syscalls (fsync(), msync()) we must wait for + * the I/O to complete on the page. + * For other cases (!sync), we can just skip this page, even if + * it's dirty. + */ + if (!sync) { + stop = false; + goto unlock_next; + } else { + folio_wait_writeback(folio); + + /* + * More I/O started meanwhile, bail out and write on the + * next call. + */ + if (WARN_ON_ONCE(folio_test_writeback(folio))) + goto unlock_next; + } } - max_pages -= nr_pages; - len = folio_size(folio); - stop = false; + /* + * We don't really have a boundary for index, so just check for overflow. + */ + if (check_add_overflow(index, nr_pages, &idx)) + goto unlock_next; + + flen = folio_size(folio); - index += nr_pages; - *_count -= nr_pages; - *_len += len; - if (max_pages <= 0 || *_len >= max_len || *_count <= 0) - stop = true; + /* Store sum in @flen so we don't have to undo it in case of failure. */ + if (check_add_overflow(len, flen, &flen) || flen > max_len) + goto unlock_next; + + index = idx; + len = flen; + + /* + * @npages and @count have been checked earlier (and are signed), so we + * can just subtract them here. + */ + npages -= nr_pages; + count -= nr_pages; + /* + * This is not an error; it just means we _did_ add this current folio, but + * can't add any more to this batch, so break out of this loop to start + * processing this batch, but don't stop the outer loop in case there are + * more folios to be processed in @xas. + */ + stop = false; if (!folio_batch_add(&batch, folio)) break; + + /* + * Folios added to the batch must be left locked for the loop below. They + * will be unlocked right away and also folio_batch_release() will take + * care of putting them. + */ + continue; +unlock_next: + folio_unlock(folio); +put_next: + folio_put(folio); +xareset_next: if (stop) break; } + /* + * Only reset @xas if we get here because of one of the stopping conditions above, + * namely: + * - couldn't lock/get a folio (someone else was processing the same folio) + * - folio was in writeback for too long (sync call was writing the same folio) + * - out of bounds index, len, or count (the last processed folio was partial and + * we can't fit it in this write request, so it shall be processed in the next + * write) + */ + if (stop) + xas_reset(xas); + xas_pause(xas); rcu_read_unlock(); @@ -2815,6 +2868,13 @@ static void cifs_extend_writeback(struct address_space *mapping, folio_unlock(folio); } + /* + * By now, data has been updated/written out to @mapping, so from this point of + * view we're done and we can safely update @_len and @_count. + */ + *_len = len; + *_count = count; + folio_batch_release(&batch); cond_resched(); } while (!stop); @@ -2902,7 +2962,8 @@ static ssize_t cifs_write_back_from_locked_folio(struct address_space *mapping, if (max_pages > 0) cifs_extend_writeback(mapping, xas, &count, start, - max_pages, max_len, &len); + max_pages, max_len, &len, + wbc->sync_mode); } } len = min_t(unsigned long long, len, i_size - start); |