/*
* Partial Parity Log for closing the RAID5 write hole
* Copyright (c) 2017, Intel Corporation.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*/
#include <linux/kernel.h>
#include <linux/blkdev.h>
#include <linux/slab.h>
#include <linux/crc32c.h>
#include <linux/flex_array.h>
#include <linux/async_tx.h>
#include <linux/raid/md_p.h>
#include "md.h"
#include "raid5.h"
/*
* PPL consists of a 4KB header (struct ppl_header) and at least 128KB for
* partial parity data. The header contains an array of entries
* (struct ppl_header_entry) which describe the logged write requests.
* Partial parity for the entries comes after the header, written in the same
* sequence as the entries:
*
* Header
* entry0
* ...
* entryN
* PP data
* PP for entry0
* ...
* PP for entryN
*
* An entry describes one or more consecutive stripe_heads, up to a full
* stripe. The modifed raid data chunks form an m-by-n matrix, where m is the
* number of stripe_heads in the entry and n is the number of modified data
* disks. Every stripe_head in the entry must write to the same data disks.
* An example of a valid case described by a single entry (writes to the first
* stripe of a 4 disk array, 16k chunk size):
*
* sh->sector dd0 dd1 dd2 ppl
* +-----+-----+-----+
* 0 | --- | --- | --- | +----+
* 8 | -W- | -W- | --- | | pp | data_sector = 8
* 16 | -W- | -W- | --- | | pp | data_size = 3 * 2 * 4k
* 24 | -W- | -W- | --- | | pp | pp_size = 3 * 4k
* +-----+-----+-----+ +----+
*
* data_sector is the first raid sector of the modified data, data_size is the
* total size of modified data and pp_size is the size of partial parity for
* this entry. Entries for full stripe writes contain no partial parity
* (pp_size = 0), they only mark the stripes for which parity should be
* recalculated after an unclean shutdown. Every entry holds a checksum of its
* partial parity, the header also has a checksum of the header itself.
*
* A write request is always logged to the PPL instance stored on the parity
* disk of the corresponding stripe. For each member disk there is one ppl_log
* used to handle logging for this disk, independently from others. They are
* grouped in child_logs array in struct ppl_conf, which is assigned to
* r5conf->log_private.
*
* ppl_io_unit represents a full PPL write, header_page contains the ppl_header.
* PPL entries for logged stripes are added in ppl_log_stripe(). A stripe_head
* can be appended to the last entry if it meets the conditions for a valid
* entry described above, otherwise a new entry is added. Checksums of entries
* are calculated incrementally as stripes containing partial parity are being
* added. ppl_submit_iounit() calculates the checksum of the header and submits
* a bio containing the header page and partial parity pages (sh->ppl_page) for
* all stripes of the io_unit. When the PPL write completes, the stripes
* associated with the io_unit are released and raid5d starts writing their data
* and parity. When all stripes are written, the io_unit is freed and the next
* can be submitted.
*
* An io_unit is used to gather stripes until it is submitted or becomes full
* (if the maximum number of entries or size of PPL is reached). Another io_unit
* can't be submitted until the previous has completed (PPL and stripe
* data+parity is written). The log->io_list tracks all io_units of a log
* (for a single member disk). New io_units are added to the end of the list
* and the first io_unit is submitted, if it is not submitted already.
* The current io_unit accepting new stripes is always at the end of the list.
*/
struct ppl_conf {
struct mddev *mddev;
/* array of child logs, one for each raid disk */
struct ppl_log *child_logs;
int count;
int block_size; /* the logical block size used for data_sector
* in ppl_header_entry */
u32 signature; /* raid array identifier */
atomic64_t seq; /* current log write sequence number */
struct kmem_cache *io_kc;
mempool_t *io_pool;
struct bio_set *bs;
mempool_t *meta_pool;
/* used only for recovery */
int recovered_entries;
int mismatch_count;
/* stripes to retry if failed to allocate io_unit */
struct list_head no_mem_stripes;
spinlock_t no_mem_stripes_lock;
};
struct ppl_log {
struct ppl_conf *ppl_conf; /* shared between all log instances */
struct md_rdev *rdev; /* array member disk associated with
* this log instance */
struct mutex io_mutex;
struct ppl_io_unit *current_io; /* current io_unit accepting new data
* always at the end of io_list */
spinlock_t io_list_lock;
struct list_head io_list; /* all io_units of this log */
};
#define PPL_IO_INLINE_BVECS 32
struct ppl_io_unit {
struct ppl_log *log;
struct page *header_page; /* for ppl_header */
unsigned int entries_count; /* number of entries in ppl_header */
unsigned int pp_size; /* total size current of partial parity */
u64 seq; /* sequence number of this log write */
struct list_head log_sibling; /* log->io_list */
struct list_head stripe_list; /* stripes added to the io_unit */
atomic_t pending_stripes; /* how many stripes not written to raid */
bool submitted; /* true if write to log started */
/* inline bio and its biovec for submitting the iounit */
struct bio bio;
struct bio_vec biovec[PPL_IO_INLINE_BVECS];
};
struct dma_async_tx_descriptor *
ops_run_partial_parity(struct stripe_head *sh, struct raid5_percpu *percpu,
struct dma_async_tx_descriptor *tx)
{
int disks = sh->disks;
struct page **xor_srcs = flex_array_get(percpu->scribble, 0);
int count = 0, pd_idx = sh->