// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2025, Christoph Hellwig.
* Copyright (c) 2025, Western Digital Corporation or its affiliates.
*
* Zoned Loop Device driver - exports a zoned block device using one file per
* zone as backing storage.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
#include <linux/blk-mq.h>
#include <linux/blkzoned.h>
#include <linux/pagemap.h>
#include <linux/miscdevice.h>
#include <linux/falloc.h>
#include <linux/mutex.h>
#include <linux/parser.h>
#include <linux/seq_file.h>
/*
* Options for adding (and removing) a device.
*/
enum {
ZLOOP_OPT_ERR = 0,
ZLOOP_OPT_ID = (1 << 0),
ZLOOP_OPT_CAPACITY = (1 << 1),
ZLOOP_OPT_ZONE_SIZE = (1 << 2),
ZLOOP_OPT_ZONE_CAPACITY = (1 << 3),
ZLOOP_OPT_NR_CONV_ZONES = (1 << 4),
ZLOOP_OPT_BASE_DIR = (1 << 5),
ZLOOP_OPT_NR_QUEUES = (1 << 6),
ZLOOP_OPT_QUEUE_DEPTH = (1 << 7),
ZLOOP_OPT_BUFFERED_IO = (1 << 8),
ZLOOP_OPT_ZONE_APPEND = (1 << 9),
ZLOOP_OPT_ORDERED_ZONE_APPEND = (1 << 10),
};
static const match_table_t zloop_opt_tokens = {
{ ZLOOP_OPT_ID, "id=%d" },
{ ZLOOP_OPT_CAPACITY, "capacity_mb=%u" },
{ ZLOOP_OPT_ZONE_SIZE, "zone_size_mb=%u" },
{ ZLOOP_OPT_ZONE_CAPACITY, "zone_capacity_mb=%u" },
{ ZLOOP_OPT_NR_CONV_ZONES, "conv_zones=%u" },
{ ZLOOP_OPT_BASE_DIR, "base_dir=%s" },
{ ZLOOP_OPT_NR_QUEUES, "nr_queues=%u" },
{ ZLOOP_OPT_QUEUE_DEPTH, "queue_depth=%u" },
{ ZLOOP_OPT_BUFFERED_IO, "buffered_io" },
{ ZLOOP_OPT_ZONE_APPEND, "zone_append=%u" },
{ ZLOOP_OPT_ORDERED_ZONE_APPEND, "ordered_zone_append" },
{ ZLOOP_OPT_ERR, NULL }
};
/* Default values for the "add" operation. */
#define ZLOOP_DEF_ID -1
#define ZLOOP_DEF_ZONE_SIZE ((256ULL * SZ_1M) >> SECTOR_SHIFT)
#define ZLOOP_DEF_NR_ZONES 64
#define ZLOOP_DEF_NR_CONV_ZONES 8
#define ZLOOP_DEF_BASE_DIR "/var/local/zloop"
#define ZLOOP_DEF_NR_QUEUES 1
#define ZLOOP_DEF_QUEUE_DEPTH 128
#define ZLOOP_DEF_BUFFERED_IO false
#define ZLOOP_DEF_ZONE_APPEND true
#define ZLOOP_DEF_ORDERED_ZONE_APPEND false
/* Arbitrary limit on the zone size (16GB). */
#define ZLOOP_MAX_ZONE_SIZE_MB 16384
struct zloop_options {
unsigned int mask;
int id;
sector_t capacity;
sector_t zone_size;
sector_t zone_capacity;
unsigned int nr_conv_zones;
char *base_dir;
unsigned int nr_queues;
unsigned int queue_depth;
bool buffered_io;
bool zone_append;
bool ordered_zone_append;
};
/*
* Device states.
*/
enum {
Zlo_creating = 0,
Zlo_live,
Zlo_deleting,
};
enum zloop_zone_flags {
ZLOOP_ZONE_CONV = 0,
ZLOOP_ZONE_SEQ_ERROR,
};
struct zloop_zone {
struct file *file;
unsigned long flags;
struct mutex lock;
spinlock_t wp_lock;
enum blk_zone_cond cond;
sector_t start;
sector_t wp;
gfp_t old_gfp_mask;
};
struct zloop_device {
unsigned int id;
unsigned int state;
struct blk_mq_tag_set tag_set;
struct gendisk *disk;
struct workqueue_struct *workqueue;
bool buffered_io;
bool zone_append;
bool ordered_zone_append;
const char *base_dir;
struct file *data_dir;
unsigned int zone_shift;
sector_t zone_size;
sector_t zone_capacity;
unsigned int nr_zones;
unsigned int nr_conv_zones;
unsigned int block_size;
struct zloop_zone zones[] __counted_by(nr_zones);
};
struct zloop_cmd {
struct work_struct work;
atomic_t ref;
sector_t sector;
sector_t nr_sectors;
long ret;
struct kiocb iocb;
struct bio_vec *bvec;
};
static DEFINE_IDR(zloop_index_idr);
static DEFINE_MUTEX(zloop_ctl_mutex);
static unsigned int rq_zone_no(struct request *rq)
{
struct zloop_device *zlo = rq->q->queuedata;
return blk_rq_pos(rq) >> zlo->zone_shift;
}
static int zloop_update_seq_zone(struct zloop_device *zlo, unsigned int zone_no)
{
struct zloop_zone *zone = &zlo->zones[zone_no];
struct kstat stat;
sector_t file_sectors;
unsigned long flags;
int ret;
lockdep_assert_held(&zone->lock);
ret = vfs_getattr(&zone->file->f_path, &stat, STATX_SIZE, 0);
if (ret < 0) {
pr_err("Failed to get zone %u file stat (err=%d)\n",
zone_no, ret);
set_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags);
return ret;
}
file_sectors = stat.size >> SECTOR_SHIFT;
if (file_sectors > zlo->zone_capacity) {
pr_err("Zone %u file too large (%llu sectors > %llu)\n",
zone_no, file_sectors, zlo->zone_capacity);
return -EINVAL;
}
if (file_sectors & ((zlo->block_size >> SECTOR_SHIFT) - 1)) {
pr_err("Zone %u file size not aligned to block size %u\n",
zone_no, zlo->block_size);
return -EINVAL;
}
spin_lock_irqsave(&zone->wp_lock, flags);
if (!file_sectors) {
zone->cond = BLK_ZONE_COND_EMPTY;
zone->wp = zone->start;
} else if (file_sectors == zlo->zone_capacity) {
zone->cond = BLK_ZONE_COND_FULL;
zone->wp = ULLONG_MAX;
} else {
zone->cond = BLK_ZONE_COND_CLOSED;
zone->wp = zone->start + file_sectors;
}
spin_unlock_irqrestore(&zone->wp_lock, flags);
return 0;
}
static int zloop_open_zone(struct zloop_device *zlo, unsigned int zone_no)
{
struct zloop_zone *zone = &zlo->zones[zone_no];
int ret = 0;
if (test_bit(ZLOOP_ZONE_CONV, &zone->flags))
return -EIO;
mutex_lock(&zone->lock);
if (test_and_clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags)) {
ret = zloop_update_seq_zone(zlo, zone_no);
if (ret)
goto unlock;
}
switch (zone->cond) {
case BLK_ZONE_COND_EXP_OPEN:
break;
case BLK_ZONE_COND_EMPTY:
case BLK_ZONE_COND_CLOSED:
case BLK_ZONE_COND_IMP_OPEN:
zone->cond = BLK_ZONE_COND_EXP_OPEN;
break;
case BLK_ZONE_COND_FULL:
default:
ret = -EIO;
break;
}
unlock:
mutex_unlock(&zone->lock);
return ret;
}
static int zloop_close_zone(struct zloop_device *zlo, unsigned int zone_no)
{
struct zloop_zone *zone = &zlo->zones[zone_no];
unsigned long flags;
int ret = 0;
if (test_bit(ZLOOP_ZONE_CONV, &zone->flags))
return -EIO;
mutex_lock(&zone->lock);
if (test_and_clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags)) {
ret = zloop_update_seq_zone(zlo, zone_no);
if (ret)
goto unlock;
}
switch (zone->cond) {
case BLK_ZONE_COND_CLOSED:
break;
case BLK_ZONE_COND_IMP_OPEN:
case BLK_ZONE_COND_EXP_OPEN:
spin_lock_irqsave(&zone->wp_lock, flags);
if (zone->wp == zone->start)
zone->cond = BLK_ZONE_COND_EMPTY;
else
zone->cond = BLK_ZONE_COND_CLOSED;
spin_unlock_irqrestore(&zone->wp_lock, flags);
break;
case BLK_ZONE_COND_EMPTY:
case BLK_ZONE_COND_FULL:
default:
ret = -EIO;
break;
}
unlock:
mutex_unlock(&zone->lock);
return ret;
}
static int zloop_reset_zone(struct zloop_device *zlo, unsigned int zone_no)
{
struct zloop_zone *zone = &zlo->zones[zone_no];
unsigned long flags;
int ret = 0;
if (test_bit(ZLOOP_ZONE_CONV, &zone->flags))
return -EIO;
mutex_lock(&zone->lock);
if (!test_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags) &&
zone->cond == BLK_ZONE_COND_EMPTY)
goto unlock;
if (vfs_truncate(&zone->file->f_path, 0)) {
set_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags);
ret = -EIO;
goto unlock;
}
spin_lock_irqsave(&zone->wp_lock, flags);
zone->cond = BLK_ZONE_COND_EMPTY;
zone->wp = zone->start;
clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags);
spin_unlock_irqrestore(&zone->wp_lock, flags);
unlock:
mutex_unlock(&zone->lock);
return ret;
}
static int zloop_reset_all_zones(struct zloop_device *zlo)
{
unsigned int i;
int ret;
for (i = zlo->nr_conv_zones; i < zlo->nr_zones; i++) {
ret = zloop_reset_zone(zlo, i);
if (ret)
return ret;
}
return 0;
}
static int zloop_finish_zone(struct zloop_device *zlo, unsigned int zone_no)
{
struct zloop_zone *zone = &zlo->zones[zone_no];
unsigned long flags;
int ret = 0;
if (test_bit(ZLOOP_ZONE_CONV, &zone-&g
|