summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
authorGao Xiang <hsiangkao@aol.com>2019-08-23 05:36:59 +0800
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>2019-08-24 14:20:10 +0200
commit47e4937a4a7ca4184fd282791dfee76c6799966a (patch)
treefc68338c13a00ac74ac9f1a838491bd3f7649c28 /fs
parentf401441deda68326852560bf70d59e95f585bbb3 (diff)
downloadlinux-47e4937a4a7ca4184fd282791dfee76c6799966a.tar.gz
linux-47e4937a4a7ca4184fd282791dfee76c6799966a.tar.bz2
linux-47e4937a4a7ca4184fd282791dfee76c6799966a.zip
erofs: move erofs out of staging
EROFS filesystem has been merged into linux-staging for a year. EROFS is designed to be a better solution of saving extra storage space with guaranteed end-to-end performance for read-only files with the help of reduced metadata, fixed-sized output compression and decompression inplace technologies. In the past year, EROFS was greatly improved by many people as a staging driver, self-tested, betaed by a large number of our internal users, successfully applied to almost all in-service HUAWEI smartphones as the part of EMUI 9.1 and proven to be stable enough to be moved out of staging. EROFS is a self-contained filesystem driver. Although there are still some TODOs to be more generic, we have a dedicated team actively keeping on working on EROFS in order to make it better with the evolution of Linux kernel as the other in-kernel filesystems. As Pavel suggested, it's better to do as one commit since git can do moves and all histories will be saved in this way. Let's promote it from staging and enhance it more actively as a "real" part of kernel for more wider scenarios! Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Alexander Viro <viro@zeniv.linux.org.uk> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Stephen Rothwell <sfr@canb.auug.org.au> Cc: Theodore Ts'o <tytso@mit.edu> Cc: Pavel Machek <pavel@denx.de> Cc: David Sterba <dsterba@suse.cz> Cc: Amir Goldstein <amir73il@gmail.com> Cc: Christoph Hellwig <hch@infradead.org> Cc: Darrick J . Wong <darrick.wong@oracle.com> Cc: Dave Chinner <david@fromorbit.com> Cc: Jaegeuk Kim <jaegeuk@kernel.org> Cc: Jan Kara <jack@suse.cz> Cc: Richard Weinberger <richard@nod.at> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Chao Yu <yuchao0@huawei.com> Cc: Miao Xie <miaoxie@huawei.com> Cc: Li Guifu <bluce.liguifu@huawei.com> Cc: Fang Wei <fangwei1@huawei.com> Signed-off-by: Gao Xiang <gaoxiang25@huawei.com> Link: https://lore.kernel.org/r/20190822213659.5501-1-hsiangkao@aol.com Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Diffstat (limited to 'fs')
-rw-r--r--fs/Kconfig1
-rw-r--r--fs/Makefile1
-rw-r--r--fs/erofs/Kconfig98
-rw-r--r--fs/erofs/Makefile11
-rw-r--r--fs/erofs/compress.h60
-rw-r--r--fs/erofs/data.c423
-rw-r--r--fs/erofs/decompressor.c358
-rw-r--r--fs/erofs/dir.c139
-rw-r--r--fs/erofs/erofs_fs.h307
-rw-r--r--fs/erofs/inode.c332
-rw-r--r--fs/erofs/internal.h553
-rw-r--r--fs/erofs/namei.c251
-rw-r--r--fs/erofs/super.c669
-rw-r--r--fs/erofs/tagptr.h110
-rw-r--r--fs/erofs/utils.c333
-rw-r--r--fs/erofs/xattr.c703
-rw-r--r--fs/erofs/xattr.h92
-rw-r--r--fs/erofs/zdata.c1432
-rw-r--r--fs/erofs/zdata.h193
-rw-r--r--fs/erofs/zmap.c466
-rw-r--r--fs/erofs/zpvec.h157
21 files changed, 6689 insertions, 0 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index bfb1c6095c7a..669d46550e6d 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -261,6 +261,7 @@ source "fs/romfs/Kconfig"
source "fs/pstore/Kconfig"
source "fs/sysv/Kconfig"
source "fs/ufs/Kconfig"
+source "fs/erofs/Kconfig"
endif # MISC_FILESYSTEMS
diff --git a/fs/Makefile b/fs/Makefile
index d60089fd689b..b2e4973a0bea 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -130,3 +130,4 @@ obj-$(CONFIG_F2FS_FS) += f2fs/
obj-$(CONFIG_CEPH_FS) += ceph/
obj-$(CONFIG_PSTORE) += pstore/
obj-$(CONFIG_EFIVAR_FS) += efivarfs/
+obj-$(CONFIG_EROFS_FS) += erofs/
diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig
new file mode 100644
index 000000000000..16316d1adca3
--- /dev/null
+++ b/fs/erofs/Kconfig
@@ -0,0 +1,98 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+config EROFS_FS
+ tristate "EROFS filesystem support"
+ depends on BLOCK
+ help
+ EROFS (Enhanced Read-Only File System) is a lightweight
+ read-only file system with modern designs (eg. page-sized
+ blocks, inline xattrs/data, etc.) for scenarios which need
+ high-performance read-only requirements, e.g. Android OS
+ for mobile phones and LIVECDs.
+
+ It also provides fixed-sized output compression support,
+ which improves storage density, keeps relatively higher
+ compression ratios, which is more useful to achieve high
+ performance for embedded devices with limited memory.
+
+ If unsure, say N.
+
+config EROFS_FS_DEBUG
+ bool "EROFS debugging feature"
+ depends on EROFS_FS
+ help
+ Print debugging messages and enable more BUG_ONs which check
+ filesystem consistency and find potential issues aggressively,
+ which can be used for Android eng build, for example.
+
+ For daily use, say N.
+
+config EROFS_FAULT_INJECTION
+ bool "EROFS fault injection facility"
+ depends on EROFS_FS
+ help
+ Test EROFS to inject faults such as ENOMEM, EIO, and so on.
+ If unsure, say N.
+
+config EROFS_FS_XATTR
+ bool "EROFS extended attributes"
+ depends on EROFS_FS
+ default y
+ help
+ Extended attributes are name:value pairs associated with inodes by
+ the kernel or by users (see the attr(5) manual page, or visit
+ <http://acl.bestbits.at/> for details).
+
+ If unsure, say N.
+
+config EROFS_FS_POSIX_ACL
+ bool "EROFS Access Control Lists"
+ depends on EROFS_FS_XATTR
+ select FS_POSIX_ACL
+ default y
+ help
+ Posix Access Control Lists (ACLs) support permissions for users and
+ groups beyond the owner/group/world scheme.
+
+ To learn more about Access Control Lists, visit the POSIX ACLs for
+ Linux website <http://acl.bestbits.at/>.
+
+ If you don't know what Access Control Lists are, say N.
+
+config EROFS_FS_SECURITY
+ bool "EROFS Security Labels"
+ depends on EROFS_FS_XATTR
+ default y
+ help
+ Security labels provide an access control facility to support Linux
+ Security Models (LSMs) accepted by AppArmor, SELinux, Smack and TOMOYO
+ Linux. This option enables an extended attribute handler for file
+ security labels in the erofs filesystem, so that it requires enabling
+ the extended attribute support in advance.
+
+ If you are not using a security module, say N.
+
+config EROFS_FS_ZIP
+ bool "EROFS Data Compression Support"
+ depends on EROFS_FS
+ select LZ4_DECOMPRESS
+ default y
+ help
+ Enable fixed-sized output compression for EROFS.
+
+ If you don't want to enable compression feature, say N.
+
+config EROFS_FS_CLUSTER_PAGE_LIMIT
+ int "EROFS Cluster Pages Hard Limit"
+ depends on EROFS_FS_ZIP
+ range 1 256
+ default "1"
+ help
+ Indicates maximum # of pages of a compressed
+ physical cluster.
+
+ For example, if files in a image were compressed
+ into 8k-unit, hard limit should not be configured
+ less than 2. Otherwise, the image will be refused
+ to mount on this kernel.
+
diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile
new file mode 100644
index 000000000000..46f2aa4ba46c
--- /dev/null
+++ b/fs/erofs/Makefile
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+EROFS_VERSION = "1.0"
+
+ccflags-y += -DEROFS_VERSION=\"$(EROFS_VERSION)\"
+
+obj-$(CONFIG_EROFS_FS) += erofs.o
+erofs-objs := super.o inode.o data.o namei.o dir.o utils.o
+erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o
+erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o
+
diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h
new file mode 100644
index 000000000000..07d279fd5d67
--- /dev/null
+++ b/fs/erofs/compress.h
@@ -0,0 +1,60 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2019 HUAWEI, Inc.
+ * http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25@huawei.com>
+ */
+#ifndef __EROFS_FS_COMPRESS_H
+#define __EROFS_FS_COMPRESS_H
+
+#include "internal.h"
+
+enum {
+ Z_EROFS_COMPRESSION_SHIFTED = Z_EROFS_COMPRESSION_MAX,
+ Z_EROFS_COMPRESSION_RUNTIME_MAX
+};
+
+struct z_erofs_decompress_req {
+ struct super_block *sb;
+ struct page **in, **out;
+
+ unsigned short pageofs_out;
+ unsigned int inputsize, outputsize;
+
+ /* indicate the algorithm will be used for decompression */
+ unsigned int alg;
+ bool inplace_io, partial_decoding;
+};
+
+/*
+ * - 0x5A110C8D ('sallocated', Z_EROFS_MAPPING_STAGING) -
+ * used to mark temporary allocated pages from other
+ * file/cached pages and NULL mapping pages.
+ */
+#define Z_EROFS_MAPPING_STAGING ((void *)0x5A110C8D)
+
+/* check if a page is marked as staging */
+static inline bool z_erofs_page_is_staging(struct page *page)
+{
+ return page->mapping == Z_EROFS_MAPPING_STAGING;
+}
+
+static inline bool z_erofs_put_stagingpage(struct list_head *pagepool,
+ struct page *page)
+{
+ if (!z_erofs_page_is_staging(page))
+ return false;
+
+ /* staging pages should not be used by others at the same time */
+ if (page_ref_count(page) > 1)
+ put_page(page);
+ else
+ list_add(&page->lru, pagepool);
+ return true;
+}
+
+int z_erofs_decompress(struct z_erofs_decompress_req *rq,
+ struct list_head *pagepool);
+
+#endif
+
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
new file mode 100644
index 000000000000..fda16ec8863e
--- /dev/null
+++ b/fs/erofs/data.c
@@ -0,0 +1,423 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2017-2018 HUAWEI, Inc.
+ * http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25@huawei.com>
+ */
+#include "internal.h"
+#include <linux/prefetch.h>
+
+#include <trace/events/erofs.h>
+
+static inline void read_endio(struct bio *bio)
+{
+ struct super_block *const sb = bio->bi_private;
+ struct bio_vec *bvec;
+ blk_status_t err = bio->bi_status;
+ struct bvec_iter_all iter_all;
+
+ if (time_to_inject(EROFS_SB(sb), FAULT_READ_IO)) {
+ erofs_show_injection_info(FAULT_READ_IO);
+ err = BLK_STS_IOERR;
+ }
+
+ bio_for_each_segment_all(bvec, bio, iter_all) {
+ struct page *page = bvec->bv_page;
+
+ /* page is already locked */
+ DBG_BUGON(PageUptodate(page));
+
+ if (unlikely(err))
+ SetPageError(page);
+ else
+ SetPageUptodate(page);
+
+ unlock_page(page);
+ /* page could be reclaimed now */
+ }
+ bio_put(bio);
+}
+
+/* prio -- true is used for dir */
+struct page *__erofs_get_meta_page(struct super_block *sb,
+ erofs_blk_t blkaddr, bool prio, bool nofail)
+{
+ struct inode *const bd_inode = sb->s_bdev->bd_inode;
+ struct address_space *const mapping = bd_inode->i_mapping;
+ /* prefer retrying in the allocator to blindly looping below */
+ const gfp_t gfp = mapping_gfp_constraint(mapping, ~__GFP_FS) |
+ (nofail ? __GFP_NOFAIL : 0);
+ unsigned int io_retries = nofail ? EROFS_IO_MAX_RETRIES_NOFAIL : 0;
+ struct page *page;
+ int err;
+
+repeat:
+ page = find_or_create_page(mapping, blkaddr, gfp);
+ if (unlikely(!page)) {
+ DBG_BUGON(nofail);
+ return ERR_PTR(-ENOMEM);
+ }
+ DBG_BUGON(!PageLocked(page));
+
+ if (!PageUptodate(page)) {
+ struct bio *bio;
+
+ bio = erofs_grab_bio(sb, blkaddr, 1, sb, read_endio, nofail);
+ if (IS_ERR(bio)) {
+ DBG_BUGON(nofail);
+ err = PTR_ERR(bio);
+ goto err_out;
+ }
+
+ err = bio_add_page(bio, page, PAGE_SIZE, 0);
+ if (unlikely(err != PAGE_SIZE)) {
+ err = -EFAULT;
+ goto err_out;
+ }
+
+ __submit_bio(bio, REQ_OP_READ,
+ REQ_META | (prio ? REQ_PRIO : 0));
+
+ lock_page(page);
+
+ /* this page has been truncated by others */
+ if (unlikely(page->mapping != mapping)) {
+unlock_repeat:
+ unlock_page(page);
+ put_page(page);
+ goto repeat;
+ }
+
+ /* more likely a read error */
+ if (unlikely(!PageUptodate(page))) {
+ if (io_retries) {
+ --io_retries;
+ goto unlock_repeat;
+ }
+ err = -EIO;
+ goto err_out;
+ }
+ }
+ return page;
+
+err_out:
+ unlock_page(page);
+ put_page(page);
+ return ERR_PTR(err);
+}
+
+static int erofs_map_blocks_flatmode(struct inode *inode,
+ struct erofs_map_blocks *map,
+ int flags)
+{
+ int err = 0;
+ erofs_blk_t nblocks, lastblk;
+ u64 offset = map->m_la;
+ struct erofs_vnode *vi = EROFS_V(inode);
+
+ trace_erofs_map_blocks_flatmode_enter(inode, map, flags);
+
+ nblocks = DIV_ROUND_UP(inode->i_size, PAGE_SIZE);
+ lastblk = nblocks - is_inode_flat_inline(inode);
+
+ if (unlikely(offset >= inode->i_size)) {
+ /* leave out-of-bound access unmapped */
+ map->m_flags = 0;
+ map->m_plen = 0;
+ goto out;
+ }
+
+ /* there is no hole in flatmode */
+ map->m_flags = EROFS_MAP_MAPPED;
+
+ if (offset < blknr_to_addr(lastblk)) {
+ map->m_pa = blknr_to_addr(vi->raw_blkaddr) + map->m_la;
+ map->m_plen = blknr_to_addr(lastblk) - offset;
+ } else if (is_inode_flat_inline(inode)) {
+ /* 2 - inode inline B: inode, [xattrs], inline last blk... */
+ struct erofs_sb_info *sbi = EROFS_SB(inode->i_sb);
+
+ map->m_pa = iloc(sbi, vi->nid) + vi->inode_isize +
+ vi->xattr_isize + erofs_blkoff(map->m_la);
+ map->m_plen = inode->i_size - offset;
+
+ /* inline data should be located in one meta block */
+ if (erofs_blkoff(map->m_pa) + map->m_plen > PAGE_SIZE) {
+ errln("inline data cross block boundary @ nid %llu",
+ vi->nid);
+ DBG_BUGON(1);
+ err = -EFSCORRUPTED;
+ goto err_out;
+ }
+
+ map->m_flags |= EROFS_MAP_META;
+ } else {
+ errln("internal error @ nid: %llu (size %llu), m_la 0x%llx",
+ vi->nid, inode->i_size, map->m_la);
+ DBG_BUGON(1);
+ err = -EIO;
+ goto err_out;
+ }
+
+out:
+ map->m_llen = map->m_plen;
+
+err_out:
+ trace_erofs_map_blocks_flatmode_exit(inode, map, flags, 0);
+ return err;
+}
+
+int erofs_map_blocks(struct inode *inode,
+ struct erofs_map_blocks *map, int flags)
+{
+ if (unlikely(is_inode_layout_compression(inode))) {
+ int err = z_erofs_map_blocks_iter(inode, map, flags);
+
+ if (map->mpage) {
+ put_page(map->mpage);
+ map->mpage = NULL;
+ }
+ return err;
+ }
+ return erofs_map_blocks_flatmode(inode, map, flags);
+}
+
+static inline struct bio *erofs_read_raw_page(struct bio *bio,
+ struct address_space *mapping,
+ struct page *page,
+ erofs_off_t *last_block,
+ unsigned int nblocks,
+ bool ra)
+{
+ struct inode *const inode = mapping->host;
+ struct super_block *const sb = inode->i_sb;
+ erofs_off_t current_block = (erofs_off_t)page->index;
+ int err;
+
+ DBG_BUGON(!nblocks);
+
+ if (PageUptodate(page)) {
+ err = 0;
+ goto has_updated;
+ }
+
+ /* note that for readpage case, bio also equals to NULL */
+ if (bio &&
+ /* not continuous */
+ *last_block + 1 != current_block) {
+submit_bio_retry:
+ __submit_bio(bio, REQ_OP_READ, 0);
+ bio = NULL;
+ }
+
+ if (!bio) {
+ struct erofs_map_blocks map = {
+ .m_la = blknr_to_addr(current_block),
+ };
+ erofs_blk_t blknr;
+ unsigned int blkoff;
+
+ err = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW);
+ if (unlikely(err))
+ goto err_out;
+
+ /* zero out the holed page */
+ if (unlikely(!(map.m_flags & EROFS_MAP_MAPPED))) {
+ zero_user_segment(page, 0, PAGE_SIZE);
+ SetPageUptodate(page);
+
+ /* imply err = 0, see erofs_map_blocks */
+ goto has_updated;
+ }
+
+ /* for RAW access mode, m_plen must be equal to m_llen */
+ DBG_BUGON(map.m_plen != map.m_llen);
+
+ blknr = erofs_blknr(map.m_pa);
+ blkoff = erofs_blkoff(map.m_pa);
+
+ /* deal with inline page */
+ if (map.m_flags & EROFS_MAP_META) {
+ void *vsrc, *vto;
+ struct page *ipage;
+
+ DBG_BUGON(map.m_plen > PAGE_SIZE);
+
+ ipage = erofs_get_meta_page(inode->i_sb, blknr, 0);
+
+ if (IS_ERR(ipage)) {
+ err = PTR_ERR(ipage);
+ goto err_out;
+ }
+
+ vsrc = kmap_atomic(ipage);
+ vto = kmap_atomic(page);
+ memcpy(vto, vsrc + blkoff, map.m_plen);
+ memset(vto + map.m_plen, 0, PAGE_SIZE - map.m_plen);
+ kunmap_atomic(vto);
+ kunmap_atomic(vsrc);
+ flush_dcache_page(page);
+
+ SetPageUptodate(page);
+ /* TODO: could we unlock the page earlier? */
+ unlock_page(ipage);
+ put_page(ipage);
+
+ /* imply err = 0, see erofs_map_blocks */
+ goto has_updated;
+ }
+
+ /* pa must be block-aligned for raw reading */
+ DBG_BUGON(erofs_blkoff(map.m_pa));
+
+ /* max # of continuous pages */
+ if (nblocks > DIV_ROUND_UP(map.m_plen, PAGE_SIZE))
+ nblocks = DIV_ROUND_UP(map.m_plen, PAGE_SIZE);
+ if (nblocks > BIO_MAX_PAGES)
+ nblocks = BIO_MAX_PAGES;
+
+ bio = erofs_grab_bio(sb, blknr, nblocks, sb,
+ read_endio, false);
+ if (IS_ERR(bio)) {
+ err = PTR_ERR(bio);
+ bio = NULL;
+ goto err_out;
+ }
+ }
+
+ err = bio_add_page(bio, page, PAGE_SIZE, 0);
+ /* out of the extent or bio is full */
+ if (err < PAGE_SIZE)
+ goto submit_bio_retry;
+
+ *last_block = current_block;
+
+ /* shift in advance in case of it followed by too many gaps */
+ if (bio->bi_iter.bi_size >= bio->bi_max_vecs * PAGE_SIZE) {
+ /* err should reassign to 0 after submitting */
+ err = 0;
+ goto submit_bio_out;
+ }
+
+ return bio;
+
+err_out:
+ /* for sync reading, set page error immediately */
+ if (!ra) {
+ SetPageError(page);
+ ClearPageUptodate(page);
+ }
+has_updated:
+ unlock_page(page);
+
+ /* if updated manually, continuous pages has a gap */
+ if (bio)
+submit_bio_out:
+ __submit_bio(bio, REQ_OP_READ, 0);
+
+ return unlikely(err) ? ERR_PTR(err) : NULL;
+}
+
+/*
+ * since we dont have write or truncate flows, so no inode
+ * locking needs to be held at the moment.
+ */
+static int erofs_raw_access_readpage(struct file *file, struct page *page)
+{
+ erofs_off_t last_block;
+ struct bio *bio;
+
+ trace_erofs_readpage(page, true);
+
+ bio = erofs_read_raw_page(NULL, page->mapping,
+ page, &last_block, 1, false);
+
+ if (IS_ERR(bio))
+ return PTR_ERR(bio);
+
+ DBG_BUGON(bio); /* since we have only one bio -- must be NULL */
+ return 0;
+}
+
+static int erofs_raw_access_readpages(struct file *filp,
+ struct address_space *mapping,
+ struct list_head *pages,
+ unsigned int nr_pages)
+{
+ erofs_off_t last_block;
+ struct bio *bio = NULL;
+ gfp_t gfp = readahead_gfp_mask(mapping);
+ struct page *page = list_last_entry(pages, struct page, lru);
+
+ trace_erofs_readpages(mapping->host, page, nr_pages, true);
+
+ for (; nr_pages; --nr_pages) {
+ page = list_entry(pages->prev, struct page, lru);
+
+ prefetchw(&page->flags);
+ list_del(&page->lru);
+
+ if (!add_to_page_cache_lru(page, mapping, page->index, gfp)) {
+ bio = erofs_read_raw_page(bio, mapping, page,
+ &last_block, nr_pages, true);
+
+ /* all the page errors are ignored when readahead */
+ if (IS_ERR(bio)) {
+ pr_err("%s, readahead error at page %lu of nid %llu\n",
+ __func__, page->index,
+ EROFS_V(mapping->host)->nid);
+
+ bio = NULL;
+ }
+ }
+
+ /* pages could still be locked */
+ put_page(page);
+ }
+ DBG_BUGON(!list_empty(pages));
+
+ /* the rare case (end in gaps) */
+ if (unlikely(bio))
+ __submit_bio(bio, REQ_OP_READ, 0);
+ return 0;
+}
+
+static int erofs_get_block(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh, int create)
+{
+ struct erofs_map_blocks map = {
+ .m_la = iblock << 9,
+ };
+ int err;
+
+ err = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW);
+ if (err)
+ return err;
+
+ if (map.m_flags & EROFS_MAP_MAPPED)
+ bh->b_blocknr = erofs_blknr(map.m_pa);
+
+ return err;
+}
+
+static sector_t erofs_bmap(struct address_space *mapping, sector_t block)
+{
+ struct inode *inode = mapping->host;
+
+ if (is_inode_flat_inline(inode)) {
+ erofs_blk_t blks = i_size_read(inode) >> LOG_BLOCK_SIZE;
+
+ if (block >> LOG_SECTORS_PER_BLOCK >= blks)
+ return 0;
+ }
+
+ return generic_block_bmap(mapping, block, erofs_get_block);
+}
+
+/* for uncompressed (aligned) files and raw access for other files */
+const struct address_space_operations erofs_raw_access_aops = {
+ .readpage = erofs_raw_access_readpage,
+ .readpages = erofs_raw_access_readpages,
+ .bmap = erofs_bmap,
+};
+
diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
new file mode 100644
index 000000000000..5f4b7f302863
--- /dev/null
+++ b/fs/erofs/decompressor.c
@@ -0,0 +1,358 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2019 HUAWEI, Inc.
+ * http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25@huawei.com>
+ */
+#include "compress.h"
+#include <linux/module.h>
+#include <linux/lz4.h>
+
+#ifndef LZ4_DISTANCE_MAX /* history window size */
+#define LZ4_DISTANCE_MAX 65535 /* set to maximum value by default */
+#endif
+
+#define LZ4_MAX_DISTANCE_PAGES (DIV_ROUND_UP(LZ4_DISTANCE_MAX, PAGE_SIZE) + 1)
+#ifndef LZ4_DECOMPRESS_INPLACE_MARGIN
+#define LZ4_DECOMPRESS_INPLACE_MARGIN(srcsize) (((srcsize) >> 8) + 32)
+#endif
+
+struct z_erofs_decompressor {
+ /*
+ * if destpages have sparsed pages, fill them with bounce pages.
+ * it also check whether destpages indicate continuous physical memory.
+ */
+ int (*prepare_destpages)(struct z_erofs_decompress_req *rq,
+ struct list_head *pagepool);
+ int (*decompress)(struct z_erofs_decompress_req *rq, u8 *out);
+ char *name;
+};
+
+static bool use_vmap;
+module_param(use_vmap, bool, 0444);
+MODULE_PARM_DESC(use_vmap, "Use vmap() instead of vm_map_ram() (default 0)");
+
+static int lz4_prepare_destpages(struct z_erofs_decompress_req *rq,
+ struct list_head *pagepool)
+{
+ const unsigned int nr =
+ PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
+ struct page *availables[LZ4_MAX_DISTANCE_PAGES] = { NULL };
+ unsigned long bounced[DIV_ROUND_UP(LZ4_MAX_DISTANCE_PAGES,
+ BITS_PER_LONG)] = { 0 };
+ void *kaddr = NULL;
+ unsigned int i, j, top;
+
+ top = 0;
+ for (i = j = 0; i < nr; ++i, ++j) {
+ struct page *const page = rq->out[i];
+ struct page *victim;
+
+ if (j >= LZ4_MAX_DISTANCE_PAGES)
+ j = 0;
+
+ /* 'valid' bounced can only be tested after a complete round */
+ if (test_bit(j, bounced)) {
+ DBG_BUGON(i < LZ4_MAX_DISTANCE_PAGES);
+ DBG_BUGON(top >= LZ4_MAX_DISTANCE_PAGES);
+ availables[top++] = rq->out[i - LZ4_MAX_DISTANCE_PAGES];
+ }
+
+ if (page) {
+ __clear_bit(j, bounced);
+ if (kaddr) {
+ if (kaddr + PAGE_SIZE == page_address(page))
+ kaddr += PAGE_SIZE;
+ else
+ kaddr = NULL;
+ } else if (!i) {
+ kaddr = page_address(page);
+ }
+ continue;
+ }
+ kaddr = NULL;
+ __set_bit(j, bounced);
+
+ if (top) {
+ victim = availables[--top];
+ get_page(victim);
+ } else {
+ victim = erofs_allocpage(pagepool, GFP_KERNEL, false);
+ if (unlikely(!victim))
+ return -ENOMEM;
+ victim->mapping = Z_EROFS_MAPPING_STAGING;
+ }
+ rq->out[i] = victim;
+ }
+ return kaddr ? 1 : 0;
+}
+
+static void *generic_copy_inplace_data(struct z_erofs_decompress_req *rq,
+ u8 *src, unsigned int pageofs_in)
+{
+ /*
+ * if in-place decompression is ongoing, those decompressed
+ * pages should be copied in order to avoid being overlapped.
+ */
+ struct page **in = rq->in;
+ u8 *const tmp = erofs_get_pcpubuf(0);
+ u8 *tmpp = tmp;
+ unsigned int inlen = rq->inputsize - pageofs_in;
+ unsigned int count = min_t(uint, inlen, PAGE_SIZE - pageofs_in);
+
+ while (tmpp < tmp + inlen) {
+ if (!src)
+ src = kmap_atomic(*in);
+ memcpy(tmpp, src + pageofs_in, count);
+ kunmap_atomic(src);
+ src = NULL;
+ tmpp += count;
+ pageofs_in = 0;
+ count = PAGE_SIZE;
+ ++in;
+ }
+ return tmp;
+}
+
+static int lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out)
+{
+ unsigned int inputmargin, inlen;
+ u8 *src;
+ bool copied, support_0padding;
+ int ret;
+
+ if (rq->inputsize > PAGE_SIZE)
+ return -EOPNOTSUPP;
+
+ src = kmap_atomic(*rq->in);
+ inputmargin = 0;
+ support_0padding = false;
+
+ /* decompression inplace is only safe when 0padding is enabled */
+ if (EROFS_SB(rq->sb)->requirements & EROFS_REQUIREMENT_LZ4_0PADDING) {
+ support_0padding = true;
+
+ while (!src[inputmargin & ~PAGE_MASK])
+ if (!(++inputmargin & ~PAGE_MASK))
+ break;
+
+ if (inputmargin >= rq->inputsize) {
+ kunmap_atomic(src);
+ return -EIO;
+ }
+ }
+
+ copied = false;
+ inlen = rq->inputsize - inputmargin;
+ if (rq->inplace_io) {
+ const uint oend = (rq->pageofs_out +
+ rq->outputsize) & ~PAGE_MASK;
+ const uint nr = PAGE_ALIGN(rq->pageofs_out +
+ rq->outputsize) >> PAGE_SHIFT;
+
+ if (rq->partial_decoding || !support_0padding ||
+ rq->out[nr - 1] != rq->in[0] ||
+ rq->inputsize - oend <
+ LZ4_DECOMPRESS_INPLACE_MARGIN(inlen)) {
+ src = generic_copy_inplace_data(rq, src, inputmargin);
+ inputmargin = 0;
+ copied = true;
+ }
+ }
+
+ ret = LZ4_decompress_safe_partial(src + inputmargin, out,
+ inlen, rq->outputsize,
+ rq->outputsize);
+ if (ret < 0) {
+ errln("%s, failed to decompress, in[%p, %u, %u] out[%p, %u]",
+ __func__, src + inputmargin, inlen, inputmargin,
+ out, rq->outputsize);
+ WARN_ON(1);
+ print_hex_dump(KERN_DEBUG, "[ in]: ", DUMP_PREFIX_OFFSET,
+ 16, 1, src + inputmargin, inlen, true);
+ print_hex_dump(KERN_DEBUG, "[out]: ", DUMP_PREFIX_OFFSET,
+ 16, 1, out, rq->outputsize, true);
+ ret = -EIO;
+ }
+
+ if (copied)
+ erofs_put_pcpubuf(src);
+ else
+ kunmap_atomic(src);
+ return ret;
+}
+
+static struct z_erofs_decompressor decompressors[] = {
+ [Z_EROFS_COMPRESSION_SHIFTED] = {
+ .name = "shifted"
+ },
+ [Z_EROFS_COMPRESSION_LZ4] = {
+ .prepare_destpages = lz4_prepare_destpages,
+ .decompress = lz4_decompress,
+ .name = "lz4"
+ },
+};
+
+static void copy_from_pcpubuf(struct page **out, const char *dst,
+ unsigned short pageofs_out,
+ unsigned int outputsize)
+{
+ const char *end = dst + outputsize;
+ const unsigned int righthalf = PAGE_SIZE - pageofs_out;
+ const char *cur = dst - pageofs_out;
+
+ while (cur < end) {
+ struct page *const page = *out++;
+
+ if (page) {
+ char *buf = kmap_atomic(page);
+
+ if (cur >= dst) {
+ memcpy(buf, cur, min_t(uint, PAGE_SIZE,
+ end - cur));
+ } else {
+ memcpy(buf + pageofs_out, cur + pageofs_out,
+ min_t(uint, righthalf, end - cur));
+ }
+ kunmap_atomic(buf);
+ }
+ cur += PAGE_SIZE;
+ }
+}
+
+static void *erofs_vmap(struct page **pages, unsigned int count)
+{
+ int i = 0;
+
+ if (use_vmap)
+ return vmap(pages, count, VM_MAP, PAGE_KERNEL);
+
+ while (1) {
+ void *addr = vm_map_ram(pages, count, -1, PAGE_KERNEL);
+
+ /* retry two more times (totally 3 times) */
+ if (addr || ++i >= 3)
+ return addr;
+ vm_unmap_aliases();
+ }
+ return NULL;
+}
+
+static void erofs_vunmap(const void *mem, unsigned int count)
+{
+ if (!use_vmap)
+ vm_unmap_ram(mem, count);
+ else
+ vunmap(mem);
+}
+
+static int decompress_generic(struct z_erofs_decompress_req *rq,
+ struct list_head *pagepool)
+{
+ const unsigned int nrpages_out =
+ PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
+ const struct z_erofs_decompressor *alg = decompressors + rq->alg;
+ unsigned int dst_maptype;
+ void *dst;
+ int ret;
+
+ if (nrpages_out == 1 && !rq->inplace_io) {
+ DBG_BUGON(!*rq->out);
+ dst = kmap_atomic(*rq->out);
+ dst_maptype = 0;
+ goto dstmap_out;
+ }
+
+ /*
+ * For the case of small output size (especially much less
+ * than PAGE_SIZE), memcpy the decompressed data rather than
+ * compressed data is preferred.
+ */
+ if (rq->outputsize <= PAGE_SIZE * 7 / 8) {
+ dst = erofs_get_pcpubuf(0);
+ if (IS_ERR(dst))
+ return PTR_ERR(dst);
+
+ rq->inplace_io = false;
+ ret = alg->decompress(rq, dst);
+ if (!ret)
+ copy_from_pcpubuf(rq->out, dst, rq->pageofs_out,
+ rq->outputsize);
+
+ erofs_put_pcpubuf(dst);
+ return ret;
+ }
+
+ ret = alg->prepare_destpages(rq, pagepool);
+ if (ret < 0) {
+ return ret;
+ } else if (ret) {
+ dst = page_address(*rq->out);
+ dst_maptype = 1;
+ goto dstmap_out;
+ }
+
+ dst = erofs_vmap(rq->out, nrpages_out);
+ if (!dst)
+ return -ENOMEM;
+ dst_maptype = 2;
+
+dstmap_out:
+ ret = alg->decompress(rq, dst + rq->pageofs_out);
+
+ if (!dst_maptype)
+ kunmap_atomic(dst);
+ else if (dst_maptype == 2)
+ erofs_vunmap(dst, nrpages_out);
+ return ret;
+}
+
+static int shifted_decompress(const struct z_erofs_decompress_req *rq,
+ struct list_head *pagepool)
+{
+ const unsigned int nrpages_out =
+ PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
+ const unsigned int righthalf = PAGE_SIZE - rq->pageofs_out;
+ unsigned char *src, *dst;
+
+ if (nrpages_out > 2) {
+ DBG_BUGON(1);
+ return -EIO;
+ }
+
+ if (rq->out[0] == *rq->in) {
+ DBG_BUGON(nrpages_out != 1);
+ return 0;
+ }
+
+ src = kmap_atomic(*rq->in);
+ if (!rq->out[0]) {
+ dst = NULL;
+ } else {
+ dst = kmap_atomic(rq->out[0]);
+ memcpy(dst + rq->pageofs_out, src,