diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2019-09-21 13:37:39 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2019-09-21 13:37:39 -0700 |
commit | 70cb0d02b58128db07fc39b5e87a2873e2c16bde (patch) | |
tree | 43c0a4eb00f192ceb306b9c52503b2d54bc59660 | |
parent | 104c0d6bc43e10ba84931c45b67e2c76c9c67f68 (diff) | |
parent | 040823b5372b445d1d9483811e85a24d71314d33 (diff) | |
download | linux-70cb0d02b58128db07fc39b5e87a2873e2c16bde.tar.gz linux-70cb0d02b58128db07fc39b5e87a2873e2c16bde.tar.bz2 linux-70cb0d02b58128db07fc39b5e87a2873e2c16bde.zip |
Merge tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4
Pull ext4 updates from Ted Ts'o:
"Added new ext4 debugging ioctls to allow userspace to get information
about the state of the extent status cache.
Dropped workaround for pre-1970 dates which were encoded incorrectly
in pre-4.4 kernels. Since both the kernel correctly generates, and
e2fsck detects and fixes this issue for the past four years, it'e time
to drop the workaround. (Also, it's not like files with dates in the
distant past were all that common in the first place.)
A lot of miscellaneous bug fixes and cleanups, including some ext4
Documentation fixes. Also included are two minor bug fixes in
fs/unicode"
* tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (21 commits)
unicode: make array 'token' static const, makes object smaller
unicode: Move static keyword to the front of declarations
ext4: add missing bigalloc documentation.
ext4: fix kernel oops caused by spurious casefold flag
ext4: fix integer overflow when calculating commit interval
ext4: use percpu_counters for extent_status cache hits/misses
ext4: fix potential use after free after remounting with noblock_validity
jbd2: add missing tracepoint for reserved handle
ext4: fix punch hole for inline_data file systems
ext4: rework reserved cluster accounting when invalidating pages
ext4: documentation fixes
ext4: treat buffers with write errors as containing valid data
ext4: fix warning inside ext4_convert_unwritten_extents_endio
ext4: set error return correctly when ext4_htree_store_dirent fails
ext4: drop legacy pre-1970 encoding workaround
ext4: add new ioctl EXT4_IOC_GET_ES_CACHE
ext4: add a new ioctl EXT4_IOC_GETSTATE
ext4: add a new ioctl EXT4_IOC_CLEAR_ES_CACHE
jbd2: flush_descriptor(): Do not decrease buffer head's ref count
ext4: remove unnecessary error check
...
-rw-r--r-- | Documentation/filesystems/ext4/bigalloc.rst | 32 | ||||
-rw-r--r-- | Documentation/filesystems/ext4/blockgroup.rst | 10 | ||||
-rw-r--r-- | Documentation/filesystems/ext4/blocks.rst | 4 | ||||
-rw-r--r-- | Documentation/filesystems/ext4/directory.rst | 2 | ||||
-rw-r--r-- | Documentation/filesystems/ext4/group_descr.rst | 9 | ||||
-rw-r--r-- | Documentation/filesystems/ext4/inodes.rst | 4 | ||||
-rw-r--r-- | Documentation/filesystems/ext4/super.rst | 20 | ||||
-rw-r--r-- | fs/ext4/block_validity.c | 189 | ||||
-rw-r--r-- | fs/ext4/dir.c | 7 | ||||
-rw-r--r-- | fs/ext4/ext4.h | 64 | ||||
-rw-r--r-- | fs/ext4/extents.c | 98 | ||||
-rw-r--r-- | fs/ext4/extents_status.c | 521 | ||||
-rw-r--r-- | fs/ext4/extents_status.h | 8 | ||||
-rw-r--r-- | fs/ext4/file.c | 2 | ||||
-rw-r--r-- | fs/ext4/hash.c | 2 | ||||
-rw-r--r-- | fs/ext4/inline.c | 2 | ||||
-rw-r--r-- | fs/ext4/inode.c | 103 | ||||
-rw-r--r-- | fs/ext4/ioctl.c | 98 | ||||
-rw-r--r-- | fs/ext4/namei.c | 4 | ||||
-rw-r--r-- | fs/ext4/super.c | 7 | ||||
-rw-r--r-- | fs/jbd2/revoke.c | 4 | ||||
-rw-r--r-- | fs/jbd2/transaction.c | 3 | ||||
-rw-r--r-- | fs/unicode/utf8-core.c | 2 | ||||
-rw-r--r-- | fs/unicode/utf8-selftest.c | 4 |
24 files changed, 890 insertions, 309 deletions
diff --git a/Documentation/filesystems/ext4/bigalloc.rst b/Documentation/filesystems/ext4/bigalloc.rst index c6d88557553c..72075aa608e4 100644 --- a/Documentation/filesystems/ext4/bigalloc.rst +++ b/Documentation/filesystems/ext4/bigalloc.rst @@ -9,14 +9,26 @@ ext4 code is not prepared to handle the case where the block size exceeds the page size. However, for a filesystem of mostly huge files, it is desirable to be able to allocate disk blocks in units of multiple blocks to reduce both fragmentation and metadata overhead. The -`bigalloc <Bigalloc>`__ feature provides exactly this ability. The -administrator can set a block cluster size at mkfs time (which is stored -in the s\_log\_cluster\_size field in the superblock); from then on, the -block bitmaps track clusters, not individual blocks. This means that -block groups can be several gigabytes in size (instead of just 128MiB); -however, the minimum allocation unit becomes a cluster, not a block, -even for directories. TaoBao had a patchset to extend the “use units of -clusters instead of blocks” to the extent tree, though it is not clear -where those patches went-- they eventually morphed into “extent tree v2” -but that code has not landed as of May 2015. +bigalloc feature provides exactly this ability. + +The bigalloc feature (EXT4_FEATURE_RO_COMPAT_BIGALLOC) changes ext4 to +use clustered allocation, so that each bit in the ext4 block allocation +bitmap addresses a power of two number of blocks. For example, if the +file system is mainly going to be storing large files in the 4-32 +megabyte range, it might make sense to set a cluster size of 1 megabyte. +This means that each bit in the block allocation bitmap now addresses +256 4k blocks. This shrinks the total size of the block allocation +bitmaps for a 2T file system from 64 megabytes to 256 kilobytes. It also +means that a block group addresses 32 gigabytes instead of 128 megabytes, +also shrinking the amount of file system overhead for metadata. + +The administrator can set a block cluster size at mkfs time (which is +stored in the s\_log\_cluster\_size field in the superblock); from then +on, the block bitmaps track clusters, not individual blocks. This means +that block groups can be several gigabytes in size (instead of just +128MiB); however, the minimum allocation unit becomes a cluster, not a +block, even for directories. TaoBao had a patchset to extend the “use +units of clusters instead of blocks” to the extent tree, though it is +not clear where those patches went-- they eventually morphed into +“extent tree v2” but that code has not landed as of May 2015. diff --git a/Documentation/filesystems/ext4/blockgroup.rst b/Documentation/filesystems/ext4/blockgroup.rst index baf888e4c06a..3da156633339 100644 --- a/Documentation/filesystems/ext4/blockgroup.rst +++ b/Documentation/filesystems/ext4/blockgroup.rst @@ -71,11 +71,11 @@ if the flex\_bg size is 4, then group 0 will contain (in order) the superblock, group descriptors, data block bitmaps for groups 0-3, inode bitmaps for groups 0-3, inode tables for groups 0-3, and the remaining space in group 0 is for file data. The effect of this is to group the -block metadata close together for faster loading, and to enable large -files to be continuous on disk. Backup copies of the superblock and -group descriptors are always at the beginning of block groups, even if -flex\_bg is enabled. The number of block groups that make up a flex\_bg -is given by 2 ^ ``sb.s_log_groups_per_flex``. +block group metadata close together for faster loading, and to enable +large files to be continuous on disk. Backup copies of the superblock +and group descriptors are always at the beginning of block groups, even +if flex\_bg is enabled. The number of block groups that make up a +flex\_bg is given by 2 ^ ``sb.s_log_groups_per_flex``. Meta Block Groups ----------------- diff --git a/Documentation/filesystems/ext4/blocks.rst b/Documentation/filesystems/ext4/blocks.rst index 73d4dc0f7bda..bd722ecd92d6 100644 --- a/Documentation/filesystems/ext4/blocks.rst +++ b/Documentation/filesystems/ext4/blocks.rst @@ -10,7 +10,9 @@ block groups. Block size is specified at mkfs time and typically is 4KiB. You may experience mounting problems if block size is greater than page size (i.e. 64KiB blocks on a i386 which only has 4KiB memory pages). By default a filesystem can contain 2^32 blocks; if the '64bit' -feature is enabled, then a filesystem can have 2^64 blocks. +feature is enabled, then a filesystem can have 2^64 blocks. The location +of structures is stored in terms of the block number the structure lives +in and not the absolute offset on disk. For 32-bit filesystems, limits are as follows: diff --git a/Documentation/filesystems/ext4/directory.rst b/Documentation/filesystems/ext4/directory.rst index 614034e24669..073940cc64ed 100644 --- a/Documentation/filesystems/ext4/directory.rst +++ b/Documentation/filesystems/ext4/directory.rst @@ -59,7 +59,7 @@ is at most 263 bytes long, though on disk you'll need to reference - File name. Since file names cannot be longer than 255 bytes, the new directory -entry format shortens the rec\_len field and uses the space for a file +entry format shortens the name\_len field and uses the space for a file type flag, probably to avoid having to load every inode during directory tree traversal. This format is ``ext4_dir_entry_2``, which is at most 263 bytes long, though on disk you'll need to reference diff --git a/Documentation/filesystems/ext4/group_descr.rst b/Documentation/filesystems/ext4/group_descr.rst index 0f783ed88592..7ba6114e7f5c 100644 --- a/Documentation/filesystems/ext4/group_descr.rst +++ b/Documentation/filesystems/ext4/group_descr.rst @@ -99,9 +99,12 @@ The block group descriptor is laid out in ``struct ext4_group_desc``. * - 0x1E - \_\_le16 - bg\_checksum - - Group descriptor checksum; crc16(sb\_uuid+group+desc) if the - RO\_COMPAT\_GDT\_CSUM feature is set, or crc32c(sb\_uuid+group\_desc) & - 0xFFFF if the RO\_COMPAT\_METADATA\_CSUM feature is set. + - Group descriptor checksum; crc16(sb\_uuid+group\_num+bg\_desc) if the + RO\_COMPAT\_GDT\_CSUM feature is set, or + crc32c(sb\_uuid+group\_num+bg\_desc) & 0xFFFF if the + RO\_COMPAT\_METADATA\_CSUM feature is set. The bg\_checksum + field in bg\_desc is skipped when calculating crc16 checksum, + and set to zero if crc32c checksum is used. * - - - diff --git a/Documentation/filesystems/ext4/inodes.rst b/Documentation/filesystems/ext4/inodes.rst index e851e6ca31fa..a65baffb4ebf 100644 --- a/Documentation/filesystems/ext4/inodes.rst +++ b/Documentation/filesystems/ext4/inodes.rst @@ -472,8 +472,8 @@ inode, which allows struct ext4\_inode to grow for a new kernel without having to upgrade all of the on-disk inodes. Access to fields beyond EXT2\_GOOD\_OLD\_INODE\_SIZE should be verified to be within ``i_extra_isize``. By default, ext4 inode records are 256 bytes, and (as -of October 2013) the inode structure is 156 bytes -(``i_extra_isize = 28``). The extra space between the end of the inode +of August 2019) the inode structure is 160 bytes +(``i_extra_isize = 32``). The extra space between the end of the inode structure and the end of the inode record can be used to store extended attributes. Each inode record can be as large as the filesystem block size, though this is not terribly efficient. diff --git a/Documentation/filesystems/ext4/super.rst b/Documentation/filesystems/ext4/super.rst index 6eae92054827..93e55d7c1d40 100644 --- a/Documentation/filesystems/ext4/super.rst +++ b/Documentation/filesystems/ext4/super.rst @@ -58,7 +58,7 @@ The ext4 superblock is laid out as follows in * - 0x1C - \_\_le32 - s\_log\_cluster\_size - - Cluster size is (2 ^ s\_log\_cluster\_size) blocks if bigalloc is + - Cluster size is 2 ^ (10 + s\_log\_cluster\_size) blocks if bigalloc is enabled. Otherwise s\_log\_cluster\_size must equal s\_log\_block\_size. * - 0x20 - \_\_le32 @@ -447,7 +447,7 @@ The ext4 superblock is laid out as follows in - Upper 8 bits of the s_wtime field. * - 0x275 - \_\_u8 - - s\_wtime_hi + - s\_mtime_hi - Upper 8 bits of the s_mtime field. * - 0x276 - \_\_u8 @@ -466,12 +466,20 @@ The ext4 superblock is laid out as follows in - s\_last_error_time_hi - Upper 8 bits of the s_last_error_time_hi field. * - 0x27A - - \_\_u8[2] - - s\_pad + - \_\_u8 + - s\_pad[2] - Zero padding. * - 0x27C + - \_\_le16 + - s\_encoding + - Filename charset encoding. + * - 0x27E + - \_\_le16 + - s\_encoding_flags + - Filename charset encoding flags. + * - 0x280 - \_\_le32 - - s\_reserved[96] + - s\_reserved[95] - Padding to the end of the block. * - 0x3FC - \_\_le32 @@ -617,7 +625,7 @@ following: * - 0x80 - Enable a filesystem size of 2^64 blocks (INCOMPAT\_64BIT). * - 0x100 - - Multiple mount protection. Not implemented (INCOMPAT\_MMP). + - Multiple mount protection (INCOMPAT\_MMP). * - 0x200 - Flexible block groups. See the earlier discussion of this feature (INCOMPAT\_FLEX\_BG). diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index 8e83741b02e0..d4d4fdfac1a6 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -38,6 +38,7 @@ int __init ext4_init_system_zone(void) void ext4_exit_system_zone(void) { + rcu_barrier(); kmem_cache_destroy(ext4_system_zone_cachep); } @@ -49,17 +50,26 @@ static inline int can_merge(struct ext4_system_zone *entry1, return 0; } +static void release_system_zone(struct ext4_system_blocks *system_blks) +{ + struct ext4_system_zone *entry, *n; + + rbtree_postorder_for_each_entry_safe(entry, n, + &system_blks->root, node) + kmem_cache_free(ext4_system_zone_cachep, entry); +} + /* * Mark a range of blocks as belonging to the "system zone" --- that * is, filesystem metadata blocks which should never be used by * inodes. */ -static int add_system_zone(struct ext4_sb_info *sbi, +static int add_system_zone(struct ext4_system_blocks *system_blks, ext4_fsblk_t start_blk, unsigned int count) { struct ext4_system_zone *new_entry = NULL, *entry; - struct rb_node **n = &sbi->system_blks.rb_node, *node; + struct rb_node **n = &system_blks->root.rb_node, *node; struct rb_node *parent = NULL, *new_node = NULL; while (*n) { @@ -91,7 +101,7 @@ static int add_system_zone(struct ext4_sb_info *sbi, new_node = &new_entry->node; rb_link_node(new_node, parent, n); - rb_insert_color(new_node, &sbi->system_blks); + rb_insert_color(new_node, &system_blks->root); } /* Can we merge to the left? */ @@ -101,7 +111,7 @@ static int add_system_zone(struct ext4_sb_info *sbi, if (can_merge(entry, new_entry)) { new_entry->start_blk = entry->start_blk; new_entry->count += entry->count; - rb_erase(node, &sbi->system_blks); + rb_erase(node, &system_blks->root); kmem_cache_free(ext4_system_zone_cachep, entry); } } @@ -112,7 +122,7 @@ static int add_system_zone(struct ext4_sb_info *sbi, entry = rb_entry(node, struct ext4_system_zone, node); if (can_merge(new_entry, entry)) { new_entry->count += entry->count; - rb_erase(node, &sbi->system_blks); + rb_erase(node, &system_blks->root); kmem_cache_free(ext4_system_zone_cachep, entry); } } @@ -126,7 +136,7 @@ static void debug_print_tree(struct ext4_sb_info *sbi) int first = 1; printk(KERN_INFO "System zones: "); - node = rb_first(&sbi->system_blks); + node = rb_first(&sbi->system_blks->root); while (node) { entry = rb_entry(node, struct ext4_system_zone, node); printk(KERN_CONT "%s%llu-%llu", first ? "" : ", ", @@ -137,7 +147,47 @@ static void debug_print_tree(struct ext4_sb_info *sbi) printk(KERN_CONT "\n"); } -static int ext4_protect_reserved_inode(struct super_block *sb, u32 ino) +/* + * Returns 1 if the passed-in block region (start_blk, + * start_blk+count) is valid; 0 if some part of the block region + * overlaps with filesystem metadata blocks. + */ +static int ext4_data_block_valid_rcu(struct ext4_sb_info *sbi, + struct ext4_system_blocks *system_blks, + ext4_fsblk_t start_blk, + unsigned int count) +{ + struct ext4_system_zone *entry; + struct rb_node *n; + + if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) || + (start_blk + count < start_blk) || + (start_blk + count > ext4_blocks_count(sbi->s_es))) { + sbi->s_es->s_last_error_block = cpu_to_le64(start_blk); + return 0; + } + + if (system_blks == NULL) + return 1; + + n = system_blks->root.rb_node; + while (n) { + entry = rb_entry(n, struct ext4_system_zone, node); + if (start_blk + count - 1 < entry->start_blk) + n = n->rb_left; + else if (start_blk >= (entry->start_blk + entry->count)) + n = n->rb_right; + else { + sbi->s_es->s_last_error_block = cpu_to_le64(start_blk); + return 0; + } + } + return 1; +} + +static int ext4_protect_reserved_inode(struct super_block *sb, + struct ext4_system_blocks *system_blks, + u32 ino) { struct inode *inode; struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -163,14 +213,15 @@ static int ext4_protect_reserved_inode(struct super_block *sb, u32 ino) if (n == 0) { i++; } else { - if (!ext4_data_block_valid(sbi, map.m_pblk, n)) { + if (!ext4_data_block_valid_rcu(sbi, system_blks, + map.m_pblk, n)) { ext4_error(sb, "blocks %llu-%llu from inode %u " "overlap system zone", map.m_pblk, map.m_pblk + map.m_len - 1, ino); err = -EFSCORRUPTED; break; } - err = add_system_zone(sbi, map.m_pblk, n); + err = add_system_zone(system_blks, map.m_pblk, n); if (err < 0) break; i += n; @@ -180,94 +231,130 @@ static int ext4_protect_reserved_inode(struct super_block *sb, u32 ino) return err; } +static void ext4_destroy_system_zone(struct rcu_head *rcu) +{ + struct ext4_system_blocks *system_blks; + + system_blks = container_of(rcu, struct ext4_system_blocks, rcu); + release_system_zone(system_blks); + kfree(system_blks); +} + +/* + * Build system zone rbtree which is used for block validity checking. + * + * The update of system_blks pointer in this function is protected by + * sb->s_umount semaphore. However we have to be careful as we can be + * racing with ext4_data_block_valid() calls reading system_blks rbtree + * protected only by RCU. That's why we first build the rbtree and then + * swap it in place. + */ int ext4_setup_system_zone(struct super_block *sb) { ext4_group_t ngroups = ext4_get_groups_count(sb); struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_system_blocks *system_blks; struct ext4_group_desc *gdp; ext4_group_t i; int flex_size = ext4_flex_bg_size(sbi); int ret; if (!test_opt(sb, BLOCK_VALIDITY)) { - if (sbi->system_blks.rb_node) + if (sbi->system_blks) ext4_release_system_zone(sb); return 0; } - if (sbi->system_blks.rb_node) + if (sbi->system_blks) return 0; + system_blks = kzalloc(sizeof(*system_blks), GFP_KERNEL); + if (!system_blks) + return -ENOMEM; + for (i=0; i < ngroups; i++) { cond_resched(); if (ext4_bg_has_super(sb, i) && ((i < 5) || ((i % flex_size) == 0))) - add_system_zone(sbi, ext4_group_first_block_no(sb, i), + add_system_zone(system_blks, + ext4_group_first_block_no(sb, i), ext4_bg_num_gdb(sb, i) + 1); gdp = ext4_get_group_desc(sb, i, NULL); - ret = add_system_zone(sbi, ext4_block_bitmap(sb, gdp), 1); + ret = add_system_zone(system_blks, + ext4_block_bitmap(sb, gdp), 1); if (ret) - return ret; - ret = add_system_zone(sbi, ext4_inode_bitmap(sb, gdp), 1); + goto err; + ret = add_system_zone(system_blks, + ext4_inode_bitmap(sb, gdp), 1); if (ret) - return ret; - ret = add_system_zone(sbi, ext4_inode_table(sb, gdp), + goto err; + ret = add_system_zone(system_blks, + ext4_inode_table(sb, gdp), sbi->s_itb_per_group); if (ret) - return ret; + goto err; } if (ext4_has_feature_journal(sb) && sbi->s_es->s_journal_inum) { - ret = ext4_protect_reserved_inode(sb, + ret = ext4_protect_reserved_inode(sb, system_blks, le32_to_cpu(sbi->s_es->s_journal_inum)); if (ret) - return ret; + goto err; } + /* + * System blks rbtree complete, announce it once to prevent racing + * with ext4_data_block_valid() accessing the rbtree at the same + * time. + */ + rcu_assign_pointer(sbi->system_blks, system_blks); + if (test_opt(sb, DEBUG)) debug_print_tree(sbi); return 0; +err: + release_system_zone(system_blks); + kfree(system_blks); + return ret; } -/* Called when the filesystem is unmounted */ +/* + * Called when the filesystem is unmounted or when remounting it with + * noblock_validity specified. + * + * The update of system_blks pointer in this function is protected by + * sb->s_umount semaphore. However we have to be careful as we can be + * racing with ext4_data_block_valid() calls reading system_blks rbtree + * protected only by RCU. So we first clear the system_blks pointer and + * then free the rbtree only after RCU grace period expires. + */ void ext4_release_system_zone(struct super_block *sb) { - struct ext4_system_zone *entry, *n; + struct ext4_system_blocks *system_blks; - rbtree_postorder_for_each_entry_safe(entry, n, - &EXT4_SB(sb)->system_blks, node) - kmem_cache_free(ext4_system_zone_cachep, entry); + system_blks = rcu_dereference_protected(EXT4_SB(sb)->system_blks, + lockdep_is_held(&sb->s_umount)); + rcu_assign_pointer(EXT4_SB(sb)->system_blks, NULL); - EXT4_SB(sb)->system_blks = RB_ROOT; + if (system_blks) + call_rcu(&system_blks->rcu, ext4_destroy_system_zone); } -/* - * Returns 1 if the passed-in block region (start_blk, - * start_blk+count) is valid; 0 if some part of the block region - * overlaps with filesystem metadata blocks. - */ int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk, unsigned int count) { - struct ext4_system_zone *entry; - struct rb_node *n = sbi->system_blks.rb_node; + struct ext4_system_blocks *system_blks; + int ret; - if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) || - (start_blk + count < start_blk) || - (start_blk + count > ext4_blocks_count(sbi->s_es))) { - sbi->s_es->s_last_error_block = cpu_to_le64(start_blk); - return 0; - } - while (n) { - entry = rb_entry(n, struct ext4_system_zone, node); - if (start_blk + count - 1 < entry->start_blk) - n = n->rb_left; - else if (start_blk >= (entry->start_blk + entry->count)) - n = n->rb_right; - else { - sbi->s_es->s_last_error_block = cpu_to_le64(start_blk); - return 0; - } - } - return 1; + /* + * Lock the system zone to prevent it being released concurrently + * when doing a remount which inverse current "[no]block_validity" + * mount option. + */ + rcu_read_lock(); + system_blks = rcu_dereference(sbi->system_blks); + ret = ext4_data_block_valid_rcu(sbi, system_blks, start_blk, + count); + rcu_read_unlock(); + return ret; } int ext4_check_blockref(const char *function, unsigned int line, diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 86054f31fe4d..9fdd2b269d61 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -668,14 +668,15 @@ static int ext4_d_compare(const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { struct qstr qstr = {.name = str, .len = len }; + struct inode *inode = dentry->d_parent->d_inode; - if (!IS_CASEFOLDED(dentry->d_parent->d_inode)) { + if (!IS_CASEFOLDED(inode) || !EXT4_SB(inode->i_sb)->s_encoding) { if (len != name->len) return -1; return memcmp(str, name->name, len); } - return ext4_ci_compare(dentry->d_parent->d_inode, name, &qstr, false); + return ext4_ci_compare(inode, name, &qstr, false); } static int ext4_d_hash(const struct dentry *dentry, struct qstr *str) @@ -685,7 +686,7 @@ static int ext4_d_hash(const struct dentry *dentry, struct qstr *str) unsigned char *norm; int len, ret = 0; - if (!IS_CASEFOLDED(dentry->d_inode)) + if (!IS_CASEFOLDED(dentry->d_inode) || !um) return 0; norm = kmalloc(PATH_MAX, GFP_ATOMIC); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 42c6e4a5e673..03db3e71676c 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -186,6 +186,14 @@ struct ext4_map_blocks { }; /* + * Block validity checking, system zone rbtree. + */ +struct ext4_system_blocks { + struct rb_root root; + struct rcu_head rcu; +}; + +/* * Flags for ext4_io_end->flags */ #define EXT4_IO_END_UNWRITTEN 0x0001 @@ -285,6 +293,9 @@ struct ext4_io_submit { ~((ext4_fsblk_t) (s)->s_cluster_ratio - 1)) #define EXT4_LBLK_CMASK(s, lblk) ((lblk) & \ ~((ext4_lblk_t) (s)->s_cluster_ratio - 1)) +/* Fill in the low bits to get the last block of the cluster */ +#define EXT4_LBLK_CFILL(sbi, lblk) ((lblk) | \ + ((ext4_lblk_t) (sbi)->s_cluster_ratio - 1)) /* Get the cluster offset */ #define EXT4_PBLK_COFF(s, pblk) ((pblk) & \ ((ext4_fsblk_t) (s)->s_cluster_ratio - 1)) @@ -653,6 +664,10 @@ enum { #define EXT4_IOC_SET_ENCRYPTION_POLICY FS_IOC_SET_ENCRYPTION_POLICY #define EXT4_IOC_GET_ENCRYPTION_PWSALT FS_IOC_GET_ENCRYPTION_PWSALT #define EXT4_IOC_GET_ENCRYPTION_POLICY FS_IOC_GET_ENCRYPTION_POLICY +/* ioctl codes 19--39 are reserved for fscrypt */ +#define EXT4_IOC_CLEAR_ES_CACHE _IO('f', 40) +#define EXT4_IOC_GETSTATE _IOW('f', 41, __u32) +#define EXT4_IOC_GET_ES_CACHE _IOWR('f', 42, struct fiemap) #define EXT4_IOC_FSGETXATTR FS_IOC_FSGETXATTR #define EXT4_IOC_FSSETXATTR FS_IOC_FSSETXATTR @@ -666,6 +681,16 @@ enum { #define EXT4_GOING_FLAGS_LOGFLUSH 0x1 /* flush log but not data */ #define EXT4_GOING_FLAGS_NOLOGFLUSH 0x2 /* don't flush log nor data */ +/* + * Flags returned by EXT4_IOC_GETSTATE + * + * We only expose to userspace a subset of the state flags in + * i_state_flags + */ +#define EXT4_STATE_FLAG_EXT_PRECACHED 0x00000001 +#define EXT4_STATE_FLAG_NEW 0x00000002 +#define EXT4_STATE_FLAG_NEWENTRY 0x00000004 +#define EXT4_STATE_FLAG_DA_ALLOC_CLOSE 0x00000008 #if defined(__KERNEL__) && defined(CONFIG_COMPAT) /* @@ -683,6 +708,12 @@ enum { #define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION #endif +/* + * Returned by EXT4_IOC_GET_ES_CACHE as an additional possible flag. + * It indicates that the entry in extent status cache is for a hole. + */ +#define EXT4_FIEMAP_EXTENT_HOLE 0x08000000 + /* Max physical block we can address w/o extents */ #define EXT4_MAX_BLOCK_FILE_PHYS 0xFFFFFFFF @@ -812,21 +843,8 @@ static inline __le32 ext4_encode_extra_time(struct timespec64 *time) static inline void ext4_decode_extra_time(struct timespec64 *time, __le32 extra) { - if (unlikely(extra & cpu_to_le32(EXT4_EPOCH_MASK))) { - -#if 1 - /* Handle legacy encoding of pre-1970 dates with epoch - * bits 1,1. (This backwards compatibility may be removed - * at the discretion of the ext4 developers.) - */ - u64 extra_bits = le32_to_cpu(extra) & EXT4_EPOCH_MASK; - if (extra_bits == 3 && ((time->tv_sec) & 0x80000000) != 0) - extra_bits = 0; - time->tv_sec += extra_bits << 32; -#else + if (unlikely(extra & cpu_to_le32(EXT4_EPOCH_MASK))) time->tv_sec += (u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK) << 32; -#endif - } time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS; } @@ -1427,7 +1445,7 @@ struct ext4_sb_info { int s_jquota_fmt; /* Format of quota to use */ #endif unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */ - struct rb_root system_blks; + struct ext4_system_blocks __rcu *system_blks; #ifdef EXTENTS_STATS /* ext4 extents stats */ @@ -3267,6 +3285,9 @@ extern int ext4_ext_check_inode(struct inode *inode); extern ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path); extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); +extern int ext4_get_es_cache(struct inode *inode, + struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len); extern int ext4_ext_precache(struct inode *inode); extern int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len); extern int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len); @@ -3359,6 +3380,19 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end) extern const struct iomap_ops ext4_iomap_ops; +static inline int ext4_buffer_uptodate(struct buffer_head *bh) +{ + /* + * If the buffer has the write error flag, we have failed + * to write out data in the block. In this case, we don't + * have to read the block because we may read the old data + * successfully. + */ + if (!buffer_uptodate(bh) && buffer_write_io_error(bh)) + set_buffer_uptodate(bh); + return buffer_uptodate(bh); +} + #endif /* __KERNEL__ */ #define EFSBADCRC EBADMSG /* Bad CRC detected */ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 92266a2da7d6..fb0f99dc8c22 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2315,6 +2315,52 @@ static int ext4_fill_fiemap_extents(struct inode *inode, return err; } +static int ext4_fill_es_cache_info(struct inode *inode, + ext4_lblk_t block, ext4_lblk_t num, + struct fiemap_extent_info *fieinfo) +{ + ext4_lblk_t next, end = block + num - 1; + struct extent_status es; + unsigned char blksize_bits = inode->i_sb->s_blocksize_bits; + unsigned int flags; + int err; + + while (block <= end) { + next = 0; + flags = 0; + if (!ext4_es_lookup_extent(inode, block, &next, &es)) + break; + if (ext4_es_is_unwritten(&es)) + flags |= FIEMAP_EXTENT_UNWRITTEN; + if (ext4_es_is_delayed(&es)) + flags |= (FIEMAP_EXTENT_DELALLOC | + FIEMAP_EXTENT_UNKNOWN); + if (ext4_es_is_hole(&es)) + flags |= EXT4_FIEMAP_EXTENT_HOLE; + if (next == 0) + flags |= FIEMAP_EXTENT_LAST; + if (flags & (FIEMAP_EXTENT_DELALLOC| + EXT4_FIEMAP_EXTENT_HOLE)) + es.es_pblk = 0; + else + es.es_pblk = ext4_es_pblock(&es); + err = fiemap_fill_next_extent(fieinfo, + (__u64)es.es_lblk << blksize_bits, + (__u64)es.es_pblk << blksize_bits, + (__u64)es.es_len << blksize_bits, + flags); + if (next == 0) + break; + block = next; + if (err < 0) + return err; + if (err == 1) + return 0; + } + return 0; +} + + /* * ext4_ext_determine_hole - determine hole around given block * @inode: inode we lookup in @@ -3813,8 +3859,8 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle, * illegal. */ if (ee_block != map->m_lblk || ee_len > map->m_len) { -#ifdef EXT4_DEBUG - ext4_warning("Inode (%ld) finished: extent logical block %llu," +#ifdef CONFIG_EXT4_DEBUG + ext4_warning(inode->i_sb, "Inode (%ld) finished: extent logical block %llu," " len %u; IO logical block %llu, len %u", inode->i_ino, (unsigned long long)ee_block, ee_len, (unsigned long long)map->m_lblk, map->m_len); @@ -5017,8 +5063,6 @@ static int ext4_find_delayed_extent(struct inode *inode, return next_del; } -/* fiemap flags we can handle specified here */ -#define EXT4_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) static int ext4_xattr_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo) @@ -5055,10 +5099,16 @@ static int ext4_xattr_fiemap(struct inode *inode, return (error < 0 ? error : 0); } -int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, - __u64 start, __u64 len) +static int _ext4_fiemap(struct inode *inode, + struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len, |