// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/file.c
*
* Copyright (C) 1998-1999, Stephen Tweedie and Bill Hawes
*
* Manage the dynamic fd arrays in the process files_struct.
*/
#include <linux/syscalls.h>
#include <linux/export.h>
#include <linux/fs.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/sched/signal.h>
#include <linux/slab.h>
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/bitops.h>
#include <linux/spinlock.h>
#include <linux/rcupdate.h>
#include <linux/close_range.h>
#include <linux/file_ref.h>
#include <net/sock.h>
#include <linux/init_task.h>
#include "internal.h"
static noinline bool __file_ref_put_badval(file_ref_t *ref, unsigned long cnt)
{
/*
* If the reference count was already in the dead zone, then this
* put() operation is imbalanced. Warn, put the reference count back to
* DEAD and tell the caller to not deconstruct the object.
*/
if (WARN_ONCE(cnt >= FILE_REF_RELEASED, "imbalanced put on file reference count")) {
atomic_long_set(&ref->refcnt, FILE_REF_DEAD);
return false;
}
/*
* This is a put() operation on a saturated refcount. Restore the
* mean saturation value and tell the caller to not deconstruct the
* object.
*/
if (cnt > FILE_REF_MAXREF)
atomic_long_set(&ref->refcnt, FILE_REF_SATURATED);
return false;
}
/**
* __file_ref_put - Slowpath of file_ref_put()
* @ref: Pointer to the reference count
* @cnt: Current reference count
*
* Invoked when the reference count is outside of the valid zone.
*
* Return:
* True if this was the last reference with no future references
* possible. This signals the caller that it can safely schedule the
* object, which is protected by the reference counter, for
* deconstruction.
*
* False if there are still active references or the put() raced
* with a concurrent get()/put() pair. Caller is not allowed to
* deconstruct the protected object.
*/
bool __file_ref_put(file_ref_t *ref, unsigned long cnt)
{
/* Did this drop the last reference? */
if (likely(cnt == FILE_REF_NOREF)) {
/*
* Carefully try to set the reference count to FILE_REF_DEAD.
*
* This can fail if a concurrent get() operation has
* elevated it again or the corresponding put() even marked
* it dead already. Both are valid situations and do not
* require a retry. If this fails the caller is not
* allowed to deconstruct the object.
*/
if (!atomic_long_try_cmpxchg_release(&ref->refcnt, &cnt, FILE_REF_DEAD))
return false;
/*
* The caller can safely schedule the object for
* deconstruction. Provide acquire ordering.
*/
smp_acquire__after_ctrl_dep();
return true;
}
return __file_ref_put_badval(ref, cnt);
}
EXPORT_SYMBOL_GPL(__file_ref_put);
unsigned int sysctl_nr_open __read_mostly = 1024*1024;
unsigned int sysctl_nr_open_min = BITS_PER_LONG;
/* our min() is unusable in constant expressions ;-/ */
#define __const_min(x, y) ((x) < (y) ? (x) : (y))
unsigned int sysctl_nr_open_max =
__const_min(INT_MAX, ~(size_t)0/sizeof(void *)) & -BITS_PER_LONG;
static void __free_fdtable(struct fdtable *fdt)
{
kvfree(fdt->fd);
kvfree(fdt->open_fds);
kfree(fdt);
}
static void free_fdtable_rcu(struct rcu_head *rcu)
{
__free_fdtable(container_of(rcu, struct fdtable, rcu));
}
#define BITBIT_NR(nr) BITS_TO_LONGS(BITS_TO_LONGS(nr))
#define BITBIT_SIZE(nr) (BITBIT_NR(nr) * sizeof(long))
#define fdt_words(fdt) ((fdt)->max_fds / BITS_PER_LONG) // words in ->open_fds
/*
* Copy 'count' fd bits from the old table to the new table and clear the extra
* space if any. This does not copy the file pointers. Called with the files
* spinlock held for write.
*/
static inline void copy_fd_bitmaps(struct fdtable *nfdt, struct fdtable *ofdt,
unsigned int copy_words)
{
unsigned int nwords = fdt_words(nfdt);
bitmap_copy_and_extend(nfdt->open_fds, ofdt->open_fds,
copy_words * BITS_PER_LONG, nwords * BITS_PER_LONG);
bitmap_copy_and_extend(nfdt->close_on_exec, ofdt->close_on_exec,
copy_words * BITS_PER_LONG, nwords * BITS_PER_LONG);
bitmap_copy_and_extend(nfdt->full_fds_bits, ofdt->full_fds_bits,
copy_words, nwords);
}
/*
* Copy all file descriptors from the old table to the new, expanded table and
* clear the extra space. Called with the files spinlock held for write.
*/
static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt)
{
size_t cpy, set;
BUG_ON(nfdt->max_fds < ofdt->max_fds);
cpy = ofdt->max_fds * sizeof(struct file *);
set = (nfdt->max_fds - ofdt->max_fds) * sizeof(struct file *);
memcpy(nfdt->fd, ofdt->fd, cpy);
memset((char *)nfdt->fd + cpy, 0, set);
copy_fd_bitmaps(nfdt, ofdt, fdt_words(ofdt));
}
/*
* Note how the fdtable bitmap allocations very much have to be a multiple of
* BITS_PER_LONG. This is not only because we walk those things in chunks of
* 'unsigned long' in some places, but simply because that is how the Linux
* kernel bitmaps are defined to work: they are not "bits in an array of bytes",
* they are very much "bits in an array of unsigned long".
*/
static struct fdtable *alloc_fdtable(unsigned int slots_wanted)
{
struct fdtable *fdt;
unsigned int nr;
void *data;
/*
* Figure out how many fds we actually want to support in this fdtable.
* Allocation steps are keyed to the size of the fdarray, since it
* grows far faster than any of the other dynamic data. We try to fit
* the fdarray into comfortable page-tuned chunks: starting at 1024B
* and growing in powers of two from there on. Since we called only
* with slots_wanted > BITS_PER_LONG (embedded instance in files->fdtab
* already gives BITS_PER_LONG slots), the above boils down to
* 1. use the smallest power of two large enough to give us that many
* slots.
* 2. on 32bit skip 64 and 128 - the minimal capacity we want there is
* 256 slots (i.e. 1Kb fd array).
* 3. on 64bit don't skip anything, 1Kb fd array means 128 slots there
* and we are never going to be asked for 64 or less.
*/
if (IS_ENABLED(CONFIG_32BIT) && slots_wanted < 256)
nr = 256;
else
nr = roundup_pow_of_two(slots_wanted);
/*
* Note that this can drive nr *below* what we had passed if sysctl_nr_open
* had been set lower between the check in expand_files() and here.
*
* We make sure that nr remains a multiple of BITS_PER_LONG - otherwise
* bitmaps handling below becomes unpleasant, to put it mildly...
*/
if (unlikely(nr > sysctl_nr_open)) {
nr = round_down(sysctl_nr_open, BITS_PER_LONG);
if (nr < slots_wanted)
return ERR_PTR(-EMFILE);
}
/*
* Check if the allocation size would exceed INT_MAX. kvmalloc_array()
* and kvmalloc() will warn if the allocation size is greater than
* INT_MAX, as filp_cachep objects are not __GFP_NOWARN.
*
* This can happen when sysctl_nr_open is set to a very high value and
* a process tries to use a file descriptor near that limit. For example,
* if sysctl_nr_open is set to 1073741816 (0x3ffffff8) - which is what
* systemd typically sets it to - then trying to use a file descriptor
* close to that value will require allocating a file descriptor table
* that exceeds 8GB in size.
*/
if (unlikely(nr > INT_MAX / sizeof(struct file *)))
return ERR_PTR(-EMFILE);
fdt = kmalloc_obj(struct fdtable, GFP_KERNEL_ACCOUNT);
if (!fdt)
goto out;
fdt->max_fds = nr;
data = kvmalloc_objs(struct file *, nr, GFP_KERNEL_ACCOUNT);
if (!data)
goto out_fdt;
fdt->fd = data;
data = kvmalloc(max_t(size_t,
2 * nr / BITS_PER_BYTE + BITBIT_SIZE
|