mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/
synced 2025-04-19 20:58:31 +09:00

-----BEGIN PGP SIGNATURE----- iHUEABYKAB0WIQRAhzRXHqcMeLMyaSiRxhvAZXjcogUCZ90r2wAKCRCRxhvAZXjc ouC6AQCk3MoqskN0WeNcaZT23dB7dHbEhf/7YXOFC9MFRMKXqQD9Fbn95+GuIe3U nBVPbVyQfDtfXE08ml6gbDJrCsbkkQI= =Xm1C -----END PGP SIGNATURE----- Merge tag 'vfs-6.15-rc1.mount.namespace' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs Pull vfs mount namespace updates from Christian Brauner: "This expands the ability of anonymous mount namespaces: - Creating detached mounts from detached mounts Currently, detached mounts can only be created from attached mounts. This limitaton prevents various use-cases. For example, the ability to mount a subdirectory without ever having to make the whole filesystem visible first. The current permission modelis: (1) Check that the caller is privileged over the owning user namespace of it's current mount namespace. (2) Check that the caller is located in the mount namespace of the mount it wants to create a detached copy of. While it is not strictly necessary to do it this way it is consistently applied in the new mount api. This model will also be used when allowing the creation of detached mount from another detached mount. The (1) requirement can simply be met by performing the same check as for the non-detached case, i.e., verify that the caller is privileged over its current mount namespace. To meet the (2) requirement it must be possible to infer the origin mount namespace that the anonymous mount namespace of the detached mount was created from. The origin mount namespace of an anonymous mount is the mount namespace that the mounts that were copied into the anonymous mount namespace originate from. In order to check the origin mount namespace of an anonymous mount namespace the sequence number of the original mount namespace is recorded in the anonymous mount namespace. With this in place it is possible to perform an equivalent check (2') to (2). The origin mount namespace of the anonymous mount namespace must be the same as the caller's mount namespace. To establish this the sequence number of the caller's mount namespace and the origin sequence number of the anonymous mount namespace are compared. The caller is always located in a non-anonymous mount namespace since anonymous mount namespaces cannot be setns()ed into. The caller's mount namespace will thus always have a valid sequence number. The owning namespace of any mount namespace, anonymous or non-anonymous, can never change. A mount attached to a non-anonymous mount namespace can never change mount namespace. If the sequence number of the non-anonymous mount namespace and the origin sequence number of the anonymous mount namespace match, the owning namespaces must match as well. Hence, the capability check on the owning namespace of the caller's mount namespace ensures that the caller has the ability to copy the mount tree. - Allow mount detached mounts on detached mounts Currently, detached mounts can only be mounted onto attached mounts. This limitation makes it impossible to assemble a new private rootfs and move it into place. Instead, a detached tree must be created, attached, then mounted open and then either moved or detached again. Lift this restriction. In order to allow mounting detached mounts onto other detached mounts the same permission model used for creating detached mounts from detached mounts can be used (cf. above). Allowing to mount detached mounts onto detached mounts leaves three cases to consider: (1) The source mount is an attached mount and the target mount is a detached mount. This would be equivalent to moving a mount between different mount namespaces. A caller could move an attached mount to a detached mount. The detached mount can now be freely attached to any mount namespace. This changes the current delegatioh model significantly for no good reason. So this will fail. (2) Anonymous mount namespaces are always attached fully, i.e., it is not possible to only attach a subtree of an anoymous mount namespace. This simplifies the implementation and reasoning. Consequently, if the anonymous mount namespace of the source detached mount and the target detached mount are the identical the mount request will fail. (3) The source mount's anonymous mount namespace is different from the target mount's anonymous mount namespace. In this case the source anonymous mount namespace of the source mount tree must be freed after its mounts have been moved to the target anonymous mount namespace. The source anonymous mount namespace must be empty afterwards. By allowing to mount detached mounts onto detached mounts a caller may do the following: fd_tree1 = open_tree(-EBADF, "/mnt", OPEN_TREE_CLONE) fd_tree2 = open_tree(-EBADF, "/tmp", OPEN_TREE_CLONE) fd_tree1 and fd_tree2 refer to two different detached mount trees that belong to two different anonymous mount namespace. It is important to note that fd_tree1 and fd_tree2 both refer to the root of their respective anonymous mount namespaces. By allowing to mount detached mounts onto detached mounts the caller may now do: move_mount(fd_tree1, "", fd_tree2, "", MOVE_MOUNT_F_EMPTY_PATH | MOVE_MOUNT_T_EMPTY_PATH) This will cause the detached mount referred to by fd_tree1 to be mounted on top of the detached mount referred to by fd_tree2. Thus, the detached mount fd_tree1 is moved from its separate anonymous mount namespace into fd_tree2's anonymous mount namespace. It also means that while fd_tree2 continues to refer to the root of its respective anonymous mount namespace fd_tree1 doesn't anymore. This has the consequence that only fd_tree2 can be moved to another anonymous or non-anonymous mount namespace. Moving fd_tree1 will now fail as fd_tree1 doesn't refer to the root of an anoymous mount namespace anymore. Now fd_tree1 and fd_tree2 refer to separate detached mount trees referring to the same anonymous mount namespace. This is conceptually fine. The new mount api does allow for this to happen already via: mount -t tmpfs tmpfs /mnt mkdir -p /mnt/A mount -t tmpfs tmpfs /mnt/A fd_tree3 = open_tree(-EBADF, "/mnt", OPEN_TREE_CLONE | AT_RECURSIVE) fd_tree4 = open_tree(-EBADF, "/mnt/A", 0) Both fd_tree3 and fd_tree4 refer to two different detached mount trees but both detached mount trees refer to the same anonymous mount namespace. An as with fd_tree1 and fd_tree2, only fd_tree3 may be moved another mount namespace as fd_tree3 refers to the root of the anonymous mount namespace just while fd_tree4 doesn't. However, there's an important difference between the fd_tree3/fd_tree4 and the fd_tree1/fd_tree2 example. Closing fd_tree4 and releasing the respective struct file will have no further effect on fd_tree3's detached mount tree. However, closing fd_tree3 will cause the mount tree and the respective anonymous mount namespace to be destroyed causing the detached mount tree of fd_tree4 to be invalid for further mounting. By allowing to mount detached mounts on detached mounts as in the fd_tree1/fd_tree2 example both struct files will affect each other. Both fd_tree1 and fd_tree2 refer to struct files that have FMODE_NEED_UNMOUNT set. To handle this we use the fact that @fd_tree1 will have a parent mount once it has been attached to @fd_tree2. When dissolve_on_fput() is called the mount that has been passed in will refer to the root of the anonymous mount namespace. If it doesn't it would mean that mounts are leaked. So before allowing to mount detached mounts onto detached mounts this would be a bug. Now that detached mounts can be mounted onto detached mounts it just means that the mount has been attached to another anonymous mount namespace and thus dissolve_on_fput() must not unmount the mount tree or free the anonymous mount namespace as the file referring to the root of the namespace hasn't been closed yet. If it had been closed yet it would be obvious because the mount namespace would be NULL, i.e., the @fd_tree1 would have already been unmounted. If @fd_tree1 hasn't been unmounted yet and has a parent mount it is safe to skip any cleanup as closing @fd_tree2 will take care of all cleanup operations. - Allow mount propagation for detached mount trees In commit ee2e3f50629f ("mount: fix mounting of detached mounts onto targets that reside on shared mounts") I fixed a bug where propagating the source mount tree of an anonymous mount namespace into a target mount tree of a non-anonymous mount namespace could be used to trigger an integer overflow in the non-anonymous mount namespace causing any new mounts to fail. The cause of this was that the propagation algorithm was unable to recognize mounts from the source mount tree that were already propagated into the target mount tree and then reappeared as propagation targets when walking the destination propagation mount tree. When fixing this I disabled mount propagation into anonymous mount namespaces. Make it possible for anonymous mount namespace to receive mount propagation events correctly. This is now also a correctness issue now that we allow mounting detached mount trees onto detached mount trees. Mark the source anonymous mount namespace with MNTNS_PROPAGATING indicating that all mounts belonging to this mount namespace are currently in the process of being propagated and make the propagation algorithm discard those if they appear as propagation targets" * tag 'vfs-6.15-rc1.mount.namespace' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs: (21 commits) selftests: test subdirectory mounting selftests: add test for detached mount tree propagation fs: namespace: fix uninitialized variable use mount: handle mount propagation for detached mount trees fs: allow creating detached mounts from fsmount() file descriptors selftests: seventh test for mounting detached mounts onto detached mounts selftests: sixth test for mounting detached mounts onto detached mounts selftests: fifth test for mounting detached mounts onto detached mounts selftests: fourth test for mounting detached mounts onto detached mounts selftests: third test for mounting detached mounts onto detached mounts selftests: second test for mounting detached mounts onto detached mounts selftests: first test for mounting detached mounts onto detached mounts fs: mount detached mounts onto detached mounts fs: support getname_maybe_null() in move_mount() selftests: create detached mounts from detached mounts fs: create detached mounts from detached mounts fs: add may_copy_tree() fs: add fastpath for dissolve_on_fput() fs: add assert for move_mount() fs: add mnt_ns_empty() helper ...
217 lines
6.1 KiB
C
217 lines
6.1 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
#include <linux/mount.h>
|
|
#include <linux/seq_file.h>
|
|
#include <linux/poll.h>
|
|
#include <linux/ns_common.h>
|
|
#include <linux/fs_pin.h>
|
|
|
|
extern struct list_head notify_list;
|
|
|
|
typedef __u32 __bitwise mntns_flags_t;
|
|
|
|
#define MNTNS_PROPAGATING ((__force mntns_flags_t)(1 << 0))
|
|
|
|
struct mnt_namespace {
|
|
struct ns_common ns;
|
|
struct mount * root;
|
|
struct {
|
|
struct rb_root mounts; /* Protected by namespace_sem */
|
|
struct rb_node *mnt_last_node; /* last (rightmost) mount in the rbtree */
|
|
struct rb_node *mnt_first_node; /* first (leftmost) mount in the rbtree */
|
|
};
|
|
struct user_namespace *user_ns;
|
|
struct ucounts *ucounts;
|
|
u64 seq; /* Sequence number to prevent loops */
|
|
union {
|
|
wait_queue_head_t poll;
|
|
struct rcu_head mnt_ns_rcu;
|
|
};
|
|
u64 seq_origin; /* Sequence number of origin mount namespace */
|
|
u64 event;
|
|
#ifdef CONFIG_FSNOTIFY
|
|
__u32 n_fsnotify_mask;
|
|
struct fsnotify_mark_connector __rcu *n_fsnotify_marks;
|
|
#endif
|
|
unsigned int nr_mounts; /* # of mounts in the namespace */
|
|
unsigned int pending_mounts;
|
|
struct rb_node mnt_ns_tree_node; /* node in the mnt_ns_tree */
|
|
struct list_head mnt_ns_list; /* entry in the sequential list of mounts namespace */
|
|
refcount_t passive; /* number references not pinning @mounts */
|
|
mntns_flags_t mntns_flags;
|
|
} __randomize_layout;
|
|
|
|
struct mnt_pcp {
|
|
int mnt_count;
|
|
int mnt_writers;
|
|
};
|
|
|
|
struct mountpoint {
|
|
struct hlist_node m_hash;
|
|
struct dentry *m_dentry;
|
|
struct hlist_head m_list;
|
|
int m_count;
|
|
};
|
|
|
|
struct mount {
|
|
struct hlist_node mnt_hash;
|
|
struct mount *mnt_parent;
|
|
struct dentry *mnt_mountpoint;
|
|
struct vfsmount mnt;
|
|
union {
|
|
struct rb_node mnt_node; /* node in the ns->mounts rbtree */
|
|
struct rcu_head mnt_rcu;
|
|
struct llist_node mnt_llist;
|
|
};
|
|
#ifdef CONFIG_SMP
|
|
struct mnt_pcp __percpu *mnt_pcp;
|
|
#else
|
|
int mnt_count;
|
|
int mnt_writers;
|
|
#endif
|
|
struct list_head mnt_mounts; /* list of children, anchored here */
|
|
struct list_head mnt_child; /* and going through their mnt_child */
|
|
struct list_head mnt_instance; /* mount instance on sb->s_mounts */
|
|
const char *mnt_devname; /* Name of device e.g. /dev/dsk/hda1 */
|
|
struct list_head mnt_list;
|
|
struct list_head mnt_expire; /* link in fs-specific expiry list */
|
|
struct list_head mnt_share; /* circular list of shared mounts */
|
|
struct list_head mnt_slave_list;/* list of slave mounts */
|
|
struct list_head mnt_slave; /* slave list entry */
|
|
struct mount *mnt_master; /* slave is on master->mnt_slave_list */
|
|
struct mnt_namespace *mnt_ns; /* containing namespace */
|
|
struct mountpoint *mnt_mp; /* where is it mounted */
|
|
union {
|
|
struct hlist_node mnt_mp_list; /* list mounts with the same mountpoint */
|
|
struct hlist_node mnt_umount;
|
|
};
|
|
struct list_head mnt_umounting; /* list entry for umount propagation */
|
|
#ifdef CONFIG_FSNOTIFY
|
|
struct fsnotify_mark_connector __rcu *mnt_fsnotify_marks;
|
|
__u32 mnt_fsnotify_mask;
|
|
struct list_head to_notify; /* need to queue notification */
|
|
struct mnt_namespace *prev_ns; /* previous namespace (NULL if none) */
|
|
#endif
|
|
int mnt_id; /* mount identifier, reused */
|
|
u64 mnt_id_unique; /* mount ID unique until reboot */
|
|
int mnt_group_id; /* peer group identifier */
|
|
int mnt_expiry_mark; /* true if marked for expiry */
|
|
struct hlist_head mnt_pins;
|
|
struct hlist_head mnt_stuck_children;
|
|
} __randomize_layout;
|
|
|
|
#define MNT_NS_INTERNAL ERR_PTR(-EINVAL) /* distinct from any mnt_namespace */
|
|
|
|
static inline struct mount *real_mount(struct vfsmount *mnt)
|
|
{
|
|
return container_of(mnt, struct mount, mnt);
|
|
}
|
|
|
|
static inline int mnt_has_parent(struct mount *mnt)
|
|
{
|
|
return mnt != mnt->mnt_parent;
|
|
}
|
|
|
|
static inline int is_mounted(struct vfsmount *mnt)
|
|
{
|
|
/* neither detached nor internal? */
|
|
return !IS_ERR_OR_NULL(real_mount(mnt)->mnt_ns);
|
|
}
|
|
|
|
extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *);
|
|
|
|
extern int __legitimize_mnt(struct vfsmount *, unsigned);
|
|
|
|
static inline bool __path_is_mountpoint(const struct path *path)
|
|
{
|
|
struct mount *m = __lookup_mnt(path->mnt, path->dentry);
|
|
return m && likely(!(m->mnt.mnt_flags & MNT_SYNC_UMOUNT));
|
|
}
|
|
|
|
extern void __detach_mounts(struct dentry *dentry);
|
|
|
|
static inline void detach_mounts(struct dentry *dentry)
|
|
{
|
|
if (!d_mountpoint(dentry))
|
|
return;
|
|
__detach_mounts(dentry);
|
|
}
|
|
|
|
static inline void get_mnt_ns(struct mnt_namespace *ns)
|
|
{
|
|
refcount_inc(&ns->ns.count);
|
|
}
|
|
|
|
extern seqlock_t mount_lock;
|
|
|
|
struct proc_mounts {
|
|
struct mnt_namespace *ns;
|
|
struct path root;
|
|
int (*show)(struct seq_file *, struct vfsmount *);
|
|
};
|
|
|
|
extern const struct seq_operations mounts_op;
|
|
|
|
extern bool __is_local_mountpoint(struct dentry *dentry);
|
|
static inline bool is_local_mountpoint(struct dentry *dentry)
|
|
{
|
|
if (!d_mountpoint(dentry))
|
|
return false;
|
|
|
|
return __is_local_mountpoint(dentry);
|
|
}
|
|
|
|
static inline bool is_anon_ns(struct mnt_namespace *ns)
|
|
{
|
|
return ns->seq == 0;
|
|
}
|
|
|
|
static inline bool mnt_ns_attached(const struct mount *mnt)
|
|
{
|
|
return !RB_EMPTY_NODE(&mnt->mnt_node);
|
|
}
|
|
|
|
static inline bool mnt_ns_empty(const struct mnt_namespace *ns)
|
|
{
|
|
return RB_EMPTY_ROOT(&ns->mounts);
|
|
}
|
|
|
|
static inline void move_from_ns(struct mount *mnt, struct list_head *dt_list)
|
|
{
|
|
struct mnt_namespace *ns = mnt->mnt_ns;
|
|
WARN_ON(!mnt_ns_attached(mnt));
|
|
if (ns->mnt_last_node == &mnt->mnt_node)
|
|
ns->mnt_last_node = rb_prev(&mnt->mnt_node);
|
|
if (ns->mnt_first_node == &mnt->mnt_node)
|
|
ns->mnt_first_node = rb_next(&mnt->mnt_node);
|
|
rb_erase(&mnt->mnt_node, &ns->mounts);
|
|
RB_CLEAR_NODE(&mnt->mnt_node);
|
|
list_add_tail(&mnt->mnt_list, dt_list);
|
|
}
|
|
|
|
bool has_locked_children(struct mount *mnt, struct dentry *dentry);
|
|
struct mnt_namespace *get_sequential_mnt_ns(struct mnt_namespace *mnt_ns,
|
|
bool previous);
|
|
|
|
static inline struct mnt_namespace *to_mnt_ns(struct ns_common *ns)
|
|
{
|
|
return container_of(ns, struct mnt_namespace, ns);
|
|
}
|
|
|
|
#ifdef CONFIG_FSNOTIFY
|
|
static inline void mnt_notify_add(struct mount *m)
|
|
{
|
|
/* Optimize the case where there are no watches */
|
|
if ((m->mnt_ns && m->mnt_ns->n_fsnotify_marks) ||
|
|
(m->prev_ns && m->prev_ns->n_fsnotify_marks))
|
|
list_add_tail(&m->to_notify, ¬ify_list);
|
|
else
|
|
m->prev_ns = m->mnt_ns;
|
|
}
|
|
#else
|
|
static inline void mnt_notify_add(struct mount *m)
|
|
{
|
|
}
|
|
#endif
|
|
|
|
struct mnt_namespace *mnt_ns_from_dentry(struct dentry *dentry);
|