1/*
2 * Copyright (c) 2021 Klara Systems, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27#include <sys/types.h>
28#include <sys/sysmacros.h>
29#include <sys/kmem.h>
30#include <linux/file.h>
31#include <linux/magic.h>
32#include <sys/zone.h>
33#include <sys/string.h>
34
35#if defined(CONFIG_USER_NS)
36#include <linux/statfs.h>
37#include <linux/proc_ns.h>
38#endif
39
40#include <sys/mutex.h>
41
42static kmutex_t zone_datasets_lock;
43static struct list_head zone_datasets;
44
45typedef struct zone_datasets {
46	struct list_head zds_list;	/* zone_datasets linkage */
47	struct user_namespace *zds_userns; /* namespace reference */
48	struct list_head zds_datasets;	/* datasets for the namespace */
49} zone_datasets_t;
50
51typedef struct zone_dataset {
52	struct list_head zd_list;	/* zone_dataset linkage */
53	size_t zd_dsnamelen;		/* length of name */
54	char zd_dsname[];		/* name of the member dataset */
55} zone_dataset_t;
56
57#if defined(CONFIG_USER_NS) && defined(HAVE_USER_NS_COMMON_INUM)
58/*
59 * Returns:
60 * - 0 on success
61 * - EBADF if it cannot open the provided file descriptor
62 * - ENOTTY if the file itself is a not a user namespace file. We want to
63 *   intercept this error in the ZFS layer. We cannot just return one of the
64 *   ZFS_ERR_* errors here as we want to preserve the seperation of the ZFS
65 *   and the SPL layers.
66 */
67static int
68user_ns_get(int fd, struct user_namespace **userns)
69{
70	struct kstatfs st;
71	struct file *nsfile;
72	struct ns_common *ns;
73	int error;
74
75	if ((nsfile = fget(fd)) == NULL)
76		return (EBADF);
77	if (vfs_statfs(&nsfile->f_path, &st) != 0) {
78		error = ENOTTY;
79		goto done;
80	}
81	if (st.f_type != NSFS_MAGIC) {
82		error = ENOTTY;
83		goto done;
84	}
85	ns = get_proc_ns(file_inode(nsfile));
86	if (ns->ops->type != CLONE_NEWUSER) {
87		error = ENOTTY;
88		goto done;
89	}
90	*userns = container_of(ns, struct user_namespace, ns);
91
92	error = 0;
93done:
94	fput(nsfile);
95
96	return (error);
97}
98#endif /* defined(CONFIG_USER_NS) && defined(HAVE_USER_NS_COMMON_INUM) */
99
100static unsigned int
101user_ns_zoneid(struct user_namespace *user_ns)
102{
103	unsigned int r;
104
105#if defined(HAVE_USER_NS_COMMON_INUM)
106	r = user_ns->ns.inum;
107#else
108	r = user_ns->proc_inum;
109#endif
110
111	return (r);
112}
113
114static struct zone_datasets *
115zone_datasets_lookup(unsigned int nsinum)
116{
117	zone_datasets_t *zds;
118
119	list_for_each_entry(zds, &zone_datasets, zds_list) {
120		if (user_ns_zoneid(zds->zds_userns) == nsinum)
121			return (zds);
122	}
123	return (NULL);
124}
125
126#if defined(CONFIG_USER_NS) && defined(HAVE_USER_NS_COMMON_INUM)
127static struct zone_dataset *
128zone_dataset_lookup(zone_datasets_t *zds, const char *dataset, size_t dsnamelen)
129{
130	zone_dataset_t *zd;
131
132	list_for_each_entry(zd, &zds->zds_datasets, zd_list) {
133		if (zd->zd_dsnamelen != dsnamelen)
134			continue;
135		if (strncmp(zd->zd_dsname, dataset, dsnamelen) == 0)
136			return (zd);
137	}
138
139	return (NULL);
140}
141
142static int
143zone_dataset_cred_check(cred_t *cred)
144{
145
146	if (!uid_eq(cred->uid, GLOBAL_ROOT_UID))
147		return (EPERM);
148
149	return (0);
150}
151#endif /* defined(CONFIG_USER_NS) && defined(HAVE_USER_NS_COMMON_INUM) */
152
153static int
154zone_dataset_name_check(const char *dataset, size_t *dsnamelen)
155{
156
157	if (dataset[0] == '\0' || dataset[0] == '/')
158		return (ENOENT);
159
160	*dsnamelen = strlen(dataset);
161	/* Ignore trailing slash, if supplied. */
162	if (dataset[*dsnamelen - 1] == '/')
163		(*dsnamelen)--;
164
165	return (0);
166}
167
168int
169zone_dataset_attach(cred_t *cred, const char *dataset, int userns_fd)
170{
171#if defined(CONFIG_USER_NS) && defined(HAVE_USER_NS_COMMON_INUM)
172	struct user_namespace *userns;
173	zone_datasets_t *zds;
174	zone_dataset_t *zd;
175	int error;
176	size_t dsnamelen;
177
178	if ((error = zone_dataset_cred_check(cred)) != 0)
179		return (error);
180	if ((error = zone_dataset_name_check(dataset, &dsnamelen)) != 0)
181		return (error);
182	if ((error = user_ns_get(userns_fd, &userns)) != 0)
183		return (error);
184
185	mutex_enter(&zone_datasets_lock);
186	zds = zone_datasets_lookup(user_ns_zoneid(userns));
187	if (zds == NULL) {
188		zds = kmem_alloc(sizeof (zone_datasets_t), KM_SLEEP);
189		INIT_LIST_HEAD(&zds->zds_list);
190		INIT_LIST_HEAD(&zds->zds_datasets);
191		zds->zds_userns = userns;
192		/*
193		 * Lock the namespace by incresing its refcount to prevent
194		 * the namespace ID from being reused.
195		 */
196		get_user_ns(userns);
197		list_add_tail(&zds->zds_list, &zone_datasets);
198	} else {
199		zd = zone_dataset_lookup(zds, dataset, dsnamelen);
200		if (zd != NULL) {
201			mutex_exit(&zone_datasets_lock);
202			return (EEXIST);
203		}
204	}
205
206	zd = kmem_alloc(sizeof (zone_dataset_t) + dsnamelen + 1, KM_SLEEP);
207	zd->zd_dsnamelen = dsnamelen;
208	strlcpy(zd->zd_dsname, dataset, dsnamelen + 1);
209	INIT_LIST_HEAD(&zd->zd_list);
210	list_add_tail(&zd->zd_list, &zds->zds_datasets);
211
212	mutex_exit(&zone_datasets_lock);
213	return (0);
214#else
215	return (ENXIO);
216#endif /* defined(CONFIG_USER_NS) && defined(HAVE_USER_NS_COMMON_INUM) */
217}
218EXPORT_SYMBOL(zone_dataset_attach);
219
220int
221zone_dataset_detach(cred_t *cred, const char *dataset, int userns_fd)
222{
223#if defined(CONFIG_USER_NS) && defined(HAVE_USER_NS_COMMON_INUM)
224	struct user_namespace *userns;
225	zone_datasets_t *zds;
226	zone_dataset_t *zd;
227	int error;
228	size_t dsnamelen;
229
230	if ((error = zone_dataset_cred_check(cred)) != 0)
231		return (error);
232	if ((error = zone_dataset_name_check(dataset, &dsnamelen)) != 0)
233		return (error);
234	if ((error = user_ns_get(userns_fd, &userns)) != 0)
235		return (error);
236
237	mutex_enter(&zone_datasets_lock);
238	zds = zone_datasets_lookup(user_ns_zoneid(userns));
239	if (zds != NULL)
240		zd = zone_dataset_lookup(zds, dataset, dsnamelen);
241	if (zds == NULL || zd == NULL) {
242		mutex_exit(&zone_datasets_lock);
243		return (ENOENT);
244	}
245
246	list_del(&zd->zd_list);
247	kmem_free(zd, sizeof (*zd) + zd->zd_dsnamelen + 1);
248
249	/* Prune the namespace entry if it has no more delegations. */
250	if (list_empty(&zds->zds_datasets)) {
251		/*
252		 * Decrease the refcount now that the namespace is no longer
253		 * used. It is no longer necessary to prevent the namespace ID
254		 * from being reused.
255		 */
256		put_user_ns(userns);
257		list_del(&zds->zds_list);
258		kmem_free(zds, sizeof (*zds));
259	}
260
261	mutex_exit(&zone_datasets_lock);
262	return (0);
263#else
264	return (ENXIO);
265#endif /* defined(CONFIG_USER_NS) && defined(HAVE_USER_NS_COMMON_INUM) */
266}
267EXPORT_SYMBOL(zone_dataset_detach);
268
269/*
270 * A dataset is visible if:
271 * - It is a parent of a namespace entry.
272 * - It is one of the namespace entries.
273 * - It is a child of a namespace entry.
274 *
275 * A dataset is writable if:
276 * - It is one of the namespace entries.
277 * - It is a child of a namespace entry.
278 *
279 * The parent datasets of namespace entries are visible and
280 * read-only to provide a path back to the root of the pool.
281 */
282int
283zone_dataset_visible(const char *dataset, int *write)
284{
285	zone_datasets_t *zds;
286	zone_dataset_t *zd;
287	size_t dsnamelen, zd_len;
288	int visible;
289
290	/* Default to read-only, in case visible is returned. */
291	if (write != NULL)
292		*write = 0;
293	if (zone_dataset_name_check(dataset, &dsnamelen) != 0)
294		return (0);
295	if (INGLOBALZONE(curproc)) {
296		if (write != NULL)
297			*write = 1;
298		return (1);
299	}
300
301	mutex_enter(&zone_datasets_lock);
302	zds = zone_datasets_lookup(crgetzoneid(curproc->cred));
303	if (zds == NULL) {
304		mutex_exit(&zone_datasets_lock);
305		return (0);
306	}
307
308	visible = 0;
309	list_for_each_entry(zd, &zds->zds_datasets, zd_list) {
310		zd_len = strlen(zd->zd_dsname);
311		if (zd_len > dsnamelen) {
312			/*
313			 * The name of the namespace entry is longer than that
314			 * of the dataset, so it could be that the dataset is a
315			 * parent of the namespace entry.
316			 */
317			visible = memcmp(zd->zd_dsname, dataset,
318			    dsnamelen) == 0 &&
319			    zd->zd_dsname[dsnamelen] == '/';
320			if (visible)
321				break;
322		} else if (zd_len == dsnamelen) {
323			/*
324			 * The name of the namespace entry is as long as that
325			 * of the dataset, so perhaps the dataset itself is the
326			 * namespace entry.
327			 */
328			visible = memcmp(zd->zd_dsname, dataset, zd_len) == 0;
329			if (visible) {
330				if (write != NULL)
331					*write = 1;
332				break;
333			}
334		} else {
335			/*
336			 * The name of the namespace entry is shorter than that
337			 * of the dataset, so perhaps the dataset is a child of
338			 * the namespace entry.
339			 */
340			visible = memcmp(zd->zd_dsname, dataset,
341			    zd_len) == 0 && dataset[zd_len] == '/';
342			if (visible) {
343				if (write != NULL)
344					*write = 1;
345				break;
346			}
347		}
348	}
349
350	mutex_exit(&zone_datasets_lock);
351	return (visible);
352}
353EXPORT_SYMBOL(zone_dataset_visible);
354
355unsigned int
356global_zoneid(void)
357{
358	unsigned int z = 0;
359
360#if defined(CONFIG_USER_NS)
361	z = user_ns_zoneid(&init_user_ns);
362#endif
363
364	return (z);
365}
366EXPORT_SYMBOL(global_zoneid);
367
368unsigned int
369crgetzoneid(const cred_t *cr)
370{
371	unsigned int r = 0;
372
373#if defined(CONFIG_USER_NS)
374	r = user_ns_zoneid(cr->user_ns);
375#endif
376
377	return (r);
378}
379EXPORT_SYMBOL(crgetzoneid);
380
381boolean_t
382inglobalzone(proc_t *proc)
383{
384#if defined(CONFIG_USER_NS)
385	return (proc->cred->user_ns == &init_user_ns);
386#else
387	return (B_TRUE);
388#endif
389}
390EXPORT_SYMBOL(inglobalzone);
391
392int
393spl_zone_init(void)
394{
395	mutex_init(&zone_datasets_lock, NULL, MUTEX_DEFAULT, NULL);
396	INIT_LIST_HEAD(&zone_datasets);
397	return (0);
398}
399
400void
401spl_zone_fini(void)
402{
403	zone_datasets_t *zds;
404	zone_dataset_t *zd;
405
406	/*
407	 * It would be better to assert an empty zone_datasets, but since
408	 * there's no automatic mechanism for cleaning them up if the user
409	 * namespace is destroyed, just do it here, since spl is about to go
410	 * out of context.
411	 */
412	while (!list_empty(&zone_datasets)) {
413		zds = list_entry(zone_datasets.next, zone_datasets_t, zds_list);
414		while (!list_empty(&zds->zds_datasets)) {
415			zd = list_entry(zds->zds_datasets.next,
416			    zone_dataset_t, zd_list);
417			list_del(&zd->zd_list);
418			kmem_free(zd, sizeof (*zd) + zd->zd_dsnamelen + 1);
419		}
420		put_user_ns(zds->zds_userns);
421		list_del(&zds->zds_list);
422		kmem_free(zds, sizeof (*zds));
423	}
424	mutex_destroy(&zone_datasets_lock);
425}
426