uipc_shm.c revision 270205
1/*-
2 * Copyright (c) 2006, 2011 Robert N. M. Watson
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27/*
28 * Support for shared swap-backed anonymous memory objects via
29 * shm_open(2) and shm_unlink(2).  While most of the implementation is
30 * here, vm_mmap.c contains mapping logic changes.
31 *
32 * TODO:
33 *
34 * (1) Need to export data to a userland tool via a sysctl.  Should ipcs(1)
35 *     and ipcrm(1) be expanded or should new tools to manage both POSIX
36 *     kernel semaphores and POSIX shared memory be written?
37 *
38 * (2) Add support for this file type to fstat(1).
39 *
40 * (3) Resource limits?  Does this need its own resource limits or are the
41 *     existing limits in mmap(2) sufficient?
42 */
43
44#include <sys/cdefs.h>
45__FBSDID("$FreeBSD: stable/10/sys/kern/uipc_shm.c 270205 2014-08-20 08:24:37Z kib $");
46
47#include "opt_capsicum.h"
48#include "opt_ktrace.h"
49
50#include <sys/param.h>
51#include <sys/capability.h>
52#include <sys/fcntl.h>
53#include <sys/file.h>
54#include <sys/filedesc.h>
55#include <sys/fnv_hash.h>
56#include <sys/kernel.h>
57#include <sys/uio.h>
58#include <sys/signal.h>
59#include <sys/ktrace.h>
60#include <sys/lock.h>
61#include <sys/malloc.h>
62#include <sys/mman.h>
63#include <sys/mutex.h>
64#include <sys/priv.h>
65#include <sys/proc.h>
66#include <sys/refcount.h>
67#include <sys/resourcevar.h>
68#include <sys/rwlock.h>
69#include <sys/stat.h>
70#include <sys/sysctl.h>
71#include <sys/sysproto.h>
72#include <sys/systm.h>
73#include <sys/sx.h>
74#include <sys/time.h>
75#include <sys/vnode.h>
76#include <sys/unistd.h>
77
78#include <security/mac/mac_framework.h>
79
80#include <vm/vm.h>
81#include <vm/vm_param.h>
82#include <vm/pmap.h>
83#include <vm/vm_extern.h>
84#include <vm/vm_map.h>
85#include <vm/vm_kern.h>
86#include <vm/vm_object.h>
87#include <vm/vm_page.h>
88#include <vm/vm_pageout.h>
89#include <vm/vm_pager.h>
90#include <vm/swap_pager.h>
91
92struct shm_mapping {
93	char		*sm_path;
94	Fnv32_t		sm_fnv;
95	struct shmfd	*sm_shmfd;
96	LIST_ENTRY(shm_mapping) sm_link;
97};
98
99static MALLOC_DEFINE(M_SHMFD, "shmfd", "shared memory file descriptor");
100static LIST_HEAD(, shm_mapping) *shm_dictionary;
101static struct sx shm_dict_lock;
102static struct mtx shm_timestamp_lock;
103static u_long shm_hash;
104
105#define	SHM_HASH(fnv)	(&shm_dictionary[(fnv) & shm_hash])
106
107static int	shm_access(struct shmfd *shmfd, struct ucred *ucred, int flags);
108static struct shmfd *shm_alloc(struct ucred *ucred, mode_t mode);
109static void	shm_dict_init(void *arg);
110static void	shm_drop(struct shmfd *shmfd);
111static struct shmfd *shm_hold(struct shmfd *shmfd);
112static void	shm_insert(char *path, Fnv32_t fnv, struct shmfd *shmfd);
113static struct shmfd *shm_lookup(char *path, Fnv32_t fnv);
114static int	shm_remove(char *path, Fnv32_t fnv, struct ucred *ucred);
115static int	shm_dotruncate(struct shmfd *shmfd, off_t length);
116
117static fo_rdwr_t	shm_read;
118static fo_rdwr_t	shm_write;
119static fo_truncate_t	shm_truncate;
120static fo_ioctl_t	shm_ioctl;
121static fo_poll_t	shm_poll;
122static fo_kqfilter_t	shm_kqfilter;
123static fo_stat_t	shm_stat;
124static fo_close_t	shm_close;
125static fo_chmod_t	shm_chmod;
126static fo_chown_t	shm_chown;
127static fo_seek_t	shm_seek;
128
129/* File descriptor operations. */
130static struct fileops shm_ops = {
131	.fo_read = shm_read,
132	.fo_write = shm_write,
133	.fo_truncate = shm_truncate,
134	.fo_ioctl = shm_ioctl,
135	.fo_poll = shm_poll,
136	.fo_kqfilter = shm_kqfilter,
137	.fo_stat = shm_stat,
138	.fo_close = shm_close,
139	.fo_chmod = shm_chmod,
140	.fo_chown = shm_chown,
141	.fo_sendfile = vn_sendfile,
142	.fo_seek = shm_seek,
143	.fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE
144};
145
146FEATURE(posix_shm, "POSIX shared memory");
147
148static int
149uiomove_object_page(vm_object_t obj, size_t len, struct uio *uio)
150{
151	vm_page_t m;
152	vm_pindex_t idx;
153	size_t tlen;
154	int error, offset, rv;
155
156	idx = OFF_TO_IDX(uio->uio_offset);
157	offset = uio->uio_offset & PAGE_MASK;
158	tlen = MIN(PAGE_SIZE - offset, len);
159
160	VM_OBJECT_WLOCK(obj);
161
162	/*
163	 * Parallel reads of the page content from disk are prevented
164	 * by exclusive busy.
165	 *
166	 * Although the tmpfs vnode lock is held here, it is
167	 * nonetheless safe to sleep waiting for a free page.  The
168	 * pageout daemon does not need to acquire the tmpfs vnode
169	 * lock to page out tobj's pages because tobj is a OBJT_SWAP
170	 * type object.
171	 */
172	m = vm_page_grab(obj, idx, VM_ALLOC_NORMAL);
173	if (m->valid != VM_PAGE_BITS_ALL) {
174		if (vm_pager_has_page(obj, idx, NULL, NULL)) {
175			rv = vm_pager_get_pages(obj, &m, 1, 0);
176			m = vm_page_lookup(obj, idx);
177			if (m == NULL) {
178				printf(
179		    "uiomove_object: vm_obj %p idx %jd null lookup rv %d\n",
180				    obj, idx, rv);
181				VM_OBJECT_WUNLOCK(obj);
182				return (EIO);
183			}
184			if (rv != VM_PAGER_OK) {
185				printf(
186	    "uiomove_object: vm_obj %p idx %jd valid %x pager error %d\n",
187				    obj, idx, m->valid, rv);
188				vm_page_lock(m);
189				vm_page_free(m);
190				vm_page_unlock(m);
191				VM_OBJECT_WUNLOCK(obj);
192				return (EIO);
193			}
194		} else
195			vm_page_zero_invalid(m, TRUE);
196	}
197	vm_page_xunbusy(m);
198	vm_page_lock(m);
199	vm_page_hold(m);
200	if (m->queue == PQ_NONE) {
201		vm_page_deactivate(m);
202	} else {
203		/* Requeue to maintain LRU ordering. */
204		vm_page_requeue(m);
205	}
206	vm_page_unlock(m);
207	VM_OBJECT_WUNLOCK(obj);
208	error = uiomove_fromphys(&m, offset, tlen, uio);
209	if (uio->uio_rw == UIO_WRITE && error == 0) {
210		VM_OBJECT_WLOCK(obj);
211		vm_page_dirty(m);
212		vm_pager_page_unswapped(m);
213		VM_OBJECT_WUNLOCK(obj);
214	}
215	vm_page_lock(m);
216	vm_page_unhold(m);
217	vm_page_unlock(m);
218
219	return (error);
220}
221
222int
223uiomove_object(vm_object_t obj, off_t obj_size, struct uio *uio)
224{
225	ssize_t resid;
226	size_t len;
227	int error;
228
229	error = 0;
230	while ((resid = uio->uio_resid) > 0) {
231		if (obj_size <= uio->uio_offset)
232			break;
233		len = MIN(obj_size - uio->uio_offset, resid);
234		if (len == 0)
235			break;
236		error = uiomove_object_page(obj, len, uio);
237		if (error != 0 || resid == uio->uio_resid)
238			break;
239	}
240	return (error);
241}
242
243static int
244shm_seek(struct file *fp, off_t offset, int whence, struct thread *td)
245{
246	struct shmfd *shmfd;
247	off_t foffset;
248	int error;
249
250	shmfd = fp->f_data;
251	foffset = foffset_lock(fp, 0);
252	error = 0;
253	switch (whence) {
254	case L_INCR:
255		if (foffset < 0 ||
256		    (offset > 0 && foffset > OFF_MAX - offset)) {
257			error = EOVERFLOW;
258			break;
259		}
260		offset += foffset;
261		break;
262	case L_XTND:
263		if (offset > 0 && shmfd->shm_size > OFF_MAX - offset) {
264			error = EOVERFLOW;
265			break;
266		}
267		offset += shmfd->shm_size;
268		break;
269	case L_SET:
270		break;
271	default:
272		error = EINVAL;
273	}
274	if (error == 0) {
275		if (offset < 0 || offset > shmfd->shm_size)
276			error = EINVAL;
277		else
278			*(off_t *)(td->td_retval) = offset;
279	}
280	foffset_unlock(fp, offset, error != 0 ? FOF_NOUPDATE : 0);
281	return (error);
282}
283
284static int
285shm_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
286    int flags, struct thread *td)
287{
288	struct shmfd *shmfd;
289	void *rl_cookie;
290	int error;
291
292	shmfd = fp->f_data;
293	foffset_lock_uio(fp, uio, flags);
294	rl_cookie = rangelock_rlock(&shmfd->shm_rl, uio->uio_offset,
295	    uio->uio_offset + uio->uio_resid, &shmfd->shm_mtx);
296#ifdef MAC
297	error = mac_posixshm_check_read(active_cred, fp->f_cred, shmfd);
298	if (error)
299		return (error);
300#endif
301	error = uiomove_object(shmfd->shm_object, shmfd->shm_size, uio);
302	rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx);
303	foffset_unlock_uio(fp, uio, flags);
304	return (error);
305}
306
307static int
308shm_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
309    int flags, struct thread *td)
310{
311	struct shmfd *shmfd;
312	void *rl_cookie;
313	int error;
314
315	shmfd = fp->f_data;
316#ifdef MAC
317	error = mac_posixshm_check_write(active_cred, fp->f_cred, shmfd);
318	if (error)
319		return (error);
320#endif
321	foffset_lock_uio(fp, uio, flags);
322	if ((flags & FOF_OFFSET) == 0) {
323		rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX,
324		    &shmfd->shm_mtx);
325	} else {
326		rl_cookie = rangelock_wlock(&shmfd->shm_rl, uio->uio_offset,
327		    uio->uio_offset + uio->uio_resid, &shmfd->shm_mtx);
328	}
329
330	error = uiomove_object(shmfd->shm_object, shmfd->shm_size, uio);
331	rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx);
332	foffset_unlock_uio(fp, uio, flags);
333	return (error);
334}
335
336static int
337shm_truncate(struct file *fp, off_t length, struct ucred *active_cred,
338    struct thread *td)
339{
340	struct shmfd *shmfd;
341#ifdef MAC
342	int error;
343#endif
344
345	shmfd = fp->f_data;
346#ifdef MAC
347	error = mac_posixshm_check_truncate(active_cred, fp->f_cred, shmfd);
348	if (error)
349		return (error);
350#endif
351	return (shm_dotruncate(shmfd, length));
352}
353
354static int
355shm_ioctl(struct file *fp, u_long com, void *data,
356    struct ucred *active_cred, struct thread *td)
357{
358
359	return (EOPNOTSUPP);
360}
361
362static int
363shm_poll(struct file *fp, int events, struct ucred *active_cred,
364    struct thread *td)
365{
366
367	return (EOPNOTSUPP);
368}
369
370static int
371shm_kqfilter(struct file *fp, struct knote *kn)
372{
373
374	return (EOPNOTSUPP);
375}
376
377static int
378shm_stat(struct file *fp, struct stat *sb, struct ucred *active_cred,
379    struct thread *td)
380{
381	struct shmfd *shmfd;
382#ifdef MAC
383	int error;
384#endif
385
386	shmfd = fp->f_data;
387
388#ifdef MAC
389	error = mac_posixshm_check_stat(active_cred, fp->f_cred, shmfd);
390	if (error)
391		return (error);
392#endif
393
394	/*
395	 * Attempt to return sanish values for fstat() on a memory file
396	 * descriptor.
397	 */
398	bzero(sb, sizeof(*sb));
399	sb->st_blksize = PAGE_SIZE;
400	sb->st_size = shmfd->shm_size;
401	sb->st_blocks = (sb->st_size + sb->st_blksize - 1) / sb->st_blksize;
402	mtx_lock(&shm_timestamp_lock);
403	sb->st_atim = shmfd->shm_atime;
404	sb->st_ctim = shmfd->shm_ctime;
405	sb->st_mtim = shmfd->shm_mtime;
406	sb->st_birthtim = shmfd->shm_birthtime;
407	sb->st_mode = S_IFREG | shmfd->shm_mode;		/* XXX */
408	sb->st_uid = shmfd->shm_uid;
409	sb->st_gid = shmfd->shm_gid;
410	mtx_unlock(&shm_timestamp_lock);
411
412	return (0);
413}
414
415static int
416shm_close(struct file *fp, struct thread *td)
417{
418	struct shmfd *shmfd;
419
420	shmfd = fp->f_data;
421	fp->f_data = NULL;
422	shm_drop(shmfd);
423
424	return (0);
425}
426
427static int
428shm_dotruncate(struct shmfd *shmfd, off_t length)
429{
430	vm_object_t object;
431	vm_page_t m, ma[1];
432	vm_pindex_t idx, nobjsize;
433	vm_ooffset_t delta;
434	int base, rv;
435
436	object = shmfd->shm_object;
437	VM_OBJECT_WLOCK(object);
438	if (length == shmfd->shm_size) {
439		VM_OBJECT_WUNLOCK(object);
440		return (0);
441	}
442	nobjsize = OFF_TO_IDX(length + PAGE_MASK);
443
444	/* Are we shrinking?  If so, trim the end. */
445	if (length < shmfd->shm_size) {
446		/*
447		 * Disallow any requests to shrink the size if this
448		 * object is mapped into the kernel.
449		 */
450		if (shmfd->shm_kmappings > 0) {
451			VM_OBJECT_WUNLOCK(object);
452			return (EBUSY);
453		}
454
455		/*
456		 * Zero the truncated part of the last page.
457		 */
458		base = length & PAGE_MASK;
459		if (base != 0) {
460			idx = OFF_TO_IDX(length);
461retry:
462			m = vm_page_lookup(object, idx);
463			if (m != NULL) {
464				if (vm_page_sleep_if_busy(m, "shmtrc"))
465					goto retry;
466			} else if (vm_pager_has_page(object, idx, NULL, NULL)) {
467				m = vm_page_alloc(object, idx, VM_ALLOC_NORMAL);
468				if (m == NULL) {
469					VM_OBJECT_WUNLOCK(object);
470					VM_WAIT;
471					VM_OBJECT_WLOCK(object);
472					goto retry;
473				} else if (m->valid != VM_PAGE_BITS_ALL) {
474					ma[0] = m;
475					rv = vm_pager_get_pages(object, ma, 1,
476					    0);
477					m = vm_page_lookup(object, idx);
478				} else
479					/* A cached page was reactivated. */
480					rv = VM_PAGER_OK;
481				vm_page_lock(m);
482				if (rv == VM_PAGER_OK) {
483					vm_page_deactivate(m);
484					vm_page_unlock(m);
485					vm_page_xunbusy(m);
486				} else {
487					vm_page_free(m);
488					vm_page_unlock(m);
489					VM_OBJECT_WUNLOCK(object);
490					return (EIO);
491				}
492			}
493			if (m != NULL) {
494				pmap_zero_page_area(m, base, PAGE_SIZE - base);
495				KASSERT(m->valid == VM_PAGE_BITS_ALL,
496				    ("shm_dotruncate: page %p is invalid", m));
497				vm_page_dirty(m);
498				vm_pager_page_unswapped(m);
499			}
500		}
501		delta = ptoa(object->size - nobjsize);
502
503		/* Toss in memory pages. */
504		if (nobjsize < object->size)
505			vm_object_page_remove(object, nobjsize, object->size,
506			    0);
507
508		/* Toss pages from swap. */
509		if (object->type == OBJT_SWAP)
510			swap_pager_freespace(object, nobjsize, delta);
511
512		/* Free the swap accounted for shm */
513		swap_release_by_cred(delta, object->cred);
514		object->charge -= delta;
515	} else {
516		/* Attempt to reserve the swap */
517		delta = ptoa(nobjsize - object->size);
518		if (!swap_reserve_by_cred(delta, object->cred)) {
519			VM_OBJECT_WUNLOCK(object);
520			return (ENOMEM);
521		}
522		object->charge += delta;
523	}
524	shmfd->shm_size = length;
525	mtx_lock(&shm_timestamp_lock);
526	vfs_timestamp(&shmfd->shm_ctime);
527	shmfd->shm_mtime = shmfd->shm_ctime;
528	mtx_unlock(&shm_timestamp_lock);
529	object->size = nobjsize;
530	VM_OBJECT_WUNLOCK(object);
531	return (0);
532}
533
534/*
535 * shmfd object management including creation and reference counting
536 * routines.
537 */
538static struct shmfd *
539shm_alloc(struct ucred *ucred, mode_t mode)
540{
541	struct shmfd *shmfd;
542
543	shmfd = malloc(sizeof(*shmfd), M_SHMFD, M_WAITOK | M_ZERO);
544	shmfd->shm_size = 0;
545	shmfd->shm_uid = ucred->cr_uid;
546	shmfd->shm_gid = ucred->cr_gid;
547	shmfd->shm_mode = mode;
548	shmfd->shm_object = vm_pager_allocate(OBJT_DEFAULT, NULL,
549	    shmfd->shm_size, VM_PROT_DEFAULT, 0, ucred);
550	KASSERT(shmfd->shm_object != NULL, ("shm_create: vm_pager_allocate"));
551	VM_OBJECT_WLOCK(shmfd->shm_object);
552	vm_object_clear_flag(shmfd->shm_object, OBJ_ONEMAPPING);
553	vm_object_set_flag(shmfd->shm_object, OBJ_NOSPLIT);
554	VM_OBJECT_WUNLOCK(shmfd->shm_object);
555	vfs_timestamp(&shmfd->shm_birthtime);
556	shmfd->shm_atime = shmfd->shm_mtime = shmfd->shm_ctime =
557	    shmfd->shm_birthtime;
558	refcount_init(&shmfd->shm_refs, 1);
559	mtx_init(&shmfd->shm_mtx, "shmrl", NULL, MTX_DEF);
560	rangelock_init(&shmfd->shm_rl);
561#ifdef MAC
562	mac_posixshm_init(shmfd);
563	mac_posixshm_create(ucred, shmfd);
564#endif
565
566	return (shmfd);
567}
568
569static struct shmfd *
570shm_hold(struct shmfd *shmfd)
571{
572
573	refcount_acquire(&shmfd->shm_refs);
574	return (shmfd);
575}
576
577static void
578shm_drop(struct shmfd *shmfd)
579{
580
581	if (refcount_release(&shmfd->shm_refs)) {
582#ifdef MAC
583		mac_posixshm_destroy(shmfd);
584#endif
585		rangelock_destroy(&shmfd->shm_rl);
586		mtx_destroy(&shmfd->shm_mtx);
587		vm_object_deallocate(shmfd->shm_object);
588		free(shmfd, M_SHMFD);
589	}
590}
591
592/*
593 * Determine if the credentials have sufficient permissions for a
594 * specified combination of FREAD and FWRITE.
595 */
596static int
597shm_access(struct shmfd *shmfd, struct ucred *ucred, int flags)
598{
599	accmode_t accmode;
600	int error;
601
602	accmode = 0;
603	if (flags & FREAD)
604		accmode |= VREAD;
605	if (flags & FWRITE)
606		accmode |= VWRITE;
607	mtx_lock(&shm_timestamp_lock);
608	error = vaccess(VREG, shmfd->shm_mode, shmfd->shm_uid, shmfd->shm_gid,
609	    accmode, ucred, NULL);
610	mtx_unlock(&shm_timestamp_lock);
611	return (error);
612}
613
614/*
615 * Dictionary management.  We maintain an in-kernel dictionary to map
616 * paths to shmfd objects.  We use the FNV hash on the path to store
617 * the mappings in a hash table.
618 */
619static void
620shm_dict_init(void *arg)
621{
622
623	mtx_init(&shm_timestamp_lock, "shm timestamps", NULL, MTX_DEF);
624	sx_init(&shm_dict_lock, "shm dictionary");
625	shm_dictionary = hashinit(1024, M_SHMFD, &shm_hash);
626}
627SYSINIT(shm_dict_init, SI_SUB_SYSV_SHM, SI_ORDER_ANY, shm_dict_init, NULL);
628
629static struct shmfd *
630shm_lookup(char *path, Fnv32_t fnv)
631{
632	struct shm_mapping *map;
633
634	LIST_FOREACH(map, SHM_HASH(fnv), sm_link) {
635		if (map->sm_fnv != fnv)
636			continue;
637		if (strcmp(map->sm_path, path) == 0)
638			return (map->sm_shmfd);
639	}
640
641	return (NULL);
642}
643
644static void
645shm_insert(char *path, Fnv32_t fnv, struct shmfd *shmfd)
646{
647	struct shm_mapping *map;
648
649	map = malloc(sizeof(struct shm_mapping), M_SHMFD, M_WAITOK);
650	map->sm_path = path;
651	map->sm_fnv = fnv;
652	map->sm_shmfd = shm_hold(shmfd);
653	shmfd->shm_path = path;
654	LIST_INSERT_HEAD(SHM_HASH(fnv), map, sm_link);
655}
656
657static int
658shm_remove(char *path, Fnv32_t fnv, struct ucred *ucred)
659{
660	struct shm_mapping *map;
661	int error;
662
663	LIST_FOREACH(map, SHM_HASH(fnv), sm_link) {
664		if (map->sm_fnv != fnv)
665			continue;
666		if (strcmp(map->sm_path, path) == 0) {
667#ifdef MAC
668			error = mac_posixshm_check_unlink(ucred, map->sm_shmfd);
669			if (error)
670				return (error);
671#endif
672			error = shm_access(map->sm_shmfd, ucred,
673			    FREAD | FWRITE);
674			if (error)
675				return (error);
676			map->sm_shmfd->shm_path = NULL;
677			LIST_REMOVE(map, sm_link);
678			shm_drop(map->sm_shmfd);
679			free(map->sm_path, M_SHMFD);
680			free(map, M_SHMFD);
681			return (0);
682		}
683	}
684
685	return (ENOENT);
686}
687
688/* System calls. */
689int
690sys_shm_open(struct thread *td, struct shm_open_args *uap)
691{
692	struct filedesc *fdp;
693	struct shmfd *shmfd;
694	struct file *fp;
695	char *path;
696	Fnv32_t fnv;
697	mode_t cmode;
698	int fd, error;
699
700#ifdef CAPABILITY_MODE
701	/*
702	 * shm_open(2) is only allowed for anonymous objects.
703	 */
704	if (IN_CAPABILITY_MODE(td) && (uap->path != SHM_ANON))
705		return (ECAPMODE);
706#endif
707
708	if ((uap->flags & O_ACCMODE) != O_RDONLY &&
709	    (uap->flags & O_ACCMODE) != O_RDWR)
710		return (EINVAL);
711
712	if ((uap->flags & ~(O_ACCMODE | O_CREAT | O_EXCL | O_TRUNC | O_CLOEXEC)) != 0)
713		return (EINVAL);
714
715	fdp = td->td_proc->p_fd;
716	cmode = (uap->mode & ~fdp->fd_cmask) & ACCESSPERMS;
717
718	error = falloc(td, &fp, &fd, O_CLOEXEC);
719	if (error)
720		return (error);
721
722	/* A SHM_ANON path pointer creates an anonymous object. */
723	if (uap->path == SHM_ANON) {
724		/* A read-only anonymous object is pointless. */
725		if ((uap->flags & O_ACCMODE) == O_RDONLY) {
726			fdclose(fdp, fp, fd, td);
727			fdrop(fp, td);
728			return (EINVAL);
729		}
730		shmfd = shm_alloc(td->td_ucred, cmode);
731	} else {
732		path = malloc(MAXPATHLEN, M_SHMFD, M_WAITOK);
733		error = copyinstr(uap->path, path, MAXPATHLEN, NULL);
734#ifdef KTRACE
735		if (error == 0 && KTRPOINT(curthread, KTR_NAMEI))
736			ktrnamei(path);
737#endif
738		/* Require paths to start with a '/' character. */
739		if (error == 0 && path[0] != '/')
740			error = EINVAL;
741		if (error) {
742			fdclose(fdp, fp, fd, td);
743			fdrop(fp, td);
744			free(path, M_SHMFD);
745			return (error);
746		}
747
748		fnv = fnv_32_str(path, FNV1_32_INIT);
749		sx_xlock(&shm_dict_lock);
750		shmfd = shm_lookup(path, fnv);
751		if (shmfd == NULL) {
752			/* Object does not yet exist, create it if requested. */
753			if (uap->flags & O_CREAT) {
754#ifdef MAC
755				error = mac_posixshm_check_create(td->td_ucred,
756				    path);
757				if (error == 0) {
758#endif
759					shmfd = shm_alloc(td->td_ucred, cmode);
760					shm_insert(path, fnv, shmfd);
761#ifdef MAC
762				}
763#endif
764			} else {
765				free(path, M_SHMFD);
766				error = ENOENT;
767			}
768		} else {
769			/*
770			 * Object already exists, obtain a new
771			 * reference if requested and permitted.
772			 */
773			free(path, M_SHMFD);
774			if ((uap->flags & (O_CREAT | O_EXCL)) ==
775			    (O_CREAT | O_EXCL))
776				error = EEXIST;
777			else {
778#ifdef MAC
779				error = mac_posixshm_check_open(td->td_ucred,
780				    shmfd, FFLAGS(uap->flags & O_ACCMODE));
781				if (error == 0)
782#endif
783				error = shm_access(shmfd, td->td_ucred,
784				    FFLAGS(uap->flags & O_ACCMODE));
785			}
786
787			/*
788			 * Truncate the file back to zero length if
789			 * O_TRUNC was specified and the object was
790			 * opened with read/write.
791			 */
792			if (error == 0 &&
793			    (uap->flags & (O_ACCMODE | O_TRUNC)) ==
794			    (O_RDWR | O_TRUNC)) {
795#ifdef MAC
796				error = mac_posixshm_check_truncate(
797					td->td_ucred, fp->f_cred, shmfd);
798				if (error == 0)
799#endif
800					shm_dotruncate(shmfd, 0);
801			}
802			if (error == 0)
803				shm_hold(shmfd);
804		}
805		sx_xunlock(&shm_dict_lock);
806
807		if (error) {
808			fdclose(fdp, fp, fd, td);
809			fdrop(fp, td);
810			return (error);
811		}
812	}
813
814	finit(fp, FFLAGS(uap->flags & O_ACCMODE), DTYPE_SHM, shmfd, &shm_ops);
815
816	td->td_retval[0] = fd;
817	fdrop(fp, td);
818
819	return (0);
820}
821
822int
823sys_shm_unlink(struct thread *td, struct shm_unlink_args *uap)
824{
825	char *path;
826	Fnv32_t fnv;
827	int error;
828
829	path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
830	error = copyinstr(uap->path, path, MAXPATHLEN, NULL);
831	if (error) {
832		free(path, M_TEMP);
833		return (error);
834	}
835#ifdef KTRACE
836	if (KTRPOINT(curthread, KTR_NAMEI))
837		ktrnamei(path);
838#endif
839	fnv = fnv_32_str(path, FNV1_32_INIT);
840	sx_xlock(&shm_dict_lock);
841	error = shm_remove(path, fnv, td->td_ucred);
842	sx_xunlock(&shm_dict_lock);
843	free(path, M_TEMP);
844
845	return (error);
846}
847
848/*
849 * mmap() helper to validate mmap() requests against shm object state
850 * and give mmap() the vm_object to use for the mapping.
851 */
852int
853shm_mmap(struct shmfd *shmfd, vm_size_t objsize, vm_ooffset_t foff,
854    vm_object_t *obj)
855{
856
857	/*
858	 * XXXRW: This validation is probably insufficient, and subject to
859	 * sign errors.  It should be fixed.
860	 */
861	if (foff >= shmfd->shm_size ||
862	    foff + objsize > round_page(shmfd->shm_size))
863		return (EINVAL);
864
865	mtx_lock(&shm_timestamp_lock);
866	vfs_timestamp(&shmfd->shm_atime);
867	mtx_unlock(&shm_timestamp_lock);
868	vm_object_reference(shmfd->shm_object);
869	*obj = shmfd->shm_object;
870	return (0);
871}
872
873static int
874shm_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
875    struct thread *td)
876{
877	struct shmfd *shmfd;
878	int error;
879
880	error = 0;
881	shmfd = fp->f_data;
882	mtx_lock(&shm_timestamp_lock);
883	/*
884	 * SUSv4 says that x bits of permission need not be affected.
885	 * Be consistent with our shm_open there.
886	 */
887#ifdef MAC
888	error = mac_posixshm_check_setmode(active_cred, shmfd, mode);
889	if (error != 0)
890		goto out;
891#endif
892	error = vaccess(VREG, shmfd->shm_mode, shmfd->shm_uid,
893	    shmfd->shm_gid, VADMIN, active_cred, NULL);
894	if (error != 0)
895		goto out;
896	shmfd->shm_mode = mode & ACCESSPERMS;
897out:
898	mtx_unlock(&shm_timestamp_lock);
899	return (error);
900}
901
902static int
903shm_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
904    struct thread *td)
905{
906	struct shmfd *shmfd;
907	int error;
908
909	error = 0;
910	shmfd = fp->f_data;
911	mtx_lock(&shm_timestamp_lock);
912#ifdef MAC
913	error = mac_posixshm_check_setowner(active_cred, shmfd, uid, gid);
914	if (error != 0)
915		goto out;
916#endif
917	if (uid == (uid_t)-1)
918		uid = shmfd->shm_uid;
919	if (gid == (gid_t)-1)
920                 gid = shmfd->shm_gid;
921	if (((uid != shmfd->shm_uid && uid != active_cred->cr_uid) ||
922	    (gid != shmfd->shm_gid && !groupmember(gid, active_cred))) &&
923	    (error = priv_check_cred(active_cred, PRIV_VFS_CHOWN, 0)))
924		goto out;
925	shmfd->shm_uid = uid;
926	shmfd->shm_gid = gid;
927out:
928	mtx_unlock(&shm_timestamp_lock);
929	return (error);
930}
931
932/*
933 * Helper routines to allow the backing object of a shared memory file
934 * descriptor to be mapped in the kernel.
935 */
936int
937shm_map(struct file *fp, size_t size, off_t offset, void **memp)
938{
939	struct shmfd *shmfd;
940	vm_offset_t kva, ofs;
941	vm_object_t obj;
942	int rv;
943
944	if (fp->f_type != DTYPE_SHM)
945		return (EINVAL);
946	shmfd = fp->f_data;
947	obj = shmfd->shm_object;
948	VM_OBJECT_WLOCK(obj);
949	/*
950	 * XXXRW: This validation is probably insufficient, and subject to
951	 * sign errors.  It should be fixed.
952	 */
953	if (offset >= shmfd->shm_size ||
954	    offset + size > round_page(shmfd->shm_size)) {
955		VM_OBJECT_WUNLOCK(obj);
956		return (EINVAL);
957	}
958
959	shmfd->shm_kmappings++;
960	vm_object_reference_locked(obj);
961	VM_OBJECT_WUNLOCK(obj);
962
963	/* Map the object into the kernel_map and wire it. */
964	kva = vm_map_min(kernel_map);
965	ofs = offset & PAGE_MASK;
966	offset = trunc_page(offset);
967	size = round_page(size + ofs);
968	rv = vm_map_find(kernel_map, obj, offset, &kva, size, 0,
969	    VMFS_OPTIMAL_SPACE, VM_PROT_READ | VM_PROT_WRITE,
970	    VM_PROT_READ | VM_PROT_WRITE, 0);
971	if (rv == KERN_SUCCESS) {
972		rv = vm_map_wire(kernel_map, kva, kva + size,
973		    VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES);
974		if (rv == KERN_SUCCESS) {
975			*memp = (void *)(kva + ofs);
976			return (0);
977		}
978		vm_map_remove(kernel_map, kva, kva + size);
979	} else
980		vm_object_deallocate(obj);
981
982	/* On failure, drop our mapping reference. */
983	VM_OBJECT_WLOCK(obj);
984	shmfd->shm_kmappings--;
985	VM_OBJECT_WUNLOCK(obj);
986
987	return (vm_mmap_to_errno(rv));
988}
989
990/*
991 * We require the caller to unmap the entire entry.  This allows us to
992 * safely decrement shm_kmappings when a mapping is removed.
993 */
994int
995shm_unmap(struct file *fp, void *mem, size_t size)
996{
997	struct shmfd *shmfd;
998	vm_map_entry_t entry;
999	vm_offset_t kva, ofs;
1000	vm_object_t obj;
1001	vm_pindex_t pindex;
1002	vm_prot_t prot;
1003	boolean_t wired;
1004	vm_map_t map;
1005	int rv;
1006
1007	if (fp->f_type != DTYPE_SHM)
1008		return (EINVAL);
1009	shmfd = fp->f_data;
1010	kva = (vm_offset_t)mem;
1011	ofs = kva & PAGE_MASK;
1012	kva = trunc_page(kva);
1013	size = round_page(size + ofs);
1014	map = kernel_map;
1015	rv = vm_map_lookup(&map, kva, VM_PROT_READ | VM_PROT_WRITE, &entry,
1016	    &obj, &pindex, &prot, &wired);
1017	if (rv != KERN_SUCCESS)
1018		return (EINVAL);
1019	if (entry->start != kva || entry->end != kva + size) {
1020		vm_map_lookup_done(map, entry);
1021		return (EINVAL);
1022	}
1023	vm_map_lookup_done(map, entry);
1024	if (obj != shmfd->shm_object)
1025		return (EINVAL);
1026	vm_map_remove(map, kva, kva + size);
1027	VM_OBJECT_WLOCK(obj);
1028	KASSERT(shmfd->shm_kmappings > 0, ("shm_unmap: object not mapped"));
1029	shmfd->shm_kmappings--;
1030	VM_OBJECT_WUNLOCK(obj);
1031	return (0);
1032}
1033
1034void
1035shm_path(struct shmfd *shmfd, char *path, size_t size)
1036{
1037
1038	if (shmfd->shm_path == NULL)
1039		return;
1040	sx_slock(&shm_dict_lock);
1041	if (shmfd->shm_path != NULL)
1042		strlcpy(path, shmfd->shm_path, size);
1043	sx_sunlock(&shm_dict_lock);
1044}
1045