1/*	$OpenBSD: uvm_mmap.c,v 1.191 2024/04/05 14:16:05 deraadt Exp $	*/
2/*	$NetBSD: uvm_mmap.c,v 1.49 2001/02/18 21:19:08 chs Exp $	*/
3
4/*
5 * Copyright (c) 1997 Charles D. Cranor and Washington University.
6 * Copyright (c) 1991, 1993 The Regents of the University of California.
7 * Copyright (c) 1988 University of Utah.
8 *
9 * All rights reserved.
10 *
11 * This code is derived from software contributed to Berkeley by
12 * the Systems Programming Group of the University of Utah Computer
13 * Science Department.
14 *
15 * Redistribution and use in source and binary forms, with or without
16 * modification, are permitted provided that the following conditions
17 * are met:
18 * 1. Redistributions of source code must retain the above copyright
19 *    notice, this list of conditions and the following disclaimer.
20 * 2. Redistributions in binary form must reproduce the above copyright
21 *    notice, this list of conditions and the following disclaimer in the
22 *    documentation and/or other materials provided with the distribution.
23 * 3. Neither the name of the University nor the names of its contributors
24 *    may be used to endorse or promote products derived from this software
25 *    without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
28 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37 * SUCH DAMAGE.
38 *
39 * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$
40 *      @(#)vm_mmap.c   8.5 (Berkeley) 5/19/94
41 * from: Id: uvm_mmap.c,v 1.1.2.14 1998/01/05 21:04:26 chuck Exp
42 */
43
44/*
45 * uvm_mmap.c: system call interface into VM system, plus kernel vm_mmap
46 * function.
47 */
48#include <sys/param.h>
49#include <sys/systm.h>
50#include <sys/fcntl.h>
51#include <sys/file.h>
52#include <sys/filedesc.h>
53#include <sys/resourcevar.h>
54#include <sys/mman.h>
55#include <sys/mount.h>
56#include <sys/proc.h>
57#include <sys/malloc.h>
58#include <sys/vnode.h>
59#include <sys/conf.h>
60#include <sys/signalvar.h>
61#include <sys/syslog.h>
62#include <sys/stat.h>
63#include <sys/specdev.h>
64#include <sys/stdint.h>
65#include <sys/pledge.h>
66#include <sys/unistd.h>		/* for KBIND* */
67#include <sys/user.h>
68
69#include <machine/exec.h>	/* for __LDPGSZ */
70
71#include <sys/syscall.h>
72#include <sys/syscallargs.h>
73
74#include <uvm/uvm.h>
75#include <uvm/uvm_device.h>
76#include <uvm/uvm_vnode.h>
77
78int uvm_mmapanon(vm_map_t, vaddr_t *, vsize_t, vm_prot_t, vm_prot_t, int,
79    vsize_t, struct proc *);
80int uvm_mmapfile(vm_map_t, vaddr_t *, vsize_t, vm_prot_t, vm_prot_t, int,
81    struct vnode *, voff_t, vsize_t, struct proc *);
82
83
84/*
85 * Page align addr and size, returning EINVAL on wraparound.
86 */
87#define ALIGN_ADDR(addr, size, pageoff)	do {				\
88	pageoff = (addr & PAGE_MASK);					\
89	if (pageoff != 0) {						\
90		if (size > SIZE_MAX - pageoff)				\
91			return EINVAL;	/* wraparound */	\
92		addr -= pageoff;					\
93		size += pageoff;					\
94	}								\
95	if (size != 0) {						\
96		size = (vsize_t)round_page(size);			\
97		if (size == 0)						\
98			return EINVAL;	/* wraparound */	\
99	}								\
100} while (0)
101
102/*
103 * sys_mquery: provide mapping hints to applications that do fixed mappings
104 *
105 * flags: 0 or MAP_FIXED (MAP_FIXED - means that we insist on this addr and
106 *	don't care about PMAP_PREFER or such)
107 * addr: hint where we'd like to place the mapping.
108 * size: size of the mapping
109 * fd: fd of the file we want to map
110 * off: offset within the file
111 */
112int
113sys_mquery(struct proc *p, void *v, register_t *retval)
114{
115	struct sys_mquery_args /* {
116		syscallarg(void *) addr;
117		syscallarg(size_t) len;
118		syscallarg(int) prot;
119		syscallarg(int) flags;
120		syscallarg(int) fd;
121		syscallarg(off_t) pos;
122	} */ *uap = v;
123	struct file *fp;
124	voff_t uoff;
125	int error;
126	vaddr_t vaddr;
127	int flags = 0;
128	vsize_t size;
129	vm_prot_t prot;
130	int fd;
131
132	vaddr = (vaddr_t) SCARG(uap, addr);
133	prot = SCARG(uap, prot);
134	size = (vsize_t) SCARG(uap, len);
135	fd = SCARG(uap, fd);
136
137	if ((prot & PROT_MASK) != prot)
138		return EINVAL;
139
140	if (SCARG(uap, flags) & MAP_FIXED)
141		flags |= UVM_FLAG_FIXED;
142
143	if (fd >= 0) {
144		if ((error = getvnode(p, fd, &fp)) != 0)
145			return error;
146		uoff = SCARG(uap, pos);
147	} else {
148		fp = NULL;
149		uoff = UVM_UNKNOWN_OFFSET;
150	}
151
152	if (vaddr == 0)
153		vaddr = uvm_map_hint(p->p_vmspace, prot, VM_MIN_ADDRESS,
154		    VM_MAXUSER_ADDRESS);
155
156	error = uvm_map_mquery(&p->p_vmspace->vm_map, &vaddr, size, uoff,
157	    flags);
158	if (error == 0)
159		*retval = (register_t)(vaddr);
160
161	if (fp != NULL)
162		FRELE(fp, p);
163	return error;
164}
165
166int	uvm_wxabort;
167
168/*
169 * W^X violations are only allowed on permitted filesystems.
170 */
171static inline int
172uvm_wxcheck(struct proc *p, char *call)
173{
174	struct process *pr = p->p_p;
175	int wxallowed = (pr->ps_textvp->v_mount &&
176	    (pr->ps_textvp->v_mount->mnt_flag & MNT_WXALLOWED));
177
178	if (wxallowed && (pr->ps_flags & PS_WXNEEDED))
179		return 0;
180
181	if (uvm_wxabort) {
182		KERNEL_LOCK();
183		/* Report W^X failures */
184		if (pr->ps_wxcounter++ == 0)
185			log(LOG_NOTICE, "%s(%d): %s W^X violation\n",
186			    pr->ps_comm, pr->ps_pid, call);
187		/* Send uncatchable SIGABRT for coredump */
188		sigexit(p, SIGABRT);
189		KERNEL_UNLOCK();
190	}
191
192	return ENOTSUP;
193}
194
195/*
196 * sys_mmap: mmap system call.
197 *
198 * => file offset and address may not be page aligned
199 *    - if MAP_FIXED, offset and address must have remainder mod PAGE_SIZE
200 *    - if address isn't page aligned the mapping starts at trunc_page(addr)
201 *      and the return value is adjusted up by the page offset.
202 */
203int
204sys_mmap(struct proc *p, void *v, register_t *retval)
205{
206	struct sys_mmap_args /* {
207		syscallarg(void *) addr;
208		syscallarg(size_t) len;
209		syscallarg(int) prot;
210		syscallarg(int) flags;
211		syscallarg(int) fd;
212		syscallarg(off_t) pos;
213	} */ *uap = v;
214	vaddr_t addr;
215	struct vattr va;
216	off_t pos;
217	vsize_t limit, pageoff, size;
218	vm_prot_t prot, maxprot;
219	int flags, fd;
220	vaddr_t vm_min_address = VM_MIN_ADDRESS;
221	struct filedesc *fdp = p->p_fd;
222	struct file *fp = NULL;
223	struct vnode *vp;
224	int error;
225
226	/* first, extract syscall args from the uap. */
227	addr = (vaddr_t) SCARG(uap, addr);
228	size = (vsize_t) SCARG(uap, len);
229	prot = SCARG(uap, prot);
230	flags = SCARG(uap, flags);
231	fd = SCARG(uap, fd);
232	pos = SCARG(uap, pos);
233
234	/*
235	 * Validate the flags.
236	 */
237	if ((prot & PROT_MASK) != prot)
238		return EINVAL;
239	if ((prot & (PROT_WRITE | PROT_EXEC)) == (PROT_WRITE | PROT_EXEC) &&
240	    (error = uvm_wxcheck(p, "mmap")))
241		return error;
242
243	if ((flags & MAP_FLAGMASK) != flags)
244		return EINVAL;
245	if ((flags & (MAP_SHARED|MAP_PRIVATE)) == (MAP_SHARED|MAP_PRIVATE))
246		return EINVAL;
247	if ((flags & (MAP_FIXED|__MAP_NOREPLACE)) == __MAP_NOREPLACE)
248		return EINVAL;
249	if (flags & MAP_STACK) {
250		if ((flags & (MAP_ANON|MAP_PRIVATE)) != (MAP_ANON|MAP_PRIVATE))
251			return EINVAL;
252		if (flags & ~(MAP_STACK|MAP_FIXED|MAP_ANON|MAP_PRIVATE))
253			return EINVAL;
254		if (pos != 0)
255			return EINVAL;
256		if ((prot & (PROT_READ|PROT_WRITE)) != (PROT_READ|PROT_WRITE))
257			return EINVAL;
258	}
259	if (size == 0)
260		return EINVAL;
261
262	error = pledge_protexec(p, prot);
263	if (error)
264		return error;
265
266	/* align file position and save offset.  adjust size. */
267	ALIGN_ADDR(pos, size, pageoff);
268
269	/* now check (MAP_FIXED) or get (!MAP_FIXED) the "addr" */
270	if (flags & MAP_FIXED) {
271		/* adjust address by the same amount as we did the offset */
272		addr -= pageoff;
273		if (addr & PAGE_MASK)
274			return EINVAL;		/* not page aligned */
275
276		if (addr > SIZE_MAX - size)
277			return EINVAL;		/* no wrapping! */
278		if (VM_MAXUSER_ADDRESS > 0 &&
279		    (addr + size) > VM_MAXUSER_ADDRESS)
280			return EINVAL;
281		if (vm_min_address > 0 && addr < vm_min_address)
282			return EINVAL;
283	}
284
285	/* check for file mappings (i.e. not anonymous) and verify file. */
286	if ((flags & MAP_ANON) == 0) {
287		KERNEL_LOCK();
288		if ((fp = fd_getfile(fdp, fd)) == NULL) {
289			error = EBADF;
290			goto out;
291		}
292
293		if (fp->f_type != DTYPE_VNODE) {
294			error = ENODEV;		/* only mmap vnodes! */
295			goto out;
296		}
297		vp = (struct vnode *)fp->f_data;	/* convert to vnode */
298
299		if (vp->v_type != VREG && vp->v_type != VCHR &&
300		    vp->v_type != VBLK) {
301			error = ENODEV; /* only REG/CHR/BLK support mmap */
302			goto out;
303		}
304
305		if (vp->v_type == VREG && (pos + size) < pos) {
306			error = EINVAL;		/* no offset wrapping */
307			goto out;
308		}
309
310		/* special case: catch SunOS style /dev/zero */
311		if (vp->v_type == VCHR && iszerodev(vp->v_rdev)) {
312			flags |= MAP_ANON;
313			FRELE(fp, p);
314			fp = NULL;
315			KERNEL_UNLOCK();
316			goto is_anon;
317		}
318
319		/*
320		 * Old programs may not select a specific sharing type, so
321		 * default to an appropriate one.
322		 */
323		if ((flags & (MAP_SHARED|MAP_PRIVATE)) == 0) {
324#if defined(DEBUG)
325			printf("WARNING: defaulted mmap() share type to"
326			    " %s (pid %d comm %s)\n",
327			    vp->v_type == VCHR ? "MAP_SHARED" : "MAP_PRIVATE",
328			    p->p_p->ps_pid, p->p_p->ps_comm);
329#endif
330			if (vp->v_type == VCHR)
331				flags |= MAP_SHARED;	/* for a device */
332			else
333				flags |= MAP_PRIVATE;	/* for a file */
334		}
335
336		/*
337		 * MAP_PRIVATE device mappings don't make sense (and aren't
338		 * supported anyway).  However, some programs rely on this,
339		 * so just change it to MAP_SHARED.
340		 */
341		if (vp->v_type == VCHR && (flags & MAP_PRIVATE) != 0) {
342			flags = (flags & ~MAP_PRIVATE) | MAP_SHARED;
343		}
344
345		/* now check protection */
346		maxprot = PROT_EXEC;
347
348		/* check read access */
349		if (fp->f_flag & FREAD)
350			maxprot |= PROT_READ;
351		else if (prot & PROT_READ) {
352			error = EACCES;
353			goto out;
354		}
355
356		/* check write access, shared case first */
357		if (flags & MAP_SHARED) {
358			/*
359			 * if the file is writable, only add PROT_WRITE to
360			 * maxprot if the file is not immutable, append-only.
361			 * otherwise, if we have asked for PROT_WRITE, return
362			 * EPERM.
363			 */
364			if (fp->f_flag & FWRITE) {
365				error = VOP_GETATTR(vp, &va, p->p_ucred, p);
366				if (error)
367					goto out;
368				if ((va.va_flags & (IMMUTABLE|APPEND)) == 0)
369					maxprot |= PROT_WRITE;
370				else if (prot & PROT_WRITE) {
371					error = EPERM;
372					goto out;
373				}
374			} else if (prot & PROT_WRITE) {
375				error = EACCES;
376				goto out;
377			}
378		} else {
379			/* MAP_PRIVATE mappings can always write to */
380			maxprot |= PROT_WRITE;
381		}
382		if ((flags & __MAP_NOFAULT) != 0 ||
383		    ((flags & MAP_PRIVATE) != 0 && (prot & PROT_WRITE) != 0)) {
384			limit = lim_cur(RLIMIT_DATA);
385			if (limit < size ||
386			    limit - size < ptoa(p->p_vmspace->vm_dused)) {
387				error = ENOMEM;
388				goto out;
389			}
390		}
391		error = uvm_mmapfile(&p->p_vmspace->vm_map, &addr, size, prot,
392		    maxprot, flags, vp, pos, lim_cur(RLIMIT_MEMLOCK), p);
393		FRELE(fp, p);
394		KERNEL_UNLOCK();
395	} else {		/* MAP_ANON case */
396		if (fd != -1)
397			return EINVAL;
398
399is_anon:	/* label for SunOS style /dev/zero */
400
401		/* __MAP_NOFAULT only makes sense with a backing object */
402		if ((flags & __MAP_NOFAULT) != 0)
403			return EINVAL;
404
405		if (prot != PROT_NONE || (flags & MAP_SHARED)) {
406			limit = lim_cur(RLIMIT_DATA);
407			if (limit < size ||
408			    limit - size < ptoa(p->p_vmspace->vm_dused)) {
409				return ENOMEM;
410			}
411		}
412
413		/*
414		 * We've been treating (MAP_SHARED|MAP_PRIVATE) == 0 as
415		 * MAP_PRIVATE, so make that clear.
416		 */
417		if ((flags & MAP_SHARED) == 0)
418			flags |= MAP_PRIVATE;
419
420		maxprot = PROT_MASK;
421		error = uvm_mmapanon(&p->p_vmspace->vm_map, &addr, size, prot,
422		    maxprot, flags, lim_cur(RLIMIT_MEMLOCK), p);
423	}
424
425	if (error == 0)
426		/* remember to add offset */
427		*retval = (register_t)(addr + pageoff);
428
429	return error;
430
431out:
432	KERNEL_UNLOCK();
433	if (fp)
434		FRELE(fp, p);
435	return error;
436}
437
438/*
439 * sys_msync: the msync system call (a front-end for flush)
440 */
441
442int
443sys_msync(struct proc *p, void *v, register_t *retval)
444{
445	struct sys_msync_args /* {
446		syscallarg(void *) addr;
447		syscallarg(size_t) len;
448		syscallarg(int) flags;
449	} */ *uap = v;
450	vaddr_t addr;
451	vsize_t size, pageoff;
452	int flags, uvmflags;
453
454	/* extract syscall args from the uap */
455	addr = (vaddr_t)SCARG(uap, addr);
456	size = (vsize_t)SCARG(uap, len);
457	flags = SCARG(uap, flags);
458
459	/* sanity check flags */
460	if ((flags & ~(MS_ASYNC | MS_SYNC | MS_INVALIDATE)) != 0 ||
461			(flags & (MS_ASYNC | MS_SYNC | MS_INVALIDATE)) == 0 ||
462			(flags & (MS_ASYNC | MS_SYNC)) == (MS_ASYNC | MS_SYNC))
463		return EINVAL;
464	if ((flags & (MS_ASYNC | MS_SYNC)) == 0)
465		flags |= MS_SYNC;
466
467	/* align the address to a page boundary, and adjust the size accordingly */
468	ALIGN_ADDR(addr, size, pageoff);
469	if (addr > SIZE_MAX - size)
470		return EINVAL;		/* disallow wrap-around. */
471
472	/* translate MS_ flags into PGO_ flags */
473	uvmflags = PGO_CLEANIT;
474	if (flags & MS_INVALIDATE)
475		uvmflags |= PGO_FREE;
476	if (flags & MS_SYNC)
477		uvmflags |= PGO_SYNCIO;
478	else
479		uvmflags |= PGO_SYNCIO;	 /* XXXCDC: force sync for now! */
480
481	return uvm_map_clean(&p->p_vmspace->vm_map, addr, addr+size, uvmflags);
482}
483
484/*
485 * sys_munmap: unmap a users memory
486 */
487int
488sys_munmap(struct proc *p, void *v, register_t *retval)
489{
490	struct sys_munmap_args /* {
491		syscallarg(void *) addr;
492		syscallarg(size_t) len;
493	} */ *uap = v;
494	vaddr_t addr;
495	vsize_t size, pageoff;
496	vm_map_t map;
497	vaddr_t vm_min_address = VM_MIN_ADDRESS;
498	struct uvm_map_deadq dead_entries;
499
500	/* get syscall args... */
501	addr = (vaddr_t) SCARG(uap, addr);
502	size = (vsize_t) SCARG(uap, len);
503
504	/* align address to a page boundary, and adjust size accordingly */
505	ALIGN_ADDR(addr, size, pageoff);
506
507	/*
508	 * Check for illegal addresses.  Watch out for address wrap...
509	 * Note that VM_*_ADDRESS are not constants due to casts (argh).
510	 */
511	if (addr > SIZE_MAX - size)
512		return EINVAL;
513	if (VM_MAXUSER_ADDRESS > 0 && addr + size > VM_MAXUSER_ADDRESS)
514		return EINVAL;
515	if (vm_min_address > 0 && addr < vm_min_address)
516		return EINVAL;
517	map = &p->p_vmspace->vm_map;
518
519
520	vm_map_lock(map);	/* lock map so we can checkprot */
521
522	/*
523	 * interesting system call semantic: make sure entire range is
524	 * allocated before allowing an unmap.
525	 */
526	if (!uvm_map_checkprot(map, addr, addr + size, PROT_NONE)) {
527		vm_map_unlock(map);
528		return EINVAL;
529	}
530
531	TAILQ_INIT(&dead_entries);
532	if (uvm_unmap_remove(map, addr, addr + size, &dead_entries,
533	    FALSE, TRUE, TRUE) != 0) {
534		vm_map_unlock(map);
535		return EPERM;	/* immutable entries found */
536	}
537	vm_map_unlock(map);	/* and unlock */
538
539	uvm_unmap_detach(&dead_entries, 0);
540
541	return 0;
542}
543
544/*
545 * sys_mprotect: the mprotect system call
546 */
547int
548sys_mprotect(struct proc *p, void *v, register_t *retval)
549{
550	struct sys_mprotect_args /* {
551		syscallarg(void *) addr;
552		syscallarg(size_t) len;
553		syscallarg(int) prot;
554	} */ *uap = v;
555	vaddr_t addr;
556	vsize_t size, pageoff;
557	vm_prot_t prot;
558	int error;
559
560	/*
561	 * extract syscall args from uap
562	 */
563
564	addr = (vaddr_t)SCARG(uap, addr);
565	size = (vsize_t)SCARG(uap, len);
566	prot = SCARG(uap, prot);
567
568	if ((prot & PROT_MASK) != prot)
569		return EINVAL;
570	if ((prot & (PROT_WRITE | PROT_EXEC)) == (PROT_WRITE | PROT_EXEC) &&
571	    (error = uvm_wxcheck(p, "mprotect")))
572		return error;
573
574	error = pledge_protexec(p, prot);
575	if (error)
576		return error;
577
578	/*
579	 * align the address to a page boundary, and adjust the size accordingly
580	 */
581	ALIGN_ADDR(addr, size, pageoff);
582	if (addr > SIZE_MAX - size)
583		return EINVAL;		/* disallow wrap-around. */
584
585	return (uvm_map_protect(&p->p_vmspace->vm_map, addr, addr+size,
586	    prot, 0, FALSE, TRUE));
587}
588
589/*
590 * sys_pinsyscalls.  The caller is required to normalize base,len
591 * to the minimum .text region, and adjust pintable offsets relative
592 * to that base.
593 */
594int
595sys_pinsyscalls(struct proc *p, void *v, register_t *retval)
596{
597	struct sys_pinsyscalls_args /* {
598		syscallarg(void *) base;
599		syscallarg(size_t) len;
600		syscallarg(u_int *) pins;
601		syscallarg(int) npins;
602	} */ *uap = v;
603	struct process *pr = p->p_p;
604	struct vm_map *map = &p->p_vmspace->vm_map;
605	int npins, error = 0, i;
606	vaddr_t base;
607	size_t len;
608	u_int *pins;
609
610	if (pr->ps_libcpin.pn_start ||
611	    (pr->ps_vmspace->vm_map.flags & VM_MAP_PINSYSCALL_ONCE))
612		return (EPERM);
613	base = (vaddr_t)SCARG(uap, base);
614	len = (vsize_t)SCARG(uap, len);
615	if (base > SIZE_MAX - len)
616		return (EINVAL);	/* disallow wrap-around. */
617	if (base < map->min_offset || base+len > map->max_offset)
618		return (EINVAL);
619
620	/* XXX MP unlock */
621
622	npins = SCARG(uap, npins);
623	if (npins < 1 || npins > SYS_MAXSYSCALL)
624		return (E2BIG);
625	pins = malloc(npins * sizeof(u_int), M_PINSYSCALL, M_WAITOK|M_ZERO);
626	if (pins == NULL)
627		return (ENOMEM);
628	error = copyin(SCARG(uap, pins), pins, npins * sizeof(u_int));
629	if (error)
630		goto err;
631
632	/* Range-check pintable offsets */
633	for (i = 0; i < npins; i++) {
634		if (pins[i] == (u_int)-1 || pins[i] == 0)
635			continue;
636		if (pins[i] > SCARG(uap, len)) {
637			error = ERANGE;
638			break;
639		}
640	}
641	if (error) {
642err:
643		free(pins, M_PINSYSCALL, npins * sizeof(u_int));
644		return (error);
645	}
646	pr->ps_libcpin.pn_start = base;
647	pr->ps_libcpin.pn_end = base + len;
648	pr->ps_libcpin.pn_pins = pins;
649	pr->ps_libcpin.pn_npins = npins;
650	pr->ps_flags |= PS_LIBCPIN;
651
652#ifdef PMAP_CHECK_COPYIN
653	/* Assume (and insist) on libc.so text being execute-only */
654	if (PMAP_CHECK_COPYIN)
655		uvm_map_check_copyin_add(map, base, base+len);
656#endif
657	return (0);
658}
659
660/*
661 * sys_mimmutable: the mimmutable system call
662 */
663int
664sys_mimmutable(struct proc *p, void *v, register_t *retval)
665{
666	struct sys_mimmutable_args /* {
667		immutablearg(void *) addr;
668		immutablearg(size_t) len;
669	} */ *uap = v;
670	vaddr_t addr;
671	vsize_t size, pageoff;
672
673	addr = (vaddr_t)SCARG(uap, addr);
674	size = (vsize_t)SCARG(uap, len);
675
676	/*
677	 * align the address to a page boundary, and adjust the size accordingly
678	 */
679	ALIGN_ADDR(addr, size, pageoff);
680	if (addr > SIZE_MAX - size)
681		return EINVAL;		/* disallow wrap-around. */
682
683	return uvm_map_immutable(&p->p_vmspace->vm_map, addr, addr+size, 1);
684}
685
686/*
687 * sys_minherit: the minherit system call
688 */
689int
690sys_minherit(struct proc *p, void *v, register_t *retval)
691{
692	struct sys_minherit_args /* {
693		syscallarg(void *) addr;
694		syscallarg(size_t) len;
695		syscallarg(int) inherit;
696	} */ *uap = v;
697	vaddr_t addr;
698	vsize_t size, pageoff;
699	vm_inherit_t inherit;
700
701	addr = (vaddr_t)SCARG(uap, addr);
702	size = (vsize_t)SCARG(uap, len);
703	inherit = SCARG(uap, inherit);
704
705	/*
706	 * align the address to a page boundary, and adjust the size accordingly
707	 */
708	ALIGN_ADDR(addr, size, pageoff);
709	if (addr > SIZE_MAX - size)
710		return EINVAL;		/* disallow wrap-around. */
711
712	return (uvm_map_inherit(&p->p_vmspace->vm_map, addr, addr+size,
713	    inherit));
714}
715
716/*
717 * sys_madvise: give advice about memory usage.
718 */
719int
720sys_madvise(struct proc *p, void *v, register_t *retval)
721{
722	struct sys_madvise_args /* {
723		syscallarg(void *) addr;
724		syscallarg(size_t) len;
725		syscallarg(int) behav;
726	} */ *uap = v;
727	vaddr_t addr;
728	vsize_t size, pageoff;
729	int advice, error;
730
731	addr = (vaddr_t)SCARG(uap, addr);
732	size = (vsize_t)SCARG(uap, len);
733	advice = SCARG(uap, behav);
734
735	/*
736	 * align the address to a page boundary, and adjust the size accordingly
737	 */
738	ALIGN_ADDR(addr, size, pageoff);
739	if (addr > SIZE_MAX - size)
740		return EINVAL;		/* disallow wrap-around. */
741
742	switch (advice) {
743	case MADV_NORMAL:
744	case MADV_RANDOM:
745	case MADV_SEQUENTIAL:
746		error = uvm_map_advice(&p->p_vmspace->vm_map, addr,
747		    addr + size, advice);
748		break;
749
750	case MADV_WILLNEED:
751		/*
752		 * Activate all these pages, pre-faulting them in if
753		 * necessary.
754		 */
755		/*
756		 * XXX IMPLEMENT ME.
757		 * Should invent a "weak" mode for uvm_fault()
758		 * which would only do the PGO_LOCKED pgo_get().
759		 */
760		return 0;
761
762	case MADV_DONTNEED:
763		/*
764		 * Deactivate all these pages.  We don't need them
765		 * any more.  We don't, however, toss the data in
766		 * the pages.
767		 */
768		error = uvm_map_clean(&p->p_vmspace->vm_map, addr, addr + size,
769		    PGO_DEACTIVATE);
770		break;
771
772	case MADV_FREE:
773		/*
774		 * These pages contain no valid data, and may be
775		 * garbage-collected.  Toss all resources, including
776		 * any swap space in use.
777		 */
778		error = uvm_map_clean(&p->p_vmspace->vm_map, addr, addr + size,
779		    PGO_FREE);
780		break;
781
782	case MADV_SPACEAVAIL:
783		/*
784		 * XXXMRG What is this?  I think it's:
785		 *
786		 *	Ensure that we have allocated backing-store
787		 *	for these pages.
788		 *
789		 * This is going to require changes to the page daemon,
790		 * as it will free swap space allocated to pages in core.
791		 * There's also what to do for device/file/anonymous memory.
792		 */
793		return EINVAL;
794
795	default:
796		return EINVAL;
797	}
798
799	return error;
800}
801
802/*
803 * sys_mlock: memory lock
804 */
805
806int
807sys_mlock(struct proc *p, void *v, register_t *retval)
808{
809	struct sys_mlock_args /* {
810		syscallarg(const void *) addr;
811		syscallarg(size_t) len;
812	} */ *uap = v;
813	vaddr_t addr;
814	vsize_t size, pageoff;
815	int error;
816
817	/* extract syscall args from uap */
818	addr = (vaddr_t)SCARG(uap, addr);
819	size = (vsize_t)SCARG(uap, len);
820
821	/* align address to a page boundary and adjust size accordingly */
822	ALIGN_ADDR(addr, size, pageoff);
823	if (addr > SIZE_MAX - size)
824		return EINVAL;		/* disallow wrap-around. */
825
826	if (atop(size) + uvmexp.wired > uvmexp.wiredmax)
827		return EAGAIN;
828
829#ifdef pmap_wired_count
830	if (size + ptoa(pmap_wired_count(vm_map_pmap(&p->p_vmspace->vm_map))) >
831			lim_cur(RLIMIT_MEMLOCK))
832		return EAGAIN;
833#else
834	if ((error = suser(p)) != 0)
835		return error;
836#endif
837
838	error = uvm_map_pageable(&p->p_vmspace->vm_map, addr, addr+size, FALSE,
839	    0);
840	return error == 0 ? 0 : ENOMEM;
841}
842
843/*
844 * sys_munlock: unlock wired pages
845 */
846
847int
848sys_munlock(struct proc *p, void *v, register_t *retval)
849{
850	struct sys_munlock_args /* {
851		syscallarg(const void *) addr;
852		syscallarg(size_t) len;
853	} */ *uap = v;
854	vaddr_t addr;
855	vsize_t size, pageoff;
856	int error;
857
858	/* extract syscall args from uap */
859	addr = (vaddr_t)SCARG(uap, addr);
860	size = (vsize_t)SCARG(uap, len);
861
862	/* align address to a page boundary, and adjust size accordingly */
863	ALIGN_ADDR(addr, size, pageoff);
864	if (addr > SIZE_MAX - size)
865		return EINVAL;		/* disallow wrap-around. */
866
867#ifndef pmap_wired_count
868	if ((error = suser(p)) != 0)
869		return error;
870#endif
871
872	error = uvm_map_pageable(&p->p_vmspace->vm_map, addr, addr+size, TRUE,
873	    0);
874	return error == 0 ? 0 : ENOMEM;
875}
876
877/*
878 * sys_mlockall: lock all pages mapped into an address space.
879 */
880int
881sys_mlockall(struct proc *p, void *v, register_t *retval)
882{
883	struct sys_mlockall_args /* {
884		syscallarg(int) flags;
885	} */ *uap = v;
886	int error, flags;
887
888	flags = SCARG(uap, flags);
889
890	if (flags == 0 ||
891	    (flags & ~(MCL_CURRENT|MCL_FUTURE)) != 0)
892		return EINVAL;
893
894#ifndef pmap_wired_count
895	if ((error = suser(p)) != 0)
896		return error;
897#endif
898
899	error = uvm_map_pageable_all(&p->p_vmspace->vm_map, flags,
900	    lim_cur(RLIMIT_MEMLOCK));
901	if (error != 0 && error != ENOMEM)
902		return EAGAIN;
903	return error;
904}
905
906/*
907 * sys_munlockall: unlock all pages mapped into an address space.
908 */
909int
910sys_munlockall(struct proc *p, void *v, register_t *retval)
911{
912
913	(void) uvm_map_pageable_all(&p->p_vmspace->vm_map, 0, 0);
914	return 0;
915}
916
917/*
918 * common code for mmapanon and mmapfile to lock a mmaping
919 */
920int
921uvm_mmaplock(vm_map_t map, vaddr_t *addr, vsize_t size, vm_prot_t prot,
922    vsize_t locklimit)
923{
924	int error;
925
926	/*
927	 * POSIX 1003.1b -- if our address space was configured
928	 * to lock all future mappings, wire the one we just made.
929	 */
930	if (prot == PROT_NONE) {
931		/*
932		 * No more work to do in this case.
933		 */
934		return 0;
935	}
936
937	vm_map_lock(map);
938	if (map->flags & VM_MAP_WIREFUTURE) {
939		KERNEL_LOCK();
940		if ((atop(size) + uvmexp.wired) > uvmexp.wiredmax
941#ifdef pmap_wired_count
942		    || (locklimit != 0 && (size +
943			 ptoa(pmap_wired_count(vm_map_pmap(map)))) >
944			locklimit)
945#endif
946		) {
947			error = ENOMEM;
948			vm_map_unlock(map);
949			/* unmap the region! */
950			uvm_unmap(map, *addr, *addr + size);
951			KERNEL_UNLOCK();
952			return error;
953		}
954		/*
955		 * uvm_map_pageable() always returns the map
956		 * unlocked.
957		 */
958		error = uvm_map_pageable(map, *addr, *addr + size,
959		    FALSE, UVM_LK_ENTER);
960		if (error != 0) {
961			/* unmap the region! */
962			uvm_unmap(map, *addr, *addr + size);
963			KERNEL_UNLOCK();
964			return error;
965		}
966		KERNEL_UNLOCK();
967		return 0;
968	}
969	vm_map_unlock(map);
970	return 0;
971}
972
973/*
974 * uvm_mmapanon: internal version of mmap for anons
975 *
976 * - used by sys_mmap
977 */
978int
979uvm_mmapanon(vm_map_t map, vaddr_t *addr, vsize_t size, vm_prot_t prot,
980    vm_prot_t maxprot, int flags, vsize_t locklimit, struct proc *p)
981{
982	int error;
983	int advice = MADV_NORMAL;
984	unsigned int uvmflag = 0;
985	vsize_t align = 0;	/* userland page size */
986
987	/*
988	 * for non-fixed mappings, round off the suggested address.
989	 * for fixed mappings, check alignment and zap old mappings.
990	 */
991	if ((flags & MAP_FIXED) == 0) {
992		*addr = round_page(*addr);	/* round */
993	} else {
994		if (*addr & PAGE_MASK)
995			return EINVAL;
996
997		uvmflag |= UVM_FLAG_FIXED;
998		if ((flags & __MAP_NOREPLACE) == 0)
999			uvmflag |= UVM_FLAG_UNMAP;
1000	}
1001
1002	if ((flags & MAP_FIXED) == 0 && size >= __LDPGSZ)
1003		align = __LDPGSZ;
1004	if ((flags & MAP_SHARED) == 0)
1005		/* XXX: defer amap create */
1006		uvmflag |= UVM_FLAG_COPYONW;
1007	else
1008		/* shared: create amap now */
1009		uvmflag |= UVM_FLAG_OVERLAY;
1010	if (flags & MAP_STACK)
1011		uvmflag |= UVM_FLAG_STACK;
1012	if (flags & MAP_CONCEAL)
1013		uvmflag |= UVM_FLAG_CONCEAL;
1014
1015	/* set up mapping flags */
1016	uvmflag = UVM_MAPFLAG(prot, maxprot,
1017	    (flags & MAP_SHARED) ? MAP_INHERIT_SHARE : MAP_INHERIT_COPY,
1018	    advice, uvmflag);
1019
1020	error = uvm_mapanon(map, addr, size, align, uvmflag);
1021
1022	if (error == 0)
1023		error = uvm_mmaplock(map, addr, size, prot, locklimit);
1024	return error;
1025}
1026
1027/*
1028 * uvm_mmapfile: internal version of mmap for non-anons
1029 *
1030 * - used by sys_mmap
1031 * - caller must page-align the file offset
1032 */
1033int
1034uvm_mmapfile(vm_map_t map, vaddr_t *addr, vsize_t size, vm_prot_t prot,
1035    vm_prot_t maxprot, int flags, struct vnode *vp, voff_t foff,
1036    vsize_t locklimit, struct proc *p)
1037{
1038	struct uvm_object *uobj;
1039	int error;
1040	int advice = MADV_NORMAL;
1041	unsigned int uvmflag = 0;
1042	vsize_t align = 0;	/* userland page size */
1043
1044	/*
1045	 * for non-fixed mappings, round off the suggested address.
1046	 * for fixed mappings, check alignment and zap old mappings.
1047	 */
1048	if ((flags & MAP_FIXED) == 0) {
1049		*addr = round_page(*addr);	/* round */
1050	} else {
1051		if (*addr & PAGE_MASK)
1052			return EINVAL;
1053
1054		uvmflag |= UVM_FLAG_FIXED;
1055		if ((flags & __MAP_NOREPLACE) == 0)
1056			uvmflag |= UVM_FLAG_UNMAP;
1057	}
1058
1059	/*
1060	 * attach to underlying vm object.
1061	 */
1062	if (vp->v_type != VCHR) {
1063		uobj = uvn_attach(vp, (flags & MAP_SHARED) ?
1064		   maxprot : (maxprot & ~PROT_WRITE));
1065
1066		/*
1067		 * XXXCDC: hack from old code
1068		 * don't allow vnodes which have been mapped
1069		 * shared-writeable to persist [forces them to be
1070		 * flushed out when last reference goes].
1071		 * XXXCDC: interesting side effect: avoids a bug.
1072		 * note that in WRITE [ufs_readwrite.c] that we
1073		 * allocate buffer, uncache, and then do the write.
1074		 * the problem with this is that if the uncache causes
1075		 * VM data to be flushed to the same area of the file
1076		 * we are writing to... in that case we've got the
1077		 * buffer locked and our process goes to sleep forever.
1078		 *
1079		 * XXXCDC: checking maxprot protects us from the
1080		 * "persistbug" program but this is not a long term
1081		 * solution.
1082		 *
1083		 * XXXCDC: we don't bother calling uncache with the vp
1084		 * VOP_LOCKed since we know that we are already
1085		 * holding a valid reference to the uvn (from the
1086		 * uvn_attach above), and thus it is impossible for
1087		 * the uncache to kill the uvn and trigger I/O.
1088		 */
1089		if (flags & MAP_SHARED) {
1090			if ((prot & PROT_WRITE) ||
1091			    (maxprot & PROT_WRITE)) {
1092				uvm_vnp_uncache(vp);
1093			}
1094		}
1095	} else {
1096		uobj = udv_attach(vp->v_rdev,
1097		    (flags & MAP_SHARED) ? maxprot :
1098		    (maxprot & ~PROT_WRITE), foff, size);
1099		/*
1100		 * XXX Some devices don't like to be mapped with
1101		 * XXX PROT_EXEC, but we don't really have a
1102		 * XXX better way of handling this, right now
1103		 */
1104		if (uobj == NULL && (prot & PROT_EXEC) == 0) {
1105			maxprot &= ~PROT_EXEC;
1106			uobj = udv_attach(vp->v_rdev,
1107			    (flags & MAP_SHARED) ? maxprot :
1108			    (maxprot & ~PROT_WRITE), foff, size);
1109		}
1110		advice = MADV_RANDOM;
1111	}
1112
1113	if (uobj == NULL)
1114		return vp->v_type == VREG ? ENOMEM : EINVAL;
1115
1116	if ((flags & MAP_SHARED) == 0)
1117		uvmflag |= UVM_FLAG_COPYONW;
1118	if (flags & __MAP_NOFAULT)
1119		uvmflag |= (UVM_FLAG_NOFAULT | UVM_FLAG_OVERLAY);
1120	if (flags & MAP_STACK)
1121		uvmflag |= UVM_FLAG_STACK;
1122	if (flags & MAP_CONCEAL)
1123		uvmflag |= UVM_FLAG_CONCEAL;
1124
1125	/* set up mapping flags */
1126	uvmflag = UVM_MAPFLAG(prot, maxprot,
1127	    (flags & MAP_SHARED) ? MAP_INHERIT_SHARE : MAP_INHERIT_COPY,
1128	    advice, uvmflag);
1129
1130	error = uvm_map(map, addr, size, uobj, foff, align, uvmflag);
1131
1132	if (error == 0)
1133		return uvm_mmaplock(map, addr, size, prot, locklimit);
1134
1135	/* errors: first detach from the uobj, if any.  */
1136	if (uobj)
1137		uobj->pgops->pgo_detach(uobj);
1138
1139	return error;
1140}
1141
1142int
1143sys_kbind(struct proc *p, void *v, register_t *retval)
1144{
1145	struct sys_kbind_args /* {
1146		syscallarg(const struct __kbind *) param;
1147		syscallarg(size_t) psize;
1148		syscallarg(uint64_t) proc_cookie;
1149	} */ *uap = v;
1150	const struct __kbind *paramp;
1151	union {
1152		struct __kbind uk[KBIND_BLOCK_MAX];
1153		char upad[KBIND_BLOCK_MAX * sizeof(*paramp) + KBIND_DATA_MAX];
1154	} param;
1155	struct uvm_map_deadq dead_entries;
1156	struct process *pr = p->p_p;
1157	const char *data;
1158	vaddr_t baseva, last_baseva, endva, pageoffset, kva;
1159	size_t psize, s;
1160	u_long pc;
1161	int count, i, extra;
1162	int error, sigill = 0;
1163
1164	/*
1165	 * extract syscall args from uap
1166	 */
1167	paramp = SCARG(uap, param);
1168	psize = SCARG(uap, psize);
1169
1170	/*
1171	 * If paramp is NULL and we're uninitialized, disable the syscall
1172	 * for the process.  Raise SIGILL if paramp is NULL and we're
1173	 * already initialized.
1174	 *
1175	 * If paramp is non-NULL and we're uninitialized, do initialization.
1176	 * Otherwise, do security checks and raise SIGILL on failure.
1177	 */
1178	pc = PROC_PC(p);
1179	mtx_enter(&pr->ps_mtx);
1180	if (paramp == NULL) {
1181		/* ld.so disables kbind() when lazy binding is disabled */
1182		if (pr->ps_kbind_addr == 0)
1183			pr->ps_kbind_addr = BOGO_PC;
1184		/* pre-7.3 static binaries disable kbind */
1185		/* XXX delete check in 2026 */
1186		else if (pr->ps_kbind_addr != BOGO_PC)
1187			sigill = 1;
1188	} else if (pr->ps_kbind_addr == 0) {
1189		pr->ps_kbind_addr = pc;
1190		pr->ps_kbind_cookie = SCARG(uap, proc_cookie);
1191	} else if (pc != pr->ps_kbind_addr || pc == BOGO_PC ||
1192	    pr->ps_kbind_cookie != SCARG(uap, proc_cookie)) {
1193		sigill = 1;
1194	}
1195	mtx_leave(&pr->ps_mtx);
1196
1197	/* Raise SIGILL if something is off. */
1198	if (sigill) {
1199		KERNEL_LOCK();
1200		sigexit(p, SIGILL);
1201		/* NOTREACHED */
1202		KERNEL_UNLOCK();
1203	}
1204
1205	/* We're done if we were disabling the syscall. */
1206	if (paramp == NULL)
1207		return 0;
1208
1209	if (psize < sizeof(struct __kbind) || psize > sizeof(param))
1210		return EINVAL;
1211	if ((error = copyin(paramp, &param, psize)))
1212		return error;
1213
1214	/*
1215	 * The param argument points to an array of __kbind structures
1216	 * followed by the corresponding new data areas for them.  Verify
1217	 * that the sizes in the __kbind structures add up to the total
1218	 * size and find the start of the new area.
1219	 */
1220	paramp = &param.uk[0];
1221	s = psize;
1222	for (count = 0; s > 0 && count < KBIND_BLOCK_MAX; count++) {
1223		if (s < sizeof(*paramp))
1224			return EINVAL;
1225		s -= sizeof(*paramp);
1226
1227		baseva = (vaddr_t)paramp[count].kb_addr;
1228		endva = baseva + paramp[count].kb_size - 1;
1229		if (paramp[count].kb_addr == NULL ||
1230		    paramp[count].kb_size == 0 ||
1231		    paramp[count].kb_size > KBIND_DATA_MAX ||
1232		    baseva >= VM_MAXUSER_ADDRESS ||
1233		    endva >= VM_MAXUSER_ADDRESS ||
1234		    s < paramp[count].kb_size)
1235			return EINVAL;
1236
1237		s -= paramp[count].kb_size;
1238	}
1239	if (s > 0)
1240		return EINVAL;
1241	data = (const char *)&paramp[count];
1242
1243	/* all looks good, so do the bindings */
1244	last_baseva = VM_MAXUSER_ADDRESS;
1245	kva = 0;
1246	TAILQ_INIT(&dead_entries);
1247	for (i = 0; i < count; i++) {
1248		baseva = (vaddr_t)paramp[i].kb_addr;
1249		s = paramp[i].kb_size;
1250		pageoffset = baseva & PAGE_MASK;
1251		baseva = trunc_page(baseva);
1252
1253		/* hppa at least runs PLT entries over page edge */
1254		extra = (pageoffset + s) & PAGE_MASK;
1255		if (extra > pageoffset)
1256			extra = 0;
1257		else
1258			s -= extra;
1259redo:
1260		/* make sure the desired page is mapped into kernel_map */
1261		if (baseva != last_baseva) {
1262			if (kva != 0) {
1263				vm_map_lock(kernel_map);
1264				uvm_unmap_remove(kernel_map, kva,
1265				    kva+PAGE_SIZE, &dead_entries,
1266				    FALSE, TRUE, FALSE);	/* XXX */
1267				vm_map_unlock(kernel_map);
1268				kva = 0;
1269			}
1270			if ((error = uvm_map_extract(&p->p_vmspace->vm_map,
1271			    baseva, PAGE_SIZE, &kva, UVM_EXTRACT_FIXPROT)))
1272				break;
1273			last_baseva = baseva;
1274		}
1275
1276		/* do the update */
1277		if ((error = kcopy(data, (char *)kva + pageoffset, s)))
1278			break;
1279		data += s;
1280
1281		if (extra > 0) {
1282			baseva += PAGE_SIZE;
1283			s = extra;
1284			pageoffset = 0;
1285			extra = 0;
1286			goto redo;
1287		}
1288	}
1289
1290	if (kva != 0) {
1291		vm_map_lock(kernel_map);
1292		uvm_unmap_remove(kernel_map, kva, kva+PAGE_SIZE,
1293		    &dead_entries, FALSE, TRUE, FALSE);		/* XXX */
1294		vm_map_unlock(kernel_map);
1295	}
1296	uvm_unmap_detach(&dead_entries, AMAP_REFALL);
1297
1298	return error;
1299}
1300