1/*-
2 * Copyright (c) 2021 The FreeBSD Foundation
3 *
4 * This software were developed by Konstantin Belousov <kib@FreeBSD.org>
5 * under sponsorship from the FreeBSD Foundation.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29#include <sys/param.h>
30#include <sys/systm.h>
31#include <sys/cpuset.h>
32#include <sys/lock.h>
33#include <sys/malloc.h>
34#include <sys/membarrier.h>
35#include <sys/mutex.h>
36#include <sys/proc.h>
37#include <sys/sched.h>
38#include <sys/smp.h>
39#include <sys/syscallsubr.h>
40#include <sys/sysproto.h>
41
42#include <vm/vm_param.h>
43#include <vm/vm.h>
44#include <vm/pmap.h>
45#include <vm/vm_map.h>
46
47#define MEMBARRIER_SUPPORTED_CMDS	(			\
48    MEMBARRIER_CMD_GLOBAL |					\
49    MEMBARRIER_CMD_GLOBAL_EXPEDITED |				\
50    MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED |			\
51    MEMBARRIER_CMD_PRIVATE_EXPEDITED |				\
52    MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED |			\
53    MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE |		\
54    MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE)
55
56static void
57membarrier_action_seqcst(void *arg __unused)
58{
59	atomic_thread_fence_seq_cst();
60}
61
62static void
63membarrier_action_seqcst_sync_core(void *arg __unused)
64{
65	atomic_thread_fence_seq_cst();
66	cpu_sync_core();
67}
68
69static void
70do_membarrier_ipi(cpuset_t *csp, void (*func)(void *))
71{
72	atomic_thread_fence_seq_cst();
73	smp_rendezvous_cpus(*csp, smp_no_rendezvous_barrier, func,
74	    smp_no_rendezvous_barrier, NULL);
75	atomic_thread_fence_seq_cst();
76}
77
78static void
79check_cpu_switched(int c, cpuset_t *csp, uint64_t *swt, bool init)
80{
81	struct pcpu *pc;
82	uint64_t sw;
83
84	if (CPU_ISSET(c, csp))
85		return;
86
87	pc = cpuid_to_pcpu[c];
88	if (pc->pc_curthread == pc->pc_idlethread) {
89		CPU_SET(c, csp);
90		return;
91	}
92
93	/*
94	 * Sync with context switch to ensure that override of
95	 * pc_curthread with non-idle thread pointer is visible before
96	 * reading of pc_switchtime.
97	 */
98	atomic_thread_fence_acq();
99
100	sw = pc->pc_switchtime;
101	if (init)
102		swt[c] = sw;
103	else if (sw != swt[c])
104		CPU_SET(c, csp);
105}
106
107/*
108 *
109 * XXXKIB: We execute the requested action (seq_cst and possibly
110 * sync_core) on current CPU as well.  There is no guarantee that
111 * current thread executes anything with the full fence semantics
112 * during syscall execution.  Similarly, cpu_core_sync() semantics
113 * might be not provided by the syscall return.  E.g. on amd64 we
114 * typically return without IRET.
115 */
116int
117kern_membarrier(struct thread *td, int cmd, unsigned flags, int cpu_id)
118{
119	struct proc *p, *p1;
120	struct thread *td1;
121	cpuset_t cs;
122	uint64_t *swt;
123	int c, error;
124	bool first;
125
126	if (flags != 0 || (cmd & ~MEMBARRIER_SUPPORTED_CMDS) != 0)
127		return (EINVAL);
128
129	if (cmd == MEMBARRIER_CMD_QUERY) {
130		td->td_retval[0] = MEMBARRIER_SUPPORTED_CMDS;
131		return (0);
132	}
133
134	p = td->td_proc;
135	error = 0;
136
137	switch (cmd) {
138	case MEMBARRIER_CMD_GLOBAL:
139		swt = malloc((mp_maxid + 1) * sizeof(*swt), M_TEMP, M_WAITOK);
140		CPU_ZERO(&cs);
141		sched_pin();
142		CPU_SET(PCPU_GET(cpuid), &cs);
143		for (first = true; error == 0; first = false) {
144			CPU_FOREACH(c)
145				check_cpu_switched(c, &cs, swt, first);
146			if (CPU_CMP(&cs, &all_cpus) == 0)
147				break;
148			error = pause_sig("mmbr", 1);
149			if (error == EWOULDBLOCK)
150				error = 0;
151		}
152		sched_unpin();
153		free(swt, M_TEMP);
154		atomic_thread_fence_seq_cst();
155		break;
156
157	case MEMBARRIER_CMD_GLOBAL_EXPEDITED:
158		if ((td->td_proc->p_flag2 & P2_MEMBAR_GLOBE) == 0) {
159			error = EPERM;
160		} else {
161			CPU_ZERO(&cs);
162			CPU_FOREACH(c) {
163				td1 = cpuid_to_pcpu[c]->pc_curthread;
164				p1 = td1->td_proc;
165				if (p1 != NULL &&
166				    (p1->p_flag2 & P2_MEMBAR_GLOBE) != 0)
167					CPU_SET(c, &cs);
168			}
169			do_membarrier_ipi(&cs, membarrier_action_seqcst);
170		}
171		break;
172
173	case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED:
174		if ((p->p_flag2 & P2_MEMBAR_GLOBE) == 0) {
175			PROC_LOCK(p);
176			p->p_flag2 |= P2_MEMBAR_GLOBE;
177			PROC_UNLOCK(p);
178		}
179		break;
180
181	case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
182		if ((td->td_proc->p_flag2 & P2_MEMBAR_PRIVE) == 0) {
183			error = EPERM;
184		} else {
185			pmap_active_cpus(vmspace_pmap(p->p_vmspace), &cs);
186			do_membarrier_ipi(&cs, membarrier_action_seqcst);
187		}
188		break;
189
190	case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
191		if ((p->p_flag2 & P2_MEMBAR_PRIVE) == 0) {
192			PROC_LOCK(p);
193			p->p_flag2 |= P2_MEMBAR_PRIVE;
194			PROC_UNLOCK(p);
195		}
196		break;
197
198	case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE:
199		if ((td->td_proc->p_flag2 & P2_MEMBAR_PRIVE_SYNCORE) == 0) {
200			error = EPERM;
201		} else {
202			/*
203			 * Calculating the IPI multicast mask from
204			 * pmap active mask means that we do not call
205			 * cpu_sync_core() on CPUs that were missed
206			 * from pmap active mask but could be switched
207			 * from or to meantime.  This is fine at least
208			 * on amd64 because threads always use slow
209			 * (IRETQ) path to return from syscall after
210			 * context switch.
211			 */
212			pmap_active_cpus(vmspace_pmap(p->p_vmspace), &cs);
213
214			do_membarrier_ipi(&cs,
215			    membarrier_action_seqcst_sync_core);
216		}
217		break;
218
219	case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE:
220		if ((p->p_flag2 & P2_MEMBAR_PRIVE_SYNCORE) == 0) {
221			PROC_LOCK(p);
222			p->p_flag2 |= P2_MEMBAR_PRIVE_SYNCORE;
223			PROC_UNLOCK(p);
224		}
225		break;
226
227	default:
228		error = EINVAL;
229		break;
230	}
231
232	return (error);
233}
234
235int
236sys_membarrier(struct thread *td, struct membarrier_args *uap)
237{
238	return (kern_membarrier(td, uap->cmd, uap->flags, uap->cpu_id));
239}
240