1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * xsave/xrstor support.
4 *
5 * Author: Suresh Siddha <suresh.b.siddha@intel.com>
6 */
7#include <linux/bitops.h>
8#include <linux/compat.h>
9#include <linux/cpu.h>
10#include <linux/mman.h>
11#include <linux/nospec.h>
12#include <linux/pkeys.h>
13#include <linux/seq_file.h>
14#include <linux/proc_fs.h>
15#include <linux/vmalloc.h>
16
17#include <asm/fpu/api.h>
18#include <asm/fpu/regset.h>
19#include <asm/fpu/signal.h>
20#include <asm/fpu/xcr.h>
21
22#include <asm/tlbflush.h>
23#include <asm/prctl.h>
24#include <asm/elf.h>
25
26#include "context.h"
27#include "internal.h"
28#include "legacy.h"
29#include "xstate.h"
30
31#define for_each_extended_xfeature(bit, mask)				\
32	(bit) = FIRST_EXTENDED_XFEATURE;				\
33	for_each_set_bit_from(bit, (unsigned long *)&(mask), 8 * sizeof(mask))
34
35/*
36 * Although we spell it out in here, the Processor Trace
37 * xfeature is completely unused.  We use other mechanisms
38 * to save/restore PT state in Linux.
39 */
40static const char *xfeature_names[] =
41{
42	"x87 floating point registers",
43	"SSE registers",
44	"AVX registers",
45	"MPX bounds registers",
46	"MPX CSR",
47	"AVX-512 opmask",
48	"AVX-512 Hi256",
49	"AVX-512 ZMM_Hi256",
50	"Processor Trace (unused)",
51	"Protection Keys User registers",
52	"PASID state",
53	"Control-flow User registers",
54	"Control-flow Kernel registers (unused)",
55	"unknown xstate feature",
56	"unknown xstate feature",
57	"unknown xstate feature",
58	"unknown xstate feature",
59	"AMX Tile config",
60	"AMX Tile data",
61	"unknown xstate feature",
62};
63
64static unsigned short xsave_cpuid_features[] __initdata = {
65	[XFEATURE_FP]				= X86_FEATURE_FPU,
66	[XFEATURE_SSE]				= X86_FEATURE_XMM,
67	[XFEATURE_YMM]				= X86_FEATURE_AVX,
68	[XFEATURE_BNDREGS]			= X86_FEATURE_MPX,
69	[XFEATURE_BNDCSR]			= X86_FEATURE_MPX,
70	[XFEATURE_OPMASK]			= X86_FEATURE_AVX512F,
71	[XFEATURE_ZMM_Hi256]			= X86_FEATURE_AVX512F,
72	[XFEATURE_Hi16_ZMM]			= X86_FEATURE_AVX512F,
73	[XFEATURE_PT_UNIMPLEMENTED_SO_FAR]	= X86_FEATURE_INTEL_PT,
74	[XFEATURE_PKRU]				= X86_FEATURE_OSPKE,
75	[XFEATURE_PASID]			= X86_FEATURE_ENQCMD,
76	[XFEATURE_CET_USER]			= X86_FEATURE_SHSTK,
77	[XFEATURE_XTILE_CFG]			= X86_FEATURE_AMX_TILE,
78	[XFEATURE_XTILE_DATA]			= X86_FEATURE_AMX_TILE,
79};
80
81static unsigned int xstate_offsets[XFEATURE_MAX] __ro_after_init =
82	{ [ 0 ... XFEATURE_MAX - 1] = -1};
83static unsigned int xstate_sizes[XFEATURE_MAX] __ro_after_init =
84	{ [ 0 ... XFEATURE_MAX - 1] = -1};
85static unsigned int xstate_flags[XFEATURE_MAX] __ro_after_init;
86
87#define XSTATE_FLAG_SUPERVISOR	BIT(0)
88#define XSTATE_FLAG_ALIGNED64	BIT(1)
89
90/*
91 * Return whether the system supports a given xfeature.
92 *
93 * Also return the name of the (most advanced) feature that the caller requested:
94 */
95int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name)
96{
97	u64 xfeatures_missing = xfeatures_needed & ~fpu_kernel_cfg.max_features;
98
99	if (unlikely(feature_name)) {
100		long xfeature_idx, max_idx;
101		u64 xfeatures_print;
102		/*
103		 * So we use FLS here to be able to print the most advanced
104		 * feature that was requested but is missing. So if a driver
105		 * asks about "XFEATURE_MASK_SSE | XFEATURE_MASK_YMM" we'll print the
106		 * missing AVX feature - this is the most informative message
107		 * to users:
108		 */
109		if (xfeatures_missing)
110			xfeatures_print = xfeatures_missing;
111		else
112			xfeatures_print = xfeatures_needed;
113
114		xfeature_idx = fls64(xfeatures_print)-1;
115		max_idx = ARRAY_SIZE(xfeature_names)-1;
116		xfeature_idx = min(xfeature_idx, max_idx);
117
118		*feature_name = xfeature_names[xfeature_idx];
119	}
120
121	if (xfeatures_missing)
122		return 0;
123
124	return 1;
125}
126EXPORT_SYMBOL_GPL(cpu_has_xfeatures);
127
128static bool xfeature_is_aligned64(int xfeature_nr)
129{
130	return xstate_flags[xfeature_nr] & XSTATE_FLAG_ALIGNED64;
131}
132
133static bool xfeature_is_supervisor(int xfeature_nr)
134{
135	return xstate_flags[xfeature_nr] & XSTATE_FLAG_SUPERVISOR;
136}
137
138static unsigned int xfeature_get_offset(u64 xcomp_bv, int xfeature)
139{
140	unsigned int offs, i;
141
142	/*
143	 * Non-compacted format and legacy features use the cached fixed
144	 * offsets.
145	 */
146	if (!cpu_feature_enabled(X86_FEATURE_XCOMPACTED) ||
147	    xfeature <= XFEATURE_SSE)
148		return xstate_offsets[xfeature];
149
150	/*
151	 * Compacted format offsets depend on the actual content of the
152	 * compacted xsave area which is determined by the xcomp_bv header
153	 * field.
154	 */
155	offs = FXSAVE_SIZE + XSAVE_HDR_SIZE;
156	for_each_extended_xfeature(i, xcomp_bv) {
157		if (xfeature_is_aligned64(i))
158			offs = ALIGN(offs, 64);
159		if (i == xfeature)
160			break;
161		offs += xstate_sizes[i];
162	}
163	return offs;
164}
165
166/*
167 * Enable the extended processor state save/restore feature.
168 * Called once per CPU onlining.
169 */
170void fpu__init_cpu_xstate(void)
171{
172	if (!boot_cpu_has(X86_FEATURE_XSAVE) || !fpu_kernel_cfg.max_features)
173		return;
174
175	cr4_set_bits(X86_CR4_OSXSAVE);
176
177	/*
178	 * Must happen after CR4 setup and before xsetbv() to allow KVM
179	 * lazy passthrough.  Write independent of the dynamic state static
180	 * key as that does not work on the boot CPU. This also ensures
181	 * that any stale state is wiped out from XFD. Reset the per CPU
182	 * xfd cache too.
183	 */
184	if (cpu_feature_enabled(X86_FEATURE_XFD))
185		xfd_set_state(init_fpstate.xfd);
186
187	/*
188	 * XCR_XFEATURE_ENABLED_MASK (aka. XCR0) sets user features
189	 * managed by XSAVE{C, OPT, S} and XRSTOR{S}.  Only XSAVE user
190	 * states can be set here.
191	 */
192	xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features);
193
194	/*
195	 * MSR_IA32_XSS sets supervisor states managed by XSAVES.
196	 */
197	if (boot_cpu_has(X86_FEATURE_XSAVES)) {
198		wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() |
199				     xfeatures_mask_independent());
200	}
201}
202
203static bool xfeature_enabled(enum xfeature xfeature)
204{
205	return fpu_kernel_cfg.max_features & BIT_ULL(xfeature);
206}
207
208/*
209 * Record the offsets and sizes of various xstates contained
210 * in the XSAVE state memory layout.
211 */
212static void __init setup_xstate_cache(void)
213{
214	u32 eax, ebx, ecx, edx, i;
215	/* start at the beginning of the "extended state" */
216	unsigned int last_good_offset = offsetof(struct xregs_state,
217						 extended_state_area);
218	/*
219	 * The FP xstates and SSE xstates are legacy states. They are always
220	 * in the fixed offsets in the xsave area in either compacted form
221	 * or standard form.
222	 */
223	xstate_offsets[XFEATURE_FP]	= 0;
224	xstate_sizes[XFEATURE_FP]	= offsetof(struct fxregs_state,
225						   xmm_space);
226
227	xstate_offsets[XFEATURE_SSE]	= xstate_sizes[XFEATURE_FP];
228	xstate_sizes[XFEATURE_SSE]	= sizeof_field(struct fxregs_state,
229						       xmm_space);
230
231	for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
232		cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx);
233
234		xstate_sizes[i] = eax;
235		xstate_flags[i] = ecx;
236
237		/*
238		 * If an xfeature is supervisor state, the offset in EBX is
239		 * invalid, leave it to -1.
240		 */
241		if (xfeature_is_supervisor(i))
242			continue;
243
244		xstate_offsets[i] = ebx;
245
246		/*
247		 * In our xstate size checks, we assume that the highest-numbered
248		 * xstate feature has the highest offset in the buffer.  Ensure
249		 * it does.
250		 */
251		WARN_ONCE(last_good_offset > xstate_offsets[i],
252			  "x86/fpu: misordered xstate at %d\n", last_good_offset);
253
254		last_good_offset = xstate_offsets[i];
255	}
256}
257
258static void __init print_xstate_feature(u64 xstate_mask)
259{
260	const char *feature_name;
261
262	if (cpu_has_xfeatures(xstate_mask, &feature_name))
263		pr_info("x86/fpu: Supporting XSAVE feature 0x%03Lx: '%s'\n", xstate_mask, feature_name);
264}
265
266/*
267 * Print out all the supported xstate features:
268 */
269static void __init print_xstate_features(void)
270{
271	print_xstate_feature(XFEATURE_MASK_FP);
272	print_xstate_feature(XFEATURE_MASK_SSE);
273	print_xstate_feature(XFEATURE_MASK_YMM);
274	print_xstate_feature(XFEATURE_MASK_BNDREGS);
275	print_xstate_feature(XFEATURE_MASK_BNDCSR);
276	print_xstate_feature(XFEATURE_MASK_OPMASK);
277	print_xstate_feature(XFEATURE_MASK_ZMM_Hi256);
278	print_xstate_feature(XFEATURE_MASK_Hi16_ZMM);
279	print_xstate_feature(XFEATURE_MASK_PKRU);
280	print_xstate_feature(XFEATURE_MASK_PASID);
281	print_xstate_feature(XFEATURE_MASK_CET_USER);
282	print_xstate_feature(XFEATURE_MASK_XTILE_CFG);
283	print_xstate_feature(XFEATURE_MASK_XTILE_DATA);
284}
285
286/*
287 * This check is important because it is easy to get XSTATE_*
288 * confused with XSTATE_BIT_*.
289 */
290#define CHECK_XFEATURE(nr) do {		\
291	WARN_ON(nr < FIRST_EXTENDED_XFEATURE);	\
292	WARN_ON(nr >= XFEATURE_MAX);	\
293} while (0)
294
295/*
296 * Print out xstate component offsets and sizes
297 */
298static void __init print_xstate_offset_size(void)
299{
300	int i;
301
302	for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
303		pr_info("x86/fpu: xstate_offset[%d]: %4d, xstate_sizes[%d]: %4d\n",
304			i, xfeature_get_offset(fpu_kernel_cfg.max_features, i),
305			i, xstate_sizes[i]);
306	}
307}
308
309/*
310 * This function is called only during boot time when x86 caps are not set
311 * up and alternative can not be used yet.
312 */
313static __init void os_xrstor_booting(struct xregs_state *xstate)
314{
315	u64 mask = fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSTATE;
316	u32 lmask = mask;
317	u32 hmask = mask >> 32;
318	int err;
319
320	if (cpu_feature_enabled(X86_FEATURE_XSAVES))
321		XSTATE_OP(XRSTORS, xstate, lmask, hmask, err);
322	else
323		XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
324
325	/*
326	 * We should never fault when copying from a kernel buffer, and the FPU
327	 * state we set at boot time should be valid.
328	 */
329	WARN_ON_FPU(err);
330}
331
332/*
333 * All supported features have either init state all zeros or are
334 * handled in setup_init_fpu() individually. This is an explicit
335 * feature list and does not use XFEATURE_MASK*SUPPORTED to catch
336 * newly added supported features at build time and make people
337 * actually look at the init state for the new feature.
338 */
339#define XFEATURES_INIT_FPSTATE_HANDLED		\
340	(XFEATURE_MASK_FP |			\
341	 XFEATURE_MASK_SSE |			\
342	 XFEATURE_MASK_YMM |			\
343	 XFEATURE_MASK_OPMASK |			\
344	 XFEATURE_MASK_ZMM_Hi256 |		\
345	 XFEATURE_MASK_Hi16_ZMM	 |		\
346	 XFEATURE_MASK_PKRU |			\
347	 XFEATURE_MASK_BNDREGS |		\
348	 XFEATURE_MASK_BNDCSR |			\
349	 XFEATURE_MASK_PASID |			\
350	 XFEATURE_MASK_CET_USER |		\
351	 XFEATURE_MASK_XTILE)
352
353/*
354 * setup the xstate image representing the init state
355 */
356static void __init setup_init_fpu_buf(void)
357{
358	BUILD_BUG_ON((XFEATURE_MASK_USER_SUPPORTED |
359		      XFEATURE_MASK_SUPERVISOR_SUPPORTED) !=
360		     XFEATURES_INIT_FPSTATE_HANDLED);
361
362	if (!boot_cpu_has(X86_FEATURE_XSAVE))
363		return;
364
365	print_xstate_features();
366
367	xstate_init_xcomp_bv(&init_fpstate.regs.xsave, init_fpstate.xfeatures);
368
369	/*
370	 * Init all the features state with header.xfeatures being 0x0
371	 */
372	os_xrstor_booting(&init_fpstate.regs.xsave);
373
374	/*
375	 * All components are now in init state. Read the state back so
376	 * that init_fpstate contains all non-zero init state. This only
377	 * works with XSAVE, but not with XSAVEOPT and XSAVEC/S because
378	 * those use the init optimization which skips writing data for
379	 * components in init state.
380	 *
381	 * XSAVE could be used, but that would require to reshuffle the
382	 * data when XSAVEC/S is available because XSAVEC/S uses xstate
383	 * compaction. But doing so is a pointless exercise because most
384	 * components have an all zeros init state except for the legacy
385	 * ones (FP and SSE). Those can be saved with FXSAVE into the
386	 * legacy area. Adding new features requires to ensure that init
387	 * state is all zeroes or if not to add the necessary handling
388	 * here.
389	 */
390	fxsave(&init_fpstate.regs.fxsave);
391}
392
393int xfeature_size(int xfeature_nr)
394{
395	u32 eax, ebx, ecx, edx;
396
397	CHECK_XFEATURE(xfeature_nr);
398	cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx);
399	return eax;
400}
401
402/* Validate an xstate header supplied by userspace (ptrace or sigreturn) */
403static int validate_user_xstate_header(const struct xstate_header *hdr,
404				       struct fpstate *fpstate)
405{
406	/* No unknown or supervisor features may be set */
407	if (hdr->xfeatures & ~fpstate->user_xfeatures)
408		return -EINVAL;
409
410	/* Userspace must use the uncompacted format */
411	if (hdr->xcomp_bv)
412		return -EINVAL;
413
414	/*
415	 * If 'reserved' is shrunken to add a new field, make sure to validate
416	 * that new field here!
417	 */
418	BUILD_BUG_ON(sizeof(hdr->reserved) != 48);
419
420	/* No reserved bits may be set */
421	if (memchr_inv(hdr->reserved, 0, sizeof(hdr->reserved)))
422		return -EINVAL;
423
424	return 0;
425}
426
427static void __init __xstate_dump_leaves(void)
428{
429	int i;
430	u32 eax, ebx, ecx, edx;
431	static int should_dump = 1;
432
433	if (!should_dump)
434		return;
435	should_dump = 0;
436	/*
437	 * Dump out a few leaves past the ones that we support
438	 * just in case there are some goodies up there
439	 */
440	for (i = 0; i < XFEATURE_MAX + 10; i++) {
441		cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx);
442		pr_warn("CPUID[%02x, %02x]: eax=%08x ebx=%08x ecx=%08x edx=%08x\n",
443			XSTATE_CPUID, i, eax, ebx, ecx, edx);
444	}
445}
446
447#define XSTATE_WARN_ON(x, fmt, ...) do {					\
448	if (WARN_ONCE(x, "XSAVE consistency problem: " fmt, ##__VA_ARGS__)) {	\
449		__xstate_dump_leaves();						\
450	}									\
451} while (0)
452
453#define XCHECK_SZ(sz, nr, __struct) ({					\
454	if (WARN_ONCE(sz != sizeof(__struct),				\
455	    "[%s]: struct is %zu bytes, cpu state %d bytes\n",		\
456	    xfeature_names[nr], sizeof(__struct), sz)) {		\
457		__xstate_dump_leaves();					\
458	}								\
459	true;								\
460})
461
462
463/**
464 * check_xtile_data_against_struct - Check tile data state size.
465 *
466 * Calculate the state size by multiplying the single tile size which is
467 * recorded in a C struct, and the number of tiles that the CPU informs.
468 * Compare the provided size with the calculation.
469 *
470 * @size:	The tile data state size
471 *
472 * Returns:	0 on success, -EINVAL on mismatch.
473 */
474static int __init check_xtile_data_against_struct(int size)
475{
476	u32 max_palid, palid, state_size;
477	u32 eax, ebx, ecx, edx;
478	u16 max_tile;
479
480	/*
481	 * Check the maximum palette id:
482	 *   eax: the highest numbered palette subleaf.
483	 */
484	cpuid_count(TILE_CPUID, 0, &max_palid, &ebx, &ecx, &edx);
485
486	/*
487	 * Cross-check each tile size and find the maximum number of
488	 * supported tiles.
489	 */
490	for (palid = 1, max_tile = 0; palid <= max_palid; palid++) {
491		u16 tile_size, max;
492
493		/*
494		 * Check the tile size info:
495		 *   eax[31:16]:  bytes per title
496		 *   ebx[31:16]:  the max names (or max number of tiles)
497		 */
498		cpuid_count(TILE_CPUID, palid, &eax, &ebx, &edx, &edx);
499		tile_size = eax >> 16;
500		max = ebx >> 16;
501
502		if (tile_size != sizeof(struct xtile_data)) {
503			pr_err("%s: struct is %zu bytes, cpu xtile %d bytes\n",
504			       __stringify(XFEATURE_XTILE_DATA),
505			       sizeof(struct xtile_data), tile_size);
506			__xstate_dump_leaves();
507			return -EINVAL;
508		}
509
510		if (max > max_tile)
511			max_tile = max;
512	}
513
514	state_size = sizeof(struct xtile_data) * max_tile;
515	if (size != state_size) {
516		pr_err("%s: calculated size is %u bytes, cpu state %d bytes\n",
517		       __stringify(XFEATURE_XTILE_DATA), state_size, size);
518		__xstate_dump_leaves();
519		return -EINVAL;
520	}
521	return 0;
522}
523
524/*
525 * We have a C struct for each 'xstate'.  We need to ensure
526 * that our software representation matches what the CPU
527 * tells us about the state's size.
528 */
529static bool __init check_xstate_against_struct(int nr)
530{
531	/*
532	 * Ask the CPU for the size of the state.
533	 */
534	int sz = xfeature_size(nr);
535
536	/*
537	 * Match each CPU state with the corresponding software
538	 * structure.
539	 */
540	switch (nr) {
541	case XFEATURE_YMM:	  return XCHECK_SZ(sz, nr, struct ymmh_struct);
542	case XFEATURE_BNDREGS:	  return XCHECK_SZ(sz, nr, struct mpx_bndreg_state);
543	case XFEATURE_BNDCSR:	  return XCHECK_SZ(sz, nr, struct mpx_bndcsr_state);
544	case XFEATURE_OPMASK:	  return XCHECK_SZ(sz, nr, struct avx_512_opmask_state);
545	case XFEATURE_ZMM_Hi256:  return XCHECK_SZ(sz, nr, struct avx_512_zmm_uppers_state);
546	case XFEATURE_Hi16_ZMM:	  return XCHECK_SZ(sz, nr, struct avx_512_hi16_state);
547	case XFEATURE_PKRU:	  return XCHECK_SZ(sz, nr, struct pkru_state);
548	case XFEATURE_PASID:	  return XCHECK_SZ(sz, nr, struct ia32_pasid_state);
549	case XFEATURE_XTILE_CFG:  return XCHECK_SZ(sz, nr, struct xtile_cfg);
550	case XFEATURE_CET_USER:	  return XCHECK_SZ(sz, nr, struct cet_user_state);
551	case XFEATURE_XTILE_DATA: check_xtile_data_against_struct(sz); return true;
552	default:
553		XSTATE_WARN_ON(1, "No structure for xstate: %d\n", nr);
554		return false;
555	}
556
557	return true;
558}
559
560static unsigned int xstate_calculate_size(u64 xfeatures, bool compacted)
561{
562	unsigned int topmost = fls64(xfeatures) -  1;
563	unsigned int offset = xstate_offsets[topmost];
564
565	if (topmost <= XFEATURE_SSE)
566		return sizeof(struct xregs_state);
567
568	if (compacted)
569		offset = xfeature_get_offset(xfeatures, topmost);
570	return offset + xstate_sizes[topmost];
571}
572
573/*
574 * This essentially double-checks what the cpu told us about
575 * how large the XSAVE buffer needs to be.  We are recalculating
576 * it to be safe.
577 *
578 * Independent XSAVE features allocate their own buffers and are not
579 * covered by these checks. Only the size of the buffer for task->fpu
580 * is checked here.
581 */
582static bool __init paranoid_xstate_size_valid(unsigned int kernel_size)
583{
584	bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED);
585	bool xsaves = cpu_feature_enabled(X86_FEATURE_XSAVES);
586	unsigned int size = FXSAVE_SIZE + XSAVE_HDR_SIZE;
587	int i;
588
589	for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
590		if (!check_xstate_against_struct(i))
591			return false;
592		/*
593		 * Supervisor state components can be managed only by
594		 * XSAVES.
595		 */
596		if (!xsaves && xfeature_is_supervisor(i)) {
597			XSTATE_WARN_ON(1, "Got supervisor feature %d, but XSAVES not advertised\n", i);
598			return false;
599		}
600	}
601	size = xstate_calculate_size(fpu_kernel_cfg.max_features, compacted);
602	XSTATE_WARN_ON(size != kernel_size,
603		       "size %u != kernel_size %u\n", size, kernel_size);
604	return size == kernel_size;
605}
606
607/*
608 * Get total size of enabled xstates in XCR0 | IA32_XSS.
609 *
610 * Note the SDM's wording here.  "sub-function 0" only enumerates
611 * the size of the *user* states.  If we use it to size a buffer
612 * that we use 'XSAVES' on, we could potentially overflow the
613 * buffer because 'XSAVES' saves system states too.
614 *
615 * This also takes compaction into account. So this works for
616 * XSAVEC as well.
617 */
618static unsigned int __init get_compacted_size(void)
619{
620	unsigned int eax, ebx, ecx, edx;
621	/*
622	 * - CPUID function 0DH, sub-function 1:
623	 *    EBX enumerates the size (in bytes) required by
624	 *    the XSAVES instruction for an XSAVE area
625	 *    containing all the state components
626	 *    corresponding to bits currently set in
627	 *    XCR0 | IA32_XSS.
628	 *
629	 * When XSAVES is not available but XSAVEC is (virt), then there
630	 * are no supervisor states, but XSAVEC still uses compacted
631	 * format.
632	 */
633	cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx);
634	return ebx;
635}
636
637/*
638 * Get the total size of the enabled xstates without the independent supervisor
639 * features.
640 */
641static unsigned int __init get_xsave_compacted_size(void)
642{
643	u64 mask = xfeatures_mask_independent();
644	unsigned int size;
645
646	if (!mask)
647		return get_compacted_size();
648
649	/* Disable independent features. */
650	wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor());
651
652	/*
653	 * Ask the hardware what size is required of the buffer.
654	 * This is the size required for the task->fpu buffer.
655	 */
656	size = get_compacted_size();
657
658	/* Re-enable independent features so XSAVES will work on them again. */
659	wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | mask);
660
661	return size;
662}
663
664static unsigned int __init get_xsave_size_user(void)
665{
666	unsigned int eax, ebx, ecx, edx;
667	/*
668	 * - CPUID function 0DH, sub-function 0:
669	 *    EBX enumerates the size (in bytes) required by
670	 *    the XSAVE instruction for an XSAVE area
671	 *    containing all the *user* state components
672	 *    corresponding to bits currently set in XCR0.
673	 */
674	cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
675	return ebx;
676}
677
678static int __init init_xstate_size(void)
679{
680	/* Recompute the context size for enabled features: */
681	unsigned int user_size, kernel_size, kernel_default_size;
682	bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED);
683
684	/* Uncompacted user space size */
685	user_size = get_xsave_size_user();
686
687	/*
688	 * XSAVES kernel size includes supervisor states and uses compacted
689	 * format. XSAVEC uses compacted format, but does not save
690	 * supervisor states.
691	 *
692	 * XSAVE[OPT] do not support supervisor states so kernel and user
693	 * size is identical.
694	 */
695	if (compacted)
696		kernel_size = get_xsave_compacted_size();
697	else
698		kernel_size = user_size;
699
700	kernel_default_size =
701		xstate_calculate_size(fpu_kernel_cfg.default_features, compacted);
702
703	if (!paranoid_xstate_size_valid(kernel_size))
704		return -EINVAL;
705
706	fpu_kernel_cfg.max_size = kernel_size;
707	fpu_user_cfg.max_size = user_size;
708
709	fpu_kernel_cfg.default_size = kernel_default_size;
710	fpu_user_cfg.default_size =
711		xstate_calculate_size(fpu_user_cfg.default_features, false);
712
713	return 0;
714}
715
716/*
717 * We enabled the XSAVE hardware, but something went wrong and
718 * we can not use it.  Disable it.
719 */
720static void __init fpu__init_disable_system_xstate(unsigned int legacy_size)
721{
722	fpu_kernel_cfg.max_features = 0;
723	cr4_clear_bits(X86_CR4_OSXSAVE);
724	setup_clear_cpu_cap(X86_FEATURE_XSAVE);
725
726	/* Restore the legacy size.*/
727	fpu_kernel_cfg.max_size = legacy_size;
728	fpu_kernel_cfg.default_size = legacy_size;
729	fpu_user_cfg.max_size = legacy_size;
730	fpu_user_cfg.default_size = legacy_size;
731
732	/*
733	 * Prevent enabling the static branch which enables writes to the
734	 * XFD MSR.
735	 */
736	init_fpstate.xfd = 0;
737
738	fpstate_reset(&current->thread.fpu);
739}
740
741/*
742 * Enable and initialize the xsave feature.
743 * Called once per system bootup.
744 */
745void __init fpu__init_system_xstate(unsigned int legacy_size)
746{
747	unsigned int eax, ebx, ecx, edx;
748	u64 xfeatures;
749	int err;
750	int i;
751
752	if (!boot_cpu_has(X86_FEATURE_FPU)) {
753		pr_info("x86/fpu: No FPU detected\n");
754		return;
755	}
756
757	if (!boot_cpu_has(X86_FEATURE_XSAVE)) {
758		pr_info("x86/fpu: x87 FPU will use %s\n",
759			boot_cpu_has(X86_FEATURE_FXSR) ? "FXSAVE" : "FSAVE");
760		return;
761	}
762
763	if (boot_cpu_data.cpuid_level < XSTATE_CPUID) {
764		WARN_ON_FPU(1);
765		return;
766	}
767
768	/*
769	 * Find user xstates supported by the processor.
770	 */
771	cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
772	fpu_kernel_cfg.max_features = eax + ((u64)edx << 32);
773
774	/*
775	 * Find supervisor xstates supported by the processor.
776	 */
777	cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx);
778	fpu_kernel_cfg.max_features |= ecx + ((u64)edx << 32);
779
780	if ((fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) {
781		/*
782		 * This indicates that something really unexpected happened
783		 * with the enumeration.  Disable XSAVE and try to continue
784		 * booting without it.  This is too early to BUG().
785		 */
786		pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n",
787		       fpu_kernel_cfg.max_features);
788		goto out_disable;
789	}
790
791	/*
792	 * Clear XSAVE features that are disabled in the normal CPUID.
793	 */
794	for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) {
795		unsigned short cid = xsave_cpuid_features[i];
796
797		/* Careful: X86_FEATURE_FPU is 0! */
798		if ((i != XFEATURE_FP && !cid) || !boot_cpu_has(cid))
799			fpu_kernel_cfg.max_features &= ~BIT_ULL(i);
800	}
801
802	if (!cpu_feature_enabled(X86_FEATURE_XFD))
803		fpu_kernel_cfg.max_features &= ~XFEATURE_MASK_USER_DYNAMIC;
804
805	if (!cpu_feature_enabled(X86_FEATURE_XSAVES))
806		fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED;
807	else
808		fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED |
809					XFEATURE_MASK_SUPERVISOR_SUPPORTED;
810
811	fpu_user_cfg.max_features = fpu_kernel_cfg.max_features;
812	fpu_user_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED;
813
814	/* Clean out dynamic features from default */
815	fpu_kernel_cfg.default_features = fpu_kernel_cfg.max_features;
816	fpu_kernel_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC;
817
818	fpu_user_cfg.default_features = fpu_user_cfg.max_features;
819	fpu_user_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC;
820
821	/* Store it for paranoia check at the end */
822	xfeatures = fpu_kernel_cfg.max_features;
823
824	/*
825	 * Initialize the default XFD state in initfp_state and enable the
826	 * dynamic sizing mechanism if dynamic states are available.  The
827	 * static key cannot be enabled here because this runs before
828	 * jump_label_init(). This is delayed to an initcall.
829	 */
830	init_fpstate.xfd = fpu_user_cfg.max_features & XFEATURE_MASK_USER_DYNAMIC;
831
832	/* Set up compaction feature bit */
833	if (cpu_feature_enabled(X86_FEATURE_XSAVEC) ||
834	    cpu_feature_enabled(X86_FEATURE_XSAVES))
835		setup_force_cpu_cap(X86_FEATURE_XCOMPACTED);
836
837	/* Enable xstate instructions to be able to continue with initialization: */
838	fpu__init_cpu_xstate();
839
840	/* Cache size, offset and flags for initialization */
841	setup_xstate_cache();
842
843	err = init_xstate_size();
844	if (err)
845		goto out_disable;
846
847	/* Reset the state for the current task */
848	fpstate_reset(&current->thread.fpu);
849
850	/*
851	 * Update info used for ptrace frames; use standard-format size and no
852	 * supervisor xstates:
853	 */
854	update_regset_xstate_info(fpu_user_cfg.max_size,
855				  fpu_user_cfg.max_features);
856
857	/*
858	 * init_fpstate excludes dynamic states as they are large but init
859	 * state is zero.
860	 */
861	init_fpstate.size		= fpu_kernel_cfg.default_size;
862	init_fpstate.xfeatures		= fpu_kernel_cfg.default_features;
863
864	if (init_fpstate.size > sizeof(init_fpstate.regs)) {
865		pr_warn("x86/fpu: init_fpstate buffer too small (%zu < %d), disabling XSAVE\n",
866			sizeof(init_fpstate.regs), init_fpstate.size);
867		goto out_disable;
868	}
869
870	setup_init_fpu_buf();
871
872	/*
873	 * Paranoia check whether something in the setup modified the
874	 * xfeatures mask.
875	 */
876	if (xfeatures != fpu_kernel_cfg.max_features) {
877		pr_err("x86/fpu: xfeatures modified from 0x%016llx to 0x%016llx during init, disabling XSAVE\n",
878		       xfeatures, fpu_kernel_cfg.max_features);
879		goto out_disable;
880	}
881
882	/*
883	 * CPU capabilities initialization runs before FPU init. So
884	 * X86_FEATURE_OSXSAVE is not set. Now that XSAVE is completely
885	 * functional, set the feature bit so depending code works.
886	 */
887	setup_force_cpu_cap(X86_FEATURE_OSXSAVE);
888
889	print_xstate_offset_size();
890	pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n",
891		fpu_kernel_cfg.max_features,
892		fpu_kernel_cfg.max_size,
893		boot_cpu_has(X86_FEATURE_XCOMPACTED) ? "compacted" : "standard");
894	return;
895
896out_disable:
897	/* something went wrong, try to boot without any XSAVE support */
898	fpu__init_disable_system_xstate(legacy_size);
899}
900
901/*
902 * Restore minimal FPU state after suspend:
903 */
904void fpu__resume_cpu(void)
905{
906	/*
907	 * Restore XCR0 on xsave capable CPUs:
908	 */
909	if (cpu_feature_enabled(X86_FEATURE_XSAVE))
910		xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features);
911
912	/*
913	 * Restore IA32_XSS. The same CPUID bit enumerates support
914	 * of XSAVES and MSR_IA32_XSS.
915	 */
916	if (cpu_feature_enabled(X86_FEATURE_XSAVES)) {
917		wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor()  |
918				     xfeatures_mask_independent());
919	}
920
921	if (fpu_state_size_dynamic())
922		wrmsrl(MSR_IA32_XFD, current->thread.fpu.fpstate->xfd);
923}
924
925/*
926 * Given an xstate feature nr, calculate where in the xsave
927 * buffer the state is.  Callers should ensure that the buffer
928 * is valid.
929 */
930static void *__raw_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
931{
932	u64 xcomp_bv = xsave->header.xcomp_bv;
933
934	if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr)))
935		return NULL;
936
937	if (cpu_feature_enabled(X86_FEATURE_XCOMPACTED)) {
938		if (WARN_ON_ONCE(!(xcomp_bv & BIT_ULL(xfeature_nr))))
939			return NULL;
940	}
941
942	return (void *)xsave + xfeature_get_offset(xcomp_bv, xfeature_nr);
943}
944
945/*
946 * Given the xsave area and a state inside, this function returns the
947 * address of the state.
948 *
949 * This is the API that is called to get xstate address in either
950 * standard format or compacted format of xsave area.
951 *
952 * Note that if there is no data for the field in the xsave buffer
953 * this will return NULL.
954 *
955 * Inputs:
956 *	xstate: the thread's storage area for all FPU data
957 *	xfeature_nr: state which is defined in xsave.h (e.g. XFEATURE_FP,
958 *	XFEATURE_SSE, etc...)
959 * Output:
960 *	address of the state in the xsave area, or NULL if the
961 *	field is not present in the xsave buffer.
962 */
963void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
964{
965	/*
966	 * Do we even *have* xsave state?
967	 */
968	if (!boot_cpu_has(X86_FEATURE_XSAVE))
969		return NULL;
970
971	/*
972	 * We should not ever be requesting features that we
973	 * have not enabled.
974	 */
975	if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr)))
976		return NULL;
977
978	/*
979	 * This assumes the last 'xsave*' instruction to
980	 * have requested that 'xfeature_nr' be saved.
981	 * If it did not, we might be seeing and old value
982	 * of the field in the buffer.
983	 *
984	 * This can happen because the last 'xsave' did not
985	 * request that this feature be saved (unlikely)
986	 * or because the "init optimization" caused it
987	 * to not be saved.
988	 */
989	if (!(xsave->header.xfeatures & BIT_ULL(xfeature_nr)))
990		return NULL;
991
992	return __raw_xsave_addr(xsave, xfeature_nr);
993}
994EXPORT_SYMBOL_GPL(get_xsave_addr);
995
996#ifdef CONFIG_ARCH_HAS_PKEYS
997
998/*
999 * This will go out and modify PKRU register to set the access
1000 * rights for @pkey to @init_val.
1001 */
1002int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
1003			      unsigned long init_val)
1004{
1005	u32 old_pkru, new_pkru_bits = 0;
1006	int pkey_shift;
1007
1008	/*
1009	 * This check implies XSAVE support.  OSPKE only gets
1010	 * set if we enable XSAVE and we enable PKU in XCR0.
1011	 */
1012	if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
1013		return -EINVAL;
1014
1015	/*
1016	 * This code should only be called with valid 'pkey'
1017	 * values originating from in-kernel users.  Complain
1018	 * if a bad value is observed.
1019	 */
1020	if (WARN_ON_ONCE(pkey >= arch_max_pkey()))
1021		return -EINVAL;
1022
1023	/* Set the bits we need in PKRU:  */
1024	if (init_val & PKEY_DISABLE_ACCESS)
1025		new_pkru_bits |= PKRU_AD_BIT;
1026	if (init_val & PKEY_DISABLE_WRITE)
1027		new_pkru_bits |= PKRU_WD_BIT;
1028
1029	/* Shift the bits in to the correct place in PKRU for pkey: */
1030	pkey_shift = pkey * PKRU_BITS_PER_PKEY;
1031	new_pkru_bits <<= pkey_shift;
1032
1033	/* Get old PKRU and mask off any old bits in place: */
1034	old_pkru = read_pkru();
1035	old_pkru &= ~((PKRU_AD_BIT|PKRU_WD_BIT) << pkey_shift);
1036
1037	/* Write old part along with new part: */
1038	write_pkru(old_pkru | new_pkru_bits);
1039
1040	return 0;
1041}
1042#endif /* ! CONFIG_ARCH_HAS_PKEYS */
1043
1044static void copy_feature(bool from_xstate, struct membuf *to, void *xstate,
1045			 void *init_xstate, unsigned int size)
1046{
1047	membuf_write(to, from_xstate ? xstate : init_xstate, size);
1048}
1049
1050/**
1051 * __copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer
1052 * @to:		membuf descriptor
1053 * @fpstate:	The fpstate buffer from which to copy
1054 * @xfeatures:	The mask of xfeatures to save (XSAVE mode only)
1055 * @pkru_val:	The PKRU value to store in the PKRU component
1056 * @copy_mode:	The requested copy mode
1057 *
1058 * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming
1059 * format, i.e. from the kernel internal hardware dependent storage format
1060 * to the requested @mode. UABI XSTATE is always uncompacted!
1061 *
1062 * It supports partial copy but @to.pos always starts from zero.
1063 */
1064void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate,
1065			       u64 xfeatures, u32 pkru_val,
1066			       enum xstate_copy_mode copy_mode)
1067{
1068	const unsigned int off_mxcsr = offsetof(struct fxregs_state, mxcsr);
1069	struct xregs_state *xinit = &init_fpstate.regs.xsave;
1070	struct xregs_state *xsave = &fpstate->regs.xsave;
1071	struct xstate_header header;
1072	unsigned int zerofrom;
1073	u64 mask;
1074	int i;
1075
1076	memset(&header, 0, sizeof(header));
1077	header.xfeatures = xsave->header.xfeatures;
1078
1079	/* Mask out the feature bits depending on copy mode */
1080	switch (copy_mode) {
1081	case XSTATE_COPY_FP:
1082		header.xfeatures &= XFEATURE_MASK_FP;
1083		break;
1084
1085	case XSTATE_COPY_FX:
1086		header.xfeatures &= XFEATURE_MASK_FP | XFEATURE_MASK_SSE;
1087		break;
1088
1089	case XSTATE_COPY_XSAVE:
1090		header.xfeatures &= fpstate->user_xfeatures & xfeatures;
1091		break;
1092	}
1093
1094	/* Copy FP state up to MXCSR */
1095	copy_feature(header.xfeatures & XFEATURE_MASK_FP, &to, &xsave->i387,
1096		     &xinit->i387, off_mxcsr);
1097
1098	/* Copy MXCSR when SSE or YMM are set in the feature mask */
1099	copy_feature(header.xfeatures & (XFEATURE_MASK_SSE | XFEATURE_MASK_YMM),
1100		     &to, &xsave->i387.mxcsr, &xinit->i387.mxcsr,
1101		     MXCSR_AND_FLAGS_SIZE);
1102
1103	/* Copy the remaining FP state */
1104	copy_feature(header.xfeatures & XFEATURE_MASK_FP,
1105		     &to, &xsave->i387.st_space, &xinit->i387.st_space,
1106		     sizeof(xsave->i387.st_space));
1107
1108	/* Copy the SSE state - shared with YMM, but independently managed */
1109	copy_feature(header.xfeatures & XFEATURE_MASK_SSE,
1110		     &to, &xsave->i387.xmm_space, &xinit->i387.xmm_space,
1111		     sizeof(xsave->i387.xmm_space));
1112
1113	if (copy_mode != XSTATE_COPY_XSAVE)
1114		goto out;
1115
1116	/* Zero the padding area */
1117	membuf_zero(&to, sizeof(xsave->i387.padding));
1118
1119	/* Copy xsave->i387.sw_reserved */
1120	membuf_write(&to, xstate_fx_sw_bytes, sizeof(xsave->i387.sw_reserved));
1121
1122	/* Copy the user space relevant state of @xsave->header */
1123	membuf_write(&to, &header, sizeof(header));
1124
1125	zerofrom = offsetof(struct xregs_state, extended_state_area);
1126
1127	/*
1128	 * This 'mask' indicates which states to copy from fpstate.
1129	 * Those extended states that are not present in fpstate are
1130	 * either disabled or initialized:
1131	 *
1132	 * In non-compacted format, disabled features still occupy
1133	 * state space but there is no state to copy from in the
1134	 * compacted init_fpstate. The gap tracking will zero these
1135	 * states.
1136	 *
1137	 * The extended features have an all zeroes init state. Thus,
1138	 * remove them from 'mask' to zero those features in the user
1139	 * buffer instead of retrieving them from init_fpstate.
1140	 */
1141	mask = header.xfeatures;
1142
1143	for_each_extended_xfeature(i, mask) {
1144		/*
1145		 * If there was a feature or alignment gap, zero the space
1146		 * in the destination buffer.
1147		 */
1148		if (zerofrom < xstate_offsets[i])
1149			membuf_zero(&to, xstate_offsets[i] - zerofrom);
1150
1151		if (i == XFEATURE_PKRU) {
1152			struct pkru_state pkru = {0};
1153			/*
1154			 * PKRU is not necessarily up to date in the
1155			 * XSAVE buffer. Use the provided value.
1156			 */
1157			pkru.pkru = pkru_val;
1158			membuf_write(&to, &pkru, sizeof(pkru));
1159		} else {
1160			membuf_write(&to,
1161				     __raw_xsave_addr(xsave, i),
1162				     xstate_sizes[i]);
1163		}
1164		/*
1165		 * Keep track of the last copied state in the non-compacted
1166		 * target buffer for gap zeroing.
1167		 */
1168		zerofrom = xstate_offsets[i] + xstate_sizes[i];
1169	}
1170
1171out:
1172	if (to.left)
1173		membuf_zero(&to, to.left);
1174}
1175
1176/**
1177 * copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer
1178 * @to:		membuf descriptor
1179 * @tsk:	The task from which to copy the saved xstate
1180 * @copy_mode:	The requested copy mode
1181 *
1182 * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming
1183 * format, i.e. from the kernel internal hardware dependent storage format
1184 * to the requested @mode. UABI XSTATE is always uncompacted!
1185 *
1186 * It supports partial copy but @to.pos always starts from zero.
1187 */
1188void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk,
1189			     enum xstate_copy_mode copy_mode)
1190{
1191	__copy_xstate_to_uabi_buf(to, tsk->thread.fpu.fpstate,
1192				  tsk->thread.fpu.fpstate->user_xfeatures,
1193				  tsk->thread.pkru, copy_mode);
1194}
1195
1196static int copy_from_buffer(void *dst, unsigned int offset, unsigned int size,
1197			    const void *kbuf, const void __user *ubuf)
1198{
1199	if (kbuf) {
1200		memcpy(dst, kbuf + offset, size);
1201	} else {
1202		if (copy_from_user(dst, ubuf + offset, size))
1203			return -EFAULT;
1204	}
1205	return 0;
1206}
1207
1208
1209/**
1210 * copy_uabi_to_xstate - Copy a UABI format buffer to the kernel xstate
1211 * @fpstate:	The fpstate buffer to copy to
1212 * @kbuf:	The UABI format buffer, if it comes from the kernel
1213 * @ubuf:	The UABI format buffer, if it comes from userspace
1214 * @pkru:	The location to write the PKRU value to
1215 *
1216 * Converts from the UABI format into the kernel internal hardware
1217 * dependent format.
1218 *
1219 * This function ultimately has three different callers with distinct PKRU
1220 * behavior.
1221 * 1.	When called from sigreturn the PKRU register will be restored from
1222 *	@fpstate via an XRSTOR. Correctly copying the UABI format buffer to
1223 *	@fpstate is sufficient to cover this case, but the caller will also
1224 *	pass a pointer to the thread_struct's pkru field in @pkru and updating
1225 *	it is harmless.
1226 * 2.	When called from ptrace the PKRU register will be restored from the
1227 *	thread_struct's pkru field. A pointer to that is passed in @pkru.
1228 *	The kernel will restore it manually, so the XRSTOR behavior that resets
1229 *	the PKRU register to the hardware init value (0) if the corresponding
1230 *	xfeatures bit is not set is emulated here.
1231 * 3.	When called from KVM the PKRU register will be restored from the vcpu's
1232 *	pkru field. A pointer to that is passed in @pkru. KVM hasn't used
1233 *	XRSTOR and hasn't had the PKRU resetting behavior described above. To
1234 *	preserve that KVM behavior, it passes NULL for @pkru if the xfeatures
1235 *	bit is not set.
1236 */
1237static int copy_uabi_to_xstate(struct fpstate *fpstate, const void *kbuf,
1238			       const void __user *ubuf, u32 *pkru)
1239{
1240	struct xregs_state *xsave = &fpstate->regs.xsave;
1241	unsigned int offset, size;
1242	struct xstate_header hdr;
1243	u64 mask;
1244	int i;
1245
1246	offset = offsetof(struct xregs_state, header);
1247	if (copy_from_buffer(&hdr, offset, sizeof(hdr), kbuf, ubuf))
1248		return -EFAULT;
1249
1250	if (validate_user_xstate_header(&hdr, fpstate))
1251		return -EINVAL;
1252
1253	/* Validate MXCSR when any of the related features is in use */
1254	mask = XFEATURE_MASK_FP | XFEATURE_MASK_SSE | XFEATURE_MASK_YMM;
1255	if (hdr.xfeatures & mask) {
1256		u32 mxcsr[2];
1257
1258		offset = offsetof(struct fxregs_state, mxcsr);
1259		if (copy_from_buffer(mxcsr, offset, sizeof(mxcsr), kbuf, ubuf))
1260			return -EFAULT;
1261
1262		/* Reserved bits in MXCSR must be zero. */
1263		if (mxcsr[0] & ~mxcsr_feature_mask)
1264			return -EINVAL;
1265
1266		/* SSE and YMM require MXCSR even when FP is not in use. */
1267		if (!(hdr.xfeatures & XFEATURE_MASK_FP)) {
1268			xsave->i387.mxcsr = mxcsr[0];
1269			xsave->i387.mxcsr_mask = mxcsr[1];
1270		}
1271	}
1272
1273	for (i = 0; i < XFEATURE_MAX; i++) {
1274		mask = BIT_ULL(i);
1275
1276		if (hdr.xfeatures & mask) {
1277			void *dst = __raw_xsave_addr(xsave, i);
1278
1279			offset = xstate_offsets[i];
1280			size = xstate_sizes[i];
1281
1282			if (copy_from_buffer(dst, offset, size, kbuf, ubuf))
1283				return -EFAULT;
1284		}
1285	}
1286
1287	if (hdr.xfeatures & XFEATURE_MASK_PKRU) {
1288		struct pkru_state *xpkru;
1289
1290		xpkru = __raw_xsave_addr(xsave, XFEATURE_PKRU);
1291		*pkru = xpkru->pkru;
1292	} else {
1293		/*
1294		 * KVM may pass NULL here to indicate that it does not need
1295		 * PKRU updated.
1296		 */
1297		if (pkru)
1298			*pkru = 0;
1299	}
1300
1301	/*
1302	 * The state that came in from userspace was user-state only.
1303	 * Mask all the user states out of 'xfeatures':
1304	 */
1305	xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR_ALL;
1306
1307	/*
1308	 * Add back in the features that came in from userspace:
1309	 */
1310	xsave->header.xfeatures |= hdr.xfeatures;
1311
1312	return 0;
1313}
1314
1315/*
1316 * Convert from a ptrace standard-format kernel buffer to kernel XSAVE[S]
1317 * format and copy to the target thread. Used by ptrace and KVM.
1318 */
1319int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf, u32 *pkru)
1320{
1321	return copy_uabi_to_xstate(fpstate, kbuf, NULL, pkru);
1322}
1323
1324/*
1325 * Convert from a sigreturn standard-format user-space buffer to kernel
1326 * XSAVE[S] format and copy to the target thread. This is called from the
1327 * sigreturn() and rt_sigreturn() system calls.
1328 */
1329int copy_sigframe_from_user_to_xstate(struct task_struct *tsk,
1330				      const void __user *ubuf)
1331{
1332	return copy_uabi_to_xstate(tsk->thread.fpu.fpstate, NULL, ubuf, &tsk->thread.pkru);
1333}
1334
1335static bool validate_independent_components(u64 mask)
1336{
1337	u64 xchk;
1338
1339	if (WARN_ON_FPU(!cpu_feature_enabled(X86_FEATURE_XSAVES)))
1340		return false;
1341
1342	xchk = ~xfeatures_mask_independent();
1343
1344	if (WARN_ON_ONCE(!mask || mask & xchk))
1345		return false;
1346
1347	return true;
1348}
1349
1350/**
1351 * xsaves - Save selected components to a kernel xstate buffer
1352 * @xstate:	Pointer to the buffer
1353 * @mask:	Feature mask to select the components to save
1354 *
1355 * The @xstate buffer must be 64 byte aligned and correctly initialized as
1356 * XSAVES does not write the full xstate header. Before first use the
1357 * buffer should be zeroed otherwise a consecutive XRSTORS from that buffer
1358 * can #GP.
1359 *
1360 * The feature mask must be a subset of the independent features.
1361 */
1362void xsaves(struct xregs_state *xstate, u64 mask)
1363{
1364	int err;
1365
1366	if (!validate_independent_components(mask))
1367		return;
1368
1369	XSTATE_OP(XSAVES, xstate, (u32)mask, (u32)(mask >> 32), err);
1370	WARN_ON_ONCE(err);
1371}
1372
1373/**
1374 * xrstors - Restore selected components from a kernel xstate buffer
1375 * @xstate:	Pointer to the buffer
1376 * @mask:	Feature mask to select the components to restore
1377 *
1378 * The @xstate buffer must be 64 byte aligned and correctly initialized
1379 * otherwise XRSTORS from that buffer can #GP.
1380 *
1381 * Proper usage is to restore the state which was saved with
1382 * xsaves() into @xstate.
1383 *
1384 * The feature mask must be a subset of the independent features.
1385 */
1386void xrstors(struct xregs_state *xstate, u64 mask)
1387{
1388	int err;
1389
1390	if (!validate_independent_components(mask))
1391		return;
1392
1393	XSTATE_OP(XRSTORS, xstate, (u32)mask, (u32)(mask >> 32), err);
1394	WARN_ON_ONCE(err);
1395}
1396
1397#if IS_ENABLED(CONFIG_KVM)
1398void fpstate_clear_xstate_component(struct fpstate *fps, unsigned int xfeature)
1399{
1400	void *addr = get_xsave_addr(&fps->regs.xsave, xfeature);
1401
1402	if (addr)
1403		memset(addr, 0, xstate_sizes[xfeature]);
1404}
1405EXPORT_SYMBOL_GPL(fpstate_clear_xstate_component);
1406#endif
1407
1408#ifdef CONFIG_X86_64
1409
1410#ifdef CONFIG_X86_DEBUG_FPU
1411/*
1412 * Ensure that a subsequent XSAVE* or XRSTOR* instruction with RFBM=@mask
1413 * can safely operate on the @fpstate buffer.
1414 */
1415static bool xstate_op_valid(struct fpstate *fpstate, u64 mask, bool rstor)
1416{
1417	u64 xfd = __this_cpu_read(xfd_state);
1418
1419	if (fpstate->xfd == xfd)
1420		return true;
1421
1422	 /*
1423	  * The XFD MSR does not match fpstate->xfd. That's invalid when
1424	  * the passed in fpstate is current's fpstate.
1425	  */
1426	if (fpstate->xfd == current->thread.fpu.fpstate->xfd)
1427		return false;
1428
1429	/*
1430	 * XRSTOR(S) from init_fpstate are always correct as it will just
1431	 * bring all components into init state and not read from the
1432	 * buffer. XSAVE(S) raises #PF after init.
1433	 */
1434	if (fpstate == &init_fpstate)
1435		return rstor;
1436
1437	/*
1438	 * XSAVE(S): clone(), fpu_swap_kvm_fpstate()
1439	 * XRSTORS(S): fpu_swap_kvm_fpstate()
1440	 */
1441
1442	/*
1443	 * No XSAVE/XRSTOR instructions (except XSAVE itself) touch
1444	 * the buffer area for XFD-disabled state components.
1445	 */
1446	mask &= ~xfd;
1447
1448	/*
1449	 * Remove features which are valid in fpstate. They
1450	 * have space allocated in fpstate.
1451	 */
1452	mask &= ~fpstate->xfeatures;
1453
1454	/*
1455	 * Any remaining state components in 'mask' might be written
1456	 * by XSAVE/XRSTOR. Fail validation it found.
1457	 */
1458	return !mask;
1459}
1460
1461void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor)
1462{
1463	WARN_ON_ONCE(!xstate_op_valid(fpstate, mask, rstor));
1464}
1465#endif /* CONFIG_X86_DEBUG_FPU */
1466
1467static int __init xfd_update_static_branch(void)
1468{
1469	/*
1470	 * If init_fpstate.xfd has bits set then dynamic features are
1471	 * available and the dynamic sizing must be enabled.
1472	 */
1473	if (init_fpstate.xfd)
1474		static_branch_enable(&__fpu_state_size_dynamic);
1475	return 0;
1476}
1477arch_initcall(xfd_update_static_branch)
1478
1479void fpstate_free(struct fpu *fpu)
1480{
1481	if (fpu->fpstate && fpu->fpstate != &fpu->__fpstate)
1482		vfree(fpu->fpstate);
1483}
1484
1485/**
1486 * fpstate_realloc - Reallocate struct fpstate for the requested new features
1487 *
1488 * @xfeatures:	A bitmap of xstate features which extend the enabled features
1489 *		of that task
1490 * @ksize:	The required size for the kernel buffer
1491 * @usize:	The required size for user space buffers
1492 * @guest_fpu:	Pointer to a guest FPU container. NULL for host allocations
1493 *
1494 * Note vs. vmalloc(): If the task with a vzalloc()-allocated buffer
1495 * terminates quickly, vfree()-induced IPIs may be a concern, but tasks
1496 * with large states are likely to live longer.
1497 *
1498 * Returns: 0 on success, -ENOMEM on allocation error.
1499 */
1500static int fpstate_realloc(u64 xfeatures, unsigned int ksize,
1501			   unsigned int usize, struct fpu_guest *guest_fpu)
1502{
1503	struct fpu *fpu = &current->thread.fpu;
1504	struct fpstate *curfps, *newfps = NULL;
1505	unsigned int fpsize;
1506	bool in_use;
1507
1508	fpsize = ksize + ALIGN(offsetof(struct fpstate, regs), 64);
1509
1510	newfps = vzalloc(fpsize);
1511	if (!newfps)
1512		return -ENOMEM;
1513	newfps->size = ksize;
1514	newfps->user_size = usize;
1515	newfps->is_valloc = true;
1516
1517	/*
1518	 * When a guest FPU is supplied, use @guest_fpu->fpstate
1519	 * as reference independent whether it is in use or not.
1520	 */
1521	curfps = guest_fpu ? guest_fpu->fpstate : fpu->fpstate;
1522
1523	/* Determine whether @curfps is the active fpstate */
1524	in_use = fpu->fpstate == curfps;
1525
1526	if (guest_fpu) {
1527		newfps->is_guest = true;
1528		newfps->is_confidential = curfps->is_confidential;
1529		newfps->in_use = curfps->in_use;
1530		guest_fpu->xfeatures |= xfeatures;
1531		guest_fpu->uabi_size = usize;
1532	}
1533
1534	fpregs_lock();
1535	/*
1536	 * If @curfps is in use, ensure that the current state is in the
1537	 * registers before swapping fpstate as that might invalidate it
1538	 * due to layout changes.
1539	 */
1540	if (in_use && test_thread_flag(TIF_NEED_FPU_LOAD))
1541		fpregs_restore_userregs();
1542
1543	newfps->xfeatures = curfps->xfeatures | xfeatures;
1544	newfps->user_xfeatures = curfps->user_xfeatures | xfeatures;
1545	newfps->xfd = curfps->xfd & ~xfeatures;
1546
1547	/* Do the final updates within the locked region */
1548	xstate_init_xcomp_bv(&newfps->regs.xsave, newfps->xfeatures);
1549
1550	if (guest_fpu) {
1551		guest_fpu->fpstate = newfps;
1552		/* If curfps is active, update the FPU fpstate pointer */
1553		if (in_use)
1554			fpu->fpstate = newfps;
1555	} else {
1556		fpu->fpstate = newfps;
1557	}
1558
1559	if (in_use)
1560		xfd_update_state(fpu->fpstate);
1561	fpregs_unlock();
1562
1563	/* Only free valloc'ed state */
1564	if (curfps && curfps->is_valloc)
1565		vfree(curfps);
1566
1567	return 0;
1568}
1569
1570static int validate_sigaltstack(unsigned int usize)
1571{
1572	struct task_struct *thread, *leader = current->group_leader;
1573	unsigned long framesize = get_sigframe_size();
1574
1575	lockdep_assert_held(&current->sighand->siglock);
1576
1577	/* get_sigframe_size() is based on fpu_user_cfg.max_size */
1578	framesize -= fpu_user_cfg.max_size;
1579	framesize += usize;
1580	for_each_thread(leader, thread) {
1581		if (thread->sas_ss_size && thread->sas_ss_size < framesize)
1582			return -ENOSPC;
1583	}
1584	return 0;
1585}
1586
1587static int __xstate_request_perm(u64 permitted, u64 requested, bool guest)
1588{
1589	/*
1590	 * This deliberately does not exclude !XSAVES as we still might
1591	 * decide to optionally context switch XCR0 or talk the silicon
1592	 * vendors into extending XFD for the pre AMX states, especially
1593	 * AVX512.
1594	 */
1595	bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED);
1596	struct fpu *fpu = &current->group_leader->thread.fpu;
1597	struct fpu_state_perm *perm;
1598	unsigned int ksize, usize;
1599	u64 mask;
1600	int ret = 0;
1601
1602	/* Check whether fully enabled */
1603	if ((permitted & requested) == requested)
1604		return 0;
1605
1606	/* Calculate the resulting kernel state size */
1607	mask = permitted | requested;
1608	/* Take supervisor states into account on the host */
1609	if (!guest)
1610		mask |= xfeatures_mask_supervisor();
1611	ksize = xstate_calculate_size(mask, compacted);
1612
1613	/* Calculate the resulting user state size */
1614	mask &= XFEATURE_MASK_USER_SUPPORTED;
1615	usize = xstate_calculate_size(mask, false);
1616
1617	if (!guest) {
1618		ret = validate_sigaltstack(usize);
1619		if (ret)
1620			return ret;
1621	}
1622
1623	perm = guest ? &fpu->guest_perm : &fpu->perm;
1624	/* Pairs with the READ_ONCE() in xstate_get_group_perm() */
1625	WRITE_ONCE(perm->__state_perm, mask);
1626	/* Protected by sighand lock */
1627	perm->__state_size = ksize;
1628	perm->__user_state_size = usize;
1629	return ret;
1630}
1631
1632/*
1633 * Permissions array to map facilities with more than one component
1634 */
1635static const u64 xstate_prctl_req[XFEATURE_MAX] = {
1636	[XFEATURE_XTILE_DATA] = XFEATURE_MASK_XTILE_DATA,
1637};
1638
1639static int xstate_request_perm(unsigned long idx, bool guest)
1640{
1641	u64 permitted, requested;
1642	int ret;
1643
1644	if (idx >= XFEATURE_MAX)
1645		return -EINVAL;
1646
1647	/*
1648	 * Look up the facility mask which can require more than
1649	 * one xstate component.
1650	 */
1651	idx = array_index_nospec(idx, ARRAY_SIZE(xstate_prctl_req));
1652	requested = xstate_prctl_req[idx];
1653	if (!requested)
1654		return -EOPNOTSUPP;
1655
1656	if ((fpu_user_cfg.max_features & requested) != requested)
1657		return -EOPNOTSUPP;
1658
1659	/* Lockless quick check */
1660	permitted = xstate_get_group_perm(guest);
1661	if ((permitted & requested) == requested)
1662		return 0;
1663
1664	/* Protect against concurrent modifications */
1665	spin_lock_irq(&current->sighand->siglock);
1666	permitted = xstate_get_group_perm(guest);
1667
1668	/* First vCPU allocation locks the permissions. */
1669	if (guest && (permitted & FPU_GUEST_PERM_LOCKED))
1670		ret = -EBUSY;
1671	else
1672		ret = __xstate_request_perm(permitted, requested, guest);
1673	spin_unlock_irq(&current->sighand->siglock);
1674	return ret;
1675}
1676
1677int __xfd_enable_feature(u64 xfd_err, struct fpu_guest *guest_fpu)
1678{
1679	u64 xfd_event = xfd_err & XFEATURE_MASK_USER_DYNAMIC;
1680	struct fpu_state_perm *perm;
1681	unsigned int ksize, usize;
1682	struct fpu *fpu;
1683
1684	if (!xfd_event) {
1685		if (!guest_fpu)
1686			pr_err_once("XFD: Invalid xfd error: %016llx\n", xfd_err);
1687		return 0;
1688	}
1689
1690	/* Protect against concurrent modifications */
1691	spin_lock_irq(&current->sighand->siglock);
1692
1693	/* If not permitted let it die */
1694	if ((xstate_get_group_perm(!!guest_fpu) & xfd_event) != xfd_event) {
1695		spin_unlock_irq(&current->sighand->siglock);
1696		return -EPERM;
1697	}
1698
1699	fpu = &current->group_leader->thread.fpu;
1700	perm = guest_fpu ? &fpu->guest_perm : &fpu->perm;
1701	ksize = perm->__state_size;
1702	usize = perm->__user_state_size;
1703
1704	/*
1705	 * The feature is permitted. State size is sufficient.  Dropping
1706	 * the lock is safe here even if more features are added from
1707	 * another task, the retrieved buffer sizes are valid for the
1708	 * currently requested feature(s).
1709	 */
1710	spin_unlock_irq(&current->sighand->siglock);
1711
1712	/*
1713	 * Try to allocate a new fpstate. If that fails there is no way
1714	 * out.
1715	 */
1716	if (fpstate_realloc(xfd_event, ksize, usize, guest_fpu))
1717		return -EFAULT;
1718	return 0;
1719}
1720
1721int xfd_enable_feature(u64 xfd_err)
1722{
1723	return __xfd_enable_feature(xfd_err, NULL);
1724}
1725
1726#else /* CONFIG_X86_64 */
1727static inline int xstate_request_perm(unsigned long idx, bool guest)
1728{
1729	return -EPERM;
1730}
1731#endif  /* !CONFIG_X86_64 */
1732
1733u64 xstate_get_guest_group_perm(void)
1734{
1735	return xstate_get_group_perm(true);
1736}
1737EXPORT_SYMBOL_GPL(xstate_get_guest_group_perm);
1738
1739/**
1740 * fpu_xstate_prctl - xstate permission operations
1741 * @option:	A subfunction of arch_prctl()
1742 * @arg2:	option argument
1743 * Return:	0 if successful; otherwise, an error code
1744 *
1745 * Option arguments:
1746 *
1747 * ARCH_GET_XCOMP_SUPP: Pointer to user space u64 to store the info
1748 * ARCH_GET_XCOMP_PERM: Pointer to user space u64 to store the info
1749 * ARCH_REQ_XCOMP_PERM: Facility number requested
1750 *
1751 * For facilities which require more than one XSTATE component, the request
1752 * must be the highest state component number related to that facility,
1753 * e.g. for AMX which requires XFEATURE_XTILE_CFG(17) and
1754 * XFEATURE_XTILE_DATA(18) this would be XFEATURE_XTILE_DATA(18).
1755 */
1756long fpu_xstate_prctl(int option, unsigned long arg2)
1757{
1758	u64 __user *uptr = (u64 __user *)arg2;
1759	u64 permitted, supported;
1760	unsigned long idx = arg2;
1761	bool guest = false;
1762
1763	switch (option) {
1764	case ARCH_GET_XCOMP_SUPP:
1765		supported = fpu_user_cfg.max_features |	fpu_user_cfg.legacy_features;
1766		return put_user(supported, uptr);
1767
1768	case ARCH_GET_XCOMP_PERM:
1769		/*
1770		 * Lockless snapshot as it can also change right after the
1771		 * dropping the lock.
1772		 */
1773		permitted = xstate_get_host_group_perm();
1774		permitted &= XFEATURE_MASK_USER_SUPPORTED;
1775		return put_user(permitted, uptr);
1776
1777	case ARCH_GET_XCOMP_GUEST_PERM:
1778		permitted = xstate_get_guest_group_perm();
1779		permitted &= XFEATURE_MASK_USER_SUPPORTED;
1780		return put_user(permitted, uptr);
1781
1782	case ARCH_REQ_XCOMP_GUEST_PERM:
1783		guest = true;
1784		fallthrough;
1785
1786	case ARCH_REQ_XCOMP_PERM:
1787		if (!IS_ENABLED(CONFIG_X86_64))
1788			return -EOPNOTSUPP;
1789
1790		return xstate_request_perm(idx, guest);
1791
1792	default:
1793		return -EINVAL;
1794	}
1795}
1796
1797#ifdef CONFIG_PROC_PID_ARCH_STATUS
1798/*
1799 * Report the amount of time elapsed in millisecond since last AVX512
1800 * use in the task.
1801 */
1802static void avx512_status(struct seq_file *m, struct task_struct *task)
1803{
1804	unsigned long timestamp = READ_ONCE(task->thread.fpu.avx512_timestamp);
1805	long delta;
1806
1807	if (!timestamp) {
1808		/*
1809		 * Report -1 if no AVX512 usage
1810		 */
1811		delta = -1;
1812	} else {
1813		delta = (long)(jiffies - timestamp);
1814		/*
1815		 * Cap to LONG_MAX if time difference > LONG_MAX
1816		 */
1817		if (delta < 0)
1818			delta = LONG_MAX;
1819		delta = jiffies_to_msecs(delta);
1820	}
1821
1822	seq_put_decimal_ll(m, "AVX512_elapsed_ms:\t", delta);
1823	seq_putc(m, '\n');
1824}
1825
1826/*
1827 * Report architecture specific information
1828 */
1829int proc_pid_arch_status(struct seq_file *m, struct pid_namespace *ns,
1830			struct pid *pid, struct task_struct *task)
1831{
1832	/*
1833	 * Report AVX512 state if the processor and build option supported.
1834	 */
1835	if (cpu_feature_enabled(X86_FEATURE_AVX512F))
1836		avx512_status(m, task);
1837
1838	return 0;
1839}
1840#endif /* CONFIG_PROC_PID_ARCH_STATUS */
1841