1// SPDX-License-Identifier: MIT
2/*
3 * Copyright �� 2014 Intel Corporation
4 */
5
6#include "gem/i915_gem_lmem.h"
7
8#include "gen8_engine_cs.h"
9#include "i915_drv.h"
10#include "i915_perf.h"
11#include "i915_reg.h"
12#include "intel_context.h"
13#include "intel_engine.h"
14#include "intel_engine_regs.h"
15#include "intel_gpu_commands.h"
16#include "intel_gt.h"
17#include "intel_gt_regs.h"
18#include "intel_lrc.h"
19#include "intel_lrc_reg.h"
20#include "intel_ring.h"
21#include "shmem_utils.h"
22
23/*
24 * The per-platform tables are u8-encoded in @data. Decode @data and set the
25 * addresses' offset and commands in @regs. The following encoding is used
26 * for each byte. There are 2 steps: decoding commands and decoding addresses.
27 *
28 * Commands:
29 * [7]: create NOPs - number of NOPs are set in lower bits
30 * [6]: When creating MI_LOAD_REGISTER_IMM command, allow to set
31 *      MI_LRI_FORCE_POSTED
32 * [5:0]: Number of NOPs or registers to set values to in case of
33 *        MI_LOAD_REGISTER_IMM
34 *
35 * Addresses: these are decoded after a MI_LOAD_REGISTER_IMM command by "count"
36 * number of registers. They are set by using the REG/REG16 macros: the former
37 * is used for offsets smaller than 0x200 while the latter is for values bigger
38 * than that. Those macros already set all the bits documented below correctly:
39 *
40 * [7]: When a register offset needs more than 6 bits, use additional bytes, to
41 *      follow, for the lower bits
42 * [6:0]: Register offset, without considering the engine base.
43 *
44 * This function only tweaks the commands and register offsets. Values are not
45 * filled out.
46 */
47static void set_offsets(u32 *regs,
48			const u8 *data,
49			const struct intel_engine_cs *engine,
50			bool close)
51#define NOP(x) (BIT(7) | (x))
52#define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6)))
53#define POSTED BIT(0)
54#define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
55#define REG16(x) \
56	(((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
57	(((x) >> 2) & 0x7f)
58#define END 0
59{
60	const u32 base = engine->mmio_base;
61
62	while (*data) {
63		u8 count, flags;
64
65		if (*data & BIT(7)) { /* skip */
66			count = *data++ & ~BIT(7);
67			regs += count;
68			continue;
69		}
70
71		count = *data & 0x3f;
72		flags = *data >> 6;
73		data++;
74
75		*regs = MI_LOAD_REGISTER_IMM(count);
76		if (flags & POSTED)
77			*regs |= MI_LRI_FORCE_POSTED;
78		if (GRAPHICS_VER(engine->i915) >= 11)
79			*regs |= MI_LRI_LRM_CS_MMIO;
80		regs++;
81
82		GEM_BUG_ON(!count);
83		do {
84			u32 offset = 0;
85			u8 v;
86
87			do {
88				v = *data++;
89				offset <<= 7;
90				offset |= v & ~BIT(7);
91			} while (v & BIT(7));
92
93			regs[0] = base + (offset << 2);
94			regs += 2;
95		} while (--count);
96	}
97
98	if (close) {
99		/* Close the batch; used mainly by live_lrc_layout() */
100		*regs = MI_BATCH_BUFFER_END;
101		if (GRAPHICS_VER(engine->i915) >= 11)
102			*regs |= BIT(0);
103	}
104}
105
106static const u8 gen8_xcs_offsets[] = {
107	NOP(1),
108	LRI(11, 0),
109	REG16(0x244),
110	REG(0x034),
111	REG(0x030),
112	REG(0x038),
113	REG(0x03c),
114	REG(0x168),
115	REG(0x140),
116	REG(0x110),
117	REG(0x11c),
118	REG(0x114),
119	REG(0x118),
120
121	NOP(9),
122	LRI(9, 0),
123	REG16(0x3a8),
124	REG16(0x28c),
125	REG16(0x288),
126	REG16(0x284),
127	REG16(0x280),
128	REG16(0x27c),
129	REG16(0x278),
130	REG16(0x274),
131	REG16(0x270),
132
133	NOP(13),
134	LRI(2, 0),
135	REG16(0x200),
136	REG(0x028),
137
138	END
139};
140
141static const u8 gen9_xcs_offsets[] = {
142	NOP(1),
143	LRI(14, POSTED),
144	REG16(0x244),
145	REG(0x034),
146	REG(0x030),
147	REG(0x038),
148	REG(0x03c),
149	REG(0x168),
150	REG(0x140),
151	REG(0x110),
152	REG(0x11c),
153	REG(0x114),
154	REG(0x118),
155	REG(0x1c0),
156	REG(0x1c4),
157	REG(0x1c8),
158
159	NOP(3),
160	LRI(9, POSTED),
161	REG16(0x3a8),
162	REG16(0x28c),
163	REG16(0x288),
164	REG16(0x284),
165	REG16(0x280),
166	REG16(0x27c),
167	REG16(0x278),
168	REG16(0x274),
169	REG16(0x270),
170
171	NOP(13),
172	LRI(1, POSTED),
173	REG16(0x200),
174
175	NOP(13),
176	LRI(44, POSTED),
177	REG(0x028),
178	REG(0x09c),
179	REG(0x0c0),
180	REG(0x178),
181	REG(0x17c),
182	REG16(0x358),
183	REG(0x170),
184	REG(0x150),
185	REG(0x154),
186	REG(0x158),
187	REG16(0x41c),
188	REG16(0x600),
189	REG16(0x604),
190	REG16(0x608),
191	REG16(0x60c),
192	REG16(0x610),
193	REG16(0x614),
194	REG16(0x618),
195	REG16(0x61c),
196	REG16(0x620),
197	REG16(0x624),
198	REG16(0x628),
199	REG16(0x62c),
200	REG16(0x630),
201	REG16(0x634),
202	REG16(0x638),
203	REG16(0x63c),
204	REG16(0x640),
205	REG16(0x644),
206	REG16(0x648),
207	REG16(0x64c),
208	REG16(0x650),
209	REG16(0x654),
210	REG16(0x658),
211	REG16(0x65c),
212	REG16(0x660),
213	REG16(0x664),
214	REG16(0x668),
215	REG16(0x66c),
216	REG16(0x670),
217	REG16(0x674),
218	REG16(0x678),
219	REG16(0x67c),
220	REG(0x068),
221
222	END
223};
224
225static const u8 gen12_xcs_offsets[] = {
226	NOP(1),
227	LRI(13, POSTED),
228	REG16(0x244),
229	REG(0x034),
230	REG(0x030),
231	REG(0x038),
232	REG(0x03c),
233	REG(0x168),
234	REG(0x140),
235	REG(0x110),
236	REG(0x1c0),
237	REG(0x1c4),
238	REG(0x1c8),
239	REG(0x180),
240	REG16(0x2b4),
241
242	NOP(5),
243	LRI(9, POSTED),
244	REG16(0x3a8),
245	REG16(0x28c),
246	REG16(0x288),
247	REG16(0x284),
248	REG16(0x280),
249	REG16(0x27c),
250	REG16(0x278),
251	REG16(0x274),
252	REG16(0x270),
253
254	END
255};
256
257static const u8 dg2_xcs_offsets[] = {
258	NOP(1),
259	LRI(15, POSTED),
260	REG16(0x244),
261	REG(0x034),
262	REG(0x030),
263	REG(0x038),
264	REG(0x03c),
265	REG(0x168),
266	REG(0x140),
267	REG(0x110),
268	REG(0x1c0),
269	REG(0x1c4),
270	REG(0x1c8),
271	REG(0x180),
272	REG16(0x2b4),
273	REG(0x120),
274	REG(0x124),
275
276	NOP(1),
277	LRI(9, POSTED),
278	REG16(0x3a8),
279	REG16(0x28c),
280	REG16(0x288),
281	REG16(0x284),
282	REG16(0x280),
283	REG16(0x27c),
284	REG16(0x278),
285	REG16(0x274),
286	REG16(0x270),
287
288	END
289};
290
291static const u8 gen8_rcs_offsets[] = {
292	NOP(1),
293	LRI(14, POSTED),
294	REG16(0x244),
295	REG(0x034),
296	REG(0x030),
297	REG(0x038),
298	REG(0x03c),
299	REG(0x168),
300	REG(0x140),
301	REG(0x110),
302	REG(0x11c),
303	REG(0x114),
304	REG(0x118),
305	REG(0x1c0),
306	REG(0x1c4),
307	REG(0x1c8),
308
309	NOP(3),
310	LRI(9, POSTED),
311	REG16(0x3a8),
312	REG16(0x28c),
313	REG16(0x288),
314	REG16(0x284),
315	REG16(0x280),
316	REG16(0x27c),
317	REG16(0x278),
318	REG16(0x274),
319	REG16(0x270),
320
321	NOP(13),
322	LRI(1, 0),
323	REG(0x0c8),
324
325	END
326};
327
328static const u8 gen9_rcs_offsets[] = {
329	NOP(1),
330	LRI(14, POSTED),
331	REG16(0x244),
332	REG(0x34),
333	REG(0x30),
334	REG(0x38),
335	REG(0x3c),
336	REG(0x168),
337	REG(0x140),
338	REG(0x110),
339	REG(0x11c),
340	REG(0x114),
341	REG(0x118),
342	REG(0x1c0),
343	REG(0x1c4),
344	REG(0x1c8),
345
346	NOP(3),
347	LRI(9, POSTED),
348	REG16(0x3a8),
349	REG16(0x28c),
350	REG16(0x288),
351	REG16(0x284),
352	REG16(0x280),
353	REG16(0x27c),
354	REG16(0x278),
355	REG16(0x274),
356	REG16(0x270),
357
358	NOP(13),
359	LRI(1, 0),
360	REG(0xc8),
361
362	NOP(13),
363	LRI(44, POSTED),
364	REG(0x28),
365	REG(0x9c),
366	REG(0xc0),
367	REG(0x178),
368	REG(0x17c),
369	REG16(0x358),
370	REG(0x170),
371	REG(0x150),
372	REG(0x154),
373	REG(0x158),
374	REG16(0x41c),
375	REG16(0x600),
376	REG16(0x604),
377	REG16(0x608),
378	REG16(0x60c),
379	REG16(0x610),
380	REG16(0x614),
381	REG16(0x618),
382	REG16(0x61c),
383	REG16(0x620),
384	REG16(0x624),
385	REG16(0x628),
386	REG16(0x62c),
387	REG16(0x630),
388	REG16(0x634),
389	REG16(0x638),
390	REG16(0x63c),
391	REG16(0x640),
392	REG16(0x644),
393	REG16(0x648),
394	REG16(0x64c),
395	REG16(0x650),
396	REG16(0x654),
397	REG16(0x658),
398	REG16(0x65c),
399	REG16(0x660),
400	REG16(0x664),
401	REG16(0x668),
402	REG16(0x66c),
403	REG16(0x670),
404	REG16(0x674),
405	REG16(0x678),
406	REG16(0x67c),
407	REG(0x68),
408
409	END
410};
411
412static const u8 gen11_rcs_offsets[] = {
413	NOP(1),
414	LRI(15, POSTED),
415	REG16(0x244),
416	REG(0x034),
417	REG(0x030),
418	REG(0x038),
419	REG(0x03c),
420	REG(0x168),
421	REG(0x140),
422	REG(0x110),
423	REG(0x11c),
424	REG(0x114),
425	REG(0x118),
426	REG(0x1c0),
427	REG(0x1c4),
428	REG(0x1c8),
429	REG(0x180),
430
431	NOP(1),
432	LRI(9, POSTED),
433	REG16(0x3a8),
434	REG16(0x28c),
435	REG16(0x288),
436	REG16(0x284),
437	REG16(0x280),
438	REG16(0x27c),
439	REG16(0x278),
440	REG16(0x274),
441	REG16(0x270),
442
443	LRI(1, POSTED),
444	REG(0x1b0),
445
446	NOP(10),
447	LRI(1, 0),
448	REG(0x0c8),
449
450	END
451};
452
453static const u8 gen12_rcs_offsets[] = {
454	NOP(1),
455	LRI(13, POSTED),
456	REG16(0x244),
457	REG(0x034),
458	REG(0x030),
459	REG(0x038),
460	REG(0x03c),
461	REG(0x168),
462	REG(0x140),
463	REG(0x110),
464	REG(0x1c0),
465	REG(0x1c4),
466	REG(0x1c8),
467	REG(0x180),
468	REG16(0x2b4),
469
470	NOP(5),
471	LRI(9, POSTED),
472	REG16(0x3a8),
473	REG16(0x28c),
474	REG16(0x288),
475	REG16(0x284),
476	REG16(0x280),
477	REG16(0x27c),
478	REG16(0x278),
479	REG16(0x274),
480	REG16(0x270),
481
482	LRI(3, POSTED),
483	REG(0x1b0),
484	REG16(0x5a8),
485	REG16(0x5ac),
486
487	NOP(6),
488	LRI(1, 0),
489	REG(0x0c8),
490	NOP(3 + 9 + 1),
491
492	LRI(51, POSTED),
493	REG16(0x588),
494	REG16(0x588),
495	REG16(0x588),
496	REG16(0x588),
497	REG16(0x588),
498	REG16(0x588),
499	REG(0x028),
500	REG(0x09c),
501	REG(0x0c0),
502	REG(0x178),
503	REG(0x17c),
504	REG16(0x358),
505	REG(0x170),
506	REG(0x150),
507	REG(0x154),
508	REG(0x158),
509	REG16(0x41c),
510	REG16(0x600),
511	REG16(0x604),
512	REG16(0x608),
513	REG16(0x60c),
514	REG16(0x610),
515	REG16(0x614),
516	REG16(0x618),
517	REG16(0x61c),
518	REG16(0x620),
519	REG16(0x624),
520	REG16(0x628),
521	REG16(0x62c),
522	REG16(0x630),
523	REG16(0x634),
524	REG16(0x638),
525	REG16(0x63c),
526	REG16(0x640),
527	REG16(0x644),
528	REG16(0x648),
529	REG16(0x64c),
530	REG16(0x650),
531	REG16(0x654),
532	REG16(0x658),
533	REG16(0x65c),
534	REG16(0x660),
535	REG16(0x664),
536	REG16(0x668),
537	REG16(0x66c),
538	REG16(0x670),
539	REG16(0x674),
540	REG16(0x678),
541	REG16(0x67c),
542	REG(0x068),
543	REG(0x084),
544	NOP(1),
545
546	END
547};
548
549static const u8 dg2_rcs_offsets[] = {
550	NOP(1),
551	LRI(15, POSTED),
552	REG16(0x244),
553	REG(0x034),
554	REG(0x030),
555	REG(0x038),
556	REG(0x03c),
557	REG(0x168),
558	REG(0x140),
559	REG(0x110),
560	REG(0x1c0),
561	REG(0x1c4),
562	REG(0x1c8),
563	REG(0x180),
564	REG16(0x2b4),
565	REG(0x120),
566	REG(0x124),
567
568	NOP(1),
569	LRI(9, POSTED),
570	REG16(0x3a8),
571	REG16(0x28c),
572	REG16(0x288),
573	REG16(0x284),
574	REG16(0x280),
575	REG16(0x27c),
576	REG16(0x278),
577	REG16(0x274),
578	REG16(0x270),
579
580	LRI(3, POSTED),
581	REG(0x1b0),
582	REG16(0x5a8),
583	REG16(0x5ac),
584
585	NOP(6),
586	LRI(1, 0),
587	REG(0x0c8),
588
589	END
590};
591
592static const u8 mtl_rcs_offsets[] = {
593	NOP(1),
594	LRI(15, POSTED),
595	REG16(0x244),
596	REG(0x034),
597	REG(0x030),
598	REG(0x038),
599	REG(0x03c),
600	REG(0x168),
601	REG(0x140),
602	REG(0x110),
603	REG(0x1c0),
604	REG(0x1c4),
605	REG(0x1c8),
606	REG(0x180),
607	REG16(0x2b4),
608	REG(0x120),
609	REG(0x124),
610
611	NOP(1),
612	LRI(9, POSTED),
613	REG16(0x3a8),
614	REG16(0x28c),
615	REG16(0x288),
616	REG16(0x284),
617	REG16(0x280),
618	REG16(0x27c),
619	REG16(0x278),
620	REG16(0x274),
621	REG16(0x270),
622
623	NOP(2),
624	LRI(2, POSTED),
625	REG16(0x5a8),
626	REG16(0x5ac),
627
628	NOP(6),
629	LRI(1, 0),
630	REG(0x0c8),
631
632	END
633};
634
635#undef END
636#undef REG16
637#undef REG
638#undef LRI
639#undef NOP
640
641static const u8 *reg_offsets(const struct intel_engine_cs *engine)
642{
643	/*
644	 * The gen12+ lists only have the registers we program in the basic
645	 * default state. We rely on the context image using relative
646	 * addressing to automatic fixup the register state between the
647	 * physical engines for virtual engine.
648	 */
649	GEM_BUG_ON(GRAPHICS_VER(engine->i915) >= 12 &&
650		   !intel_engine_has_relative_mmio(engine));
651
652	if (engine->flags & I915_ENGINE_HAS_RCS_REG_STATE) {
653		if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 70))
654			return mtl_rcs_offsets;
655		else if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
656			return dg2_rcs_offsets;
657		else if (GRAPHICS_VER(engine->i915) >= 12)
658			return gen12_rcs_offsets;
659		else if (GRAPHICS_VER(engine->i915) >= 11)
660			return gen11_rcs_offsets;
661		else if (GRAPHICS_VER(engine->i915) >= 9)
662			return gen9_rcs_offsets;
663		else
664			return gen8_rcs_offsets;
665	} else {
666		if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
667			return dg2_xcs_offsets;
668		else if (GRAPHICS_VER(engine->i915) >= 12)
669			return gen12_xcs_offsets;
670		else if (GRAPHICS_VER(engine->i915) >= 9)
671			return gen9_xcs_offsets;
672		else
673			return gen8_xcs_offsets;
674	}
675}
676
677static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
678{
679	if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
680		return 0x70;
681	else if (GRAPHICS_VER(engine->i915) >= 12)
682		return 0x60;
683	else if (GRAPHICS_VER(engine->i915) >= 9)
684		return 0x54;
685	else if (engine->class == RENDER_CLASS)
686		return 0x58;
687	else
688		return -1;
689}
690
691static int lrc_ring_bb_offset(const struct intel_engine_cs *engine)
692{
693	if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
694		return 0x80;
695	else if (GRAPHICS_VER(engine->i915) >= 12)
696		return 0x70;
697	else if (GRAPHICS_VER(engine->i915) >= 9)
698		return 0x64;
699	else if (GRAPHICS_VER(engine->i915) >= 8 &&
700		 engine->class == RENDER_CLASS)
701		return 0xc4;
702	else
703		return -1;
704}
705
706static int lrc_ring_gpr0(const struct intel_engine_cs *engine)
707{
708	if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
709		return 0x84;
710	else if (GRAPHICS_VER(engine->i915) >= 12)
711		return 0x74;
712	else if (GRAPHICS_VER(engine->i915) >= 9)
713		return 0x68;
714	else if (engine->class == RENDER_CLASS)
715		return 0xd8;
716	else
717		return -1;
718}
719
720static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine)
721{
722	if (GRAPHICS_VER(engine->i915) >= 12)
723		return 0x12;
724	else if (GRAPHICS_VER(engine->i915) >= 9 || engine->class == RENDER_CLASS)
725		return 0x18;
726	else
727		return -1;
728}
729
730static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine)
731{
732	int x;
733
734	x = lrc_ring_wa_bb_per_ctx(engine);
735	if (x < 0)
736		return x;
737
738	return x + 2;
739}
740
741static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine)
742{
743	int x;
744
745	x = lrc_ring_indirect_ptr(engine);
746	if (x < 0)
747		return x;
748
749	return x + 2;
750}
751
752static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine)
753{
754
755	if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
756		/*
757		 * Note that the CSFE context has a dummy slot for CMD_BUF_CCTL
758		 * simply to match the RCS context image layout.
759		 */
760		return 0xc6;
761	else if (engine->class != RENDER_CLASS)
762		return -1;
763	else if (GRAPHICS_VER(engine->i915) >= 12)
764		return 0xb6;
765	else if (GRAPHICS_VER(engine->i915) >= 11)
766		return 0xaa;
767	else
768		return -1;
769}
770
771static u32
772lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine)
773{
774	if (GRAPHICS_VER(engine->i915) >= 12)
775		return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
776	else if (GRAPHICS_VER(engine->i915) >= 11)
777		return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
778	else if (GRAPHICS_VER(engine->i915) >= 9)
779		return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
780	else if (GRAPHICS_VER(engine->i915) >= 8)
781		return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
782
783	GEM_BUG_ON(GRAPHICS_VER(engine->i915) < 8);
784
785	return 0;
786}
787
788static void
789lrc_setup_bb_per_ctx(u32 *regs,
790		     const struct intel_engine_cs *engine,
791		     u32 ctx_bb_ggtt_addr)
792{
793	GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1);
794	regs[lrc_ring_wa_bb_per_ctx(engine) + 1] =
795		ctx_bb_ggtt_addr |
796		PER_CTX_BB_FORCE |
797		PER_CTX_BB_VALID;
798}
799
800static void
801lrc_setup_indirect_ctx(u32 *regs,
802		       const struct intel_engine_cs *engine,
803		       u32 ctx_bb_ggtt_addr,
804		       u32 size)
805{
806	GEM_BUG_ON(!size);
807	GEM_BUG_ON(!IS_ALIGNED(size, CACHELINE_BYTES));
808	GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1);
809	regs[lrc_ring_indirect_ptr(engine) + 1] =
810		ctx_bb_ggtt_addr | (size / CACHELINE_BYTES);
811
812	GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1);
813	regs[lrc_ring_indirect_offset(engine) + 1] =
814		lrc_ring_indirect_offset_default(engine) << 6;
815}
816
817static bool ctx_needs_runalone(const struct intel_context *ce)
818{
819	struct i915_gem_context *gem_ctx;
820	bool ctx_is_protected = false;
821
822	/*
823	 * On MTL and newer platforms, protected contexts require setting
824	 * the LRC run-alone bit or else the encryption will not happen.
825	 */
826	if (GRAPHICS_VER_FULL(ce->engine->i915) >= IP_VER(12, 70) &&
827	    (ce->engine->class == COMPUTE_CLASS || ce->engine->class == RENDER_CLASS)) {
828		rcu_read_lock();
829		gem_ctx = rcu_dereference(ce->gem_context);
830		if (gem_ctx)
831			ctx_is_protected = gem_ctx->uses_protected_content;
832		rcu_read_unlock();
833	}
834
835	return ctx_is_protected;
836}
837
838static void init_common_regs(u32 * const regs,
839			     const struct intel_context *ce,
840			     const struct intel_engine_cs *engine,
841			     bool inhibit)
842{
843	u32 ctl;
844	int loc;
845
846	ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
847	ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
848	if (inhibit)
849		ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT;
850	if (GRAPHICS_VER(engine->i915) < 11)
851		ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
852					   CTX_CTRL_RS_CTX_ENABLE);
853	if (ctx_needs_runalone(ce))
854		ctl |= _MASKED_BIT_ENABLE(GEN12_CTX_CTRL_RUNALONE_MODE);
855	regs[CTX_CONTEXT_CONTROL] = ctl;
856
857	regs[CTX_TIMESTAMP] = ce->stats.runtime.last;
858
859	loc = lrc_ring_bb_offset(engine);
860	if (loc != -1)
861		regs[loc + 1] = 0;
862}
863
864static void init_wa_bb_regs(u32 * const regs,
865			    const struct intel_engine_cs *engine)
866{
867	const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
868
869	if (wa_ctx->per_ctx.size) {
870		const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
871
872		GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1);
873		regs[lrc_ring_wa_bb_per_ctx(engine) + 1] =
874			(ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
875	}
876
877	if (wa_ctx->indirect_ctx.size) {
878		lrc_setup_indirect_ctx(regs, engine,
879				       i915_ggtt_offset(wa_ctx->vma) +
880				       wa_ctx->indirect_ctx.offset,
881				       wa_ctx->indirect_ctx.size);
882	}
883}
884
885static void init_ppgtt_regs(u32 *regs, const struct i915_ppgtt *ppgtt)
886{
887	if (i915_vm_is_4lvl(&ppgtt->vm)) {
888		/* 64b PPGTT (48bit canonical)
889		 * PDP0_DESCRIPTOR contains the base address to PML4 and
890		 * other PDP Descriptors are ignored.
891		 */
892		ASSIGN_CTX_PML4(ppgtt, regs);
893	} else {
894		ASSIGN_CTX_PDP(ppgtt, regs, 3);
895		ASSIGN_CTX_PDP(ppgtt, regs, 2);
896		ASSIGN_CTX_PDP(ppgtt, regs, 1);
897		ASSIGN_CTX_PDP(ppgtt, regs, 0);
898	}
899}
900
901static struct i915_ppgtt *vm_alias(struct i915_address_space *vm)
902{
903	if (i915_is_ggtt(vm))
904		return i915_vm_to_ggtt(vm)->alias;
905	else
906		return i915_vm_to_ppgtt(vm);
907}
908
909static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine)
910{
911	int x;
912
913	x = lrc_ring_mi_mode(engine);
914	if (x != -1) {
915		regs[x + 1] &= ~STOP_RING;
916		regs[x + 1] |= STOP_RING << 16;
917	}
918}
919
920static void __lrc_init_regs(u32 *regs,
921			    const struct intel_context *ce,
922			    const struct intel_engine_cs *engine,
923			    bool inhibit)
924{
925	/*
926	 * A context is actually a big batch buffer with several
927	 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
928	 * values we are setting here are only for the first context restore:
929	 * on a subsequent save, the GPU will recreate this batchbuffer with new
930	 * values (including all the missing MI_LOAD_REGISTER_IMM commands that
931	 * we are not initializing here).
932	 *
933	 * Must keep consistent with virtual_update_register_offsets().
934	 */
935
936	if (inhibit)
937		memset(regs, 0, PAGE_SIZE);
938
939	set_offsets(regs, reg_offsets(engine), engine, inhibit);
940
941	init_common_regs(regs, ce, engine, inhibit);
942	init_ppgtt_regs(regs, vm_alias(ce->vm));
943
944	init_wa_bb_regs(regs, engine);
945
946	__reset_stop_ring(regs, engine);
947}
948
949void lrc_init_regs(const struct intel_context *ce,
950		   const struct intel_engine_cs *engine,
951		   bool inhibit)
952{
953	__lrc_init_regs(ce->lrc_reg_state, ce, engine, inhibit);
954}
955
956void lrc_reset_regs(const struct intel_context *ce,
957		    const struct intel_engine_cs *engine)
958{
959	__reset_stop_ring(ce->lrc_reg_state, engine);
960}
961
962static void
963set_redzone(void *vaddr, const struct intel_engine_cs *engine)
964{
965	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
966		return;
967
968	vaddr += engine->context_size;
969
970	memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE);
971}
972
973static void
974check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
975{
976	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
977		return;
978
979	vaddr += engine->context_size;
980
981	if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE))
982		drm_err_once(&engine->i915->drm,
983			     "%s context redzone overwritten!\n",
984			     engine->name);
985}
986
987static u32 context_wa_bb_offset(const struct intel_context *ce)
988{
989	return PAGE_SIZE * ce->wa_bb_page;
990}
991
992/*
993 * per_ctx below determines which WABB section is used.
994 * When true, the function returns the location of the
995 * PER_CTX_BB.  When false, the function returns the
996 * location of the INDIRECT_CTX.
997 */
998static u32 *context_wabb(const struct intel_context *ce, bool per_ctx)
999{
1000	void *ptr;
1001
1002	GEM_BUG_ON(!ce->wa_bb_page);
1003
1004	ptr = ce->lrc_reg_state;
1005	ptr -= LRC_STATE_OFFSET; /* back to start of context image */
1006	ptr += context_wa_bb_offset(ce);
1007	ptr += per_ctx ? PAGE_SIZE : 0;
1008
1009	return ptr;
1010}
1011
1012void lrc_init_state(struct intel_context *ce,
1013		    struct intel_engine_cs *engine,
1014		    void *state)
1015{
1016	bool inhibit = true;
1017
1018	set_redzone(state, engine);
1019
1020	if (engine->default_state) {
1021		shmem_read(engine->default_state, 0,
1022			   state, engine->context_size);
1023		__set_bit(CONTEXT_VALID_BIT, &ce->flags);
1024		inhibit = false;
1025	}
1026
1027	/* Clear the ppHWSP (inc. per-context counters) */
1028	memset(state, 0, PAGE_SIZE);
1029
1030	/* Clear the indirect wa and storage */
1031	if (ce->wa_bb_page)
1032		memset(state + context_wa_bb_offset(ce), 0, PAGE_SIZE);
1033
1034	/*
1035	 * The second page of the context object contains some registers which
1036	 * must be set up prior to the first execution.
1037	 */
1038	__lrc_init_regs(state + LRC_STATE_OFFSET, ce, engine, inhibit);
1039}
1040
1041u32 lrc_indirect_bb(const struct intel_context *ce)
1042{
1043	return i915_ggtt_offset(ce->state) + context_wa_bb_offset(ce);
1044}
1045
1046static u32 *setup_predicate_disable_wa(const struct intel_context *ce, u32 *cs)
1047{
1048	/* If predication is active, this will be noop'ed */
1049	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT | (4 - 2);
1050	*cs++ = lrc_indirect_bb(ce) + DG2_PREDICATE_RESULT_WA;
1051	*cs++ = 0;
1052	*cs++ = 0; /* No predication */
1053
1054	/* predicated end, only terminates if SET_PREDICATE_RESULT:0 is clear */
1055	*cs++ = MI_BATCH_BUFFER_END | BIT(15);
1056	*cs++ = MI_SET_PREDICATE | MI_SET_PREDICATE_DISABLE;
1057
1058	/* Instructions are no longer predicated (disabled), we can proceed */
1059	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT | (4 - 2);
1060	*cs++ = lrc_indirect_bb(ce) + DG2_PREDICATE_RESULT_WA;
1061	*cs++ = 0;
1062	*cs++ = 1; /* enable predication before the next BB */
1063
1064	*cs++ = MI_BATCH_BUFFER_END;
1065	GEM_BUG_ON(offset_in_page(cs) > DG2_PREDICATE_RESULT_WA);
1066
1067	return cs;
1068}
1069
1070static struct i915_vma *
1071__lrc_alloc_state(struct intel_context *ce, struct intel_engine_cs *engine)
1072{
1073	struct drm_i915_gem_object *obj;
1074	struct i915_vma *vma;
1075	u32 context_size;
1076
1077	context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
1078
1079	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
1080		context_size += I915_GTT_PAGE_SIZE; /* for redzone */
1081
1082	if (GRAPHICS_VER(engine->i915) >= 12) {
1083		ce->wa_bb_page = context_size / PAGE_SIZE;
1084		/* INDIRECT_CTX and PER_CTX_BB need separate pages. */
1085		context_size += PAGE_SIZE * 2;
1086	}
1087
1088	if (intel_context_is_parent(ce) && intel_engine_uses_guc(engine)) {
1089		ce->parallel.guc.parent_page = context_size / PAGE_SIZE;
1090		context_size += PARENT_SCRATCH_SIZE;
1091	}
1092
1093	obj = i915_gem_object_create_lmem(engine->i915, context_size,
1094					  I915_BO_ALLOC_PM_VOLATILE);
1095	if (IS_ERR(obj)) {
1096		obj = i915_gem_object_create_shmem(engine->i915, context_size);
1097		if (IS_ERR(obj))
1098			return ERR_CAST(obj);
1099
1100		/*
1101		 * Wa_22016122933: For Media version 13.0, all Media GT shared
1102		 * memory needs to be mapped as WC on CPU side and UC (PAT
1103		 * index 2) on GPU side.
1104		 */
1105		if (intel_gt_needs_wa_22016122933(engine->gt))
1106			i915_gem_object_set_cache_coherency(obj, I915_CACHE_NONE);
1107	}
1108
1109	vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
1110	if (IS_ERR(vma)) {
1111		i915_gem_object_put(obj);
1112		return vma;
1113	}
1114
1115	return vma;
1116}
1117
1118static struct intel_timeline *
1119pinned_timeline(struct intel_context *ce, struct intel_engine_cs *engine)
1120{
1121	struct intel_timeline *tl = fetch_and_zero(&ce->timeline);
1122
1123	return intel_timeline_create_from_engine(engine, page_unmask_bits(tl));
1124}
1125
1126int lrc_alloc(struct intel_context *ce, struct intel_engine_cs *engine)
1127{
1128	struct intel_ring *ring;
1129	struct i915_vma *vma;
1130	int err;
1131
1132	GEM_BUG_ON(ce->state);
1133
1134	vma = __lrc_alloc_state(ce, engine);
1135	if (IS_ERR(vma))
1136		return PTR_ERR(vma);
1137
1138	ring = intel_engine_create_ring(engine, ce->ring_size);
1139	if (IS_ERR(ring)) {
1140		err = PTR_ERR(ring);
1141		goto err_vma;
1142	}
1143
1144	if (!page_mask_bits(ce->timeline)) {
1145		struct intel_timeline *tl;
1146
1147		/*
1148		 * Use the static global HWSP for the kernel context, and
1149		 * a dynamically allocated cacheline for everyone else.
1150		 */
1151		if (unlikely(ce->timeline))
1152			tl = pinned_timeline(ce, engine);
1153		else
1154			tl = intel_timeline_create(engine->gt);
1155		if (IS_ERR(tl)) {
1156			err = PTR_ERR(tl);
1157			goto err_ring;
1158		}
1159
1160		ce->timeline = tl;
1161	}
1162
1163	ce->ring = ring;
1164	ce->state = vma;
1165
1166	return 0;
1167
1168err_ring:
1169	intel_ring_put(ring);
1170err_vma:
1171	i915_vma_put(vma);
1172	return err;
1173}
1174
1175void lrc_reset(struct intel_context *ce)
1176{
1177	GEM_BUG_ON(!intel_context_is_pinned(ce));
1178
1179	intel_ring_reset(ce->ring, ce->ring->emit);
1180
1181	/* Scrub away the garbage */
1182	lrc_init_regs(ce, ce->engine, true);
1183	ce->lrc.lrca = lrc_update_regs(ce, ce->engine, ce->ring->tail);
1184}
1185
1186int
1187lrc_pre_pin(struct intel_context *ce,
1188	    struct intel_engine_cs *engine,
1189	    struct i915_gem_ww_ctx *ww,
1190	    void **vaddr)
1191{
1192	GEM_BUG_ON(!ce->state);
1193	GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
1194
1195	*vaddr = i915_gem_object_pin_map(ce->state->obj,
1196					 intel_gt_coherent_map_type(ce->engine->gt,
1197								    ce->state->obj,
1198								    false) |
1199					 I915_MAP_OVERRIDE);
1200
1201	return PTR_ERR_OR_ZERO(*vaddr);
1202}
1203
1204int
1205lrc_pin(struct intel_context *ce,
1206	struct intel_engine_cs *engine,
1207	void *vaddr)
1208{
1209	ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET;
1210
1211	if (!__test_and_set_bit(CONTEXT_INIT_BIT, &ce->flags))
1212		lrc_init_state(ce, engine, vaddr);
1213
1214	ce->lrc.lrca = lrc_update_regs(ce, engine, ce->ring->tail);
1215	return 0;
1216}
1217
1218void lrc_unpin(struct intel_context *ce)
1219{
1220	if (unlikely(ce->parallel.last_rq)) {
1221		i915_request_put(ce->parallel.last_rq);
1222		ce->parallel.last_rq = NULL;
1223	}
1224	check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET,
1225		      ce->engine);
1226}
1227
1228void lrc_post_unpin(struct intel_context *ce)
1229{
1230	i915_gem_object_unpin_map(ce->state->obj);
1231}
1232
1233void lrc_fini(struct intel_context *ce)
1234{
1235	if (!ce->state)
1236		return;
1237
1238	intel_ring_put(fetch_and_zero(&ce->ring));
1239	i915_vma_put(fetch_and_zero(&ce->state));
1240}
1241
1242void lrc_destroy(struct kref *kref)
1243{
1244	struct intel_context *ce = container_of(kref, typeof(*ce), ref);
1245
1246	GEM_BUG_ON(!i915_active_is_idle(&ce->active));
1247	GEM_BUG_ON(intel_context_is_pinned(ce));
1248
1249	lrc_fini(ce);
1250
1251	intel_context_fini(ce);
1252	intel_context_free(ce);
1253}
1254
1255static u32 *
1256gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs)
1257{
1258	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1259		MI_SRM_LRM_GLOBAL_GTT |
1260		MI_LRI_LRM_CS_MMIO;
1261	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1262	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1263		CTX_TIMESTAMP * sizeof(u32);
1264	*cs++ = 0;
1265
1266	*cs++ = MI_LOAD_REGISTER_REG |
1267		MI_LRR_SOURCE_CS_MMIO |
1268		MI_LRI_LRM_CS_MMIO;
1269	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1270	*cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
1271
1272	*cs++ = MI_LOAD_REGISTER_REG |
1273		MI_LRR_SOURCE_CS_MMIO |
1274		MI_LRI_LRM_CS_MMIO;
1275	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1276	*cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
1277
1278	return cs;
1279}
1280
1281static u32 *
1282gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs)
1283{
1284	GEM_BUG_ON(lrc_ring_gpr0(ce->engine) == -1);
1285
1286	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1287		MI_SRM_LRM_GLOBAL_GTT |
1288		MI_LRI_LRM_CS_MMIO;
1289	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1290	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1291		(lrc_ring_gpr0(ce->engine) + 1) * sizeof(u32);
1292	*cs++ = 0;
1293
1294	return cs;
1295}
1296
1297static u32 *
1298gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs)
1299{
1300	GEM_BUG_ON(lrc_ring_cmd_buf_cctl(ce->engine) == -1);
1301
1302	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1303		MI_SRM_LRM_GLOBAL_GTT |
1304		MI_LRI_LRM_CS_MMIO;
1305	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1306	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1307		(lrc_ring_cmd_buf_cctl(ce->engine) + 1) * sizeof(u32);
1308	*cs++ = 0;
1309
1310	*cs++ = MI_LOAD_REGISTER_REG |
1311		MI_LRR_SOURCE_CS_MMIO |
1312		MI_LRI_LRM_CS_MMIO;
1313	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1314	*cs++ = i915_mmio_reg_offset(RING_CMD_BUF_CCTL(0));
1315
1316	return cs;
1317}
1318
1319/*
1320 * The bspec's tuning guide asks us to program a vertical watermark value of
1321 * 0x3FF.  However this register is not saved/restored properly by the
1322 * hardware, so we're required to apply the desired value via INDIRECT_CTX
1323 * batch buffer to ensure the value takes effect properly.  All other bits
1324 * in this register should remain at 0 (the hardware default).
1325 */
1326static u32 *
1327dg2_emit_draw_watermark_setting(u32 *cs)
1328{
1329	*cs++ = MI_LOAD_REGISTER_IMM(1);
1330	*cs++ = i915_mmio_reg_offset(DRAW_WATERMARK);
1331	*cs++ = REG_FIELD_PREP(VERT_WM_VAL, 0x3FF);
1332
1333	return cs;
1334}
1335
1336static u32 *
1337gen12_invalidate_state_cache(u32 *cs)
1338{
1339	*cs++ = MI_LOAD_REGISTER_IMM(1);
1340	*cs++ = i915_mmio_reg_offset(GEN12_CS_DEBUG_MODE2);
1341	*cs++ = _MASKED_BIT_ENABLE(INSTRUCTION_STATE_CACHE_INVALIDATE);
1342	return cs;
1343}
1344
1345static u32 *
1346gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs)
1347{
1348	cs = gen12_emit_timestamp_wa(ce, cs);
1349	cs = gen12_emit_cmd_buf_wa(ce, cs);
1350	cs = gen12_emit_restore_scratch(ce, cs);
1351
1352	/* Wa_16013000631:dg2 */
1353	if (IS_DG2_G11(ce->engine->i915))
1354		cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE, 0);
1355
1356	cs = gen12_emit_aux_table_inv(ce->engine, cs);
1357
1358	/* Wa_18022495364 */
1359	if (IS_GFX_GT_IP_RANGE(ce->engine->gt, IP_VER(12, 0), IP_VER(12, 10)))
1360		cs = gen12_invalidate_state_cache(cs);
1361
1362	/* Wa_16014892111 */
1363	if (IS_GFX_GT_IP_STEP(ce->engine->gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
1364	    IS_GFX_GT_IP_STEP(ce->engine->gt, IP_VER(12, 71), STEP_A0, STEP_B0) ||
1365	    IS_DG2(ce->engine->i915))
1366		cs = dg2_emit_draw_watermark_setting(cs);
1367
1368	return cs;
1369}
1370
1371static u32 *
1372gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs)
1373{
1374	cs = gen12_emit_timestamp_wa(ce, cs);
1375	cs = gen12_emit_restore_scratch(ce, cs);
1376
1377	/* Wa_16013000631:dg2 */
1378	if (IS_DG2_G11(ce->engine->i915))
1379		if (ce->engine->class == COMPUTE_CLASS)
1380			cs = gen8_emit_pipe_control(cs,
1381						    PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE,
1382						    0);
1383
1384	return gen12_emit_aux_table_inv(ce->engine, cs);
1385}
1386
1387static u32 *xehp_emit_fastcolor_blt_wabb(const struct intel_context *ce, u32 *cs)
1388{
1389	struct intel_gt *gt = ce->engine->gt;
1390	int mocs = gt->mocs.uc_index << 1;
1391
1392	/**
1393	 * Wa_16018031267 / Wa_16018063123 requires that SW forces the
1394	 * main copy engine arbitration into round robin mode.  We
1395	 * additionally need to submit the following WABB blt command
1396	 * to produce 4 subblits with each subblit generating 0 byte
1397	 * write requests as WABB:
1398	 *
1399	 * XY_FASTCOLOR_BLT
1400	 *  BG0    -> 5100000E
1401	 *  BG1    -> 0000003F (Dest pitch)
1402	 *  BG2    -> 00000000 (X1, Y1) = (0, 0)
1403	 *  BG3    -> 00040001 (X2, Y2) = (1, 4)
1404	 *  BG4    -> scratch
1405	 *  BG5    -> scratch
1406	 *  BG6-12 -> 00000000
1407	 *  BG13   -> 20004004 (Surf. Width= 2,Surf. Height = 5 )
1408	 *  BG14   -> 00000010 (Qpitch = 4)
1409	 *  BG15   -> 00000000
1410	 */
1411	*cs++ = XY_FAST_COLOR_BLT_CMD | (16 - 2);
1412	*cs++ = FIELD_PREP(XY_FAST_COLOR_BLT_MOCS_MASK, mocs) | 0x3f;
1413	*cs++ = 0;
1414	*cs++ = 4 << 16 | 1;
1415	*cs++ = lower_32_bits(i915_vma_offset(ce->vm->rsvd.vma));
1416	*cs++ = upper_32_bits(i915_vma_offset(ce->vm->rsvd.vma));
1417	*cs++ = 0;
1418	*cs++ = 0;
1419	*cs++ = 0;
1420	*cs++ = 0;
1421	*cs++ = 0;
1422	*cs++ = 0;
1423	*cs++ = 0;
1424	*cs++ = 0x20004004;
1425	*cs++ = 0x10;
1426	*cs++ = 0;
1427
1428	return cs;
1429}
1430
1431static u32 *
1432xehp_emit_per_ctx_bb(const struct intel_context *ce, u32 *cs)
1433{
1434	/* Wa_16018031267, Wa_16018063123 */
1435	if (NEEDS_FASTCOLOR_BLT_WABB(ce->engine))
1436		cs = xehp_emit_fastcolor_blt_wabb(ce, cs);
1437
1438	return cs;
1439}
1440
1441static void
1442setup_per_ctx_bb(const struct intel_context *ce,
1443		 const struct intel_engine_cs *engine,
1444		 u32 *(*emit)(const struct intel_context *, u32 *))
1445{
1446	/* Place PER_CTX_BB on next page after INDIRECT_CTX */
1447	u32 * const start = context_wabb(ce, true);
1448	u32 *cs;
1449
1450	cs = emit(ce, start);
1451
1452	/* PER_CTX_BB must manually terminate */
1453	*cs++ = MI_BATCH_BUFFER_END;
1454
1455	GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs));
1456	lrc_setup_bb_per_ctx(ce->lrc_reg_state, engine,
1457			     lrc_indirect_bb(ce) + PAGE_SIZE);
1458}
1459
1460static void
1461setup_indirect_ctx_bb(const struct intel_context *ce,
1462		      const struct intel_engine_cs *engine,
1463		      u32 *(*emit)(const struct intel_context *, u32 *))
1464{
1465	u32 * const start = context_wabb(ce, false);
1466	u32 *cs;
1467
1468	cs = emit(ce, start);
1469	GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs));
1470	while ((unsigned long)cs % CACHELINE_BYTES)
1471		*cs++ = MI_NOOP;
1472
1473	GEM_BUG_ON(cs - start > DG2_PREDICATE_RESULT_BB / sizeof(*start));
1474	setup_predicate_disable_wa(ce, start + DG2_PREDICATE_RESULT_BB / sizeof(*start));
1475
1476	lrc_setup_indirect_ctx(ce->lrc_reg_state, engine,
1477			       lrc_indirect_bb(ce),
1478			       (cs - start) * sizeof(*cs));
1479}
1480
1481/*
1482 * The context descriptor encodes various attributes of a context,
1483 * including its GTT address and some flags. Because it's fairly
1484 * expensive to calculate, we'll just do it once and cache the result,
1485 * which remains valid until the context is unpinned.
1486 *
1487 * This is what a descriptor looks like, from LSB to MSB::
1488 *
1489 *      bits  0-11:    flags, GEN8_CTX_* (cached in ctx->desc_template)
1490 *      bits 12-31:    LRCA, GTT address of (the HWSP of) this context
1491 *      bits 32-52:    ctx ID, a globally unique tag (highest bit used by GuC)
1492 *      bits 53-54:    mbz, reserved for use by hardware
1493 *      bits 55-63:    group ID, currently unused and set to 0
1494 *
1495 * Starting from Gen11, the upper dword of the descriptor has a new format:
1496 *
1497 *      bits 32-36:    reserved
1498 *      bits 37-47:    SW context ID
1499 *      bits 48:53:    engine instance
1500 *      bit 54:        mbz, reserved for use by hardware
1501 *      bits 55-60:    SW counter
1502 *      bits 61-63:    engine class
1503 *
1504 * On Xe_HP, the upper dword of the descriptor has a new format:
1505 *
1506 *      bits 32-37:    virtual function number
1507 *      bit 38:        mbz, reserved for use by hardware
1508 *      bits 39-54:    SW context ID
1509 *      bits 55-57:    reserved
1510 *      bits 58-63:    SW counter
1511 *
1512 * engine info, SW context ID and SW counter need to form a unique number
1513 * (Context ID) per lrc.
1514 */
1515static u32 lrc_descriptor(const struct intel_context *ce)
1516{
1517	u32 desc;
1518
1519	desc = INTEL_LEGACY_32B_CONTEXT;
1520	if (i915_vm_is_4lvl(ce->vm))
1521		desc = INTEL_LEGACY_64B_CONTEXT;
1522	desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
1523
1524	desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
1525	if (GRAPHICS_VER(ce->vm->i915) == 8)
1526		desc |= GEN8_CTX_L3LLC_COHERENT;
1527
1528	return i915_ggtt_offset(ce->state) | desc;
1529}
1530
1531u32 lrc_update_regs(const struct intel_context *ce,
1532		    const struct intel_engine_cs *engine,
1533		    u32 head)
1534{
1535	struct intel_ring *ring = ce->ring;
1536	u32 *regs = ce->lrc_reg_state;
1537
1538	GEM_BUG_ON(!intel_ring_offset_valid(ring, head));
1539	GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
1540
1541	regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1542	regs[CTX_RING_HEAD] = head;
1543	regs[CTX_RING_TAIL] = ring->tail;
1544	regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1545
1546	/* RPCS */
1547	if (engine->class == RENDER_CLASS) {
1548		regs[CTX_R_PWR_CLK_STATE] =
1549			intel_sseu_make_rpcs(engine->gt, &ce->sseu);
1550
1551		i915_oa_init_reg_state(ce, engine);
1552	}
1553
1554	if (ce->wa_bb_page) {
1555		u32 *(*fn)(const struct intel_context *ce, u32 *cs);
1556
1557		fn = gen12_emit_indirect_ctx_xcs;
1558		if (ce->engine->class == RENDER_CLASS)
1559			fn = gen12_emit_indirect_ctx_rcs;
1560
1561		/* Mutually exclusive wrt to global indirect bb */
1562		GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size);
1563		setup_indirect_ctx_bb(ce, engine, fn);
1564		setup_per_ctx_bb(ce, engine, xehp_emit_per_ctx_bb);
1565	}
1566
1567	return lrc_descriptor(ce) | CTX_DESC_FORCE_RESTORE;
1568}
1569
1570void lrc_update_offsets(struct intel_context *ce,
1571			struct intel_engine_cs *engine)
1572{
1573	set_offsets(ce->lrc_reg_state, reg_offsets(engine), engine, false);
1574}
1575
1576void lrc_check_regs(const struct intel_context *ce,
1577		    const struct intel_engine_cs *engine,
1578		    const char *when)
1579{
1580	const struct intel_ring *ring = ce->ring;
1581	u32 *regs = ce->lrc_reg_state;
1582	bool valid = true;
1583	int x;
1584
1585	if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) {
1586		pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n",
1587		       engine->name,
1588		       regs[CTX_RING_START],
1589		       i915_ggtt_offset(ring->vma));
1590		regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1591		valid = false;
1592	}
1593
1594	if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) !=
1595	    (RING_CTL_SIZE(ring->size) | RING_VALID)) {
1596		pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n",
1597		       engine->name,
1598		       regs[CTX_RING_CTL],
1599		       (u32)(RING_CTL_SIZE(ring->size) | RING_VALID));
1600		regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1601		valid = false;
1602	}
1603
1604	x = lrc_ring_mi_mode(engine);
1605	if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) {
1606		pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n",
1607		       engine->name, regs[x + 1]);
1608		regs[x + 1] &= ~STOP_RING;
1609		regs[x + 1] |= STOP_RING << 16;
1610		valid = false;
1611	}
1612
1613	WARN_ONCE(!valid, "Invalid lrc state found %s submission\n", when);
1614}
1615
1616/*
1617 * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
1618 * PIPE_CONTROL instruction. This is required for the flush to happen correctly
1619 * but there is a slight complication as this is applied in WA batch where the
1620 * values are only initialized once so we cannot take register value at the
1621 * beginning and reuse it further; hence we save its value to memory, upload a
1622 * constant value with bit21 set and then we restore it back with the saved value.
1623 * To simplify the WA, a constant value is formed by using the default value
1624 * of this register. This shouldn't be a problem because we are only modifying
1625 * it for a short period and this batch in non-premptible. We can ofcourse
1626 * use additional instructions that read the actual value of the register
1627 * at that time and set our bit of interest but it makes the WA complicated.
1628 *
1629 * This WA is also required for Gen9 so extracting as a function avoids
1630 * code duplication.
1631 */
1632static u32 *
1633gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
1634{
1635	/* NB no one else is allowed to scribble over scratch + 256! */
1636	*batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
1637	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1638	*batch++ = intel_gt_scratch_offset(engine->gt,
1639					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
1640	*batch++ = 0;
1641
1642	*batch++ = MI_LOAD_REGISTER_IMM(1);
1643	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1644	*batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
1645
1646	batch = gen8_emit_pipe_control(batch,
1647				       PIPE_CONTROL_CS_STALL |
1648				       PIPE_CONTROL_DC_FLUSH_ENABLE,
1649				       0);
1650
1651	*batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
1652	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1653	*batch++ = intel_gt_scratch_offset(engine->gt,
1654					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
1655	*batch++ = 0;
1656
1657	return batch;
1658}
1659
1660/*
1661 * Typically we only have one indirect_ctx and per_ctx batch buffer which are
1662 * initialized at the beginning and shared across all contexts but this field
1663 * helps us to have multiple batches at different offsets and select them based
1664 * on a criteria. At the moment this batch always start at the beginning of the page
1665 * and at this point we don't have multiple wa_ctx batch buffers.
1666 *
1667 * The number of WA applied are not known at the beginning; we use this field
1668 * to return the no of DWORDS written.
1669 *
1670 * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
1671 * so it adds NOOPs as padding to make it cacheline aligned.
1672 * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
1673 * makes a complete batch buffer.
1674 */
1675static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1676{
1677	/* WaDisableCtxRestoreArbitration:bdw,chv */
1678	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1679
1680	/* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
1681	if (IS_BROADWELL(engine->i915))
1682		batch = gen8_emit_flush_coherentl3_wa(engine, batch);
1683
1684	/* WaClearSlmSpaceAtContextSwitch:bdw,chv */
1685	/* Actual scratch location is at 128 bytes offset */
1686	batch = gen8_emit_pipe_control(batch,
1687				       PIPE_CONTROL_FLUSH_L3 |
1688				       PIPE_CONTROL_STORE_DATA_INDEX |
1689				       PIPE_CONTROL_CS_STALL |
1690				       PIPE_CONTROL_QW_WRITE,
1691				       LRC_PPHWSP_SCRATCH_ADDR);
1692
1693	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1694
1695	/* Pad to end of cacheline */
1696	while ((unsigned long)batch % CACHELINE_BYTES)
1697		*batch++ = MI_NOOP;
1698
1699	/*
1700	 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
1701	 * execution depends on the length specified in terms of cache lines
1702	 * in the register CTX_RCS_INDIRECT_CTX
1703	 */
1704
1705	return batch;
1706}
1707
1708struct lri {
1709	i915_reg_t reg;
1710	u32 value;
1711};
1712
1713static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
1714{
1715	GEM_BUG_ON(!count || count > 63);
1716
1717	*batch++ = MI_LOAD_REGISTER_IMM(count);
1718	do {
1719		*batch++ = i915_mmio_reg_offset(lri->reg);
1720		*batch++ = lri->value;
1721	} while (lri++, --count);
1722	*batch++ = MI_NOOP;
1723
1724	return batch;
1725}
1726
1727static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1728{
1729	static const struct lri lri[] = {
1730		/* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
1731		{
1732			COMMON_SLICE_CHICKEN2,
1733			__MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
1734				       0),
1735		},
1736
1737		/* BSpec: 11391 */
1738		{
1739			FF_SLICE_CHICKEN,
1740			__MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
1741				       FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
1742		},
1743
1744		/* BSpec: 11299 */
1745		{
1746			_3D_CHICKEN3,
1747			__MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
1748				       _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
1749		}
1750	};
1751
1752	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1753
1754	/* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
1755	batch = gen8_emit_flush_coherentl3_wa(engine, batch);
1756
1757	/* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */
1758	batch = gen8_emit_pipe_control(batch,
1759				       PIPE_CONTROL_FLUSH_L3 |
1760				       PIPE_CONTROL_STORE_DATA_INDEX |
1761				       PIPE_CONTROL_CS_STALL |
1762				       PIPE_CONTROL_QW_WRITE,
1763				       LRC_PPHWSP_SCRATCH_ADDR);
1764
1765	batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
1766
1767	/* WaMediaPoolStateCmdInWABB:bxt,glk */
1768	if (HAS_POOLED_EU(engine->i915)) {
1769		/*
1770		 * EU pool configuration is setup along with golden context
1771		 * during context initialization. This value depends on
1772		 * device type (2x6 or 3x6) and needs to be updated based
1773		 * on which subslice is disabled especially for 2x6
1774		 * devices, however it is safe to load default
1775		 * configuration of 3x6 device instead of masking off
1776		 * corresponding bits because HW ignores bits of a disabled
1777		 * subslice and drops down to appropriate config. Please
1778		 * see render_state_setup() in i915_gem_render_state.c for
1779		 * possible configurations, to avoid duplication they are
1780		 * not shown here again.
1781		 */
1782		*batch++ = GEN9_MEDIA_POOL_STATE;
1783		*batch++ = GEN9_MEDIA_POOL_ENABLE;
1784		*batch++ = 0x00777000;
1785		*batch++ = 0;
1786		*batch++ = 0;
1787		*batch++ = 0;
1788	}
1789
1790	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1791
1792	/* Pad to end of cacheline */
1793	while ((unsigned long)batch % CACHELINE_BYTES)
1794		*batch++ = MI_NOOP;
1795
1796	return batch;
1797}
1798
1799#define CTX_WA_BB_SIZE (PAGE_SIZE)
1800
1801static int lrc_create_wa_ctx(struct intel_engine_cs *engine)
1802{
1803	struct drm_i915_gem_object *obj;
1804	struct i915_vma *vma;
1805	int err;
1806
1807	obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_SIZE);
1808	if (IS_ERR(obj))
1809		return PTR_ERR(obj);
1810
1811	vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
1812	if (IS_ERR(vma)) {
1813		err = PTR_ERR(vma);
1814		goto err;
1815	}
1816
1817	engine->wa_ctx.vma = vma;
1818	return 0;
1819
1820err:
1821	i915_gem_object_put(obj);
1822	return err;
1823}
1824
1825void lrc_fini_wa_ctx(struct intel_engine_cs *engine)
1826{
1827	i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
1828}
1829
1830typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
1831
1832void lrc_init_wa_ctx(struct intel_engine_cs *engine)
1833{
1834	struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
1835	struct i915_wa_ctx_bb *wa_bb[] = {
1836		&wa_ctx->indirect_ctx, &wa_ctx->per_ctx
1837	};
1838	wa_bb_func_t wa_bb_fn[ARRAY_SIZE(wa_bb)];
1839	struct i915_gem_ww_ctx ww;
1840	void *batch, *batch_ptr;
1841	unsigned int i;
1842	int err;
1843
1844	if (GRAPHICS_VER(engine->i915) >= 11 ||
1845	    !(engine->flags & I915_ENGINE_HAS_RCS_REG_STATE))
1846		return;
1847
1848	if (GRAPHICS_VER(engine->i915) == 9) {
1849		wa_bb_fn[0] = gen9_init_indirectctx_bb;
1850		wa_bb_fn[1] = NULL;
1851	} else if (GRAPHICS_VER(engine->i915) == 8) {
1852		wa_bb_fn[0] = gen8_init_indirectctx_bb;
1853		wa_bb_fn[1] = NULL;
1854	}
1855
1856	err = lrc_create_wa_ctx(engine);
1857	if (err) {
1858		/*
1859		 * We continue even if we fail to initialize WA batch
1860		 * because we only expect rare glitches but nothing
1861		 * critical to prevent us from using GPU
1862		 */
1863		drm_err(&engine->i915->drm,
1864			"Ignoring context switch w/a allocation error:%d\n",
1865			err);
1866		return;
1867	}
1868
1869	if (!engine->wa_ctx.vma)
1870		return;
1871
1872	i915_gem_ww_ctx_init(&ww, true);
1873retry:
1874	err = i915_gem_object_lock(wa_ctx->vma->obj, &ww);
1875	if (!err)
1876		err = i915_ggtt_pin(wa_ctx->vma, &ww, 0, PIN_HIGH);
1877	if (err)
1878		goto err;
1879
1880	batch = i915_gem_object_pin_map(wa_ctx->vma->obj, I915_MAP_WB);
1881	if (IS_ERR(batch)) {
1882		err = PTR_ERR(batch);
1883		goto err_unpin;
1884	}
1885
1886	/*
1887	 * Emit the two workaround batch buffers, recording the offset from the
1888	 * start of the workaround batch buffer object for each and their
1889	 * respective sizes.
1890	 */
1891	batch_ptr = batch;
1892	for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
1893		wa_bb[i]->offset = batch_ptr - batch;
1894		if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
1895						  CACHELINE_BYTES))) {
1896			err = -EINVAL;
1897			break;
1898		}
1899		if (wa_bb_fn[i])
1900			batch_ptr = wa_bb_fn[i](engine, batch_ptr);
1901		wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
1902	}
1903	GEM_BUG_ON(batch_ptr - batch > CTX_WA_BB_SIZE);
1904
1905	__i915_gem_object_flush_map(wa_ctx->vma->obj, 0, batch_ptr - batch);
1906	__i915_gem_object_release_map(wa_ctx->vma->obj);
1907
1908	/* Verify that we can handle failure to setup the wa_ctx */
1909	if (!err)
1910		err = i915_inject_probe_error(engine->i915, -ENODEV);
1911
1912err_unpin:
1913	if (err)
1914		i915_vma_unpin(wa_ctx->vma);
1915err:
1916	if (err == -EDEADLK) {
1917		err = i915_gem_ww_ctx_backoff(&ww);
1918		if (!err)
1919			goto retry;
1920	}
1921	i915_gem_ww_ctx_fini(&ww);
1922
1923	if (err) {
1924		i915_vma_put(engine->wa_ctx.vma);
1925
1926		/* Clear all flags to prevent further use */
1927		memset(wa_ctx, 0, sizeof(*wa_ctx));
1928	}
1929}
1930
1931static void st_runtime_underflow(struct intel_context_stats *stats, s32 dt)
1932{
1933#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1934	stats->runtime.num_underflow++;
1935	stats->runtime.max_underflow =
1936		max_t(u32, stats->runtime.max_underflow, -dt);
1937#endif
1938}
1939
1940static u32 lrc_get_runtime(const struct intel_context *ce)
1941{
1942	/*
1943	 * We can use either ppHWSP[16] which is recorded before the context
1944	 * switch (and so excludes the cost of context switches) or use the
1945	 * value from the context image itself, which is saved/restored earlier
1946	 * and so includes the cost of the save.
1947	 */
1948	return READ_ONCE(ce->lrc_reg_state[CTX_TIMESTAMP]);
1949}
1950
1951void lrc_update_runtime(struct intel_context *ce)
1952{
1953	struct intel_context_stats *stats = &ce->stats;
1954	u32 old;
1955	s32 dt;
1956
1957	old = stats->runtime.last;
1958	stats->runtime.last = lrc_get_runtime(ce);
1959	dt = stats->runtime.last - old;
1960	if (!dt)
1961		return;
1962
1963	if (unlikely(dt < 0)) {
1964		CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n",
1965			 old, stats->runtime.last, dt);
1966		st_runtime_underflow(stats, dt);
1967		return;
1968	}
1969
1970	ewma_runtime_add(&stats->runtime.avg, dt);
1971	stats->runtime.total += dt;
1972}
1973
1974#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1975#include "selftest_lrc.c"
1976#endif
1977