1// SPDX-License-Identifier: (BSD-3-Clause OR GPL-2.0-only)
2/* Copyright(c) 2014 - 2020 Intel Corporation */
3#include <linux/kernel.h>
4#include <linux/pci.h>
5#include <linux/completion.h>
6#include <linux/workqueue.h>
7#include <linux/delay.h>
8#include "adf_accel_devices.h"
9#include "adf_common_drv.h"
10#include "adf_pfvf_pf_msg.h"
11
12struct adf_fatal_error_data {
13	struct adf_accel_dev *accel_dev;
14	struct work_struct work;
15};
16
17static struct workqueue_struct *device_reset_wq;
18static struct workqueue_struct *device_sriov_wq;
19
20static pci_ers_result_t adf_error_detected(struct pci_dev *pdev,
21					   pci_channel_state_t state)
22{
23	struct adf_accel_dev *accel_dev = adf_devmgr_pci_to_accel_dev(pdev);
24
25	dev_info(&pdev->dev, "Acceleration driver hardware error detected.\n");
26	if (!accel_dev) {
27		dev_err(&pdev->dev, "Can't find acceleration device\n");
28		return PCI_ERS_RESULT_DISCONNECT;
29	}
30
31	if (state == pci_channel_io_perm_failure) {
32		dev_err(&pdev->dev, "Can't recover from device error\n");
33		return PCI_ERS_RESULT_DISCONNECT;
34	}
35
36	set_bit(ADF_STATUS_RESTARTING, &accel_dev->status);
37	if (accel_dev->hw_device->exit_arb) {
38		dev_dbg(&pdev->dev, "Disabling arbitration\n");
39		accel_dev->hw_device->exit_arb(accel_dev);
40	}
41	adf_error_notifier(accel_dev);
42	adf_pf2vf_notify_fatal_error(accel_dev);
43	adf_dev_restarting_notify(accel_dev);
44	adf_pf2vf_notify_restarting(accel_dev);
45	adf_pf2vf_wait_for_restarting_complete(accel_dev);
46	pci_clear_master(pdev);
47	adf_dev_down(accel_dev, false);
48
49	return PCI_ERS_RESULT_NEED_RESET;
50}
51
52/* reset dev data */
53struct adf_reset_dev_data {
54	int mode;
55	struct adf_accel_dev *accel_dev;
56	struct completion compl;
57	struct work_struct reset_work;
58};
59
60/* sriov dev data */
61struct adf_sriov_dev_data {
62	struct adf_accel_dev *accel_dev;
63	struct completion compl;
64	struct work_struct sriov_work;
65};
66
67void adf_reset_sbr(struct adf_accel_dev *accel_dev)
68{
69	struct pci_dev *pdev = accel_to_pci_dev(accel_dev);
70	struct pci_dev *parent = pdev->bus->self;
71	u16 bridge_ctl = 0;
72
73	if (!parent)
74		parent = pdev;
75
76	if (!pci_wait_for_pending_transaction(pdev))
77		dev_info(&GET_DEV(accel_dev),
78			 "Transaction still in progress. Proceeding\n");
79
80	dev_info(&GET_DEV(accel_dev), "Secondary bus reset\n");
81
82	pci_read_config_word(parent, PCI_BRIDGE_CONTROL, &bridge_ctl);
83	bridge_ctl |= PCI_BRIDGE_CTL_BUS_RESET;
84	pci_write_config_word(parent, PCI_BRIDGE_CONTROL, bridge_ctl);
85	msleep(100);
86	bridge_ctl &= ~PCI_BRIDGE_CTL_BUS_RESET;
87	pci_write_config_word(parent, PCI_BRIDGE_CONTROL, bridge_ctl);
88	msleep(100);
89}
90EXPORT_SYMBOL_GPL(adf_reset_sbr);
91
92void adf_reset_flr(struct adf_accel_dev *accel_dev)
93{
94	pcie_flr(accel_to_pci_dev(accel_dev));
95}
96EXPORT_SYMBOL_GPL(adf_reset_flr);
97
98void adf_dev_restore(struct adf_accel_dev *accel_dev)
99{
100	struct adf_hw_device_data *hw_device = accel_dev->hw_device;
101	struct pci_dev *pdev = accel_to_pci_dev(accel_dev);
102
103	if (hw_device->reset_device) {
104		dev_info(&GET_DEV(accel_dev), "Resetting device qat_dev%d\n",
105			 accel_dev->accel_id);
106		hw_device->reset_device(accel_dev);
107		pci_restore_state(pdev);
108		pci_save_state(pdev);
109	}
110}
111
112static void adf_device_sriov_worker(struct work_struct *work)
113{
114	struct adf_sriov_dev_data *sriov_data =
115		container_of(work, struct adf_sriov_dev_data, sriov_work);
116
117	adf_reenable_sriov(sriov_data->accel_dev);
118	complete(&sriov_data->compl);
119}
120
121static void adf_device_reset_worker(struct work_struct *work)
122{
123	struct adf_reset_dev_data *reset_data =
124		  container_of(work, struct adf_reset_dev_data, reset_work);
125	struct adf_accel_dev *accel_dev = reset_data->accel_dev;
126	unsigned long wait_jiffies = msecs_to_jiffies(10000);
127	struct adf_sriov_dev_data sriov_data;
128
129	adf_dev_restarting_notify(accel_dev);
130	if (adf_dev_restart(accel_dev)) {
131		/* The device hanged and we can't restart it so stop here */
132		dev_err(&GET_DEV(accel_dev), "Restart device failed\n");
133		if (reset_data->mode == ADF_DEV_RESET_ASYNC)
134			kfree(reset_data);
135		WARN(1, "QAT: device restart failed. Device is unusable\n");
136		return;
137	}
138
139	sriov_data.accel_dev = accel_dev;
140	init_completion(&sriov_data.compl);
141	INIT_WORK(&sriov_data.sriov_work, adf_device_sriov_worker);
142	queue_work(device_sriov_wq, &sriov_data.sriov_work);
143	if (wait_for_completion_timeout(&sriov_data.compl, wait_jiffies))
144		adf_pf2vf_notify_restarted(accel_dev);
145
146	adf_dev_restarted_notify(accel_dev);
147	clear_bit(ADF_STATUS_RESTARTING, &accel_dev->status);
148
149	/* The dev is back alive. Notify the caller if in sync mode */
150	if (reset_data->mode == ADF_DEV_RESET_ASYNC)
151		kfree(reset_data);
152	else
153		complete(&reset_data->compl);
154}
155
156static int adf_dev_aer_schedule_reset(struct adf_accel_dev *accel_dev,
157				      enum adf_dev_reset_mode mode)
158{
159	struct adf_reset_dev_data *reset_data;
160
161	if (!adf_dev_started(accel_dev) ||
162	    test_bit(ADF_STATUS_RESTARTING, &accel_dev->status))
163		return 0;
164
165	set_bit(ADF_STATUS_RESTARTING, &accel_dev->status);
166	reset_data = kzalloc(sizeof(*reset_data), GFP_KERNEL);
167	if (!reset_data)
168		return -ENOMEM;
169	reset_data->accel_dev = accel_dev;
170	init_completion(&reset_data->compl);
171	reset_data->mode = mode;
172	INIT_WORK(&reset_data->reset_work, adf_device_reset_worker);
173	queue_work(device_reset_wq, &reset_data->reset_work);
174
175	/* If in sync mode wait for the result */
176	if (mode == ADF_DEV_RESET_SYNC) {
177		int ret = 0;
178		/* Maximum device reset time is 10 seconds */
179		unsigned long wait_jiffies = msecs_to_jiffies(10000);
180		unsigned long timeout = wait_for_completion_timeout(
181				   &reset_data->compl, wait_jiffies);
182		if (!timeout) {
183			dev_err(&GET_DEV(accel_dev),
184				"Reset device timeout expired\n");
185			cancel_work_sync(&reset_data->reset_work);
186			ret = -EFAULT;
187		}
188		kfree(reset_data);
189		return ret;
190	}
191	return 0;
192}
193
194static pci_ers_result_t adf_slot_reset(struct pci_dev *pdev)
195{
196	struct adf_accel_dev *accel_dev = adf_devmgr_pci_to_accel_dev(pdev);
197	int res = 0;
198
199	if (!accel_dev) {
200		pr_err("QAT: Can't find acceleration device\n");
201		return PCI_ERS_RESULT_DISCONNECT;
202	}
203
204	if (!pdev->is_busmaster)
205		pci_set_master(pdev);
206	pci_restore_state(pdev);
207	pci_save_state(pdev);
208	res = adf_dev_up(accel_dev, false);
209	if (res && res != -EALREADY)
210		return PCI_ERS_RESULT_DISCONNECT;
211
212	adf_reenable_sriov(accel_dev);
213	adf_pf2vf_notify_restarted(accel_dev);
214	adf_dev_restarted_notify(accel_dev);
215	clear_bit(ADF_STATUS_RESTARTING, &accel_dev->status);
216	return PCI_ERS_RESULT_RECOVERED;
217}
218
219static void adf_resume(struct pci_dev *pdev)
220{
221	dev_info(&pdev->dev, "Acceleration driver reset completed\n");
222	dev_info(&pdev->dev, "Device is up and running\n");
223}
224
225const struct pci_error_handlers adf_err_handler = {
226	.error_detected = adf_error_detected,
227	.slot_reset = adf_slot_reset,
228	.resume = adf_resume,
229};
230EXPORT_SYMBOL_GPL(adf_err_handler);
231
232int adf_dev_autoreset(struct adf_accel_dev *accel_dev)
233{
234	if (accel_dev->autoreset_on_error)
235		return adf_dev_aer_schedule_reset(accel_dev, ADF_DEV_RESET_ASYNC);
236
237	return 0;
238}
239
240static void adf_notify_fatal_error_worker(struct work_struct *work)
241{
242	struct adf_fatal_error_data *wq_data =
243			container_of(work, struct adf_fatal_error_data, work);
244	struct adf_accel_dev *accel_dev = wq_data->accel_dev;
245	struct adf_hw_device_data *hw_device = accel_dev->hw_device;
246
247	adf_error_notifier(accel_dev);
248
249	if (!accel_dev->is_vf) {
250		/* Disable arbitration to stop processing of new requests */
251		if (accel_dev->autoreset_on_error && hw_device->exit_arb)
252			hw_device->exit_arb(accel_dev);
253		if (accel_dev->pf.vf_info)
254			adf_pf2vf_notify_fatal_error(accel_dev);
255		adf_dev_autoreset(accel_dev);
256	}
257
258	kfree(wq_data);
259}
260
261int adf_notify_fatal_error(struct adf_accel_dev *accel_dev)
262{
263	struct adf_fatal_error_data *wq_data;
264
265	wq_data = kzalloc(sizeof(*wq_data), GFP_ATOMIC);
266	if (!wq_data)
267		return -ENOMEM;
268
269	wq_data->accel_dev = accel_dev;
270	INIT_WORK(&wq_data->work, adf_notify_fatal_error_worker);
271	adf_misc_wq_queue_work(&wq_data->work);
272
273	return 0;
274}
275
276int adf_init_aer(void)
277{
278	device_reset_wq = alloc_workqueue("qat_device_reset_wq",
279					  WQ_MEM_RECLAIM, 0);
280	if (!device_reset_wq)
281		return -EFAULT;
282
283	device_sriov_wq = alloc_workqueue("qat_device_sriov_wq", 0, 0);
284	if (!device_sriov_wq)
285		return -EFAULT;
286
287	return 0;
288}
289
290void adf_exit_aer(void)
291{
292	if (device_reset_wq)
293		destroy_workqueue(device_reset_wq);
294	device_reset_wq = NULL;
295
296	if (device_sriov_wq)
297		destroy_workqueue(device_sriov_wq);
298	device_sriov_wq = NULL;
299}
300