blob: 14e33fdf4006d03d0c08dcaeb6745b816fdb26c8 [file] [log] [blame]
Joel Stanley6453c892018-03-29 17:37:51 +11001From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
2From: Michael Neuling <mikey@neuling.org>
3Date: Mon, 26 Mar 2018 15:17:07 +1100
Joel Stanley83360db2018-04-11 15:11:59 +09304Subject: [PATCH 2/4] powerpc/eeh: Fix race with driver un/bind
Joel Stanley6453c892018-03-29 17:37:51 +11005
Joel Stanley83360db2018-04-11 15:11:59 +09306The current EEH callbacks can race with a driver unbind. This can
7result in a backtraces like this:
Joel Stanley6453c892018-03-29 17:37:51 +11008
Joel Stanley83360db2018-04-11 15:11:59 +09309 EEH: Frozen PHB#0-PE#1fc detected
10 EEH: PE location: S000009, PHB location: N/A
11 CPU: 2 PID: 2312 Comm: kworker/u258:3 Not tainted 4.15.6-openpower1 #2
12 Workqueue: nvme-wq nvme_reset_work [nvme]
13 Call Trace:
14 dump_stack+0x9c/0xd0 (unreliable)
15 eeh_dev_check_failure+0x420/0x470
16 eeh_check_failure+0xa0/0xa4
17 nvme_reset_work+0x138/0x1414 [nvme]
18 process_one_work+0x1ec/0x328
19 worker_thread+0x2e4/0x3a8
20 kthread+0x14c/0x154
21 ret_from_kernel_thread+0x5c/0xc8
22 nvme nvme1: Removing after probe failure status: -19
23 <snip>
24 cpu 0x23: Vector: 300 (Data Access) at [c000000ff50f3800]
25 pc: c0080000089a0eb0: nvme_error_detected+0x4c/0x90 [nvme]
26 lr: c000000000026564: eeh_report_error+0xe0/0x110
27 sp: c000000ff50f3a80
28 msr: 9000000000009033
29 dar: 400
30 dsisr: 40000000
31 current = 0xc000000ff507c000
32 paca = 0xc00000000fdc9d80 softe: 0 irq_happened: 0x01
33 pid = 782, comm = eehd
34 Linux version 4.15.6-openpower1 (smc@smc-desktop) (gcc version 6.4.0 (Buildroot 2017.11.2-00008-g4b6188e)) #2 SM P Tue Feb 27 12:33:27 PST 2018
35 enter ? for help
36 eeh_report_error+0xe0/0x110
37 eeh_pe_dev_traverse+0xc0/0xdc
38 eeh_handle_normal_event+0x184/0x4c4
39 eeh_handle_event+0x30/0x288
40 eeh_event_handler+0x124/0x170
41 kthread+0x14c/0x154
42 ret_from_kernel_thread+0x5c/0xc8
Joel Stanley6453c892018-03-29 17:37:51 +110043
44The first part is an EEH (on boot), the second half is the resulting
45crash. nvme probe starts the nvme_reset_work() worker thread. This
46worker thread starts touching the device which see a device error
47(EEH) and hence queues up an event in the powerpc EEH worker
48thread. nvme_reset_work() then continues and runs
49nvme_remove_dead_ctrl_work() which results in unbinding the driver
50from the device and hence releases all resources. At the same time,
51the EEH worker thread starts doing the EEH .error_detected() driver
52callback, which no longer works since the resources have been freed.
53
54This fixes the problem in the same way the generic PCIe AER code (in
55drivers/pci/pcie/aer/aerdrv_core.c) does. It makes the EEH code hold
56the device_lock() while performing the driver EEH callbacks and
57associated code. This ensures either the callbacks are no longer
58register, or if they are registered the driver will not be removed
59from underneath us.
60
61This has been broken forever. The EEH call backs were first introduced
62in 2005 (in 77bd7415610) but it's not clear if a lock was needed back
63then.
64
Joel Stanley83360db2018-04-11 15:11:59 +093065Fixes: 77bd74156101 ("[PATCH] powerpc: PCI Error Recovery: PPC64 core recovery routines")
66Cc: stable@vger.kernel.org # v2.6.16+
Joel Stanley6453c892018-03-29 17:37:51 +110067Signed-off-by: Michael Neuling <mikey@neuling.org>
68Reviewed-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Joel Stanley83360db2018-04-11 15:11:59 +093069Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
70(cherry picked from commit f0295e047fcf52ccb42561fb7de6942f5201b676)
Joel Stanley6453c892018-03-29 17:37:51 +110071Signed-off-by: Joel Stanley <joel@jms.id.au>
72---
Joel Stanley83360db2018-04-11 15:11:59 +093073 arch/powerpc/kernel/eeh_driver.c | 68 ++++++++++++++++++++------------
74 1 file changed, 42 insertions(+), 26 deletions(-)
Joel Stanley6453c892018-03-29 17:37:51 +110075
76diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
Joel Stanley83360db2018-04-11 15:11:59 +093077index 0c0b66fc5bfb..295ba833846e 100644
Joel Stanley6453c892018-03-29 17:37:51 +110078--- a/arch/powerpc/kernel/eeh_driver.c
79+++ b/arch/powerpc/kernel/eeh_driver.c
80@@ -207,18 +207,18 @@ static void *eeh_report_error(void *data, void *userdata)
81
82 if (!dev || eeh_dev_removed(edev) || eeh_pe_passed(edev->pe))
83 return NULL;
84+
85+ device_lock(&dev->dev);
86 dev->error_state = pci_channel_io_frozen;
87
88 driver = eeh_pcid_get(dev);
89- if (!driver) return NULL;
90+ if (!driver) goto out_no_dev;
91
92 eeh_disable_irq(dev);
93
94 if (!driver->err_handler ||
95- !driver->err_handler->error_detected) {
96- eeh_pcid_put(dev);
97- return NULL;
98- }
99+ !driver->err_handler->error_detected)
100+ goto out;
101
102 rc = driver->err_handler->error_detected(dev, pci_channel_io_frozen);
103
Joel Stanley83360db2018-04-11 15:11:59 +0930104@@ -227,8 +227,12 @@ static void *eeh_report_error(void *data, void *userdata)
Joel Stanley6453c892018-03-29 17:37:51 +1100105 if (*res == PCI_ERS_RESULT_NONE) *res = rc;
106
107 edev->in_error = true;
Joel Stanley83360db2018-04-11 15:11:59 +0930108- eeh_pcid_put(dev);
109 pci_uevent_ers(dev, PCI_ERS_RESULT_NONE);
110+
Joel Stanley6453c892018-03-29 17:37:51 +1100111+out:
Joel Stanley83360db2018-04-11 15:11:59 +0930112+ eeh_pcid_put(dev);
Joel Stanley6453c892018-03-29 17:37:51 +1100113+out_no_dev:
114+ device_unlock(&dev->dev);
115 return NULL;
116 }
117
Joel Stanley83360db2018-04-11 15:11:59 +0930118@@ -251,15 +255,14 @@ static void *eeh_report_mmio_enabled(void *data, void *userdata)
Joel Stanley6453c892018-03-29 17:37:51 +1100119 if (!dev || eeh_dev_removed(edev) || eeh_pe_passed(edev->pe))
120 return NULL;
121
122+ device_lock(&dev->dev);
123 driver = eeh_pcid_get(dev);
124- if (!driver) return NULL;
125+ if (!driver) goto out_no_dev;
126
127 if (!driver->err_handler ||
128 !driver->err_handler->mmio_enabled ||
129- (edev->mode & EEH_DEV_NO_HANDLER)) {
130- eeh_pcid_put(dev);
131- return NULL;
132- }
133+ (edev->mode & EEH_DEV_NO_HANDLER))
134+ goto out;
135
136 rc = driver->err_handler->mmio_enabled(dev);
137
Joel Stanley83360db2018-04-11 15:11:59 +0930138@@ -267,7 +270,10 @@ static void *eeh_report_mmio_enabled(void *data, void *userdata)
Joel Stanley6453c892018-03-29 17:37:51 +1100139 if (rc == PCI_ERS_RESULT_NEED_RESET) *res = rc;
140 if (*res == PCI_ERS_RESULT_NONE) *res = rc;
141
142+out:
143 eeh_pcid_put(dev);
144+out_no_dev:
145+ device_unlock(&dev->dev);
146 return NULL;
147 }
148
Joel Stanley83360db2018-04-11 15:11:59 +0930149@@ -290,20 +296,20 @@ static void *eeh_report_reset(void *data, void *userdata)
Joel Stanley6453c892018-03-29 17:37:51 +1100150
151 if (!dev || eeh_dev_removed(edev) || eeh_pe_passed(edev->pe))
152 return NULL;
153+
154+ device_lock(&dev->dev);
155 dev->error_state = pci_channel_io_normal;
156
157 driver = eeh_pcid_get(dev);
158- if (!driver) return NULL;
159+ if (!driver) goto out_no_dev;
160
161 eeh_enable_irq(dev);
162
163 if (!driver->err_handler ||
164 !driver->err_handler->slot_reset ||
165 (edev->mode & EEH_DEV_NO_HANDLER) ||
166- (!edev->in_error)) {
167- eeh_pcid_put(dev);
168- return NULL;
169- }
170+ (!edev->in_error))
171+ goto out;
172
173 rc = driver->err_handler->slot_reset(dev);
174 if ((*res == PCI_ERS_RESULT_NONE) ||
Joel Stanley83360db2018-04-11 15:11:59 +0930175@@ -311,7 +317,10 @@ static void *eeh_report_reset(void *data, void *userdata)
Joel Stanley6453c892018-03-29 17:37:51 +1100176 if (*res == PCI_ERS_RESULT_DISCONNECT &&
177 rc == PCI_ERS_RESULT_NEED_RESET) *res = rc;
178
179+out:
180 eeh_pcid_put(dev);
181+out_no_dev:
182+ device_unlock(&dev->dev);
183 return NULL;
184 }
185
Joel Stanley83360db2018-04-11 15:11:59 +0930186@@ -362,10 +371,12 @@ static void *eeh_report_resume(void *data, void *userdata)
Joel Stanley6453c892018-03-29 17:37:51 +1100187
188 if (!dev || eeh_dev_removed(edev) || eeh_pe_passed(edev->pe))
189 return NULL;
190+
191+ device_lock(&dev->dev);
192 dev->error_state = pci_channel_io_normal;
193
194 driver = eeh_pcid_get(dev);
195- if (!driver) return NULL;
196+ if (!driver) goto out_no_dev;
197
198 was_in_error = edev->in_error;
199 edev->in_error = false;
Joel Stanley83360db2018-04-11 15:11:59 +0930200@@ -375,18 +386,20 @@ static void *eeh_report_resume(void *data, void *userdata)
Joel Stanley6453c892018-03-29 17:37:51 +1100201 !driver->err_handler->resume ||
202 (edev->mode & EEH_DEV_NO_HANDLER) || !was_in_error) {
203 edev->mode &= ~EEH_DEV_NO_HANDLER;
204- eeh_pcid_put(dev);
205- return NULL;
206+ goto out;
207 }
208
209 driver->err_handler->resume(dev);
210
Joel Stanley83360db2018-04-11 15:11:59 +0930211- eeh_pcid_put(dev);
212 pci_uevent_ers(dev, PCI_ERS_RESULT_RECOVERED);
Joel Stanley6453c892018-03-29 17:37:51 +1100213+out:
Joel Stanley83360db2018-04-11 15:11:59 +0930214+ eeh_pcid_put(dev);
215 #ifdef CONFIG_PCI_IOV
216 if (eeh_ops->notify_resume && eeh_dev_to_pdn(edev))
217 eeh_ops->notify_resume(eeh_dev_to_pdn(edev));
218 #endif
Joel Stanley6453c892018-03-29 17:37:51 +1100219+out_no_dev:
220+ device_unlock(&dev->dev);
221 return NULL;
222 }
223
Joel Stanley83360db2018-04-11 15:11:59 +0930224@@ -406,23 +419,26 @@ static void *eeh_report_failure(void *data, void *userdata)
Joel Stanley6453c892018-03-29 17:37:51 +1100225
226 if (!dev || eeh_dev_removed(edev) || eeh_pe_passed(edev->pe))
227 return NULL;
228+
229+ device_lock(&dev->dev);
230 dev->error_state = pci_channel_io_perm_failure;
231
232 driver = eeh_pcid_get(dev);
233- if (!driver) return NULL;
234+ if (!driver) goto out_no_dev;
235
236 eeh_disable_irq(dev);
237
238 if (!driver->err_handler ||
239- !driver->err_handler->error_detected) {
240- eeh_pcid_put(dev);
241- return NULL;
242- }
243+ !driver->err_handler->error_detected)
244+ goto out;
245
246 driver->err_handler->error_detected(dev, pci_channel_io_perm_failure);
Joel Stanley83360db2018-04-11 15:11:59 +0930247
248- eeh_pcid_put(dev);
249 pci_uevent_ers(dev, PCI_ERS_RESULT_DISCONNECT);
Joel Stanley6453c892018-03-29 17:37:51 +1100250+out:
Joel Stanley83360db2018-04-11 15:11:59 +0930251+ eeh_pcid_put(dev);
Joel Stanley6453c892018-03-29 17:37:51 +1100252+out_no_dev:
253+ device_unlock(&dev->dev);
254 return NULL;
255 }
256