blob: d9c07dd77ea9612ff9d3ee7b5452684f8c781a25 [file] [log] [blame]
Andrew Geissler87f5cff2022-09-30 13:13:31 -05001From: James Cowgill <jcowgill@debian.org>
2Date: Sun, 11 Aug 2019 16:50:56 +0100
3Subject: avcodec/arm/sbcenc: avoid callee preserved vfp registers
4
Andrew Geissler517393d2023-01-13 08:55:19 -06005Upstream-Status: Inappropriate
6
7RPI-Distro repo clones original ffmpeg and applies patches to enable
8raspiberry pi support.
9
Andrew Geissler87f5cff2022-09-30 13:13:31 -050010When compiling FFmpeg with GCC-9, some very random segfaults were
11observed in code which had previously called down into the SBC encoder
12NEON assembly routines. This was caused by these functions clobbering
13some of the vfp callee saved registers (d8 - d15 aka q4 - q7). GCC was
14using these registers to save local variables, but after these
15functions returned, they would contain garbage.
16
17Fix by reallocating the registers in the two affected functions in
18the following way:
19 ff_sbc_analyze_4_neon: q2-q5 => q8-q11, then q1-q4 => q8-q11
20 ff_sbc_analyze_8_neon: q2-q9 => q8-q15
21
22The reason for using these replacements is to keep closely related
23sets of registers consecutively numbered which hopefully makes the
24code more easy to follow. Since this commit only reallocates
25registers, it should have no performance impact.
26
Andrew Geissler87f5cff2022-09-30 13:13:31 -050027Signed-off-by: James Cowgill <jcowgill@debian.org>
28---
29 libavcodec/arm/sbcdsp_neon.S | 220 +++++++++++++++++++++----------------------
30 1 file changed, 110 insertions(+), 110 deletions(-)
31
32diff --git a/libavcodec/arm/sbcdsp_neon.S b/libavcodec/arm/sbcdsp_neon.S
33index d83d21d..914abfb 100644
34--- a/libavcodec/arm/sbcdsp_neon.S
35+++ b/libavcodec/arm/sbcdsp_neon.S
36@@ -38,49 +38,49 @@ function ff_sbc_analyze_4_neon, export=1
37 /* TODO: merge even and odd cases (or even merge all four calls to this
38 * function) in order to have only aligned reads from 'in' array
39 * and reduce number of load instructions */
40- vld1.16 {d4, d5}, [r0, :64]!
41- vld1.16 {d8, d9}, [r2, :128]!
42+ vld1.16 {d16, d17}, [r0, :64]!
43+ vld1.16 {d20, d21}, [r2, :128]!
Andrew Geissler517393d2023-01-13 08:55:19 -060044
Andrew Geissler87f5cff2022-09-30 13:13:31 -050045- vmull.s16 q0, d4, d8
46- vld1.16 {d6, d7}, [r0, :64]!
47- vmull.s16 q1, d5, d9
48- vld1.16 {d10, d11}, [r2, :128]!
49+ vmull.s16 q0, d16, d20
50+ vld1.16 {d18, d19}, [r0, :64]!
51+ vmull.s16 q1, d17, d21
52+ vld1.16 {d22, d23}, [r2, :128]!
Andrew Geissler517393d2023-01-13 08:55:19 -060053
Andrew Geissler87f5cff2022-09-30 13:13:31 -050054- vmlal.s16 q0, d6, d10
55- vld1.16 {d4, d5}, [r0, :64]!
56- vmlal.s16 q1, d7, d11
57- vld1.16 {d8, d9}, [r2, :128]!
58+ vmlal.s16 q0, d18, d22
59+ vld1.16 {d16, d17}, [r0, :64]!
60+ vmlal.s16 q1, d19, d23
61+ vld1.16 {d20, d21}, [r2, :128]!
Andrew Geissler517393d2023-01-13 08:55:19 -060062
Andrew Geissler87f5cff2022-09-30 13:13:31 -050063- vmlal.s16 q0, d4, d8
64- vld1.16 {d6, d7}, [r0, :64]!
65- vmlal.s16 q1, d5, d9
66- vld1.16 {d10, d11}, [r2, :128]!
67+ vmlal.s16 q0, d16, d20
68+ vld1.16 {d18, d19}, [r0, :64]!
69+ vmlal.s16 q1, d17, d21
70+ vld1.16 {d22, d23}, [r2, :128]!
Andrew Geissler517393d2023-01-13 08:55:19 -060071
Andrew Geissler87f5cff2022-09-30 13:13:31 -050072- vmlal.s16 q0, d6, d10
73- vld1.16 {d4, d5}, [r0, :64]!
74- vmlal.s16 q1, d7, d11
75- vld1.16 {d8, d9}, [r2, :128]!
76+ vmlal.s16 q0, d18, d22
77+ vld1.16 {d16, d17}, [r0, :64]!
78+ vmlal.s16 q1, d19, d23
79+ vld1.16 {d20, d21}, [r2, :128]!
Andrew Geissler517393d2023-01-13 08:55:19 -060080
Andrew Geissler87f5cff2022-09-30 13:13:31 -050081- vmlal.s16 q0, d4, d8
82- vmlal.s16 q1, d5, d9
83+ vmlal.s16 q0, d16, d20
84+ vmlal.s16 q1, d17, d21
Andrew Geissler517393d2023-01-13 08:55:19 -060085
Andrew Geissler87f5cff2022-09-30 13:13:31 -050086 vpadd.s32 d0, d0, d1
87 vpadd.s32 d1, d2, d3
Andrew Geissler517393d2023-01-13 08:55:19 -060088
Andrew Geissler87f5cff2022-09-30 13:13:31 -050089 vrshrn.s32 d0, q0, SBC_PROTO_FIXED_SCALE
Andrew Geissler517393d2023-01-13 08:55:19 -060090
Andrew Geissler87f5cff2022-09-30 13:13:31 -050091- vld1.16 {d2, d3, d4, d5}, [r2, :128]!
92+ vld1.16 {d16, d17, d18, d19}, [r2, :128]!
Andrew Geissler517393d2023-01-13 08:55:19 -060093
Andrew Geissler87f5cff2022-09-30 13:13:31 -050094 vdup.i32 d1, d0[1] /* TODO: can be eliminated */
95 vdup.i32 d0, d0[0] /* TODO: can be eliminated */
Andrew Geissler517393d2023-01-13 08:55:19 -060096
Andrew Geissler87f5cff2022-09-30 13:13:31 -050097- vmull.s16 q3, d2, d0
98- vmull.s16 q4, d3, d0
99- vmlal.s16 q3, d4, d1
100- vmlal.s16 q4, d5, d1
101+ vmull.s16 q10, d16, d0
102+ vmull.s16 q11, d17, d0
103+ vmlal.s16 q10, d18, d1
104+ vmlal.s16 q11, d19, d1
Andrew Geissler517393d2023-01-13 08:55:19 -0600105
Andrew Geissler87f5cff2022-09-30 13:13:31 -0500106- vpadd.s32 d0, d6, d7 /* TODO: can be eliminated */
107- vpadd.s32 d1, d8, d9 /* TODO: can be eliminated */
108+ vpadd.s32 d0, d20, d21 /* TODO: can be eliminated */
109+ vpadd.s32 d1, d22, d23 /* TODO: can be eliminated */
Andrew Geissler517393d2023-01-13 08:55:19 -0600110
Andrew Geissler87f5cff2022-09-30 13:13:31 -0500111 vst1.32 {d0, d1}, [r1, :128]
Andrew Geissler517393d2023-01-13 08:55:19 -0600112
Andrew Geissler87f5cff2022-09-30 13:13:31 -0500113@@ -91,57 +91,57 @@ function ff_sbc_analyze_8_neon, export=1
114 /* TODO: merge even and odd cases (or even merge all four calls to this
115 * function) in order to have only aligned reads from 'in' array
116 * and reduce number of load instructions */
117- vld1.16 {d4, d5}, [r0, :64]!
118- vld1.16 {d8, d9}, [r2, :128]!
119-
120- vmull.s16 q6, d4, d8
121- vld1.16 {d6, d7}, [r0, :64]!
122- vmull.s16 q7, d5, d9
123- vld1.16 {d10, d11}, [r2, :128]!
124- vmull.s16 q8, d6, d10
125- vld1.16 {d4, d5}, [r0, :64]!
126- vmull.s16 q9, d7, d11
127- vld1.16 {d8, d9}, [r2, :128]!
128-
129- vmlal.s16 q6, d4, d8
130- vld1.16 {d6, d7}, [r0, :64]!
131- vmlal.s16 q7, d5, d9
132- vld1.16 {d10, d11}, [r2, :128]!
133- vmlal.s16 q8, d6, d10
134- vld1.16 {d4, d5}, [r0, :64]!
135- vmlal.s16 q9, d7, d11
136- vld1.16 {d8, d9}, [r2, :128]!
137-
138- vmlal.s16 q6, d4, d8
139- vld1.16 {d6, d7}, [r0, :64]!
140- vmlal.s16 q7, d5, d9
141- vld1.16 {d10, d11}, [r2, :128]!
142- vmlal.s16 q8, d6, d10
143- vld1.16 {d4, d5}, [r0, :64]!
144- vmlal.s16 q9, d7, d11
145- vld1.16 {d8, d9}, [r2, :128]!
146-
147- vmlal.s16 q6, d4, d8
148- vld1.16 {d6, d7}, [r0, :64]!
149- vmlal.s16 q7, d5, d9
150- vld1.16 {d10, d11}, [r2, :128]!
151- vmlal.s16 q8, d6, d10
152- vld1.16 {d4, d5}, [r0, :64]!
153- vmlal.s16 q9, d7, d11
154- vld1.16 {d8, d9}, [r2, :128]!
155-
156- vmlal.s16 q6, d4, d8
157- vld1.16 {d6, d7}, [r0, :64]!
158- vmlal.s16 q7, d5, d9
159- vld1.16 {d10, d11}, [r2, :128]!
160-
161- vmlal.s16 q8, d6, d10
162- vmlal.s16 q9, d7, d11
163-
164- vpadd.s32 d0, d12, d13
165- vpadd.s32 d1, d14, d15
166- vpadd.s32 d2, d16, d17
167- vpadd.s32 d3, d18, d19
168+ vld1.16 {d16, d17}, [r0, :64]!
169+ vld1.16 {d20, d21}, [r2, :128]!
170+
171+ vmull.s16 q12, d16, d20
172+ vld1.16 {d18, d19}, [r0, :64]!
173+ vmull.s16 q13, d17, d21
174+ vld1.16 {d22, d23}, [r2, :128]!
175+ vmull.s16 q14, d18, d22
176+ vld1.16 {d16, d17}, [r0, :64]!
177+ vmull.s16 q15, d19, d23
178+ vld1.16 {d20, d21}, [r2, :128]!
179+
180+ vmlal.s16 q12, d16, d20
181+ vld1.16 {d18, d19}, [r0, :64]!
182+ vmlal.s16 q13, d17, d21
183+ vld1.16 {d22, d23}, [r2, :128]!
184+ vmlal.s16 q14, d18, d22
185+ vld1.16 {d16, d17}, [r0, :64]!
186+ vmlal.s16 q15, d19, d23
187+ vld1.16 {d20, d21}, [r2, :128]!
188+
189+ vmlal.s16 q12, d16, d20
190+ vld1.16 {d18, d19}, [r0, :64]!
191+ vmlal.s16 q13, d17, d21
192+ vld1.16 {d22, d23}, [r2, :128]!
193+ vmlal.s16 q14, d18, d22
194+ vld1.16 {d16, d17}, [r0, :64]!
195+ vmlal.s16 q15, d19, d23
196+ vld1.16 {d20, d21}, [r2, :128]!
197+
198+ vmlal.s16 q12, d16, d20
199+ vld1.16 {d18, d19}, [r0, :64]!
200+ vmlal.s16 q13, d17, d21
201+ vld1.16 {d22, d23}, [r2, :128]!
202+ vmlal.s16 q14, d18, d22
203+ vld1.16 {d16, d17}, [r0, :64]!
204+ vmlal.s16 q15, d19, d23
205+ vld1.16 {d20, d21}, [r2, :128]!
206+
207+ vmlal.s16 q12, d16, d20
208+ vld1.16 {d18, d19}, [r0, :64]!
209+ vmlal.s16 q13, d17, d21
210+ vld1.16 {d22, d23}, [r2, :128]!
211+
212+ vmlal.s16 q14, d18, d22
213+ vmlal.s16 q15, d19, d23
214+
215+ vpadd.s32 d0, d24, d25
216+ vpadd.s32 d1, d26, d27
217+ vpadd.s32 d2, d28, d29
218+ vpadd.s32 d3, d30, d31
Andrew Geissler517393d2023-01-13 08:55:19 -0600219
Andrew Geissler87f5cff2022-09-30 13:13:31 -0500220 vrshr.s32 q0, q0, SBC_PROTO_FIXED_SCALE
221 vrshr.s32 q1, q1, SBC_PROTO_FIXED_SCALE
222@@ -153,38 +153,38 @@ function ff_sbc_analyze_8_neon, export=1
223 vdup.i32 d1, d0[1] /* TODO: can be eliminated */
224 vdup.i32 d0, d0[0] /* TODO: can be eliminated */
Andrew Geissler517393d2023-01-13 08:55:19 -0600225
Andrew Geissler87f5cff2022-09-30 13:13:31 -0500226- vld1.16 {d4, d5}, [r2, :128]!
227- vmull.s16 q6, d4, d0
228- vld1.16 {d6, d7}, [r2, :128]!
229- vmull.s16 q7, d5, d0
230- vmull.s16 q8, d6, d0
231- vmull.s16 q9, d7, d0
232-
233- vld1.16 {d4, d5}, [r2, :128]!
234- vmlal.s16 q6, d4, d1
235- vld1.16 {d6, d7}, [r2, :128]!
236- vmlal.s16 q7, d5, d1
237- vmlal.s16 q8, d6, d1
238- vmlal.s16 q9, d7, d1
239-
240- vld1.16 {d4, d5}, [r2, :128]!
241- vmlal.s16 q6, d4, d2
242- vld1.16 {d6, d7}, [r2, :128]!
243- vmlal.s16 q7, d5, d2
244- vmlal.s16 q8, d6, d2
245- vmlal.s16 q9, d7, d2
246-
247- vld1.16 {d4, d5}, [r2, :128]!
248- vmlal.s16 q6, d4, d3
249- vld1.16 {d6, d7}, [r2, :128]!
250- vmlal.s16 q7, d5, d3
251- vmlal.s16 q8, d6, d3
252- vmlal.s16 q9, d7, d3
253-
254- vpadd.s32 d0, d12, d13 /* TODO: can be eliminated */
255- vpadd.s32 d1, d14, d15 /* TODO: can be eliminated */
256- vpadd.s32 d2, d16, d17 /* TODO: can be eliminated */
257- vpadd.s32 d3, d18, d19 /* TODO: can be eliminated */
258+ vld1.16 {d16, d17}, [r2, :128]!
259+ vmull.s16 q12, d16, d0
260+ vld1.16 {d18, d19}, [r2, :128]!
261+ vmull.s16 q13, d17, d0
262+ vmull.s16 q14, d18, d0
263+ vmull.s16 q15, d19, d0
264+
265+ vld1.16 {d16, d17}, [r2, :128]!
266+ vmlal.s16 q12, d16, d1
267+ vld1.16 {d18, d19}, [r2, :128]!
268+ vmlal.s16 q13, d17, d1
269+ vmlal.s16 q14, d18, d1
270+ vmlal.s16 q15, d19, d1
271+
272+ vld1.16 {d16, d17}, [r2, :128]!
273+ vmlal.s16 q12, d16, d2
274+ vld1.16 {d18, d19}, [r2, :128]!
275+ vmlal.s16 q13, d17, d2
276+ vmlal.s16 q14, d18, d2
277+ vmlal.s16 q15, d19, d2
278+
279+ vld1.16 {d16, d17}, [r2, :128]!
280+ vmlal.s16 q12, d16, d3
281+ vld1.16 {d18, d19}, [r2, :128]!
282+ vmlal.s16 q13, d17, d3
283+ vmlal.s16 q14, d18, d3
284+ vmlal.s16 q15, d19, d3
285+
286+ vpadd.s32 d0, d24, d25 /* TODO: can be eliminated */
287+ vpadd.s32 d1, d26, d27 /* TODO: can be eliminated */
288+ vpadd.s32 d2, d28, d29 /* TODO: can be eliminated */
289+ vpadd.s32 d3, d30, d31 /* TODO: can be eliminated */
Andrew Geissler517393d2023-01-13 08:55:19 -0600290
Andrew Geissler87f5cff2022-09-30 13:13:31 -0500291 vst1.32 {d0, d1, d2, d3}, [r1, :128]
Andrew Geissler517393d2023-01-13 08:55:19 -0600292