blob: ead099545df75d5fdf2cbbef1c510a3be59ac02e [file] [log] [blame]
Brad Bishop19323692019-04-05 15:28:33 -04001From 947f79f97a5fa6547d99bff282606026632e010b Mon Sep 17 00:00:00 2001
2From: =?UTF-8?q?Andreas=20M=C3=BCller?= <schnitzeltony@gmail.com>
3Date: Sat, 13 Oct 2018 23:01:11 +0200
4Subject: [PATCH] Use ARM-NEON accelaration for float-multithreaded setups
5
6Profiling shows a considerable performance win. See estimated number of voices:
7They increase from ~471 to ~513 which makes a ~9% win:
8
9*******************************************************************************
10WITHOUT ARM NEON:
11*******************************************************************************
12
13morona@raspberrypi3:~$ fluidsynth -o synth.cpu-cores=4 -o synth.chorus.active=0 -o synth.reverb.active=0 /usr/share/sf2/fluidr3gm.sf2
14> prof_set_print 1
15> prof_set_notes 10
16> prof_start 3 10000
17Generating 10 notes, generated voices:20
18Number of measures(n_prof):3, duration of one mesure(dur):10000ms
19
20Profiling time(mm:ss): Total=0:30 Remainder=0:30, press <ENTER> to cancel
21 ------------------------------------------------------------------------------
22 Duration(microsecond) and cpu loads(%) (sr: 48000 Hz, sp: 20.83 microsecond)
23 ------------------------------------------------------------------------------
24 Code under profiling |Voices| Duration (microsecond) | Load(%)
25 | nbr| min| avg| max|
26 ---------------------------|------|--------------------------------|----------
27 synth_write_* ------------>| 20| 112.00| 118.96| 206.00| 4.461
28 synth_one_block ---------->| 20| 109.00| 116.44| 196.00| 4.367
29 synth_one_block:clear ---->| 20| 1.00| 1.67| 18.00| 0.063
30 synth_one_block:one voice->| 1| 11.00| 12.36| 58.00| 0.463
31 synth_one_block:all voices>| 20| 107.00| 113.47| 187.00| 4.255
32 synth_one_block:reverb --->| no profiling available
33 synth_one_block:chorus --->| no profiling available
34 voice:note --------------->| no profiling available
35 voice:release ------------>| no profiling available
36 ------------------------------------------------------------------------------
37 Cpu loads(%) (sr: 48000 Hz, sp: 20.83 microsecond) and maximum voices
38 ------------------------------------------------------------------------------
39 nVoices| total(%)|voices(%)| reverb(%)|chorus(%)| voice(%)|estimated maxVoices
40 -------|---------|---------|----------|---------|---------|-------------------
41 20| 4.461| 4.461| 0.000| 0.000| 0.213| 470
42
43Profiling time(mm:ss): Total=0:30 Remainder=0:20, press <ENTER> to cancel
44 ------------------------------------------------------------------------------
45 Duration(microsecond) and cpu loads(%) (sr: 48000 Hz, sp: 20.83 microsecond)
46 ------------------------------------------------------------------------------
47 Code under profiling |Voices| Duration (microsecond) | Load(%)
48 | nbr| min| avg| max|
49 ---------------------------|------|--------------------------------|----------
50 synth_write_* ------------>| 20| 112.00| 118.42| 216.00| 4.441
51 synth_one_block ---------->| 20| 109.00| 115.91| 205.00| 4.347
52 synth_one_block:clear ---->| 20| 1.00| 1.65| 18.00| 0.062
53 synth_one_block:one voice->| 1| 11.00| 12.30| 58.00| 0.461
54 synth_one_block:all voices>| 20| 107.00| 112.98| 197.00| 4.237
55 synth_one_block:reverb --->| no profiling available
56 synth_one_block:chorus --->| no profiling available
57 voice:note --------------->| no profiling available
58 voice:release ------------>| no profiling available
59 ------------------------------------------------------------------------------
60 Cpu loads(%) (sr: 48000 Hz, sp: 20.83 microsecond) and maximum voices
61 ------------------------------------------------------------------------------
62 nVoices| total(%)|voices(%)| reverb(%)|chorus(%)| voice(%)|estimated maxVoices
63 -------|---------|---------|----------|---------|---------|-------------------
64 20| 4.441| 4.441| 0.000| 0.000| 0.212| 472
65
66Profiling time(mm:ss): Total=0:30 Remainder=0:10, press <ENTER> to cancel
67 ------------------------------------------------------------------------------
68 Duration(microsecond) and cpu loads(%) (sr: 48000 Hz, sp: 20.83 microsecond)
69 ------------------------------------------------------------------------------
70 Code under profiling |Voices| Duration (microsecond) | Load(%)
71 | nbr| min| avg| max|
72 ---------------------------|------|--------------------------------|----------
73 synth_write_* ------------>| 20| 112.00| 118.64| 244.00| 4.449
74 synth_one_block ---------->| 20| 109.00| 116.12| 234.00| 4.355
75 synth_one_block:clear ---->| 20| 1.00| 1.67| 37.00| 0.062
76 synth_one_block:one voice->| 1| 11.00| 12.31| 63.00| 0.462
77 synth_one_block:all voices>| 20| 107.00| 113.18| 214.00| 4.244
78 synth_one_block:reverb --->| no profiling available
79 synth_one_block:chorus --->| no profiling available
80 voice:note --------------->| no profiling available
81 voice:release ------------>| no profiling available
82 ------------------------------------------------------------------------------
83 Cpu loads(%) (sr: 48000 Hz, sp: 20.83 microsecond) and maximum voices
84 ------------------------------------------------------------------------------
85 nVoices| total(%)|voices(%)| reverb(%)|chorus(%)| voice(%)|estimated maxVoices
86 -------|---------|---------|----------|---------|---------|-------------------
87 20| 4.449| 4.449| 0.000| 0.000| 0.212| 471
88Stopping 20 voices...voices stopped.
89> quit
90cheers!
91JackTemporaryException : now quits...
92Jack main caught signal 2
93fluid_profiling_print
94fluidsynth: Estimated times: min/avg/max (micro seconds)
95fluidsynth: synth_write_* ------------>: 112.000/118.636/244.000
96fluidsynth: synth_one_block ---------->: 109.000/116.124/234.000
97fluidsynth: synth_one_block:clear ---->: 1.000/1.665/37.000
98fluidsynth: synth_one_block:one voice->: 11.000/12.309/63.000
99fluidsynth: synth_one_block:all voices>: 107.000/113.180/214.000
100
101*******************************************************************************
102WITH ARM NEON:
103*******************************************************************************
104
105morona@raspberrypi3:~$ fluidsynth -o synth.cpu-cores=4 -o synth.chorus.active=0 -o synth.reverb.active=0 /usr/share/sf2/fluidr3gm.sf2
106> prof_set_print 1
107> prof_set_notes 10
108> prof_start 3 10000
109Generating 10 notes, generated voices:20
110Number of measures(n_prof):3, duration of one mesure(dur):10000ms
111
112Profiling time(mm:ss): Total=0:30 Remainder=0:30, press <ENTER> to cancel
113 ------------------------------------------------------------------------------
114 Duration(microsecond) and cpu loads(%) (sr: 48000 Hz, sp: 20.83 microsecond)
115 ------------------------------------------------------------------------------
116 Code under profiling |Voices| Duration (microsecond) | Load(%)
117 | nbr| min| avg| max|
118 ---------------------------|------|--------------------------------|----------
119 synth_write_* ------------>| 20| 102.00| 109.21| 213.00| 4.095
120 synth_one_block ---------->| 20| 99.00| 106.68| 201.00| 4.001
121 synth_one_block:clear ---->| 20| 1.00| 1.64| 18.00| 0.062
122 synth_one_block:one voice->| 1| 11.00| 12.30| 54.00| 0.461
123 synth_one_block:all voices>| 20| 97.00| 103.71| 188.00| 3.889
124 synth_one_block:reverb --->| no profiling available
125 synth_one_block:chorus --->| no profiling available
126 voice:note --------------->| no profiling available
127 voice:release ------------>| no profiling available
128 ------------------------------------------------------------------------------
129 Cpu loads(%) (sr: 48000 Hz, sp: 20.83 microsecond) and maximum voices
130 ------------------------------------------------------------------------------
131 nVoices| total(%)|voices(%)| reverb(%)|chorus(%)| voice(%)|estimated maxVoices
132 -------|---------|---------|----------|---------|---------|-------------------
133 20| 4.095| 4.095| 0.000| 0.000| 0.194| 514
134
135Profiling time(mm:ss): Total=0:30 Remainder=0:20, press <ENTER> to cancel
136 ------------------------------------------------------------------------------
137 Duration(microsecond) and cpu loads(%) (sr: 48000 Hz, sp: 20.83 microsecond)
138 ------------------------------------------------------------------------------
139 Code under profiling |Voices| Duration (microsecond) | Load(%)
140 | nbr| min| avg| max|
141 ---------------------------|------|--------------------------------|----------
142 synth_write_* ------------>| 20| 102.00| 109.46| 278.00| 4.105
143 synth_one_block ---------->| 20| 99.00| 106.91| 265.00| 4.009
144 synth_one_block:clear ---->| 20| 1.00| 1.67| 22.00| 0.062
145 synth_one_block:one voice->| 1| 11.00| 12.30| 54.00| 0.461
146 synth_one_block:all voices>| 20| 97.00| 103.94| 251.00| 3.898
147 synth_one_block:reverb --->| no profiling available
148 synth_one_block:chorus --->| no profiling available
149 voice:note --------------->| no profiling available
150 voice:release ------------>| no profiling available
151 ------------------------------------------------------------------------------
152 Cpu loads(%) (sr: 48000 Hz, sp: 20.83 microsecond) and maximum voices
153 ------------------------------------------------------------------------------
154 nVoices| total(%)|voices(%)| reverb(%)|chorus(%)| voice(%)|estimated maxVoices
155 -------|---------|---------|----------|---------|---------|-------------------
156 20| 4.105| 4.105| 0.000| 0.000| 0.195| 513
157
158Profiling time(mm:ss): Total=0:30 Remainder=0:10, press <ENTER> to cancel
159 ------------------------------------------------------------------------------
160 Duration(microsecond) and cpu loads(%) (sr: 48000 Hz, sp: 20.83 microsecond)
161 ------------------------------------------------------------------------------
162 Code under profiling |Voices| Duration (microsecond) | Load(%)
163 | nbr| min| avg| max|
164 ---------------------------|------|--------------------------------|----------
165 synth_write_* ------------>| 20| 102.00| 109.22| 278.00| 4.096
166 synth_one_block ---------->| 20| 99.00| 106.65| 265.00| 3.999
167 synth_one_block:clear ---->| 20| 1.00| 1.67| 22.00| 0.062
168 synth_one_block:one voice->| 1| 11.00| 12.31| 57.00| 0.462
169 synth_one_block:all voices>| 20| 97.00| 103.68| 251.00| 3.888
170 synth_one_block:reverb --->| no profiling available
171 synth_one_block:chorus --->| no profiling available
172 voice:note --------------->| no profiling available
173 voice:release ------------>| no profiling available
174 ------------------------------------------------------------------------------
175 Cpu loads(%) (sr: 48000 Hz, sp: 20.83 microsecond) and maximum voices
176 ------------------------------------------------------------------------------
177 nVoices| total(%)|voices(%)| reverb(%)|chorus(%)| voice(%)|estimated maxVoices
178 -------|---------|---------|----------|---------|---------|-------------------
179 20| 4.096| 4.096| 0.000| 0.000| 0.194| 514
180Stopping 20 voices...voices stopped.
181> quit
182cheers!
183JackTemporaryException : now quits...
184Jack main caught signal 2
185fluid_profiling_print
186fluidsynth: Estimated times: min/avg/max (micro seconds)
187fluidsynth: synth_write_* ------------>: 102.000/109.216/278.000
188fluidsynth: synth_one_block ---------->: 99.000/106.649/265.000
189fluidsynth: synth_one_block:clear ---->: 1.000/1.666/22.000
190fluidsynth: synth_one_block:one voice->: 11.000/12.307/57.000
191fluidsynth: synth_one_block:all voices>: 97.000/103.681/251.000
192
193Upstream-Status: Inappropriate [embedded-specific]
194
195---
196 src/rvoice/fluid_rvoice_mixer.c | 55 ++++++++++++++++++++++++++++++++-
197 1 file changed, 54 insertions(+), 1 deletion(-)
198
199diff --git a/src/rvoice/fluid_rvoice_mixer.c b/src/rvoice/fluid_rvoice_mixer.c
200index af0ef75d..07a357c7 100644
201--- a/src/rvoice/fluid_rvoice_mixer.c
202+++ b/src/rvoice/fluid_rvoice_mixer.c
203@@ -27,6 +27,9 @@
204 #include "fluid_ladspa.h"
205 #include "fluid_synth.h"
206
207+#if defined(__ARM_NEON__)
208+#include "arm_neon.h"
209+#endif
210
211 // If less than x voices, the thread overhead is larger than the gain,
212 // so don't activate the thread(s).
213@@ -1053,9 +1056,15 @@ fluid_mixer_buffers_mix(fluid_mixer_buffers_t *dst, fluid_mixer_buffers_t *src,
214 int i, j;
215 int scount = current_blockcount * FLUID_BUFSIZE;
216 int minbuf;
217+#if defined(__ARM_NEON__) && defined(WITH_FLOAT)
218+ fluid_real_t *FLUID_RESTRICT base_src_left;
219+ fluid_real_t *FLUID_RESTRICT base_src_right;
220+ fluid_real_t *FLUID_RESTRICT base_dst_left;
221+ fluid_real_t *FLUID_RESTRICT base_dst_right;
222+#else
223 fluid_real_t *FLUID_RESTRICT base_src;
224 fluid_real_t *FLUID_RESTRICT base_dst;
225-
226+#endif
227 minbuf = dst->buf_count;
228
229 if(minbuf > src->buf_count)
230@@ -1063,6 +1072,27 @@ fluid_mixer_buffers_mix(fluid_mixer_buffers_t *dst, fluid_mixer_buffers_t *src,
231 minbuf = src->buf_count;
232 }
233
234+#if defined(__ARM_NEON__) && defined(WITH_FLOAT)
235+ base_src_left = fluid_align_ptr(src->left_buf, FLUID_DEFAULT_ALIGNMENT);
236+ base_dst_left = fluid_align_ptr(dst->left_buf, FLUID_DEFAULT_ALIGNMENT);
237+ base_src_right = fluid_align_ptr(src->right_buf, FLUID_DEFAULT_ALIGNMENT);
238+ base_dst_right = fluid_align_ptr(dst->right_buf, FLUID_DEFAULT_ALIGNMENT);
239+
240+ for(i = 0; i < minbuf; i++)
241+ {
242+ for(j = 0; j < scount; j+=4)
243+ {
244+ int dsp_i = i * FLUID_MIXER_MAX_BUFFERS_DEFAULT * FLUID_BUFSIZE + j;
245+
246+ float32x4_t vleft = vld1q_f32(&base_dst_left[dsp_i]);
247+ float32x4_t vright = vld1q_f32(&base_dst_right[dsp_i]);
248+ vleft = vaddq_f32(vleft, vld1q_f32(&base_src_left[dsp_i]));
249+ vright = vaddq_f32(vright, vld1q_f32(&base_src_right[dsp_i]));
250+ vst1q_f32(&base_dst_left[dsp_i], vleft);
251+ vst1q_f32(&base_dst_right[dsp_i], vright);
252+ }
253+ }
254+#else
255 base_src = fluid_align_ptr(src->left_buf, FLUID_DEFAULT_ALIGNMENT);
256 base_dst = fluid_align_ptr(dst->left_buf, FLUID_DEFAULT_ALIGNMENT);
257
258@@ -1090,6 +1120,7 @@ fluid_mixer_buffers_mix(fluid_mixer_buffers_t *dst, fluid_mixer_buffers_t *src,
259 base_dst[dsp_i] += base_src[dsp_i];
260 }
261 }
262+#endif
263
264 minbuf = dst->fx_buf_count;
265
266@@ -1098,6 +1129,27 @@ fluid_mixer_buffers_mix(fluid_mixer_buffers_t *dst, fluid_mixer_buffers_t *src,
267 minbuf = src->fx_buf_count;
268 }
269
270+#if defined(__ARM_NEON__) && defined(WITH_FLOAT)
271+ base_src_left = fluid_align_ptr(src->fx_left_buf, FLUID_DEFAULT_ALIGNMENT);
272+ base_dst_left = fluid_align_ptr(dst->fx_left_buf, FLUID_DEFAULT_ALIGNMENT);
273+ base_src_right = fluid_align_ptr(src->fx_right_buf, FLUID_DEFAULT_ALIGNMENT);
274+ base_dst_right = fluid_align_ptr(dst->fx_right_buf, FLUID_DEFAULT_ALIGNMENT);
275+
276+ for(i = 0; i < minbuf; i++)
277+ {
278+ for(j = 0; j < scount; j+=4)
279+ {
280+ int dsp_i = i * FLUID_MIXER_MAX_BUFFERS_DEFAULT * FLUID_BUFSIZE + j;
281+
282+ float32x4_t vleft = vld1q_f32(&base_dst_left[dsp_i]);
283+ float32x4_t vright = vld1q_f32(&base_dst_right[dsp_i]);
284+ vleft = vaddq_f32(vleft, vld1q_f32(&base_src_left[dsp_i]));
285+ vright = vaddq_f32(vright, vld1q_f32(&base_src_right[dsp_i]));
286+ vst1q_f32(&base_dst_left[dsp_i], vleft);
287+ vst1q_f32(&base_dst_right[dsp_i], vright);
288+ }
289+ }
290+#else
291 base_src = fluid_align_ptr(src->fx_left_buf, FLUID_DEFAULT_ALIGNMENT);
292 base_dst = fluid_align_ptr(dst->fx_left_buf, FLUID_DEFAULT_ALIGNMENT);
293
294@@ -1125,6 +1177,7 @@ fluid_mixer_buffers_mix(fluid_mixer_buffers_t *dst, fluid_mixer_buffers_t *src,
295 base_dst[dsp_i] += base_src[dsp_i];
296 }
297 }
298+#endif
299 }
300
301
302--
3032.20.1
304