Brad Bishop | 1932369 | 2019-04-05 15:28:33 -0400 | [diff] [blame^] | 1 | From 947f79f97a5fa6547d99bff282606026632e010b Mon Sep 17 00:00:00 2001 |
| 2 | From: =?UTF-8?q?Andreas=20M=C3=BCller?= <schnitzeltony@gmail.com> |
| 3 | Date: Sat, 13 Oct 2018 23:01:11 +0200 |
| 4 | Subject: [PATCH] Use ARM-NEON accelaration for float-multithreaded setups |
| 5 | |
| 6 | Profiling shows a considerable performance win. See estimated number of voices: |
| 7 | They increase from ~471 to ~513 which makes a ~9% win: |
| 8 | |
| 9 | ******************************************************************************* |
| 10 | WITHOUT ARM NEON: |
| 11 | ******************************************************************************* |
| 12 | |
| 13 | morona@raspberrypi3:~$ fluidsynth -o synth.cpu-cores=4 -o synth.chorus.active=0 -o synth.reverb.active=0 /usr/share/sf2/fluidr3gm.sf2 |
| 14 | > prof_set_print 1 |
| 15 | > prof_set_notes 10 |
| 16 | > prof_start 3 10000 |
| 17 | Generating 10 notes, generated voices:20 |
| 18 | Number of measures(n_prof):3, duration of one mesure(dur):10000ms |
| 19 | |
| 20 | Profiling time(mm:ss): Total=0:30 Remainder=0:30, press <ENTER> to cancel |
| 21 | ------------------------------------------------------------------------------ |
| 22 | Duration(microsecond) and cpu loads(%) (sr: 48000 Hz, sp: 20.83 microsecond) |
| 23 | ------------------------------------------------------------------------------ |
| 24 | Code under profiling |Voices| Duration (microsecond) | Load(%) |
| 25 | | nbr| min| avg| max| |
| 26 | ---------------------------|------|--------------------------------|---------- |
| 27 | synth_write_* ------------>| 20| 112.00| 118.96| 206.00| 4.461 |
| 28 | synth_one_block ---------->| 20| 109.00| 116.44| 196.00| 4.367 |
| 29 | synth_one_block:clear ---->| 20| 1.00| 1.67| 18.00| 0.063 |
| 30 | synth_one_block:one voice->| 1| 11.00| 12.36| 58.00| 0.463 |
| 31 | synth_one_block:all voices>| 20| 107.00| 113.47| 187.00| 4.255 |
| 32 | synth_one_block:reverb --->| no profiling available |
| 33 | synth_one_block:chorus --->| no profiling available |
| 34 | voice:note --------------->| no profiling available |
| 35 | voice:release ------------>| no profiling available |
| 36 | ------------------------------------------------------------------------------ |
| 37 | Cpu loads(%) (sr: 48000 Hz, sp: 20.83 microsecond) and maximum voices |
| 38 | ------------------------------------------------------------------------------ |
| 39 | nVoices| total(%)|voices(%)| reverb(%)|chorus(%)| voice(%)|estimated maxVoices |
| 40 | -------|---------|---------|----------|---------|---------|------------------- |
| 41 | 20| 4.461| 4.461| 0.000| 0.000| 0.213| 470 |
| 42 | |
| 43 | Profiling time(mm:ss): Total=0:30 Remainder=0:20, press <ENTER> to cancel |
| 44 | ------------------------------------------------------------------------------ |
| 45 | Duration(microsecond) and cpu loads(%) (sr: 48000 Hz, sp: 20.83 microsecond) |
| 46 | ------------------------------------------------------------------------------ |
| 47 | Code under profiling |Voices| Duration (microsecond) | Load(%) |
| 48 | | nbr| min| avg| max| |
| 49 | ---------------------------|------|--------------------------------|---------- |
| 50 | synth_write_* ------------>| 20| 112.00| 118.42| 216.00| 4.441 |
| 51 | synth_one_block ---------->| 20| 109.00| 115.91| 205.00| 4.347 |
| 52 | synth_one_block:clear ---->| 20| 1.00| 1.65| 18.00| 0.062 |
| 53 | synth_one_block:one voice->| 1| 11.00| 12.30| 58.00| 0.461 |
| 54 | synth_one_block:all voices>| 20| 107.00| 112.98| 197.00| 4.237 |
| 55 | synth_one_block:reverb --->| no profiling available |
| 56 | synth_one_block:chorus --->| no profiling available |
| 57 | voice:note --------------->| no profiling available |
| 58 | voice:release ------------>| no profiling available |
| 59 | ------------------------------------------------------------------------------ |
| 60 | Cpu loads(%) (sr: 48000 Hz, sp: 20.83 microsecond) and maximum voices |
| 61 | ------------------------------------------------------------------------------ |
| 62 | nVoices| total(%)|voices(%)| reverb(%)|chorus(%)| voice(%)|estimated maxVoices |
| 63 | -------|---------|---------|----------|---------|---------|------------------- |
| 64 | 20| 4.441| 4.441| 0.000| 0.000| 0.212| 472 |
| 65 | |
| 66 | Profiling time(mm:ss): Total=0:30 Remainder=0:10, press <ENTER> to cancel |
| 67 | ------------------------------------------------------------------------------ |
| 68 | Duration(microsecond) and cpu loads(%) (sr: 48000 Hz, sp: 20.83 microsecond) |
| 69 | ------------------------------------------------------------------------------ |
| 70 | Code under profiling |Voices| Duration (microsecond) | Load(%) |
| 71 | | nbr| min| avg| max| |
| 72 | ---------------------------|------|--------------------------------|---------- |
| 73 | synth_write_* ------------>| 20| 112.00| 118.64| 244.00| 4.449 |
| 74 | synth_one_block ---------->| 20| 109.00| 116.12| 234.00| 4.355 |
| 75 | synth_one_block:clear ---->| 20| 1.00| 1.67| 37.00| 0.062 |
| 76 | synth_one_block:one voice->| 1| 11.00| 12.31| 63.00| 0.462 |
| 77 | synth_one_block:all voices>| 20| 107.00| 113.18| 214.00| 4.244 |
| 78 | synth_one_block:reverb --->| no profiling available |
| 79 | synth_one_block:chorus --->| no profiling available |
| 80 | voice:note --------------->| no profiling available |
| 81 | voice:release ------------>| no profiling available |
| 82 | ------------------------------------------------------------------------------ |
| 83 | Cpu loads(%) (sr: 48000 Hz, sp: 20.83 microsecond) and maximum voices |
| 84 | ------------------------------------------------------------------------------ |
| 85 | nVoices| total(%)|voices(%)| reverb(%)|chorus(%)| voice(%)|estimated maxVoices |
| 86 | -------|---------|---------|----------|---------|---------|------------------- |
| 87 | 20| 4.449| 4.449| 0.000| 0.000| 0.212| 471 |
| 88 | Stopping 20 voices...voices stopped. |
| 89 | > quit |
| 90 | cheers! |
| 91 | JackTemporaryException : now quits... |
| 92 | Jack main caught signal 2 |
| 93 | fluid_profiling_print |
| 94 | fluidsynth: Estimated times: min/avg/max (micro seconds) |
| 95 | fluidsynth: synth_write_* ------------>: 112.000/118.636/244.000 |
| 96 | fluidsynth: synth_one_block ---------->: 109.000/116.124/234.000 |
| 97 | fluidsynth: synth_one_block:clear ---->: 1.000/1.665/37.000 |
| 98 | fluidsynth: synth_one_block:one voice->: 11.000/12.309/63.000 |
| 99 | fluidsynth: synth_one_block:all voices>: 107.000/113.180/214.000 |
| 100 | |
| 101 | ******************************************************************************* |
| 102 | WITH ARM NEON: |
| 103 | ******************************************************************************* |
| 104 | |
| 105 | morona@raspberrypi3:~$ fluidsynth -o synth.cpu-cores=4 -o synth.chorus.active=0 -o synth.reverb.active=0 /usr/share/sf2/fluidr3gm.sf2 |
| 106 | > prof_set_print 1 |
| 107 | > prof_set_notes 10 |
| 108 | > prof_start 3 10000 |
| 109 | Generating 10 notes, generated voices:20 |
| 110 | Number of measures(n_prof):3, duration of one mesure(dur):10000ms |
| 111 | |
| 112 | Profiling time(mm:ss): Total=0:30 Remainder=0:30, press <ENTER> to cancel |
| 113 | ------------------------------------------------------------------------------ |
| 114 | Duration(microsecond) and cpu loads(%) (sr: 48000 Hz, sp: 20.83 microsecond) |
| 115 | ------------------------------------------------------------------------------ |
| 116 | Code under profiling |Voices| Duration (microsecond) | Load(%) |
| 117 | | nbr| min| avg| max| |
| 118 | ---------------------------|------|--------------------------------|---------- |
| 119 | synth_write_* ------------>| 20| 102.00| 109.21| 213.00| 4.095 |
| 120 | synth_one_block ---------->| 20| 99.00| 106.68| 201.00| 4.001 |
| 121 | synth_one_block:clear ---->| 20| 1.00| 1.64| 18.00| 0.062 |
| 122 | synth_one_block:one voice->| 1| 11.00| 12.30| 54.00| 0.461 |
| 123 | synth_one_block:all voices>| 20| 97.00| 103.71| 188.00| 3.889 |
| 124 | synth_one_block:reverb --->| no profiling available |
| 125 | synth_one_block:chorus --->| no profiling available |
| 126 | voice:note --------------->| no profiling available |
| 127 | voice:release ------------>| no profiling available |
| 128 | ------------------------------------------------------------------------------ |
| 129 | Cpu loads(%) (sr: 48000 Hz, sp: 20.83 microsecond) and maximum voices |
| 130 | ------------------------------------------------------------------------------ |
| 131 | nVoices| total(%)|voices(%)| reverb(%)|chorus(%)| voice(%)|estimated maxVoices |
| 132 | -------|---------|---------|----------|---------|---------|------------------- |
| 133 | 20| 4.095| 4.095| 0.000| 0.000| 0.194| 514 |
| 134 | |
| 135 | Profiling time(mm:ss): Total=0:30 Remainder=0:20, press <ENTER> to cancel |
| 136 | ------------------------------------------------------------------------------ |
| 137 | Duration(microsecond) and cpu loads(%) (sr: 48000 Hz, sp: 20.83 microsecond) |
| 138 | ------------------------------------------------------------------------------ |
| 139 | Code under profiling |Voices| Duration (microsecond) | Load(%) |
| 140 | | nbr| min| avg| max| |
| 141 | ---------------------------|------|--------------------------------|---------- |
| 142 | synth_write_* ------------>| 20| 102.00| 109.46| 278.00| 4.105 |
| 143 | synth_one_block ---------->| 20| 99.00| 106.91| 265.00| 4.009 |
| 144 | synth_one_block:clear ---->| 20| 1.00| 1.67| 22.00| 0.062 |
| 145 | synth_one_block:one voice->| 1| 11.00| 12.30| 54.00| 0.461 |
| 146 | synth_one_block:all voices>| 20| 97.00| 103.94| 251.00| 3.898 |
| 147 | synth_one_block:reverb --->| no profiling available |
| 148 | synth_one_block:chorus --->| no profiling available |
| 149 | voice:note --------------->| no profiling available |
| 150 | voice:release ------------>| no profiling available |
| 151 | ------------------------------------------------------------------------------ |
| 152 | Cpu loads(%) (sr: 48000 Hz, sp: 20.83 microsecond) and maximum voices |
| 153 | ------------------------------------------------------------------------------ |
| 154 | nVoices| total(%)|voices(%)| reverb(%)|chorus(%)| voice(%)|estimated maxVoices |
| 155 | -------|---------|---------|----------|---------|---------|------------------- |
| 156 | 20| 4.105| 4.105| 0.000| 0.000| 0.195| 513 |
| 157 | |
| 158 | Profiling time(mm:ss): Total=0:30 Remainder=0:10, press <ENTER> to cancel |
| 159 | ------------------------------------------------------------------------------ |
| 160 | Duration(microsecond) and cpu loads(%) (sr: 48000 Hz, sp: 20.83 microsecond) |
| 161 | ------------------------------------------------------------------------------ |
| 162 | Code under profiling |Voices| Duration (microsecond) | Load(%) |
| 163 | | nbr| min| avg| max| |
| 164 | ---------------------------|------|--------------------------------|---------- |
| 165 | synth_write_* ------------>| 20| 102.00| 109.22| 278.00| 4.096 |
| 166 | synth_one_block ---------->| 20| 99.00| 106.65| 265.00| 3.999 |
| 167 | synth_one_block:clear ---->| 20| 1.00| 1.67| 22.00| 0.062 |
| 168 | synth_one_block:one voice->| 1| 11.00| 12.31| 57.00| 0.462 |
| 169 | synth_one_block:all voices>| 20| 97.00| 103.68| 251.00| 3.888 |
| 170 | synth_one_block:reverb --->| no profiling available |
| 171 | synth_one_block:chorus --->| no profiling available |
| 172 | voice:note --------------->| no profiling available |
| 173 | voice:release ------------>| no profiling available |
| 174 | ------------------------------------------------------------------------------ |
| 175 | Cpu loads(%) (sr: 48000 Hz, sp: 20.83 microsecond) and maximum voices |
| 176 | ------------------------------------------------------------------------------ |
| 177 | nVoices| total(%)|voices(%)| reverb(%)|chorus(%)| voice(%)|estimated maxVoices |
| 178 | -------|---------|---------|----------|---------|---------|------------------- |
| 179 | 20| 4.096| 4.096| 0.000| 0.000| 0.194| 514 |
| 180 | Stopping 20 voices...voices stopped. |
| 181 | > quit |
| 182 | cheers! |
| 183 | JackTemporaryException : now quits... |
| 184 | Jack main caught signal 2 |
| 185 | fluid_profiling_print |
| 186 | fluidsynth: Estimated times: min/avg/max (micro seconds) |
| 187 | fluidsynth: synth_write_* ------------>: 102.000/109.216/278.000 |
| 188 | fluidsynth: synth_one_block ---------->: 99.000/106.649/265.000 |
| 189 | fluidsynth: synth_one_block:clear ---->: 1.000/1.666/22.000 |
| 190 | fluidsynth: synth_one_block:one voice->: 11.000/12.307/57.000 |
| 191 | fluidsynth: synth_one_block:all voices>: 97.000/103.681/251.000 |
| 192 | |
| 193 | Upstream-Status: Inappropriate [embedded-specific] |
| 194 | |
| 195 | --- |
| 196 | src/rvoice/fluid_rvoice_mixer.c | 55 ++++++++++++++++++++++++++++++++- |
| 197 | 1 file changed, 54 insertions(+), 1 deletion(-) |
| 198 | |
| 199 | diff --git a/src/rvoice/fluid_rvoice_mixer.c b/src/rvoice/fluid_rvoice_mixer.c |
| 200 | index af0ef75d..07a357c7 100644 |
| 201 | --- a/src/rvoice/fluid_rvoice_mixer.c |
| 202 | +++ b/src/rvoice/fluid_rvoice_mixer.c |
| 203 | @@ -27,6 +27,9 @@ |
| 204 | #include "fluid_ladspa.h" |
| 205 | #include "fluid_synth.h" |
| 206 | |
| 207 | +#if defined(__ARM_NEON__) |
| 208 | +#include "arm_neon.h" |
| 209 | +#endif |
| 210 | |
| 211 | // If less than x voices, the thread overhead is larger than the gain, |
| 212 | // so don't activate the thread(s). |
| 213 | @@ -1053,9 +1056,15 @@ fluid_mixer_buffers_mix(fluid_mixer_buffers_t *dst, fluid_mixer_buffers_t *src, |
| 214 | int i, j; |
| 215 | int scount = current_blockcount * FLUID_BUFSIZE; |
| 216 | int minbuf; |
| 217 | +#if defined(__ARM_NEON__) && defined(WITH_FLOAT) |
| 218 | + fluid_real_t *FLUID_RESTRICT base_src_left; |
| 219 | + fluid_real_t *FLUID_RESTRICT base_src_right; |
| 220 | + fluid_real_t *FLUID_RESTRICT base_dst_left; |
| 221 | + fluid_real_t *FLUID_RESTRICT base_dst_right; |
| 222 | +#else |
| 223 | fluid_real_t *FLUID_RESTRICT base_src; |
| 224 | fluid_real_t *FLUID_RESTRICT base_dst; |
| 225 | - |
| 226 | +#endif |
| 227 | minbuf = dst->buf_count; |
| 228 | |
| 229 | if(minbuf > src->buf_count) |
| 230 | @@ -1063,6 +1072,27 @@ fluid_mixer_buffers_mix(fluid_mixer_buffers_t *dst, fluid_mixer_buffers_t *src, |
| 231 | minbuf = src->buf_count; |
| 232 | } |
| 233 | |
| 234 | +#if defined(__ARM_NEON__) && defined(WITH_FLOAT) |
| 235 | + base_src_left = fluid_align_ptr(src->left_buf, FLUID_DEFAULT_ALIGNMENT); |
| 236 | + base_dst_left = fluid_align_ptr(dst->left_buf, FLUID_DEFAULT_ALIGNMENT); |
| 237 | + base_src_right = fluid_align_ptr(src->right_buf, FLUID_DEFAULT_ALIGNMENT); |
| 238 | + base_dst_right = fluid_align_ptr(dst->right_buf, FLUID_DEFAULT_ALIGNMENT); |
| 239 | + |
| 240 | + for(i = 0; i < minbuf; i++) |
| 241 | + { |
| 242 | + for(j = 0; j < scount; j+=4) |
| 243 | + { |
| 244 | + int dsp_i = i * FLUID_MIXER_MAX_BUFFERS_DEFAULT * FLUID_BUFSIZE + j; |
| 245 | + |
| 246 | + float32x4_t vleft = vld1q_f32(&base_dst_left[dsp_i]); |
| 247 | + float32x4_t vright = vld1q_f32(&base_dst_right[dsp_i]); |
| 248 | + vleft = vaddq_f32(vleft, vld1q_f32(&base_src_left[dsp_i])); |
| 249 | + vright = vaddq_f32(vright, vld1q_f32(&base_src_right[dsp_i])); |
| 250 | + vst1q_f32(&base_dst_left[dsp_i], vleft); |
| 251 | + vst1q_f32(&base_dst_right[dsp_i], vright); |
| 252 | + } |
| 253 | + } |
| 254 | +#else |
| 255 | base_src = fluid_align_ptr(src->left_buf, FLUID_DEFAULT_ALIGNMENT); |
| 256 | base_dst = fluid_align_ptr(dst->left_buf, FLUID_DEFAULT_ALIGNMENT); |
| 257 | |
| 258 | @@ -1090,6 +1120,7 @@ fluid_mixer_buffers_mix(fluid_mixer_buffers_t *dst, fluid_mixer_buffers_t *src, |
| 259 | base_dst[dsp_i] += base_src[dsp_i]; |
| 260 | } |
| 261 | } |
| 262 | +#endif |
| 263 | |
| 264 | minbuf = dst->fx_buf_count; |
| 265 | |
| 266 | @@ -1098,6 +1129,27 @@ fluid_mixer_buffers_mix(fluid_mixer_buffers_t *dst, fluid_mixer_buffers_t *src, |
| 267 | minbuf = src->fx_buf_count; |
| 268 | } |
| 269 | |
| 270 | +#if defined(__ARM_NEON__) && defined(WITH_FLOAT) |
| 271 | + base_src_left = fluid_align_ptr(src->fx_left_buf, FLUID_DEFAULT_ALIGNMENT); |
| 272 | + base_dst_left = fluid_align_ptr(dst->fx_left_buf, FLUID_DEFAULT_ALIGNMENT); |
| 273 | + base_src_right = fluid_align_ptr(src->fx_right_buf, FLUID_DEFAULT_ALIGNMENT); |
| 274 | + base_dst_right = fluid_align_ptr(dst->fx_right_buf, FLUID_DEFAULT_ALIGNMENT); |
| 275 | + |
| 276 | + for(i = 0; i < minbuf; i++) |
| 277 | + { |
| 278 | + for(j = 0; j < scount; j+=4) |
| 279 | + { |
| 280 | + int dsp_i = i * FLUID_MIXER_MAX_BUFFERS_DEFAULT * FLUID_BUFSIZE + j; |
| 281 | + |
| 282 | + float32x4_t vleft = vld1q_f32(&base_dst_left[dsp_i]); |
| 283 | + float32x4_t vright = vld1q_f32(&base_dst_right[dsp_i]); |
| 284 | + vleft = vaddq_f32(vleft, vld1q_f32(&base_src_left[dsp_i])); |
| 285 | + vright = vaddq_f32(vright, vld1q_f32(&base_src_right[dsp_i])); |
| 286 | + vst1q_f32(&base_dst_left[dsp_i], vleft); |
| 287 | + vst1q_f32(&base_dst_right[dsp_i], vright); |
| 288 | + } |
| 289 | + } |
| 290 | +#else |
| 291 | base_src = fluid_align_ptr(src->fx_left_buf, FLUID_DEFAULT_ALIGNMENT); |
| 292 | base_dst = fluid_align_ptr(dst->fx_left_buf, FLUID_DEFAULT_ALIGNMENT); |
| 293 | |
| 294 | @@ -1125,6 +1177,7 @@ fluid_mixer_buffers_mix(fluid_mixer_buffers_t *dst, fluid_mixer_buffers_t *src, |
| 295 | base_dst[dsp_i] += base_src[dsp_i]; |
| 296 | } |
| 297 | } |
| 298 | +#endif |
| 299 | } |
| 300 | |
| 301 | |
| 302 | -- |
| 303 | 2.20.1 |
| 304 | |