diff options
| author | Andreas Müller <schnitzeltony@googlemail.com> | 2017-01-25 21:06:02 +0100 | 
|---|---|---|
| committer | Martin Jansa <Martin.Jansa@gmail.com> | 2017-02-13 18:43:23 +0100 | 
| commit | d307c4f59deb22cc8dfecb88720b5162f39d895c (patch) | |
| tree | 41bcc9e75f11dbf6133fd65d2c091fd76a51eaee | |
| parent | 0db057da47354a7a1184ebf1d0c3c6d0ecdc89aa (diff) | |
| download | meta-openembedded-d307c4f59deb22cc8dfecb88720b5162f39d895c.tar.gz | |
jack: add ARM NEON support for sample conversions
Add a test application checking accurracy and performance win of accelerated
code.
Signed-off-by: Andreas Müller <schnitzeltony@googlemail.com>
Signed-off-by: Martin Jansa <Martin.Jansa@gmail.com>
3 files changed, 934 insertions, 1 deletions
| diff --git a/meta-oe/recipes-multimedia/jack/jack/0001-Add-ARM-NEON-acceleration-for-all-non-dithering-samp.patch b/meta-oe/recipes-multimedia/jack/jack/0001-Add-ARM-NEON-acceleration-for-all-non-dithering-samp.patch new file mode 100644 index 0000000000..76ec7136b3 --- /dev/null +++ b/meta-oe/recipes-multimedia/jack/jack/0001-Add-ARM-NEON-acceleration-for-all-non-dithering-samp.patch | |||
| @@ -0,0 +1,496 @@ | |||
| 1 | From 99785aabc685a94415fcd445345c093488e10350 Mon Sep 17 00:00:00 2001 | ||
| 2 | From: =?UTF-8?q?Andreas=20M=C3=BCller?= <schnitzeltony@googlemail.com> | ||
| 3 | Date: Fri, 13 Jan 2017 22:42:11 +0100 | ||
| 4 | Subject: [PATCH 1/2] Add ARM-NEON acceleration for all non-dithering sample | ||
| 5 | conversion functions | ||
| 6 | MIME-Version: 1.0 | ||
| 7 | Content-Type: text/plain; charset=UTF-8 | ||
| 8 | Content-Transfer-Encoding: 8bit | ||
| 9 | |||
| 10 | Upstream-Status: Submitted [1] | ||
| 11 | |||
| 12 | [1] https://github.com/jackaudio/jack2/pull/250 | ||
| 13 | |||
| 14 | Signed-off-by: Andreas Müller <schnitzeltony@googlemail.com> | ||
| 15 | --- | ||
| 16 | common/memops.c | 356 +++++++++++++++++++++++++++++++++++++++++++++++++++++++- | ||
| 17 | 1 file changed, 351 insertions(+), 5 deletions(-) | ||
| 18 | |||
| 19 | diff --git a/common/memops.c b/common/memops.c | ||
| 20 | index 2ff0792..8f9ece2 100644 | ||
| 21 | --- a/common/memops.c | ||
| 22 | +++ b/common/memops.c | ||
| 23 | @@ -42,6 +42,10 @@ | ||
| 24 | #endif | ||
| 25 | #endif | ||
| 26 | |||
| 27 | +#ifdef __ARM_NEON__ | ||
| 28 | +#include <arm_neon.h> | ||
| 29 | +#endif | ||
| 30 | + | ||
| 31 | /* Notes about these *_SCALING values. | ||
| 32 | |||
| 33 | the MAX_<N>BIT values are floating point. when multiplied by | ||
| 34 | @@ -193,6 +197,35 @@ static inline __m128i float_24_sse(__m128 s) | ||
| 35 | } | ||
| 36 | #endif | ||
| 37 | |||
| 38 | + | ||
| 39 | +#ifdef __ARM_NEON__ | ||
| 40 | + | ||
| 41 | +static inline float32x4_t clip(float32x4_t s, float32x4_t min, float32x4_t max) | ||
| 42 | +{ | ||
| 43 | + return vminq_f32(max, vmaxq_f32(s, min)); | ||
| 44 | +} | ||
| 45 | + | ||
| 46 | +static inline int32x4_t float_24_neon(float32x4_t s) | ||
| 47 | +{ | ||
| 48 | + const float32x4_t upper_bound = vdupq_n_f32(NORMALIZED_FLOAT_MAX); | ||
| 49 | + const float32x4_t lower_bound = vdupq_n_f32(NORMALIZED_FLOAT_MIN); | ||
| 50 | + | ||
| 51 | + float32x4_t clipped = clip(s, lower_bound, upper_bound); | ||
| 52 | + float32x4_t scaled = vmulq_f32(clipped, vdupq_n_f32(SAMPLE_24BIT_SCALING)); | ||
| 53 | + return vcvtq_s32_f32(scaled); | ||
| 54 | +} | ||
| 55 | + | ||
| 56 | +static inline int16x4_t float_16_neon(float32x4_t s) | ||
| 57 | +{ | ||
| 58 | + const float32x4_t upper_bound = vdupq_n_f32(NORMALIZED_FLOAT_MAX); | ||
| 59 | + const float32x4_t lower_bound = vdupq_n_f32(NORMALIZED_FLOAT_MIN); | ||
| 60 | + | ||
| 61 | + float32x4_t clipped = clip(s, lower_bound, upper_bound); | ||
| 62 | + float32x4_t scaled = vmulq_f32(clipped, vdupq_n_f32(SAMPLE_16BIT_SCALING)); | ||
| 63 | + return vmovn_s32(vcvtq_s32_f32(scaled)); | ||
| 64 | +} | ||
| 65 | +#endif | ||
| 66 | + | ||
| 67 | /* Linear Congruential noise generator. From the music-dsp list | ||
| 68 | * less random than rand(), but good enough and 10x faster | ||
| 69 | */ | ||
| 70 | @@ -248,6 +281,32 @@ void sample_move_dS_floatLE (char *dst, jack_default_audio_sample_t *src, unsign | ||
| 71 | |||
| 72 | void sample_move_d32u24_sSs (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state) | ||
| 73 | { | ||
| 74 | +#ifdef __ARM_NEON__ | ||
| 75 | + unsigned long unrolled = nsamples / 4; | ||
| 76 | + nsamples = nsamples & 3; | ||
| 77 | + | ||
| 78 | + while (unrolled--) { | ||
| 79 | + float32x4_t samples = vld1q_f32(src); | ||
| 80 | + int32x4_t converted = float_24_neon(samples); | ||
| 81 | + int32x4_t shifted = vshlq_n_s32(converted, 8); | ||
| 82 | + shifted = vreinterpretq_s32_u8(vrev32q_u8(vreinterpretq_u8_s32(shifted))); | ||
| 83 | + | ||
| 84 | + switch(dst_skip) { | ||
| 85 | + case 4: | ||
| 86 | + vst1q_s32((int32_t*)dst, shifted); | ||
| 87 | + break; | ||
| 88 | + default: | ||
| 89 | + vst1q_lane_s32((int32_t*)(dst), shifted, 0); | ||
| 90 | + vst1q_lane_s32((int32_t*)(dst+dst_skip), shifted, 1); | ||
| 91 | + vst1q_lane_s32((int32_t*)(dst+2*dst_skip), shifted, 2); | ||
| 92 | + vst1q_lane_s32((int32_t*)(dst+3*dst_skip), shifted, 3); | ||
| 93 | + break; | ||
| 94 | + } | ||
| 95 | + dst += 4*dst_skip; | ||
| 96 | + src+= 4; | ||
| 97 | + } | ||
| 98 | +#endif | ||
| 99 | + | ||
| 100 | int32_t z; | ||
| 101 | |||
| 102 | while (nsamples--) { | ||
| 103 | @@ -321,7 +380,33 @@ void sample_move_d32u24_sS (char *dst, jack_default_audio_sample_t *src, unsigne | ||
| 104 | src++; | ||
| 105 | } | ||
| 106 | |||
| 107 | -#else | ||
| 108 | +#elif defined(__ARM_NEON__) | ||
| 109 | + unsigned long unrolled = nsamples / 4; | ||
| 110 | + nsamples = nsamples & 3; | ||
| 111 | + | ||
| 112 | + while (unrolled--) { | ||
| 113 | + float32x4_t samples = vld1q_f32(src); | ||
| 114 | + int32x4_t converted = float_24_neon(samples); | ||
| 115 | + int32x4_t shifted = vshlq_n_s32(converted, 8); | ||
| 116 | + | ||
| 117 | + switch(dst_skip) { | ||
| 118 | + case 4: | ||
| 119 | + vst1q_s32((int32_t*)dst, shifted); | ||
| 120 | + break; | ||
| 121 | + default: | ||
| 122 | + vst1q_lane_s32((int32_t*)(dst), shifted, 0); | ||
| 123 | + vst1q_lane_s32((int32_t*)(dst+dst_skip), shifted, 1); | ||
| 124 | + vst1q_lane_s32((int32_t*)(dst+2*dst_skip), shifted, 2); | ||
| 125 | + vst1q_lane_s32((int32_t*)(dst+3*dst_skip), shifted, 3); | ||
| 126 | + break; | ||
| 127 | + } | ||
| 128 | + dst += 4*dst_skip; | ||
| 129 | + | ||
| 130 | + src+= 4; | ||
| 131 | + } | ||
| 132 | +#endif | ||
| 133 | + | ||
| 134 | +#if !defined (__SSE2__) | ||
| 135 | while (nsamples--) { | ||
| 136 | float_24u32 (*src, *((int32_t*) dst)); | ||
| 137 | dst += dst_skip; | ||
| 138 | @@ -332,6 +417,38 @@ void sample_move_d32u24_sS (char *dst, jack_default_audio_sample_t *src, unsigne | ||
| 139 | |||
| 140 | void sample_move_dS_s32u24s (jack_default_audio_sample_t *dst, char *src, unsigned long nsamples, unsigned long src_skip) | ||
| 141 | { | ||
| 142 | +#ifdef __ARM_NEON__ | ||
| 143 | + float32x4_t factor = vdupq_n_f32(1.0 / SAMPLE_24BIT_SCALING); | ||
| 144 | + unsigned long unrolled = nsamples / 4; | ||
| 145 | + while (unrolled--) { | ||
| 146 | + int32x4_t src128; | ||
| 147 | + switch(src_skip) | ||
| 148 | + { | ||
| 149 | + case 4: | ||
| 150 | + src128 = vld1q_s32((int32_t*)src); | ||
| 151 | + break; | ||
| 152 | + case 8: | ||
| 153 | + src128 = vld2q_s32((int32_t*)src).val[0]; | ||
| 154 | + break; | ||
| 155 | + default: | ||
| 156 | + src128 = vld1q_lane_s32((int32_t*)src, src128, 0); | ||
| 157 | + src128 = vld1q_lane_s32((int32_t*)(src+src_skip), src128, 1); | ||
| 158 | + src128 = vld1q_lane_s32((int32_t*)(src+2*src_skip), src128, 2); | ||
| 159 | + src128 = vld1q_lane_s32((int32_t*)(src+3*src_skip), src128, 3); | ||
| 160 | + break; | ||
| 161 | + } | ||
| 162 | + src128 = vreinterpretq_s32_u8(vrev32q_u8(vreinterpretq_u8_s32(src128))); | ||
| 163 | + int32x4_t shifted = vshrq_n_s32(src128, 8); | ||
| 164 | + float32x4_t as_float = vcvtq_f32_s32(shifted); | ||
| 165 | + float32x4_t divided = vmulq_f32(as_float, factor); | ||
| 166 | + vst1q_f32(dst, divided); | ||
| 167 | + | ||
| 168 | + src += 4*src_skip; | ||
| 169 | + dst += 4; | ||
| 170 | + } | ||
| 171 | + nsamples = nsamples & 3; | ||
| 172 | +#endif | ||
| 173 | + | ||
| 174 | /* ALERT: signed sign-extension portability !!! */ | ||
| 175 | |||
| 176 | const jack_default_audio_sample_t scaling = 1.0/SAMPLE_24BIT_SCALING; | ||
| 177 | @@ -389,6 +506,34 @@ void sample_move_dS_s32u24 (jack_default_audio_sample_t *dst, char *src, unsigne | ||
| 178 | dst += 4; | ||
| 179 | } | ||
| 180 | nsamples = nsamples & 3; | ||
| 181 | +#elif defined(__ARM_NEON__) | ||
| 182 | + unsigned long unrolled = nsamples / 4; | ||
| 183 | + float32x4_t factor = vdupq_n_f32(1.0 / SAMPLE_24BIT_SCALING); | ||
| 184 | + while (unrolled--) { | ||
| 185 | + int32x4_t src128; | ||
| 186 | + switch(src_skip) { | ||
| 187 | + case 4: | ||
| 188 | + src128 = vld1q_s32((int32_t*)src); | ||
| 189 | + break; | ||
| 190 | + case 8: | ||
| 191 | + src128 = vld2q_s32((int32_t*)src).val[0]; | ||
| 192 | + break; | ||
| 193 | + default: | ||
| 194 | + src128 = vld1q_lane_s32((int32_t*)src, src128, 0); | ||
| 195 | + src128 = vld1q_lane_s32((int32_t*)(src+src_skip), src128, 1); | ||
| 196 | + src128 = vld1q_lane_s32((int32_t*)(src+2*src_skip), src128, 2); | ||
| 197 | + src128 = vld1q_lane_s32((int32_t*)(src+3*src_skip), src128, 3); | ||
| 198 | + break; | ||
| 199 | + } | ||
| 200 | + int32x4_t shifted = vshrq_n_s32(src128, 8); | ||
| 201 | + float32x4_t as_float = vcvtq_f32_s32(shifted); | ||
| 202 | + float32x4_t divided = vmulq_f32(as_float, factor); | ||
| 203 | + vst1q_f32(dst, divided); | ||
| 204 | + | ||
| 205 | + src += 4*src_skip; | ||
| 206 | + dst += 4; | ||
| 207 | + } | ||
| 208 | + nsamples = nsamples & 3; | ||
| 209 | #endif | ||
| 210 | |||
| 211 | /* ALERT: signed sign-extension portability !!! */ | ||
| 212 | @@ -403,6 +548,24 @@ void sample_move_dS_s32u24 (jack_default_audio_sample_t *dst, char *src, unsigne | ||
| 213 | |||
| 214 | void sample_move_d24_sSs (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state) | ||
| 215 | { | ||
| 216 | +#ifdef __ARM_NEON__ | ||
| 217 | + unsigned long unrolled = nsamples / 4; | ||
| 218 | + while (unrolled--) { | ||
| 219 | + int32_t z[4]; | ||
| 220 | + float32x4_t samples = vld1q_f32(src); | ||
| 221 | + int32x4_t converted = float_24_neon(samples); | ||
| 222 | + converted = vreinterpretq_s32_u8(vrev32q_u8(vreinterpretq_u8_s32(converted))); | ||
| 223 | + vst1q_s32(z, converted); | ||
| 224 | + | ||
| 225 | + for (int i = 0; i != 4; ++i) { | ||
| 226 | + memcpy (dst, ((char*)(z+i))+1, 3); | ||
| 227 | + dst += dst_skip; | ||
| 228 | + } | ||
| 229 | + src += 4; | ||
| 230 | + } | ||
| 231 | + nsamples = nsamples & 3; | ||
| 232 | +#endif | ||
| 233 | + | ||
| 234 | int32_t z; | ||
| 235 | |||
| 236 | while (nsamples--) { | ||
| 237 | @@ -426,7 +589,6 @@ void sample_move_d24_sS (char *dst, jack_default_audio_sample_t *src, unsigned l | ||
| 238 | #if defined (__SSE2__) && !defined (__sun__) | ||
| 239 | _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); | ||
| 240 | while (nsamples >= 4) { | ||
| 241 | - int i; | ||
| 242 | int32_t z[4]; | ||
| 243 | __m128 samples = _mm_loadu_ps(src); | ||
| 244 | __m128i converted = float_24_sse(samples); | ||
| 245 | @@ -447,7 +609,7 @@ void sample_move_d24_sS (char *dst, jack_default_audio_sample_t *src, unsigned l | ||
| 246 | _mm_store_ss((float*)z+3, (__m128)shuffled3); | ||
| 247 | #endif | ||
| 248 | |||
| 249 | - for (i = 0; i != 4; ++i) { | ||
| 250 | + for (int i = 0; i != 4; ++i) { | ||
| 251 | memcpy (dst, z+i, 3); | ||
| 252 | dst += dst_skip; | ||
| 253 | } | ||
| 254 | @@ -455,6 +617,22 @@ void sample_move_d24_sS (char *dst, jack_default_audio_sample_t *src, unsigned l | ||
| 255 | nsamples -= 4; | ||
| 256 | src += 4; | ||
| 257 | } | ||
| 258 | +#elif defined(__ARM_NEON__) | ||
| 259 | + unsigned long unrolled = nsamples / 4; | ||
| 260 | + while (unrolled--) { | ||
| 261 | + int i; | ||
| 262 | + int32_t z[4]; | ||
| 263 | + float32x4_t samples = vld1q_f32(src); | ||
| 264 | + int32x4_t converted = float_24_neon(samples); | ||
| 265 | + vst1q_s32(z, converted); | ||
| 266 | + | ||
| 267 | + for (i = 0; i != 4; ++i) { | ||
| 268 | + memcpy (dst, z+i, 3); | ||
| 269 | + dst += dst_skip; | ||
| 270 | + } | ||
| 271 | + src += 4; | ||
| 272 | + } | ||
| 273 | + nsamples = nsamples & 3; | ||
| 274 | #endif | ||
| 275 | |||
| 276 | int32_t z; | ||
| 277 | @@ -473,9 +651,41 @@ void sample_move_d24_sS (char *dst, jack_default_audio_sample_t *src, unsigned l | ||
| 278 | |||
| 279 | void sample_move_dS_s24s (jack_default_audio_sample_t *dst, char *src, unsigned long nsamples, unsigned long src_skip) | ||
| 280 | { | ||
| 281 | + const jack_default_audio_sample_t scaling = 1.0/SAMPLE_24BIT_SCALING; | ||
| 282 | + | ||
| 283 | +#ifdef __ARM_NEON__ | ||
| 284 | + // we shift 8 to the right by dividing by 256.0 -> no sign extra handling | ||
| 285 | + const float32x4_t vscaling = vdupq_n_f32(scaling/256.0); | ||
| 286 | + int32_t x[4]; | ||
| 287 | + memset(x, 0, sizeof(x)); | ||
| 288 | + unsigned long unrolled = nsamples / 4; | ||
| 289 | + while (unrolled--) { | ||
| 290 | +#if __BYTE_ORDER == __BIG_ENDIAN /* ARM big endian?? */ | ||
| 291 | + // right aligned / inverse sequence below -> *256 | ||
| 292 | + memcpy(((char*)&x[0])+1, src, 3); | ||
| 293 | + memcpy(((char*)&x[1])+1, src+src_skip, 3); | ||
| 294 | + memcpy(((char*)&x[2])+1, src+2*src_skip, 3); | ||
| 295 | + memcpy(((char*)&x[3])+1, src+3*src_skip, 3); | ||
| 296 | +#else | ||
| 297 | + memcpy(&x[0], src, 3); | ||
| 298 | + memcpy(&x[1], src+src_skip, 3); | ||
| 299 | + memcpy(&x[2], src+2*src_skip, 3); | ||
| 300 | + memcpy(&x[3], src+3*src_skip, 3); | ||
| 301 | +#endif | ||
| 302 | + src += 4 * src_skip; | ||
| 303 | + | ||
| 304 | + int32x4_t source = vld1q_s32(x); | ||
| 305 | + source = vreinterpretq_s32_u8(vrev32q_u8(vreinterpretq_u8_s32(source))); | ||
| 306 | + float32x4_t converted = vcvtq_f32_s32(source); | ||
| 307 | + float32x4_t scaled = vmulq_f32(converted, vscaling); | ||
| 308 | + vst1q_f32(dst, scaled); | ||
| 309 | + dst += 4; | ||
| 310 | + } | ||
| 311 | + nsamples = nsamples & 3; | ||
| 312 | +#endif | ||
| 313 | + | ||
| 314 | /* ALERT: signed sign-extension portability !!! */ | ||
| 315 | |||
| 316 | - const jack_default_audio_sample_t scaling = 1.0/SAMPLE_24BIT_SCALING; | ||
| 317 | while (nsamples--) { | ||
| 318 | int x; | ||
| 319 | #if __BYTE_ORDER == __LITTLE_ENDIAN | ||
| 320 | @@ -528,6 +738,34 @@ void sample_move_dS_s24 (jack_default_audio_sample_t *dst, char *src, unsigned l | ||
| 321 | dst += 4; | ||
| 322 | nsamples -= 4; | ||
| 323 | } | ||
| 324 | +#elif defined(__ARM_NEON__) | ||
| 325 | + // we shift 8 to the right by dividing by 256.0 -> no sign extra handling | ||
| 326 | + const float32x4_t vscaling = vdupq_n_f32(scaling/256.0); | ||
| 327 | + int32_t x[4]; | ||
| 328 | + memset(x, 0, sizeof(x)); | ||
| 329 | + unsigned long unrolled = nsamples / 4; | ||
| 330 | + while (unrolled--) { | ||
| 331 | +#if __BYTE_ORDER == __BIG_ENDIAN /* ARM big endian?? */ | ||
| 332 | + // left aligned -> *256 | ||
| 333 | + memcpy(&x[0], src, 3); | ||
| 334 | + memcpy(&x[1], src+src_skip, 3); | ||
| 335 | + memcpy(&x[2], src+2*src_skip, 3); | ||
| 336 | + memcpy(&x[3], src+3*src_skip, 3); | ||
| 337 | +#else | ||
| 338 | + memcpy(((char*)&x[0])+1, src, 3); | ||
| 339 | + memcpy(((char*)&x[1])+1, src+src_skip, 3); | ||
| 340 | + memcpy(((char*)&x[2])+1, src+2*src_skip, 3); | ||
| 341 | + memcpy(((char*)&x[3])+1, src+3*src_skip, 3); | ||
| 342 | +#endif | ||
| 343 | + src += 4 * src_skip; | ||
| 344 | + | ||
| 345 | + int32x4_t source = vld1q_s32(x); | ||
| 346 | + float32x4_t converted = vcvtq_f32_s32(source); | ||
| 347 | + float32x4_t scaled = vmulq_f32(converted, vscaling); | ||
| 348 | + vst1q_f32(dst, scaled); | ||
| 349 | + dst += 4; | ||
| 350 | + } | ||
| 351 | + nsamples = nsamples & 3; | ||
| 352 | #endif | ||
| 353 | |||
| 354 | while (nsamples--) { | ||
| 355 | @@ -547,6 +785,30 @@ void sample_move_dS_s24 (jack_default_audio_sample_t *dst, char *src, unsigned l | ||
| 356 | |||
| 357 | void sample_move_d16_sSs (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state) | ||
| 358 | { | ||
| 359 | +#ifdef __ARM_NEON__ | ||
| 360 | + unsigned long unrolled = nsamples / 4; | ||
| 361 | + nsamples = nsamples & 3; | ||
| 362 | + | ||
| 363 | + while (unrolled--) { | ||
| 364 | + float32x4_t samples = vld1q_f32(src); | ||
| 365 | + int16x4_t converted = float_16_neon(samples); | ||
| 366 | + converted = vreinterpret_s16_u8(vrev16_u8(vreinterpret_u8_s16(converted))); | ||
| 367 | + | ||
| 368 | + switch(dst_skip) { | ||
| 369 | + case 2: | ||
| 370 | + vst1_s16((int16_t*)dst, converted); | ||
| 371 | + break; | ||
| 372 | + default: | ||
| 373 | + vst1_lane_s16((int16_t*)(dst), converted, 0); | ||
| 374 | + vst1_lane_s16((int16_t*)(dst+dst_skip), converted, 1); | ||
| 375 | + vst1_lane_s16((int16_t*)(dst+2*dst_skip), converted, 2); | ||
| 376 | + vst1_lane_s16((int16_t*)(dst+3*dst_skip), converted, 3); | ||
| 377 | + break; | ||
| 378 | + } | ||
| 379 | + dst += 4*dst_skip; | ||
| 380 | + src+= 4; | ||
| 381 | + } | ||
| 382 | +#endif | ||
| 383 | int16_t tmp; | ||
| 384 | |||
| 385 | while (nsamples--) { | ||
| 386 | @@ -574,6 +836,29 @@ void sample_move_d16_sSs (char *dst, jack_default_audio_sample_t *src, unsigned | ||
| 387 | |||
| 388 | void sample_move_d16_sS (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state) | ||
| 389 | { | ||
| 390 | +#ifdef __ARM_NEON__ | ||
| 391 | + unsigned long unrolled = nsamples / 4; | ||
| 392 | + nsamples = nsamples & 3; | ||
| 393 | + | ||
| 394 | + while (unrolled--) { | ||
| 395 | + float32x4_t samples = vld1q_f32(src); | ||
| 396 | + int16x4_t converted = float_16_neon(samples); | ||
| 397 | + | ||
| 398 | + switch(dst_skip) { | ||
| 399 | + case 2: | ||
| 400 | + vst1_s16((int16_t*)dst, converted); | ||
| 401 | + break; | ||
| 402 | + default: | ||
| 403 | + vst1_lane_s16((int16_t*)(dst), converted, 0); | ||
| 404 | + vst1_lane_s16((int16_t*)(dst+dst_skip), converted, 1); | ||
| 405 | + vst1_lane_s16((int16_t*)(dst+2*dst_skip), converted, 2); | ||
| 406 | + vst1_lane_s16((int16_t*)(dst+3*dst_skip), converted, 3); | ||
| 407 | + break; | ||
| 408 | + } | ||
| 409 | + dst += 4*dst_skip; | ||
| 410 | + src+= 4; | ||
| 411 | + } | ||
| 412 | +#endif | ||
| 413 | while (nsamples--) { | ||
| 414 | float_16 (*src, *((int16_t*) dst)); | ||
| 415 | dst += dst_skip; | ||
| 416 | @@ -728,8 +1013,39 @@ void sample_move_dither_shaped_d16_sS (char *dst, jack_default_audio_sample_t * | ||
| 417 | |||
| 418 | void sample_move_dS_s16s (jack_default_audio_sample_t *dst, char *src, unsigned long nsamples, unsigned long src_skip) | ||
| 419 | { | ||
| 420 | - short z; | ||
| 421 | const jack_default_audio_sample_t scaling = 1.0/SAMPLE_16BIT_SCALING; | ||
| 422 | +#ifdef __ARM_NEON__ | ||
| 423 | + const float32x4_t vscaling = vdupq_n_f32(scaling); | ||
| 424 | + unsigned long unrolled = nsamples / 4; | ||
| 425 | + while (unrolled--) { | ||
| 426 | + int16x4_t source16x4; | ||
| 427 | + switch(src_skip) { | ||
| 428 | + case 2: | ||
| 429 | + source16x4 = vld1_s16((int16_t*)src); | ||
| 430 | + break; | ||
| 431 | + case 4: | ||
| 432 | + source16x4 = vld2_s16((int16_t*)src).val[0]; | ||
| 433 | + break; | ||
| 434 | + default: | ||
| 435 | + source16x4 = vld1_lane_s16((int16_t*)src, source16x4, 0); | ||
| 436 | + source16x4 = vld1_lane_s16((int16_t*)(src+src_skip), source16x4, 1); | ||
| 437 | + source16x4 = vld1_lane_s16((int16_t*)(src+2*src_skip), source16x4, 2); | ||
| 438 | + source16x4 = vld1_lane_s16((int16_t*)(src+3*src_skip), source16x4, 3); | ||
| 439 | + break; | ||
| 440 | + } | ||
| 441 | + source16x4 = vreinterpret_s16_u8(vrev16_u8(vreinterpret_u8_s16(source16x4))); | ||
| 442 | + int32x4_t source32x4 = vmovl_s16(source16x4); | ||
| 443 | + src += 4 * src_skip; | ||
| 444 | + | ||
| 445 | + float32x4_t converted = vcvtq_f32_s32(source32x4); | ||
| 446 | + float32x4_t scaled = vmulq_f32(converted, vscaling); | ||
| 447 | + vst1q_f32(dst, scaled); | ||
| 448 | + dst += 4; | ||
| 449 | + } | ||
| 450 | + nsamples = nsamples & 3; | ||
| 451 | +#endif | ||
| 452 | + | ||
| 453 | + short z; | ||
| 454 | |||
| 455 | /* ALERT: signed sign-extension portability !!! */ | ||
| 456 | while (nsamples--) { | ||
| 457 | @@ -752,6 +1068,36 @@ void sample_move_dS_s16 (jack_default_audio_sample_t *dst, char *src, unsigned l | ||
| 458 | { | ||
| 459 | /* ALERT: signed sign-extension portability !!! */ | ||
| 460 | const jack_default_audio_sample_t scaling = 1.0/SAMPLE_16BIT_SCALING; | ||
| 461 | +#ifdef __ARM_NEON__ | ||
| 462 | + const float32x4_t vscaling = vdupq_n_f32(scaling); | ||
| 463 | + unsigned long unrolled = nsamples / 4; | ||
| 464 | + while (unrolled--) { | ||
| 465 | + int16x4_t source16x4; | ||
| 466 | + switch(src_skip) { | ||
| 467 | + case 2: | ||
| 468 | + source16x4 = vld1_s16((int16_t*)src); | ||
| 469 | + break; | ||
| 470 | + case 4: | ||
| 471 | + source16x4 = vld2_s16((int16_t*)src).val[0]; | ||
| 472 | + break; | ||
| 473 | + default: | ||
| 474 | + source16x4 = vld1_lane_s16((int16_t*)src, source16x4, 0); | ||
| 475 | + source16x4 = vld1_lane_s16((int16_t*)(src+src_skip), source16x4, 1); | ||
| 476 | + source16x4 = vld1_lane_s16((int16_t*)(src+2*src_skip), source16x4, 2); | ||
| 477 | + source16x4 = vld1_lane_s16((int16_t*)(src+3*src_skip), source16x4, 3); | ||
| 478 | + break; | ||
| 479 | + } | ||
| 480 | + int32x4_t source32x4 = vmovl_s16(source16x4); | ||
| 481 | + src += 4 * src_skip; | ||
| 482 | + | ||
| 483 | + float32x4_t converted = vcvtq_f32_s32(source32x4); | ||
| 484 | + float32x4_t scaled = vmulq_f32(converted, vscaling); | ||
| 485 | + vst1q_f32(dst, scaled); | ||
| 486 | + dst += 4; | ||
| 487 | + } | ||
| 488 | + nsamples = nsamples & 3; | ||
| 489 | +#endif | ||
| 490 | + | ||
| 491 | while (nsamples--) { | ||
| 492 | *dst = (*((short *) src)) * scaling; | ||
| 493 | dst++; | ||
| 494 | -- | ||
| 495 | 2.5.5 | ||
| 496 | |||
| diff --git a/meta-oe/recipes-multimedia/jack/jack/0002-jack_simdtests-add-application-checking-accurracy-an.patch b/meta-oe/recipes-multimedia/jack/jack/0002-jack_simdtests-add-application-checking-accurracy-an.patch new file mode 100644 index 0000000000..e0c9e8ca87 --- /dev/null +++ b/meta-oe/recipes-multimedia/jack/jack/0002-jack_simdtests-add-application-checking-accurracy-an.patch | |||
| @@ -0,0 +1,433 @@ | |||
| 1 | From d0543c0628d2c0a6d898c694003e941fa189b393 Mon Sep 17 00:00:00 2001 | ||
| 2 | From: =?UTF-8?q?Andreas=20M=C3=BCller?= <schnitzeltony@googlemail.com> | ||
| 3 | Date: Sun, 15 Jan 2017 20:52:20 +0100 | ||
| 4 | Subject: [PATCH 2/2] jack_simdtests: add application checking accurracy and | ||
| 5 | performance of SIMD optimizations | ||
| 6 | MIME-Version: 1.0 | ||
| 7 | Content-Type: text/plain; charset=UTF-8 | ||
| 8 | Content-Transfer-Encoding: 8bit | ||
| 9 | |||
| 10 | Upstream-Status: Submitted [1] | ||
| 11 | |||
| 12 | [1] https://github.com/jackaudio/jack2/pull/250 | ||
| 13 | |||
| 14 | Signed-off-by: Andreas Müller <schnitzeltony@googlemail.com> | ||
| 15 | --- | ||
| 16 | example-clients/simdtests.cpp | 390 ++++++++++++++++++++++++++++++++++++++++++ | ||
| 17 | example-clients/wscript | 3 +- | ||
| 18 | 2 files changed, 392 insertions(+), 1 deletion(-) | ||
| 19 | create mode 100644 example-clients/simdtests.cpp | ||
| 20 | |||
| 21 | diff --git a/example-clients/simdtests.cpp b/example-clients/simdtests.cpp | ||
| 22 | new file mode 100644 | ||
| 23 | index 0000000..b74d50a | ||
| 24 | --- /dev/null | ||
| 25 | +++ b/example-clients/simdtests.cpp | ||
| 26 | @@ -0,0 +1,390 @@ | ||
| 27 | +/* | ||
| 28 | + * simdtests.c -- test accuraccy and performance of simd optimizations | ||
| 29 | + * | ||
| 30 | + * Copyright (C) 2017 Andreas Mueller. | ||
| 31 | + * | ||
| 32 | + * This program is free software; you can redistribute it and/or modify | ||
| 33 | + * it under the terms of the GNU General Public License as published by | ||
| 34 | + * the Free Software Foundation; either version 2 of the License, or | ||
| 35 | + * (at your option) any later version. | ||
| 36 | + * | ||
| 37 | + * This program is distributed in the hope that it will be useful, | ||
| 38 | + * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 39 | + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 40 | + * GNU General Public License for more details. | ||
| 41 | + * | ||
| 42 | + * You should have received a copy of the GNU General Public License | ||
| 43 | + * along with this program; if not, write to the Free Software | ||
| 44 | + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | ||
| 45 | + */ | ||
| 46 | + | ||
| 47 | +/* We must include all headers memops.c includes to avoid trouble with | ||
| 48 | + * out namespace game below. | ||
| 49 | + */ | ||
| 50 | +#include <stdio.h> | ||
| 51 | +#include <string.h> | ||
| 52 | +#include <math.h> | ||
| 53 | +#include <memory.h> | ||
| 54 | +#include <stdlib.h> | ||
| 55 | +#include <stdint.h> | ||
| 56 | +#include <limits.h> | ||
| 57 | +#ifdef __linux__ | ||
| 58 | +#include <endian.h> | ||
| 59 | +#endif | ||
| 60 | +#include "memops.h" | ||
| 61 | + | ||
| 62 | +#if defined (__SSE2__) && !defined (__sun__) | ||
| 63 | +#include <emmintrin.h> | ||
| 64 | +#ifdef __SSE4_1__ | ||
| 65 | +#include <smmintrin.h> | ||
| 66 | +#endif | ||
| 67 | +#endif | ||
| 68 | + | ||
| 69 | +#ifdef __ARM_NEON__ | ||
| 70 | +#include <arm_neon.h> | ||
| 71 | +#endif | ||
| 72 | + | ||
| 73 | +// our additional headers | ||
| 74 | +#include <time.h> | ||
| 75 | + | ||
| 76 | +/* Dirty: include mempos.c twice the second time with SIMD disabled | ||
| 77 | + * so we can compare aceelerated non accelerated | ||
| 78 | + */ | ||
| 79 | +namespace accelerated { | ||
| 80 | +#include "../common/memops.c" | ||
| 81 | +} | ||
| 82 | + | ||
| 83 | +namespace origerated { | ||
| 84 | +#ifdef __SSE2__ | ||
| 85 | +#undef __SSE2__ | ||
| 86 | +#endif | ||
| 87 | + | ||
| 88 | +#ifdef __ARM_NEON__ | ||
| 89 | +#undef __ARM_NEON__ | ||
| 90 | +#endif | ||
| 91 | + | ||
| 92 | +#include "../common/memops.c" | ||
| 93 | +} | ||
| 94 | + | ||
| 95 | +// define conversion function types | ||
| 96 | +typedef void (*t_jack_to_integer)( | ||
| 97 | + char *dst, | ||
| 98 | + jack_default_audio_sample_t *src, | ||
| 99 | + unsigned long nsamples, | ||
| 100 | + unsigned long dst_skip, | ||
| 101 | + dither_state_t *state); | ||
| 102 | + | ||
| 103 | +typedef void (*t_integer_to_jack)( | ||
| 104 | + jack_default_audio_sample_t *dst, | ||
| 105 | + char *src, | ||
| 106 | + unsigned long nsamples, | ||
| 107 | + unsigned long src_skip); | ||
| 108 | + | ||
| 109 | +// define/setup test case data | ||
| 110 | +typedef struct test_case_data { | ||
| 111 | + uint32_t frame_size; | ||
| 112 | + uint32_t sample_size; | ||
| 113 | + bool reverse; | ||
| 114 | + t_jack_to_integer jack_to_integer_accel; | ||
| 115 | + t_jack_to_integer jack_to_integer_orig; | ||
| 116 | + t_integer_to_jack integer_to_jack_accel; | ||
| 117 | + t_integer_to_jack integer_to_jack_orig; | ||
| 118 | + dither_state_t *ditherstate; | ||
| 119 | + const char *name; | ||
| 120 | +} test_case_data_t; | ||
| 121 | + | ||
| 122 | +test_case_data_t test_cases[] = { | ||
| 123 | + { | ||
| 124 | + 4, | ||
| 125 | + 3, | ||
| 126 | + true, | ||
| 127 | + accelerated::sample_move_d32u24_sSs, | ||
| 128 | + origerated::sample_move_d32u24_sSs, | ||
| 129 | + accelerated::sample_move_dS_s32u24s, | ||
| 130 | + origerated::sample_move_dS_s32u24s, | ||
| 131 | + NULL, | ||
| 132 | + "32u24s" }, | ||
| 133 | + { | ||
| 134 | + 4, | ||
| 135 | + 3, | ||
| 136 | + false, | ||
| 137 | + accelerated::sample_move_d32u24_sS, | ||
| 138 | + origerated::sample_move_d32u24_sS, | ||
| 139 | + accelerated::sample_move_dS_s32u24, | ||
| 140 | + origerated::sample_move_dS_s32u24, | ||
| 141 | + NULL, | ||
| 142 | + "32u24" }, | ||
| 143 | + { | ||
| 144 | + 3, | ||
| 145 | + 3, | ||
| 146 | + true, | ||
| 147 | + accelerated::sample_move_d24_sSs, | ||
| 148 | + origerated::sample_move_d24_sSs, | ||
| 149 | + accelerated::sample_move_dS_s24s, | ||
| 150 | + origerated::sample_move_dS_s24s, | ||
| 151 | + NULL, | ||
| 152 | + "24s" }, | ||
| 153 | + { | ||
| 154 | + 3, | ||
| 155 | + 3, | ||
| 156 | + false, | ||
| 157 | + accelerated::sample_move_d24_sS, | ||
| 158 | + origerated::sample_move_d24_sS, | ||
| 159 | + accelerated::sample_move_dS_s24, | ||
| 160 | + origerated::sample_move_dS_s24, | ||
| 161 | + NULL, | ||
| 162 | + "24" }, | ||
| 163 | + { | ||
| 164 | + 2, | ||
| 165 | + 2, | ||
| 166 | + true, | ||
| 167 | + accelerated::sample_move_d16_sSs, | ||
| 168 | + origerated::sample_move_d16_sSs, | ||
| 169 | + accelerated::sample_move_dS_s16s, | ||
| 170 | + origerated::sample_move_dS_s16s, | ||
| 171 | + NULL, | ||
| 172 | + "16s" }, | ||
| 173 | + { | ||
| 174 | + 2, | ||
| 175 | + 2, | ||
| 176 | + false, | ||
| 177 | + accelerated::sample_move_d16_sS, | ||
| 178 | + origerated::sample_move_d16_sS, | ||
| 179 | + accelerated::sample_move_dS_s16, | ||
| 180 | + origerated::sample_move_dS_s16, | ||
| 181 | + NULL, | ||
| 182 | + "16" }, | ||
| 183 | +}; | ||
| 184 | + | ||
| 185 | +// we need to repeat for better accuracy at time measurement | ||
| 186 | +const uint32_t retry_per_case = 1000; | ||
| 187 | + | ||
| 188 | +// setup test buffers | ||
| 189 | +#define TESTBUFF_SIZE 1024 | ||
| 190 | +jack_default_audio_sample_t jackbuffer_source[TESTBUFF_SIZE]; | ||
| 191 | +// integer buffers: max 4 bytes per value / * 2 for stereo | ||
| 192 | +char integerbuffer_accel[TESTBUFF_SIZE*4*2]; | ||
| 193 | +char integerbuffer_orig[TESTBUFF_SIZE*4*2]; | ||
| 194 | +// float buffers | ||
| 195 | +jack_default_audio_sample_t jackfloatbuffer_accel[TESTBUFF_SIZE]; | ||
| 196 | +jack_default_audio_sample_t jackfloatbuffer_orig[TESTBUFF_SIZE]; | ||
| 197 | + | ||
| 198 | +// comparing unsigned makes life easier | ||
| 199 | +uint32_t extract_integer( | ||
| 200 | + char* buff, | ||
| 201 | + uint32_t offset, | ||
| 202 | + uint32_t frame_size, | ||
| 203 | + uint32_t sample_size, | ||
| 204 | + bool big_endian) | ||
| 205 | +{ | ||
| 206 | + uint32_t retval = 0; | ||
| 207 | + unsigned char* curr; | ||
| 208 | + uint32_t mult = 1; | ||
| 209 | + if(big_endian) { | ||
| 210 | + curr = (unsigned char*)buff + offset + sample_size-1; | ||
| 211 | + for(uint32_t i=0; i<sample_size; i++) { | ||
| 212 | + retval += *(curr--) * mult; | ||
| 213 | + mult*=256; | ||
| 214 | + } | ||
| 215 | + } | ||
| 216 | + else { | ||
| 217 | + curr = (unsigned char*)buff + offset + frame_size-sample_size; | ||
| 218 | + for(uint32_t i=0; i<sample_size; i++) { | ||
| 219 | + retval += *(curr++) * mult; | ||
| 220 | + mult*=256; | ||
| 221 | + } | ||
| 222 | + } | ||
| 223 | + return retval; | ||
| 224 | +} | ||
| 225 | + | ||
| 226 | +int main(int argc, char *argv[]) | ||
| 227 | +{ | ||
| 228 | +// parse_arguments(argc, argv); | ||
| 229 | + uint32_t maxerr_displayed = 10; | ||
| 230 | + | ||
| 231 | + // fill jackbuffer | ||
| 232 | + for(int i=0; i<TESTBUFF_SIZE; i++) { | ||
| 233 | + // ramp | ||
| 234 | + jack_default_audio_sample_t value = | ||
| 235 | + ((jack_default_audio_sample_t)((i % TESTBUFF_SIZE) - TESTBUFF_SIZE/2)) / (TESTBUFF_SIZE/2); | ||
| 236 | + // force clipping | ||
| 237 | + value *= 1.02; | ||
| 238 | + jackbuffer_source[i] = value; | ||
| 239 | + } | ||
| 240 | + | ||
| 241 | + for(uint32_t testcase=0; testcase<sizeof(test_cases)/sizeof(test_case_data_t); testcase++) { | ||
| 242 | + // test mono/stereo | ||
| 243 | + for(uint32_t channels=1; channels<=2; channels++) { | ||
| 244 | + ////////////////////////////////////////////////////////////////////////////// | ||
| 245 | + // jackfloat -> integer | ||
| 246 | + | ||
| 247 | + // clean target buffers | ||
| 248 | + memset(integerbuffer_accel, 0, sizeof(integerbuffer_accel)); | ||
| 249 | + memset(integerbuffer_orig, 0, sizeof(integerbuffer_orig)); | ||
| 250 | + // accel | ||
| 251 | + clock_t time_to_integer_accel = clock(); | ||
| 252 | + for(uint32_t repetition=0; repetition<retry_per_case; repetition++) | ||
| 253 | + { | ||
| 254 | + test_cases[testcase].jack_to_integer_accel( | ||
| 255 | + integerbuffer_accel, | ||
| 256 | + jackbuffer_source, | ||
| 257 | + TESTBUFF_SIZE, | ||
| 258 | + test_cases[testcase].frame_size*channels, | ||
| 259 | + test_cases[testcase].ditherstate); | ||
| 260 | + } | ||
| 261 | + float timediff_to_integer_accel = ((float)(clock() - time_to_integer_accel)) / CLOCKS_PER_SEC; | ||
| 262 | + // orig | ||
| 263 | + clock_t time_to_integer_orig = clock(); | ||
| 264 | + for(uint32_t repetition=0; repetition<retry_per_case; repetition++) | ||
| 265 | + { | ||
| 266 | + test_cases[testcase].jack_to_integer_orig( | ||
| 267 | + integerbuffer_orig, | ||
| 268 | + jackbuffer_source, | ||
| 269 | + TESTBUFF_SIZE, | ||
| 270 | + test_cases[testcase].frame_size*channels, | ||
| 271 | + test_cases[testcase].ditherstate); | ||
| 272 | + } | ||
| 273 | + float timediff_to_integer_orig = ((float)(clock() - time_to_integer_orig)) / CLOCKS_PER_SEC; | ||
| 274 | + // output performance results | ||
| 275 | + printf( | ||
| 276 | + "JackFloat->Integer @%7.7s/%u: Orig %7.6f sec / Accel %7.6f sec -> Win: %5.2f %%\n", | ||
| 277 | + test_cases[testcase].name, | ||
| 278 | + channels, | ||
| 279 | + timediff_to_integer_orig, | ||
| 280 | + timediff_to_integer_accel, | ||
| 281 | + (timediff_to_integer_orig/timediff_to_integer_accel-1)*100.0); | ||
| 282 | + uint32_t int_deviation_max = 0; | ||
| 283 | + uint32_t int_error_count = 0; | ||
| 284 | + // output error (avoid spam -> limit error lines per test case) | ||
| 285 | + for(uint32_t sample=0; sample<TESTBUFF_SIZE; sample++) { | ||
| 286 | + uint32_t sample_offset = sample*test_cases[testcase].frame_size*channels; | ||
| 287 | + // compare both results | ||
| 288 | + uint32_t intval_accel=extract_integer( | ||
| 289 | + integerbuffer_accel, | ||
| 290 | + sample_offset, | ||
| 291 | + test_cases[testcase].frame_size, | ||
| 292 | + test_cases[testcase].sample_size, | ||
| 293 | +#if __BYTE_ORDER == __BIG_ENDIAN | ||
| 294 | + !test_cases[testcase].reverse); | ||
| 295 | +#else | ||
| 296 | + test_cases[testcase].reverse); | ||
| 297 | +#endif | ||
| 298 | + uint32_t intval_orig=extract_integer( | ||
| 299 | + integerbuffer_orig, | ||
| 300 | + sample_offset, | ||
| 301 | + test_cases[testcase].frame_size, | ||
| 302 | + test_cases[testcase].sample_size, | ||
| 303 | +#if __BYTE_ORDER == __BIG_ENDIAN | ||
| 304 | + !test_cases[testcase].reverse); | ||
| 305 | +#else | ||
| 306 | + test_cases[testcase].reverse); | ||
| 307 | +#endif | ||
| 308 | + if(intval_accel != intval_orig) { | ||
| 309 | + if(int_error_count<maxerr_displayed) { | ||
| 310 | + printf("Value error sample %u:", sample); | ||
| 311 | + printf(" Orig 0x"); | ||
| 312 | + char formatstr[10]; | ||
| 313 | + sprintf(formatstr, "%%0%uX", test_cases[testcase].sample_size*2); | ||
| 314 | + printf(formatstr, intval_orig); | ||
| 315 | + printf(" Accel 0x"); | ||
| 316 | + printf(formatstr, intval_accel); | ||
| 317 | + printf("\n"); | ||
| 318 | + } | ||
| 319 | + int_error_count++; | ||
| 320 | + uint32_t int_deviation; | ||
| 321 | + if(intval_accel > intval_orig) | ||
| 322 | + int_deviation = intval_accel-intval_orig; | ||
| 323 | + else | ||
| 324 | + int_deviation = intval_orig-intval_accel; | ||
| 325 | + if(int_deviation > int_deviation_max) | ||
| 326 | + int_deviation_max = int_deviation; | ||
| 327 | + } | ||
| 328 | + } | ||
| 329 | + printf( | ||
| 330 | + "JackFloat->Integer @%7.7s/%u: Errors: %u Max deviation %u\n", | ||
| 331 | + test_cases[testcase].name, | ||
| 332 | + channels, | ||
| 333 | + int_error_count, | ||
| 334 | + int_deviation_max); | ||
| 335 | + | ||
| 336 | + ////////////////////////////////////////////////////////////////////////////// | ||
| 337 | + // integer -> jackfloat | ||
| 338 | + | ||
| 339 | + // clean target buffers | ||
| 340 | + memset(jackfloatbuffer_accel, 0, sizeof(jackfloatbuffer_accel)); | ||
| 341 | + memset(jackfloatbuffer_orig, 0, sizeof(jackfloatbuffer_orig)); | ||
| 342 | + // accel | ||
| 343 | + clock_t time_to_float_accel = clock(); | ||
| 344 | + for(uint32_t repetition=0; repetition<retry_per_case; repetition++) | ||
| 345 | + { | ||
| 346 | + test_cases[testcase].integer_to_jack_accel( | ||
| 347 | + jackfloatbuffer_accel, | ||
| 348 | + integerbuffer_orig, | ||
| 349 | + TESTBUFF_SIZE, | ||
| 350 | + test_cases[testcase].frame_size*channels); | ||
| 351 | + } | ||
| 352 | + float timediff_to_float_accel = ((float)(clock() - time_to_float_accel)) / CLOCKS_PER_SEC; | ||
| 353 | + // orig | ||
| 354 | + clock_t time_to_float_orig = clock(); | ||
| 355 | + for(uint32_t repetition=0; repetition<retry_per_case; repetition++) | ||
| 356 | + { | ||
| 357 | + test_cases[testcase].integer_to_jack_orig( | ||
| 358 | + jackfloatbuffer_orig, | ||
| 359 | + integerbuffer_orig, | ||
| 360 | + TESTBUFF_SIZE, | ||
| 361 | + test_cases[testcase].frame_size*channels); | ||
| 362 | + } | ||
| 363 | + float timediff_to_float_orig = ((float)(clock() - time_to_float_orig)) / CLOCKS_PER_SEC; | ||
| 364 | + // output performance results | ||
| 365 | + printf( | ||
| 366 | + "Integer->JackFloat @%7.7s/%u: Orig %7.6f sec / Accel %7.6f sec -> Win: %5.2f %%\n", | ||
| 367 | + test_cases[testcase].name, | ||
| 368 | + channels, | ||
| 369 | + timediff_to_float_orig, | ||
| 370 | + timediff_to_float_accel, | ||
| 371 | + (timediff_to_float_orig/timediff_to_float_accel-1)*100.0); | ||
| 372 | + jack_default_audio_sample_t float_deviation_max = 0.0; | ||
| 373 | + uint32_t float_error_count = 0; | ||
| 374 | + // output error (avoid spam -> limit error lines per test case) | ||
| 375 | + for(uint32_t sample=0; sample<TESTBUFF_SIZE; sample++) { | ||
| 376 | + // For easier estimation/readabilty we scale floats back to integer | ||
| 377 | + jack_default_audio_sample_t sample_scaling; | ||
| 378 | + switch(test_cases[testcase].sample_size) { | ||
| 379 | + case 2: | ||
| 380 | + sample_scaling = SAMPLE_16BIT_SCALING; | ||
| 381 | + break; | ||
| 382 | + default: | ||
| 383 | + sample_scaling = SAMPLE_24BIT_SCALING; | ||
| 384 | + break; | ||
| 385 | + } | ||
| 386 | + jack_default_audio_sample_t floatval_accel = jackfloatbuffer_accel[sample] * sample_scaling; | ||
| 387 | + jack_default_audio_sample_t floatval_orig = jackfloatbuffer_orig[sample] * sample_scaling; | ||
| 388 | + // compare both results | ||
| 389 | + jack_default_audio_sample_t float_deviation; | ||
| 390 | + if(floatval_accel > floatval_orig) | ||
| 391 | + float_deviation = floatval_accel-floatval_orig; | ||
| 392 | + else | ||
| 393 | + float_deviation = floatval_orig-floatval_accel; | ||
| 394 | + if(float_deviation > float_deviation_max) | ||
| 395 | + float_deviation_max = float_deviation; | ||
| 396 | + // deviation > half bit => error | ||
| 397 | + if(float_deviation > 0.5) { | ||
| 398 | + if(float_error_count<maxerr_displayed) { | ||
| 399 | + printf("Value error sample %u:", sample); | ||
| 400 | + printf(" Orig %8.1f Accel %8.1f\n", floatval_orig, floatval_accel); | ||
| 401 | + } | ||
| 402 | + float_error_count++; | ||
| 403 | + } | ||
| 404 | + } | ||
| 405 | + printf( | ||
| 406 | + "Integer->JackFloat @%7.7s/%u: Errors: %u Max deviation %f\n", | ||
| 407 | + test_cases[testcase].name, | ||
| 408 | + channels, | ||
| 409 | + float_error_count, | ||
| 410 | + float_deviation_max); | ||
| 411 | + | ||
| 412 | + printf("\n"); | ||
| 413 | + } | ||
| 414 | + } | ||
| 415 | + return 0; | ||
| 416 | +} | ||
| 417 | diff --git a/example-clients/wscript b/example-clients/wscript | ||
| 418 | index ba67614..1b2f674 100644 | ||
| 419 | --- a/example-clients/wscript | ||
| 420 | +++ b/example-clients/wscript | ||
| 421 | @@ -28,7 +28,8 @@ example_programs = { | ||
| 422 | 'jack_net_master' : 'netmaster.c', | ||
| 423 | 'jack_latent_client' : 'latent_client.c', | ||
| 424 | 'jack_midi_dump' : 'midi_dump.c', | ||
| 425 | - 'jack_midi_latency_test' : 'midi_latency_test.c' | ||
| 426 | + 'jack_midi_latency_test' : 'midi_latency_test.c', | ||
| 427 | + 'jack_simdtests' : 'simdtests.cpp' | ||
| 428 | } | ||
| 429 | |||
| 430 | example_libs = { | ||
| 431 | -- | ||
| 432 | 2.5.5 | ||
| 433 | |||
| diff --git a/meta-oe/recipes-multimedia/jack/jack_git.bb b/meta-oe/recipes-multimedia/jack/jack_git.bb index 89fd638cbe..be5f7bbd97 100644 --- a/meta-oe/recipes-multimedia/jack/jack_git.bb +++ b/meta-oe/recipes-multimedia/jack/jack_git.bb | |||
| @@ -14,7 +14,11 @@ LIC_FILES_CHKSUM = " \ | |||
| 14 | 14 | ||
| 15 | DEPENDS = "libsamplerate0 libsndfile1 readline" | 15 | DEPENDS = "libsamplerate0 libsndfile1 readline" | 
| 16 | 16 | ||
| 17 | SRC_URI = "git://github.com/jackaudio/jack2.git" | 17 | SRC_URI = " \ | 
| 18 | git://github.com/jackaudio/jack2.git \ | ||
| 19 | file://0001-Add-ARM-NEON-acceleration-for-all-non-dithering-samp.patch \ | ||
| 20 | file://0002-jack_simdtests-add-application-checking-accurracy-an.patch \ | ||
| 21 | " | ||
| 18 | SRCREV = "0279a2d65a36d1378f5bab56d95bf9e99cc8cefb" | 22 | SRCREV = "0279a2d65a36d1378f5bab56d95bf9e99cc8cefb" | 
| 19 | PV = "1.9.10+git${SRCPV}" | 23 | PV = "1.9.10+git${SRCPV}" | 
| 20 | S = "${WORKDIR}/git" | 24 | S = "${WORKDIR}/git" | 
