🛠️🐜 Antkeeper superbuild with dependencies included https://antkeeper.com
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

314 lines
11 KiB

  1. #ifndef PHASE_SHIFTER_H
  2. #define PHASE_SHIFTER_H
  3. #ifdef HAVE_SSE_INTRINSICS
  4. #include <xmmintrin.h>
  5. #elif defined(HAVE_NEON)
  6. #include <arm_neon.h>
  7. #endif
  8. #include <array>
  9. #include <stddef.h>
  10. #include "alcomplex.h"
  11. #include "alspan.h"
  12. /* Implements a wide-band +90 degree phase-shift. Note that this should be
  13. * given one sample less of a delay (FilterSize/2 - 1) compared to the direct
  14. * signal delay (FilterSize/2) to properly align.
  15. */
  16. template<size_t FilterSize>
  17. struct PhaseShifterT {
  18. static_assert(FilterSize >= 16, "FilterSize needs to be at least 16");
  19. static_assert((FilterSize&(FilterSize-1)) == 0, "FilterSize needs to be power-of-two");
  20. alignas(16) std::array<float,FilterSize/2> mCoeffs{};
  21. /* Some notes on this filter construction.
  22. *
  23. * A wide-band phase-shift filter needs a delay to maintain linearity. A
  24. * dirac impulse in the center of a time-domain buffer represents a filter
  25. * passing all frequencies through as-is with a pure delay. Converting that
  26. * to the frequency domain, adjusting the phase of each frequency bin by
  27. * +90 degrees, then converting back to the time domain, results in a FIR
  28. * filter that applies a +90 degree wide-band phase-shift.
  29. *
  30. * A particularly notable aspect of the time-domain filter response is that
  31. * every other coefficient is 0. This allows doubling the effective size of
  32. * the filter, by storing only the non-0 coefficients and double-stepping
  33. * over the input to apply it.
  34. *
  35. * Additionally, the resulting filter is independent of the sample rate.
  36. * The same filter can be applied regardless of the device's sample rate
  37. * and achieve the same effect.
  38. */
  39. PhaseShifterT()
  40. {
  41. using complex_d = std::complex<double>;
  42. constexpr size_t fft_size{FilterSize};
  43. constexpr size_t half_size{fft_size / 2};
  44. auto fftBuffer = std::make_unique<complex_d[]>(fft_size);
  45. std::fill_n(fftBuffer.get(), fft_size, complex_d{});
  46. fftBuffer[half_size] = 1.0;
  47. forward_fft({fftBuffer.get(), fft_size});
  48. for(size_t i{0};i < half_size+1;++i)
  49. fftBuffer[i] = complex_d{-fftBuffer[i].imag(), fftBuffer[i].real()};
  50. for(size_t i{half_size+1};i < fft_size;++i)
  51. fftBuffer[i] = std::conj(fftBuffer[fft_size - i]);
  52. inverse_fft({fftBuffer.get(), fft_size});
  53. auto fftiter = fftBuffer.get() + half_size + (FilterSize/2 - 1);
  54. for(float &coeff : mCoeffs)
  55. {
  56. coeff = static_cast<float>(fftiter->real() / double{fft_size});
  57. fftiter -= 2;
  58. }
  59. }
  60. void process(al::span<float> dst, const float *RESTRICT src) const;
  61. void processAccum(al::span<float> dst, const float *RESTRICT src) const;
  62. private:
  63. #if defined(HAVE_NEON)
  64. /* There doesn't seem to be NEON intrinsics to do this kind of stipple
  65. * shuffling, so there's two custom methods for it.
  66. */
  67. static auto shuffle_2020(float32x4_t a, float32x4_t b)
  68. {
  69. float32x4_t ret{vmovq_n_f32(vgetq_lane_f32(a, 0))};
  70. ret = vsetq_lane_f32(vgetq_lane_f32(a, 2), ret, 1);
  71. ret = vsetq_lane_f32(vgetq_lane_f32(b, 0), ret, 2);
  72. ret = vsetq_lane_f32(vgetq_lane_f32(b, 2), ret, 3);
  73. return ret;
  74. }
  75. static auto shuffle_3131(float32x4_t a, float32x4_t b)
  76. {
  77. float32x4_t ret{vmovq_n_f32(vgetq_lane_f32(a, 1))};
  78. ret = vsetq_lane_f32(vgetq_lane_f32(a, 3), ret, 1);
  79. ret = vsetq_lane_f32(vgetq_lane_f32(b, 1), ret, 2);
  80. ret = vsetq_lane_f32(vgetq_lane_f32(b, 3), ret, 3);
  81. return ret;
  82. }
  83. static auto unpacklo(float32x4_t a, float32x4_t b)
  84. {
  85. float32x2x2_t result{vzip_f32(vget_low_f32(a), vget_low_f32(b))};
  86. return vcombine_f32(result.val[0], result.val[1]);
  87. }
  88. static auto unpackhi(float32x4_t a, float32x4_t b)
  89. {
  90. float32x2x2_t result{vzip_f32(vget_high_f32(a), vget_high_f32(b))};
  91. return vcombine_f32(result.val[0], result.val[1]);
  92. }
  93. static auto load4(float32_t a, float32_t b, float32_t c, float32_t d)
  94. {
  95. float32x4_t ret{vmovq_n_f32(a)};
  96. ret = vsetq_lane_f32(b, ret, 1);
  97. ret = vsetq_lane_f32(c, ret, 2);
  98. ret = vsetq_lane_f32(d, ret, 3);
  99. return ret;
  100. }
  101. #endif
  102. };
  103. template<size_t S>
  104. inline void PhaseShifterT<S>::process(al::span<float> dst, const float *RESTRICT src) const
  105. {
  106. #ifdef HAVE_SSE_INTRINSICS
  107. if(size_t todo{dst.size()>>1})
  108. {
  109. auto *out = reinterpret_cast<__m64*>(dst.data());
  110. do {
  111. __m128 r04{_mm_setzero_ps()};
  112. __m128 r14{_mm_setzero_ps()};
  113. for(size_t j{0};j < mCoeffs.size();j+=4)
  114. {
  115. const __m128 coeffs{_mm_load_ps(&mCoeffs[j])};
  116. const __m128 s0{_mm_loadu_ps(&src[j*2])};
  117. const __m128 s1{_mm_loadu_ps(&src[j*2 + 4])};
  118. __m128 s{_mm_shuffle_ps(s0, s1, _MM_SHUFFLE(2, 0, 2, 0))};
  119. r04 = _mm_add_ps(r04, _mm_mul_ps(s, coeffs));
  120. s = _mm_shuffle_ps(s0, s1, _MM_SHUFFLE(3, 1, 3, 1));
  121. r14 = _mm_add_ps(r14, _mm_mul_ps(s, coeffs));
  122. }
  123. src += 2;
  124. __m128 r4{_mm_add_ps(_mm_unpackhi_ps(r04, r14), _mm_unpacklo_ps(r04, r14))};
  125. r4 = _mm_add_ps(r4, _mm_movehl_ps(r4, r4));
  126. _mm_storel_pi(out, r4);
  127. ++out;
  128. } while(--todo);
  129. }
  130. if((dst.size()&1))
  131. {
  132. __m128 r4{_mm_setzero_ps()};
  133. for(size_t j{0};j < mCoeffs.size();j+=4)
  134. {
  135. const __m128 coeffs{_mm_load_ps(&mCoeffs[j])};
  136. const __m128 s{_mm_setr_ps(src[j*2], src[j*2 + 2], src[j*2 + 4], src[j*2 + 6])};
  137. r4 = _mm_add_ps(r4, _mm_mul_ps(s, coeffs));
  138. }
  139. r4 = _mm_add_ps(r4, _mm_shuffle_ps(r4, r4, _MM_SHUFFLE(0, 1, 2, 3)));
  140. r4 = _mm_add_ps(r4, _mm_movehl_ps(r4, r4));
  141. dst.back() = _mm_cvtss_f32(r4);
  142. }
  143. #elif defined(HAVE_NEON)
  144. size_t pos{0};
  145. if(size_t todo{dst.size()>>1})
  146. {
  147. do {
  148. float32x4_t r04{vdupq_n_f32(0.0f)};
  149. float32x4_t r14{vdupq_n_f32(0.0f)};
  150. for(size_t j{0};j < mCoeffs.size();j+=4)
  151. {
  152. const float32x4_t coeffs{vld1q_f32(&mCoeffs[j])};
  153. const float32x4_t s0{vld1q_f32(&src[j*2])};
  154. const float32x4_t s1{vld1q_f32(&src[j*2 + 4])};
  155. r04 = vmlaq_f32(r04, shuffle_2020(s0, s1), coeffs);
  156. r14 = vmlaq_f32(r14, shuffle_3131(s0, s1), coeffs);
  157. }
  158. src += 2;
  159. float32x4_t r4{vaddq_f32(unpackhi(r04, r14), unpacklo(r04, r14))};
  160. float32x2_t r2{vadd_f32(vget_low_f32(r4), vget_high_f32(r4))};
  161. vst1_f32(&dst[pos], r2);
  162. pos += 2;
  163. } while(--todo);
  164. }
  165. if((dst.size()&1))
  166. {
  167. float32x4_t r4{vdupq_n_f32(0.0f)};
  168. for(size_t j{0};j < mCoeffs.size();j+=4)
  169. {
  170. const float32x4_t coeffs{vld1q_f32(&mCoeffs[j])};
  171. const float32x4_t s{load4(src[j*2], src[j*2 + 2], src[j*2 + 4], src[j*2 + 6])};
  172. r4 = vmlaq_f32(r4, s, coeffs);
  173. }
  174. r4 = vaddq_f32(r4, vrev64q_f32(r4));
  175. dst[pos] = vget_lane_f32(vadd_f32(vget_low_f32(r4), vget_high_f32(r4)), 0);
  176. }
  177. #else
  178. for(float &output : dst)
  179. {
  180. float ret{0.0f};
  181. for(size_t j{0};j < mCoeffs.size();++j)
  182. ret += src[j*2] * mCoeffs[j];
  183. output = ret;
  184. ++src;
  185. }
  186. #endif
  187. }
  188. template<size_t S>
  189. inline void PhaseShifterT<S>::processAccum(al::span<float> dst, const float *RESTRICT src) const
  190. {
  191. #ifdef HAVE_SSE_INTRINSICS
  192. if(size_t todo{dst.size()>>1})
  193. {
  194. auto *out = reinterpret_cast<__m64*>(dst.data());
  195. do {
  196. __m128 r04{_mm_setzero_ps()};
  197. __m128 r14{_mm_setzero_ps()};
  198. for(size_t j{0};j < mCoeffs.size();j+=4)
  199. {
  200. const __m128 coeffs{_mm_load_ps(&mCoeffs[j])};
  201. const __m128 s0{_mm_loadu_ps(&src[j*2])};
  202. const __m128 s1{_mm_loadu_ps(&src[j*2 + 4])};
  203. __m128 s{_mm_shuffle_ps(s0, s1, _MM_SHUFFLE(2, 0, 2, 0))};
  204. r04 = _mm_add_ps(r04, _mm_mul_ps(s, coeffs));
  205. s = _mm_shuffle_ps(s0, s1, _MM_SHUFFLE(3, 1, 3, 1));
  206. r14 = _mm_add_ps(r14, _mm_mul_ps(s, coeffs));
  207. }
  208. src += 2;
  209. __m128 r4{_mm_add_ps(_mm_unpackhi_ps(r04, r14), _mm_unpacklo_ps(r04, r14))};
  210. r4 = _mm_add_ps(r4, _mm_movehl_ps(r4, r4));
  211. _mm_storel_pi(out, _mm_add_ps(_mm_loadl_pi(_mm_undefined_ps(), out), r4));
  212. ++out;
  213. } while(--todo);
  214. }
  215. if((dst.size()&1))
  216. {
  217. __m128 r4{_mm_setzero_ps()};
  218. for(size_t j{0};j < mCoeffs.size();j+=4)
  219. {
  220. const __m128 coeffs{_mm_load_ps(&mCoeffs[j])};
  221. const __m128 s{_mm_setr_ps(src[j*2], src[j*2 + 2], src[j*2 + 4], src[j*2 + 6])};
  222. r4 = _mm_add_ps(r4, _mm_mul_ps(s, coeffs));
  223. }
  224. r4 = _mm_add_ps(r4, _mm_shuffle_ps(r4, r4, _MM_SHUFFLE(0, 1, 2, 3)));
  225. r4 = _mm_add_ps(r4, _mm_movehl_ps(r4, r4));
  226. dst.back() += _mm_cvtss_f32(r4);
  227. }
  228. #elif defined(HAVE_NEON)
  229. size_t pos{0};
  230. if(size_t todo{dst.size()>>1})
  231. {
  232. do {
  233. float32x4_t r04{vdupq_n_f32(0.0f)};
  234. float32x4_t r14{vdupq_n_f32(0.0f)};
  235. for(size_t j{0};j < mCoeffs.size();j+=4)
  236. {
  237. const float32x4_t coeffs{vld1q_f32(&mCoeffs[j])};
  238. const float32x4_t s0{vld1q_f32(&src[j*2])};
  239. const float32x4_t s1{vld1q_f32(&src[j*2 + 4])};
  240. r04 = vmlaq_f32(r04, shuffle_2020(s0, s1), coeffs);
  241. r14 = vmlaq_f32(r14, shuffle_3131(s0, s1), coeffs);
  242. }
  243. src += 2;
  244. float32x4_t r4{vaddq_f32(unpackhi(r04, r14), unpacklo(r04, r14))};
  245. float32x2_t r2{vadd_f32(vget_low_f32(r4), vget_high_f32(r4))};
  246. vst1_f32(&dst[pos], vadd_f32(vld1_f32(&dst[pos]), r2));
  247. pos += 2;
  248. } while(--todo);
  249. }
  250. if((dst.size()&1))
  251. {
  252. float32x4_t r4{vdupq_n_f32(0.0f)};
  253. for(size_t j{0};j < mCoeffs.size();j+=4)
  254. {
  255. const float32x4_t coeffs{vld1q_f32(&mCoeffs[j])};
  256. const float32x4_t s{load4(src[j*2], src[j*2 + 2], src[j*2 + 4], src[j*2 + 6])};
  257. r4 = vmlaq_f32(r4, s, coeffs);
  258. }
  259. r4 = vaddq_f32(r4, vrev64q_f32(r4));
  260. dst[pos] += vget_lane_f32(vadd_f32(vget_low_f32(r4), vget_high_f32(r4)), 0);
  261. }
  262. #else
  263. for(float &output : dst)
  264. {
  265. float ret{0.0f};
  266. for(size_t j{0};j < mCoeffs.size();++j)
  267. ret += src[j*2] * mCoeffs[j];
  268. output += ret;
  269. ++src;
  270. }
  271. #endif
  272. }
  273. #endif /* PHASE_SHIFTER_H */