Skip to content

Commit a2d0790

Browse files
committed
SIMD audio muxing: initial work
1 parent 6a13dee commit a2d0790

10 files changed

+538
-18
lines changed

compile_flags.txt

+11
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
-D
2+
SIMD
3+
-I
4+
src/include
5+
-I
6+
src/include/private
7+
-I
8+
libs/teletone/src
9+
-mavx
10+
-mavx2
11+
-march=native

src/include/switch_simd.h

+337
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,337 @@
1+
/*
2+
* (c) 2025 Stéphane Alnet
3+
*
4+
* The contents of this file are subject to the Mozilla Public License Version
5+
* 1.1 (the "License"); you may not use this file except in compliance with
6+
* the License. You may obtain a copy of the License at
7+
* http://www.mozilla.org/MPL/
8+
*
9+
* Software distributed under the License is distributed on an "AS IS" basis,
10+
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11+
* for the specific language governing rights and limitations under the
12+
* License.
13+
*
14+
* Contributor(s):
15+
* Stéphane Alnet <stephane@shimaore.net>
16+
*
17+
* switch_simd.h -- SIMD definitions
18+
*
19+
*/
20+
21+
#ifndef SWITCH_SIMD_H
22+
#define SWITCH_SIMD_H
23+
24+
#ifdef SIMD
25+
26+
/* The initial goal of this module is to provide noticeable speed improvements for audio muxing. It probably could be extended to video processing, but I haven't tried yet.
27+
* For higher speed improvemrnts you generally want your data to be aligned on the SIMD datasize.
28+
* (See e.g. https://www.agner.org/optimize/instruction_tables.pdf for speed measurements.)
29+
* Here we focus on 256 bits (8 octets) since
30+
* - as of 2025 it is essentially available on most x86_64 hardware via AVX and AVX2,
31+
* - and is an appropriate size for e.g. PCMU or PCMA at 8kHz (512 bits would be too much for 160 bytes).
32+
* For easy alignment, use the SWITCH_ALIGN macro. It can be used in struct/union, and for stack-allocated variables.
33+
* Pointers might or might not be aligned. For example, glibc malloc will return 8-octets aligned memory blocks, but an arbitrary pointer inside that structure will not necessarily be aligned!
34+
* Alignment results in faster loads and stores - instead of sequencing the load and store, the microcode can use a 128-bit or 256-bit lane to move the data between cache and register in a smaller number of steps.
35+
*/
36+
37+
#include <stdalign.h>
38+
#include <string.h>
39+
#include <simde/x86/sse2.h>
40+
#include <simde/x86/avx.h>
41+
#include <simde/x86/avx2.h>
42+
/* SIMDE will provide substitutes for AVX512 functions on lower platforms. */
43+
#include <simde/x86/avx512.h>
44+
45+
enum {
46+
int16_per_m256i = sizeof(simde__m256i)/sizeof(int16_t),
47+
mask_int16_per_m256i = int16_per_m256i-1,
48+
int16_per_m128i = sizeof(simde__m128i)/sizeof(int16_t),
49+
mask_int16_per_m128i = int16_per_m128i-1,
50+
int32_per_m256i = sizeof(simde__m256i)/sizeof(int32_t),
51+
};
52+
53+
/* Apply the `SWITCH_ALIGN` prefix to:
54+
* - function variables
55+
* - struct/union fields
56+
* e.g.
57+
*
58+
* SWITCH_ALIGN int16_t data[SWITCH_RECOMMENDED_BUFFER_SIZE/sizeof(int16_t)];
59+
*
60+
* Then `data` can be used safely as destination or source for SIMD_mux_aligned_unbound_sln, for example.
61+
*/
62+
#define SWITCH_ALIGN alignas(sizeof(simde__m256i))
63+
64+
/* SIMD-optimized int16_t saturated addition
65+
* - aligned: both int16_t pointers must be aligned on 256 bits boundary
66+
* - unbound: underlying buffer must end on m256i (256 bits / 16 int16_t) boundary.
67+
* will modify data outside of the range if sample%4 != 0; might SIGSEV if the underlying buffer is too short.
68+
* It is safe to use with buffers defined as
69+
*
70+
* SWITCH_ALIGN data[SWITCH_RECOMMENDED_BUFFER_SIZE];
71+
*
72+
* for example.
73+
*/
74+
inline static void SIMD_mux_sln_m256i_m256i_unbound(simde__m256i *dst, const simde__m256i *add, int samples)
75+
{
76+
int x;
77+
const int blocks = samples / int16_per_m256i;
78+
for ( x = 0; x < blocks; x++) {
79+
/* AVX: Must be aligned on a 32-byte (128 bits) boundary) */
80+
simde_mm256_store_si256(
81+
dst+x,
82+
simde_mm256_adds_epi16(
83+
/* AVX: Must be aligned on a 32-byte (128 bits) boundary) */
84+
simde_mm256_load_si256(dst+x),
85+
simde_mm256_load_si256(add+x)
86+
));
87+
}
88+
}
89+
90+
/* SIMD-optimized int16_t satured addition
91+
* - only the first parameter must be aligned
92+
* - unbound: underlying buffer must end on m256i (256 bits / 16 int16_t) boundary.
93+
*/
94+
inline static void SIMD_mux_sln_m256i_int16_unbound(simde__m256i *dst, const int16_t *add, int samples)
95+
{
96+
int x;
97+
const int blocks = samples / int16_per_m256i;
98+
for ( x = 0; x < blocks; x++) {
99+
simde_mm256_store_si256(
100+
dst+x,
101+
simde_mm256_adds_epi16(
102+
simde_mm256_load_si256(dst+x),
103+
simde_mm256_loadu_si256(add+x*int16_per_m256i)
104+
));
105+
}
106+
}
107+
108+
/* SIMD-optimized int16_t saturated addition
109+
* - unbound: underlying buffer must end on m256i (256 bits / 16 int16_t) boundary.
110+
*/
111+
inline static void SIMD_mux_sln_int16_int16_unbound(int16_t *dst, const int16_t *add, int samples)
112+
{
113+
int x;
114+
const int blocks = samples / int16_per_m256i;
115+
for ( x = 0; x < blocks; x++) {
116+
simde_mm256_storeu_si256(
117+
dst+x*int16_per_m256i,
118+
simde_mm256_adds_epi16(
119+
simde_mm256_loadu_si256(dst+x*int16_per_m256i),
120+
simde_mm256_loadu_si256(add+x*int16_per_m256i)
121+
));
122+
}
123+
}
124+
125+
inline static int SIMD_is_aligned256(const void *p) {
126+
return (uintptr_t)p % sizeof(simde__m256i) == 0;
127+
}
128+
129+
inline static int SIMD_is_aligned128(const void *p) {
130+
return (uintptr_t)p % sizeof(simde__m128i) == 0;
131+
}
132+
133+
inline static void SIMD_mux_sln(int16_t *dst, const int16_t *add, int samples)
134+
{
135+
/* Round down to the nearest 256 bits block */
136+
int bound_len = samples & ~mask_int16_per_m256i;
137+
int extra = samples & mask_int16_per_m256i;
138+
139+
const int dst_aligned = SIMD_is_aligned256(dst);
140+
const int src_aligned = SIMD_is_aligned256(add);
141+
142+
/* Process as much as we can from the original buffer */
143+
if (dst_aligned && src_aligned) {
144+
SIMD_mux_sln_m256i_m256i_unbound((simde__m256i *)dst, (const simde__m256i *)add, bound_len);
145+
} else if (dst_aligned) {
146+
SIMD_mux_sln_m256i_int16_unbound((simde__m256i *)dst, add, bound_len);
147+
} else {
148+
SIMD_mux_sln_int16_int16_unbound(dst, add, bound_len);
149+
}
150+
151+
if (extra >= 0) {
152+
/* Since the original buffers might not go all the way up to the next 256 bits, we copy the data
153+
* in local buffers large enough to hold it, then do the maths in SIMD.
154+
*/
155+
SWITCH_ALIGN int16_t _dst[int16_per_m256i];
156+
SWITCH_ALIGN int16_t _add[int16_per_m256i];
157+
memcpy(_dst, dst+bound_len, sizeof(int16_t) * extra);
158+
memcpy(_add, add+bound_len, sizeof(int16_t) * extra);
159+
SIMD_mux_sln_m256i_m256i_unbound((simde__m256i *)_dst, (const simde__m256i *)_add, extra);
160+
memcpy(dst+bound_len, _dst, sizeof(int16_t) * extra);
161+
}
162+
}
163+
164+
/* In mod_conference we do 16-to-32 bit conversions to avoid overflow. */
165+
166+
/* Convert to unaligned int16_t to unaligned int32_t.
167+
* - unbound: might overflow the input and output buffers boundaries if samples is not a multiple of 16.
168+
*/
169+
inline static void SIMD_convert32_int16_unbound(int32_t *dst, const int16_t *src, int samples)
170+
{
171+
int x;
172+
const int blocks = samples / int16_per_m128i;
173+
for ( x = 0; x < blocks; x++) {
174+
/* Store 8 int32 at once.
175+
* Apparently SIMDE doesn't define an _aligned_ store operation, but this is fine.
176+
*/
177+
simde_mm256_storeu_epi32(dst+x,
178+
/* Sign-extend from 16-bits to 32-bits */
179+
simde_mm256_cvtepi16_epi32(
180+
/* Load 8 int16 at one */
181+
simde_mm_loadu_epi16(src+x)));
182+
}
183+
}
184+
185+
/* Convert to aligned int32_t (in bunches of 8) to int16_t (in bunches of 8).
186+
* - unbound: might overflow the input and output buffer boundaries.
187+
*/
188+
inline static void SIMD_convert16_m256i_unbound(simde__m128i *dst, const simde__m256i *src, int samples)
189+
{
190+
int x;
191+
const int blocks = samples / int32_per_m256i;
192+
for ( x = 0; x < blocks; x++) {
193+
simde_mm_store_si128(
194+
dst+x,
195+
simde_mm256_cvtsepi32_epi16(
196+
simde_mm256_load_si256(src+x)
197+
));
198+
}
199+
200+
}
201+
202+
/* Add int16_t samples to packed int32_t values.
203+
* - unbound: might overflow the input and output buffer boundaries.
204+
*/
205+
inline static void SIMD_mux32_m256i_m128i_unbound(simde__m256i *dst, const simde__m128i *add, int samples)
206+
{
207+
int x;
208+
const int blocks = samples / int16_per_m128i;
209+
for ( x = 0; x < blocks; x++) {
210+
/* AVX: Must be aligned on a 32-byte (128 bits) boundary) */
211+
simde_mm256_store_si256(
212+
dst+x,
213+
simde_mm256_add_epi32(
214+
/* AVX: Must be aligned on a 32-byte (128 bits) boundary) */
215+
simde_mm256_load_si256(dst+x),
216+
simde_mm256_cvtepi16_epi32(
217+
simde_mm_load_si128(add+x)
218+
)));
219+
}
220+
}
221+
222+
/* Add int16_t samples to packed int32_t values.
223+
* - unbound: might overflow the input and output buffer boundaries.
224+
*/
225+
inline static void SIMD_mux32_m256i_int16_unbound(simde__m256i *dst, const int16_t *add, int samples)
226+
{
227+
int x;
228+
const int blocks = samples / int16_per_m128i;
229+
for ( x = 0; x < blocks; x++) {
230+
simde_mm256_store_si256(
231+
dst+x,
232+
simde_mm256_add_epi32(
233+
simde_mm256_load_si256(dst+x),
234+
simde_mm256_cvtepi16_epi32(
235+
simde_mm_loadu_epi16(add+x*int16_per_m128i)
236+
)));
237+
}
238+
}
239+
240+
/* Add int16_t samples to packed int32_t values. */
241+
inline static void SIMD_mux32_sln(simde__m256i *dst, const int16_t *add, int samples)
242+
{
243+
/* Round down to the nearest 256 bits block */
244+
int bound_len = samples & ~mask_int16_per_m128i;
245+
int extra = samples & mask_int16_per_m128i;
246+
247+
const int src_aligned = SIMD_is_aligned128(add);
248+
249+
/* Process as much as we can from the original buffer */
250+
if (src_aligned) {
251+
SIMD_mux32_m256i_m128i_unbound((simde__m256i *)dst, (const simde__m128i *)add, bound_len);
252+
} else {
253+
SIMD_mux32_m256i_int16_unbound(dst, add, bound_len);
254+
}
255+
256+
if (extra >= 0) {
257+
/* Since the original buffers might not go all the way up to the next 256 bits, we copy the data
258+
* in local buffers large enough to hold it, then do the maths in SIMD.
259+
*/
260+
SWITCH_ALIGN int16_t _add[int16_per_m128i];
261+
memcpy(_add, add+bound_len, sizeof(int16_t) * extra);
262+
SIMD_mux32_m256i_m128i_unbound(dst, (const simde__m128i *)_add, extra);
263+
}
264+
}
265+
266+
/* Subtract packed, aligned int16_t values from packed, aligned int32_t values.
267+
* - unbound: might overflow the input and output buffer boundaries.
268+
*/
269+
inline static void SIMD_sub32_m256i_m128i_unbound(simde__m256i *dst, const simde__m128i *sub, int samples)
270+
{
271+
int x;
272+
const int blocks = samples / int16_per_m128i;
273+
for ( x = 0; x < blocks; x++) {
274+
/* AVX: Must be aligned on a 32-byte (128 bits) boundary) */
275+
simde_mm256_store_si256(
276+
dst+x,
277+
simde_mm256_sub_epi32(
278+
/* AVX: Must be aligned on a 32-byte (128 bits) boundary) */
279+
simde_mm256_load_si256(dst+x),
280+
simde_mm256_cvtepi16_epi32(
281+
simde_mm_load_si128(sub+x)
282+
)));
283+
}
284+
}
285+
286+
/* Subtract int16_t values from packed, aligned int32_t values.
287+
* - unbound: might overflow the input and output buffer boundaries.
288+
*/
289+
inline static void SIMD_sub32_m256i_int16_unbound(simde__m256i *dst, const int16_t *add, int samples)
290+
{
291+
int x;
292+
const int blocks = samples / int16_per_m128i;
293+
for ( x = 0; x < blocks; x++) {
294+
simde_mm256_store_si256(
295+
dst+x,
296+
simde_mm256_sub_epi32(
297+
simde_mm256_load_si256(dst+x),
298+
simde_mm256_cvtepi16_epi32(
299+
simde_mm_loadu_epi16(add+x*int16_per_m128i)
300+
)));
301+
}
302+
}
303+
304+
/* Subtract int16_t values from packed, aligned int32_t values.
305+
*/
306+
inline static void SIMD_sub32_sln(simde__m256i *dst, const int16_t *add, int samples)
307+
{
308+
/* Round down to the nearest 256 bits block */
309+
int bound_len = samples & ~mask_int16_per_m128i;
310+
int extra = samples & mask_int16_per_m128i;
311+
312+
const int src_aligned = SIMD_is_aligned128(add);
313+
314+
/* Process as much as we can from the original buffer */
315+
if (src_aligned) {
316+
SIMD_sub32_m256i_m128i_unbound((simde__m256i *)dst, (const simde__m128i *)add, bound_len);
317+
} else {
318+
SIMD_sub32_m256i_int16_unbound(dst, add, bound_len);
319+
}
320+
321+
if (extra >= 0) {
322+
/* Since the original buffers might not go all the way up to the next 256 bits, we copy the data
323+
* in local buffers large enough to hold it, then do the maths in SIMD.
324+
*/
325+
SWITCH_ALIGN int16_t _add[int16_per_m128i];
326+
memcpy(_add, add+bound_len, sizeof(int16_t) * extra);
327+
SIMD_sub32_m256i_m128i_unbound(dst, (const simde__m128i *)_add, extra);
328+
}
329+
}
330+
331+
#else /* SIMD */
332+
333+
#define SWITCH_ALIGN
334+
335+
#endif /* SIMD */
336+
337+
#endif /* SWITCH_SIMD_H */

src/mod/applications/mod_conference/conference_member.c

+11-5
Original file line numberDiff line numberDiff line change
@@ -35,11 +35,13 @@
3535
* Seven Du <dujinfang@gmail.com>
3636
* Emmanuel Schmidbauer <e.schmidbauer@gmail.com>
3737
* William King <william.king@quentustech.com>
38+
* Stephane Alnet <stephane@shimaore.net>
3839
*
3940
* mod_conference.c -- Software Conference Bridge
4041
*
4142
*/
4243
#include <mod_conference.h>
44+
#include <switch_simd.h>
4345

4446
int conference_member_noise_gate_check(conference_member_t *member)
4547
{
@@ -550,7 +552,7 @@ void conference_member_check_channels(switch_frame_t *frame, conference_member_t
550552
void conference_member_add_file_data(conference_member_t *member, int16_t *data, switch_size_t file_data_len)
551553
{
552554
switch_size_t file_sample_len;
553-
int16_t file_frame[SWITCH_RECOMMENDED_BUFFER_SIZE] = { 0 };
555+
SWITCH_ALIGN int16_t file_frame[SWITCH_RECOMMENDED_BUFFER_SIZE] = { 0 };
554556

555557

556558
switch_mutex_lock(member->fnode_mutex);
@@ -618,14 +620,18 @@ void conference_member_add_file_data(conference_member_t *member, int16_t *data,
618620
conference_al_process(member->fnode->al, file_frame, file_sample_len * 2, member->conference->rate);
619621
}
620622

621-
for (i = 0; i < (int)file_sample_len * member->conference->channels; i++) {
622-
if (member->fnode->mux) {
623+
if (member->fnode->mux) {
624+
#ifdef SIMD
625+
SIMD_mux_sln(data, file_frame, (int)file_sample_len * member->conference->channels);
626+
#else
627+
for (i = 0; i < (int)file_sample_len * member->conference->channels; i++) {
623628
sample = data[i] + file_frame[i];
624629
switch_normalize_to_16bit(sample);
625630
data[i] = (int16_t)sample;
626-
} else {
627-
data[i] = file_frame[i];
628631
}
632+
#endif
633+
} else {
634+
memcpy(data, file_frame, (int)file_sample_len * member->conference->channels * sizeof(int16_t));
629635
}
630636

631637
}

0 commit comments

Comments
 (0)