-
Notifications
You must be signed in to change notification settings - Fork 1.5k
/
Copy pathswitch_simd.h
337 lines (302 loc) · 11.7 KB
/
switch_simd.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
/*
* (c) 2025 Stéphane Alnet
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* Contributor(s):
* Stéphane Alnet <stephane@shimaore.net>
*
* switch_simd.h -- SIMD definitions
*
*/
#ifndef SWITCH_SIMD_H
#define SWITCH_SIMD_H
#ifdef SIMD
/* The initial goal of this module is to provide noticeable speed improvements for audio muxing. It probably could be extended to video processing, but I haven't tried yet.
* For higher speed improvemrnts you generally want your data to be aligned on the SIMD datasize.
* (See e.g. https://www.agner.org/optimize/instruction_tables.pdf for speed measurements.)
* Here we focus on 256 bits (8 octets) since
* - as of 2025 it is essentially available on most x86_64 hardware via AVX and AVX2,
* - and is an appropriate size for e.g. PCMU or PCMA at 8kHz (512 bits would be too much for 160 bytes).
* For easy alignment, use the SWITCH_ALIGN macro. It can be used in struct/union, and for stack-allocated variables.
* Pointers might or might not be aligned. For example, glibc malloc will return 8-octets aligned memory blocks, but an arbitrary pointer inside that structure will not necessarily be aligned!
* Alignment results in faster loads and stores - instead of sequencing the load and store, the microcode can use a 128-bit or 256-bit lane to move the data between cache and register in a smaller number of steps.
*/
#include <stdalign.h>
#include <string.h>
#include <simde/x86/sse2.h>
#include <simde/x86/avx.h>
#include <simde/x86/avx2.h>
/* SIMDE will provide substitutes for AVX512 functions on lower platforms. */
#include <simde/x86/avx512.h>
enum {
int16_per_m256i = sizeof(simde__m256i)/sizeof(int16_t),
mask_int16_per_m256i = int16_per_m256i-1,
int16_per_m128i = sizeof(simde__m128i)/sizeof(int16_t),
mask_int16_per_m128i = int16_per_m128i-1,
int32_per_m256i = sizeof(simde__m256i)/sizeof(int32_t),
};
/* Apply the `SWITCH_ALIGN` prefix to:
* - function variables
* - struct/union fields
* e.g.
*
* SWITCH_ALIGN int16_t data[SWITCH_RECOMMENDED_BUFFER_SIZE/sizeof(int16_t)];
*
* Then `data` can be used safely as destination or source for SIMD_mux_aligned_unbound_sln, for example.
*/
#define SWITCH_ALIGN alignas(sizeof(simde__m256i))
/* SIMD-optimized int16_t saturated addition
* - aligned: both int16_t pointers must be aligned on 256 bits boundary
* - unbound: underlying buffer must end on m256i (256 bits / 16 int16_t) boundary.
* will modify data outside of the range if sample%4 != 0; might SIGSEV if the underlying buffer is too short.
* It is safe to use with buffers defined as
*
* SWITCH_ALIGN data[SWITCH_RECOMMENDED_BUFFER_SIZE];
*
* for example.
*/
inline static void SIMD_mux_sln_m256i_m256i_unbound(simde__m256i *dst, const simde__m256i *add, int samples)
{
int x;
const int blocks = samples / int16_per_m256i;
for ( x = 0; x < blocks; x++) {
/* AVX: Must be aligned on a 32-byte (128 bits) boundary) */
simde_mm256_store_si256(
dst+x,
simde_mm256_adds_epi16(
/* AVX: Must be aligned on a 32-byte (128 bits) boundary) */
simde_mm256_load_si256(dst+x),
simde_mm256_load_si256(add+x)
));
}
}
/* SIMD-optimized int16_t satured addition
* - only the first parameter must be aligned
* - unbound: underlying buffer must end on m256i (256 bits / 16 int16_t) boundary.
*/
inline static void SIMD_mux_sln_m256i_int16_unbound(simde__m256i *dst, const int16_t *add, int samples)
{
uint x;
const uint blocks = samples / int16_per_m256i;
for ( x = 0; x < blocks; x++) {
simde_mm256_store_si256(
dst+x,
simde_mm256_adds_epi16(
simde_mm256_load_si256(dst+x),
simde_mm256_loadu_si256(add+x*int16_per_m256i)
));
}
}
/* SIMD-optimized int16_t saturated addition
* - unbound: underlying buffer must end on m256i (256 bits / 16 int16_t) boundary.
*/
inline static void SIMD_mux_sln_int16_int16_unbound(int16_t *dst, const int16_t *add, int samples)
{
uint x;
const uint blocks = samples / int16_per_m256i;
for ( x = 0; x < blocks; x++) {
simde_mm256_storeu_si256(
dst+x*int16_per_m256i,
simde_mm256_adds_epi16(
simde_mm256_loadu_si256(dst+x*int16_per_m256i),
simde_mm256_loadu_si256(add+x*int16_per_m256i)
));
}
}
inline static int SIMD_is_aligned256(const void *p) {
return (uintptr_t)p % sizeof(simde__m256i) == 0;
}
inline static int SIMD_is_aligned128(const void *p) {
return (uintptr_t)p % sizeof(simde__m128i) == 0;
}
inline static void SIMD_mux_sln(int16_t *dst, const int16_t *add, int samples)
{
/* Round down to the nearest 256 bits block */
uint bound_len = samples & ~mask_int16_per_m256i;
uint extra = samples & mask_int16_per_m256i;
const int dst_aligned = SIMD_is_aligned256(dst);
const int src_aligned = SIMD_is_aligned256(add);
/* Process as much as we can from the original buffer */
if (dst_aligned && src_aligned) {
SIMD_mux_sln_m256i_m256i_unbound((simde__m256i *)dst, (const simde__m256i *)add, bound_len);
} else if (dst_aligned) {
SIMD_mux_sln_m256i_int16_unbound((simde__m256i *)dst, add, bound_len);
} else {
SIMD_mux_sln_int16_int16_unbound(dst, add, bound_len);
}
if (extra > 0) {
/* Since the original buffers might not go all the way up to the next 256 bits, we copy the data
* in local buffers large enough to hold it, then do the maths in SIMD.
*/
SWITCH_ALIGN int16_t _dst[int16_per_m256i];
SWITCH_ALIGN int16_t _add[int16_per_m256i];
memcpy(_dst, dst+bound_len, sizeof(int16_t) * extra);
memcpy(_add, add+bound_len, sizeof(int16_t) * extra);
SIMD_mux_sln_m256i_m256i_unbound((simde__m256i *)_dst, (const simde__m256i *)_add, extra);
memcpy(dst+bound_len, _dst, sizeof(int16_t) * extra);
}
}
/* In mod_conference we do 16-to-32 bit conversions to avoid overflow. */
/* Convert to unaligned int16_t to unaligned int32_t.
* - unbound: might overflow the input and output buffers boundaries if samples is not a multiple of 16.
*/
inline static void SIMD_convert32_int16_unbound(int32_t *dst, const int16_t *src, int samples)
{
uint x;
const uint blocks = samples / int16_per_m128i;
for ( x = 0; x < blocks; x++) {
/* Store 8 int32 at once.
* Apparently SIMDE doesn't define an _aligned_ store operation, but this is fine.
*/
simde_mm256_storeu_epi32(dst+x,
/* Sign-extend from 16-bits to 32-bits */
simde_mm256_cvtepi16_epi32(
/* Load 8 int16 at one */
simde_mm_loadu_epi16(src+x)));
}
}
/* Convert to aligned int32_t (in bunches of 8) to int16_t (in bunches of 8).
* - unbound: might overflow the input and output buffer boundaries.
*/
inline static void SIMD_convert16_m256i_unbound(simde__m128i *dst, const simde__m256i *src, int samples)
{
uint x;
const uint blocks = samples / int32_per_m256i;
for ( x = 0; x < blocks; x++) {
simde_mm_store_si128(
dst+x,
simde_mm256_cvtsepi32_epi16(
simde_mm256_load_si256(src+x)
));
}
}
/* Add int16_t samples to packed int32_t values.
* - unbound: might overflow the input and output buffer boundaries.
*/
inline static void SIMD_mux32_m256i_m128i_unbound(simde__m256i *dst, const simde__m128i *add, int samples)
{
uint x;
const uint blocks = samples / int16_per_m128i;
for ( x = 0; x < blocks; x++) {
/* AVX: Must be aligned on a 32-byte (128 bits) boundary) */
simde_mm256_store_si256(
dst+x,
simde_mm256_add_epi32(
/* AVX: Must be aligned on a 32-byte (128 bits) boundary) */
simde_mm256_load_si256(dst+x),
simde_mm256_cvtepi16_epi32(
simde_mm_load_si128(add+x)
)));
}
}
/* Add int16_t samples to packed int32_t values.
* - unbound: might overflow the input and output buffer boundaries.
*/
inline static void SIMD_mux32_m256i_int16_unbound(simde__m256i *dst, const int16_t *add, int samples)
{
uint x;
const uint blocks = samples / int16_per_m128i;
for ( x = 0; x < blocks; x++) {
simde_mm256_store_si256(
dst+x,
simde_mm256_add_epi32(
simde_mm256_load_si256(dst+x),
simde_mm256_cvtepi16_epi32(
simde_mm_loadu_epi16(add+x*int16_per_m128i)
)));
}
}
/* Add int16_t samples to packed int32_t values. */
inline static void SIMD_mux32_sln(simde__m256i *dst, const int16_t *add, int samples)
{
/* Round down to the nearest 128 bits block */
uint bound_len = samples & ~mask_int16_per_m128i;
uint extra = samples & mask_int16_per_m128i;
const int src_aligned = SIMD_is_aligned128(add);
/* Process as much as we can from the original buffer */
if (src_aligned) {
SIMD_mux32_m256i_m128i_unbound((simde__m256i *)dst, (const simde__m128i *)add, bound_len);
} else {
SIMD_mux32_m256i_int16_unbound(dst, add, bound_len);
}
if (extra > 0) {
/* Since the original buffers might not go all the way up to the next 256 bits, we copy the data
* in local buffers large enough to hold it, then do the maths in SIMD.
*/
SWITCH_ALIGN int16_t _add[int16_per_m128i];
memcpy(_add, add+bound_len, sizeof(int16_t) * extra);
SIMD_mux32_m256i_m128i_unbound(dst, (const simde__m128i *)_add, extra);
}
}
/* Subtract packed, aligned int16_t values from packed, aligned int32_t values.
* - unbound: might overflow the input and output buffer boundaries.
*/
inline static void SIMD_sub32_m256i_m128i_unbound(simde__m256i *dst, const simde__m128i *sub, int samples)
{
uint x;
const uint blocks = samples / int16_per_m128i;
for ( x = 0; x < blocks; x++) {
/* AVX: Must be aligned on a 32-byte (128 bits) boundary) */
simde_mm256_store_si256(
dst+x,
simde_mm256_sub_epi32(
/* AVX: Must be aligned on a 32-byte (128 bits) boundary) */
simde_mm256_load_si256(dst+x),
simde_mm256_cvtepi16_epi32(
simde_mm_load_si128(sub+x)
)));
}
}
/* Subtract int16_t values from packed, aligned int32_t values.
* - unbound: might overflow the input and output buffer boundaries.
*/
inline static void SIMD_sub32_m256i_int16_unbound(simde__m256i *dst, const int16_t *add, int samples)
{
uint x;
const uint blocks = samples / int16_per_m128i;
for ( x = 0; x < blocks; x++) {
simde_mm256_store_si256(
dst+x,
simde_mm256_sub_epi32(
simde_mm256_load_si256(dst+x),
simde_mm256_cvtepi16_epi32(
simde_mm_loadu_epi16(add+x*int16_per_m128i)
)));
}
}
/* Subtract int16_t values from packed, aligned int32_t values.
*/
inline static void SIMD_sub32_sln(simde__m256i *dst, const int16_t *add, int samples)
{
/* Round down to the nearest 256 bits block */
uint bound_len = samples & ~mask_int16_per_m128i;
uint extra = samples & mask_int16_per_m128i;
const int src_aligned = SIMD_is_aligned128(add);
/* Process as much as we can from the original buffer */
if (src_aligned) {
SIMD_sub32_m256i_m128i_unbound((simde__m256i *)dst, (const simde__m128i *)add, bound_len);
} else {
SIMD_sub32_m256i_int16_unbound(dst, add, bound_len);
}
if (extra > 0) {
/* Since the original buffers might not go all the way up to the next 256 bits, we copy the data
* in local buffers large enough to hold it, then do the maths in SIMD.
*/
SWITCH_ALIGN int16_t _add[int16_per_m128i];
memcpy(_add, add+bound_len, sizeof(int16_t) * extra);
SIMD_sub32_m256i_m128i_unbound(dst, (const simde__m128i *)_add, extra);
}
}
#else /* SIMD */
#define SWITCH_ALIGN
#endif /* SIMD */
#endif /* SWITCH_SIMD_H */