core/stdarch/crates/core_arch/src/x86/
avx2.rs

1//! Advanced Vector Extensions 2 (AVX)
2//!
3//! AVX2 expands most AVX commands to 256-bit wide vector registers and
4//! adds [FMA](https://en.wikipedia.org/wiki/Fused_multiply-accumulate).
5//!
6//! The references are:
7//!
8//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
9//!   Instruction Set Reference, A-Z][intel64_ref].
10//! - [AMD64 Architecture Programmer's Manual, Volume 3: General-Purpose and
11//!   System Instructions][amd64_ref].
12//!
13//! Wikipedia's [AVX][wiki_avx] and [FMA][wiki_fma] pages provide a quick
14//! overview of the instructions available.
15//!
16//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
17//! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
18//! [wiki_avx]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions
19//! [wiki_fma]: https://en.wikipedia.org/wiki/Fused_multiply-accumulate
20
21use crate::core_arch::{simd::*, x86::*};
22use crate::intrinsics::simd::*;
23
24#[cfg(test)]
25use stdarch_test::assert_instr;
26
27/// Computes the absolute values of packed 32-bit integers in `a`.
28///
29/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_abs_epi32)
30#[inline]
31#[target_feature(enable = "avx2")]
32#[cfg_attr(test, assert_instr(vpabsd))]
33#[stable(feature = "simd_x86", since = "1.27.0")]
34pub fn _mm256_abs_epi32(a: __m256i) -> __m256i {
35    unsafe {
36        let a = a.as_i32x8();
37        let r = simd_select::<m32x8, _>(simd_lt(a, i32x8::ZERO), simd_neg(a), a);
38        transmute(r)
39    }
40}
41
42/// Computes the absolute values of packed 16-bit integers in `a`.
43///
44/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_abs_epi16)
45#[inline]
46#[target_feature(enable = "avx2")]
47#[cfg_attr(test, assert_instr(vpabsw))]
48#[stable(feature = "simd_x86", since = "1.27.0")]
49pub fn _mm256_abs_epi16(a: __m256i) -> __m256i {
50    unsafe {
51        let a = a.as_i16x16();
52        let r = simd_select::<m16x16, _>(simd_lt(a, i16x16::ZERO), simd_neg(a), a);
53        transmute(r)
54    }
55}
56
57/// Computes the absolute values of packed 8-bit integers in `a`.
58///
59/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_abs_epi8)
60#[inline]
61#[target_feature(enable = "avx2")]
62#[cfg_attr(test, assert_instr(vpabsb))]
63#[stable(feature = "simd_x86", since = "1.27.0")]
64pub fn _mm256_abs_epi8(a: __m256i) -> __m256i {
65    unsafe {
66        let a = a.as_i8x32();
67        let r = simd_select::<m8x32, _>(simd_lt(a, i8x32::ZERO), simd_neg(a), a);
68        transmute(r)
69    }
70}
71
72/// Adds packed 64-bit integers in `a` and `b`.
73///
74/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi64)
75#[inline]
76#[target_feature(enable = "avx2")]
77#[cfg_attr(test, assert_instr(vpaddq))]
78#[stable(feature = "simd_x86", since = "1.27.0")]
79pub fn _mm256_add_epi64(a: __m256i, b: __m256i) -> __m256i {
80    unsafe { transmute(simd_add(a.as_i64x4(), b.as_i64x4())) }
81}
82
83/// Adds packed 32-bit integers in `a` and `b`.
84///
85/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi32)
86#[inline]
87#[target_feature(enable = "avx2")]
88#[cfg_attr(test, assert_instr(vpaddd))]
89#[stable(feature = "simd_x86", since = "1.27.0")]
90pub fn _mm256_add_epi32(a: __m256i, b: __m256i) -> __m256i {
91    unsafe { transmute(simd_add(a.as_i32x8(), b.as_i32x8())) }
92}
93
94/// Adds packed 16-bit integers in `a` and `b`.
95///
96/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi16)
97#[inline]
98#[target_feature(enable = "avx2")]
99#[cfg_attr(test, assert_instr(vpaddw))]
100#[stable(feature = "simd_x86", since = "1.27.0")]
101pub fn _mm256_add_epi16(a: __m256i, b: __m256i) -> __m256i {
102    unsafe { transmute(simd_add(a.as_i16x16(), b.as_i16x16())) }
103}
104
105/// Adds packed 8-bit integers in `a` and `b`.
106///
107/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi8)
108#[inline]
109#[target_feature(enable = "avx2")]
110#[cfg_attr(test, assert_instr(vpaddb))]
111#[stable(feature = "simd_x86", since = "1.27.0")]
112pub fn _mm256_add_epi8(a: __m256i, b: __m256i) -> __m256i {
113    unsafe { transmute(simd_add(a.as_i8x32(), b.as_i8x32())) }
114}
115
116/// Adds packed 8-bit integers in `a` and `b` using saturation.
117///
118/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epi8)
119#[inline]
120#[target_feature(enable = "avx2")]
121#[cfg_attr(test, assert_instr(vpaddsb))]
122#[stable(feature = "simd_x86", since = "1.27.0")]
123pub fn _mm256_adds_epi8(a: __m256i, b: __m256i) -> __m256i {
124    unsafe { transmute(simd_saturating_add(a.as_i8x32(), b.as_i8x32())) }
125}
126
127/// Adds packed 16-bit integers in `a` and `b` using saturation.
128///
129/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epi16)
130#[inline]
131#[target_feature(enable = "avx2")]
132#[cfg_attr(test, assert_instr(vpaddsw))]
133#[stable(feature = "simd_x86", since = "1.27.0")]
134pub fn _mm256_adds_epi16(a: __m256i, b: __m256i) -> __m256i {
135    unsafe { transmute(simd_saturating_add(a.as_i16x16(), b.as_i16x16())) }
136}
137
138/// Adds packed unsigned 8-bit integers in `a` and `b` using saturation.
139///
140/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epu8)
141#[inline]
142#[target_feature(enable = "avx2")]
143#[cfg_attr(test, assert_instr(vpaddusb))]
144#[stable(feature = "simd_x86", since = "1.27.0")]
145pub fn _mm256_adds_epu8(a: __m256i, b: __m256i) -> __m256i {
146    unsafe { transmute(simd_saturating_add(a.as_u8x32(), b.as_u8x32())) }
147}
148
149/// Adds packed unsigned 16-bit integers in `a` and `b` using saturation.
150///
151/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epu16)
152#[inline]
153#[target_feature(enable = "avx2")]
154#[cfg_attr(test, assert_instr(vpaddusw))]
155#[stable(feature = "simd_x86", since = "1.27.0")]
156pub fn _mm256_adds_epu16(a: __m256i, b: __m256i) -> __m256i {
157    unsafe { transmute(simd_saturating_add(a.as_u16x16(), b.as_u16x16())) }
158}
159
160/// Concatenates pairs of 16-byte blocks in `a` and `b` into a 32-byte temporary
161/// result, shifts the result right by `n` bytes, and returns the low 16 bytes.
162///
163/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_alignr_epi8)
164#[inline]
165#[target_feature(enable = "avx2")]
166#[cfg_attr(test, assert_instr(vpalignr, IMM8 = 7))]
167#[rustc_legacy_const_generics(2)]
168#[stable(feature = "simd_x86", since = "1.27.0")]
169pub fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
170    static_assert_uimm_bits!(IMM8, 8);
171
172    // If palignr is shifting the pair of vectors more than the size of two
173    // lanes, emit zero.
174    if IMM8 >= 32 {
175        return _mm256_setzero_si256();
176    }
177    // If palignr is shifting the pair of input vectors more than one lane,
178    // but less than two lanes, convert to shifting in zeroes.
179    let (a, b) = if IMM8 > 16 {
180        (_mm256_setzero_si256(), a)
181    } else {
182        (a, b)
183    };
184    unsafe {
185        if IMM8 == 16 {
186            return transmute(a);
187        }
188    }
189    const fn mask(shift: u32, i: u32) -> u32 {
190        let shift = shift % 16;
191        let mod_i = i % 16;
192        if mod_i < (16 - shift) {
193            i + shift
194        } else {
195            i + 16 + shift
196        }
197    }
198
199    unsafe {
200        let r: i8x32 = simd_shuffle!(
201            b.as_i8x32(),
202            a.as_i8x32(),
203            [
204                mask(IMM8 as u32, 0),
205                mask(IMM8 as u32, 1),
206                mask(IMM8 as u32, 2),
207                mask(IMM8 as u32, 3),
208                mask(IMM8 as u32, 4),
209                mask(IMM8 as u32, 5),
210                mask(IMM8 as u32, 6),
211                mask(IMM8 as u32, 7),
212                mask(IMM8 as u32, 8),
213                mask(IMM8 as u32, 9),
214                mask(IMM8 as u32, 10),
215                mask(IMM8 as u32, 11),
216                mask(IMM8 as u32, 12),
217                mask(IMM8 as u32, 13),
218                mask(IMM8 as u32, 14),
219                mask(IMM8 as u32, 15),
220                mask(IMM8 as u32, 16),
221                mask(IMM8 as u32, 17),
222                mask(IMM8 as u32, 18),
223                mask(IMM8 as u32, 19),
224                mask(IMM8 as u32, 20),
225                mask(IMM8 as u32, 21),
226                mask(IMM8 as u32, 22),
227                mask(IMM8 as u32, 23),
228                mask(IMM8 as u32, 24),
229                mask(IMM8 as u32, 25),
230                mask(IMM8 as u32, 26),
231                mask(IMM8 as u32, 27),
232                mask(IMM8 as u32, 28),
233                mask(IMM8 as u32, 29),
234                mask(IMM8 as u32, 30),
235                mask(IMM8 as u32, 31),
236            ],
237        );
238        transmute(r)
239    }
240}
241
242/// Computes the bitwise AND of 256 bits (representing integer data)
243/// in `a` and `b`.
244///
245/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_and_si256)
246#[inline]
247#[target_feature(enable = "avx2")]
248#[cfg_attr(test, assert_instr(vandps))]
249#[stable(feature = "simd_x86", since = "1.27.0")]
250pub fn _mm256_and_si256(a: __m256i, b: __m256i) -> __m256i {
251    unsafe { transmute(simd_and(a.as_i64x4(), b.as_i64x4())) }
252}
253
254/// Computes the bitwise NOT of 256 bits (representing integer data)
255/// in `a` and then AND with `b`.
256///
257/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_andnot_si256)
258#[inline]
259#[target_feature(enable = "avx2")]
260#[cfg_attr(test, assert_instr(vandnps))]
261#[stable(feature = "simd_x86", since = "1.27.0")]
262pub fn _mm256_andnot_si256(a: __m256i, b: __m256i) -> __m256i {
263    unsafe {
264        let all_ones = _mm256_set1_epi8(-1);
265        transmute(simd_and(
266            simd_xor(a.as_i64x4(), all_ones.as_i64x4()),
267            b.as_i64x4(),
268        ))
269    }
270}
271
272/// Averages packed unsigned 16-bit integers in `a` and `b`.
273///
274/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_avg_epu16)
275#[inline]
276#[target_feature(enable = "avx2")]
277#[cfg_attr(test, assert_instr(vpavgw))]
278#[stable(feature = "simd_x86", since = "1.27.0")]
279pub fn _mm256_avg_epu16(a: __m256i, b: __m256i) -> __m256i {
280    unsafe {
281        let a = simd_cast::<_, u32x16>(a.as_u16x16());
282        let b = simd_cast::<_, u32x16>(b.as_u16x16());
283        let r = simd_shr(simd_add(simd_add(a, b), u32x16::splat(1)), u32x16::splat(1));
284        transmute(simd_cast::<_, u16x16>(r))
285    }
286}
287
288/// Averages packed unsigned 8-bit integers in `a` and `b`.
289///
290/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_avg_epu8)
291#[inline]
292#[target_feature(enable = "avx2")]
293#[cfg_attr(test, assert_instr(vpavgb))]
294#[stable(feature = "simd_x86", since = "1.27.0")]
295pub fn _mm256_avg_epu8(a: __m256i, b: __m256i) -> __m256i {
296    unsafe {
297        let a = simd_cast::<_, u16x32>(a.as_u8x32());
298        let b = simd_cast::<_, u16x32>(b.as_u8x32());
299        let r = simd_shr(simd_add(simd_add(a, b), u16x32::splat(1)), u16x32::splat(1));
300        transmute(simd_cast::<_, u8x32>(r))
301    }
302}
303
304/// Blends packed 32-bit integers from `a` and `b` using control mask `IMM4`.
305///
306/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_epi32)
307#[inline]
308#[target_feature(enable = "avx2")]
309#[cfg_attr(test, assert_instr(vblendps, IMM4 = 9))]
310#[rustc_legacy_const_generics(2)]
311#[stable(feature = "simd_x86", since = "1.27.0")]
312pub fn _mm_blend_epi32<const IMM4: i32>(a: __m128i, b: __m128i) -> __m128i {
313    static_assert_uimm_bits!(IMM4, 4);
314    unsafe {
315        let a = a.as_i32x4();
316        let b = b.as_i32x4();
317        let r: i32x4 = simd_shuffle!(
318            a,
319            b,
320            [
321                [0, 4, 0, 4][IMM4 as usize & 0b11],
322                [1, 1, 5, 5][IMM4 as usize & 0b11],
323                [2, 6, 2, 6][(IMM4 as usize >> 2) & 0b11],
324                [3, 3, 7, 7][(IMM4 as usize >> 2) & 0b11],
325            ],
326        );
327        transmute(r)
328    }
329}
330
331/// Blends packed 32-bit integers from `a` and `b` using control mask `IMM8`.
332///
333/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_epi32)
334#[inline]
335#[target_feature(enable = "avx2")]
336#[cfg_attr(test, assert_instr(vblendps, IMM8 = 9))]
337#[rustc_legacy_const_generics(2)]
338#[stable(feature = "simd_x86", since = "1.27.0")]
339pub fn _mm256_blend_epi32<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
340    static_assert_uimm_bits!(IMM8, 8);
341    unsafe {
342        let a = a.as_i32x8();
343        let b = b.as_i32x8();
344        let r: i32x8 = simd_shuffle!(
345            a,
346            b,
347            [
348                [0, 8, 0, 8][IMM8 as usize & 0b11],
349                [1, 1, 9, 9][IMM8 as usize & 0b11],
350                [2, 10, 2, 10][(IMM8 as usize >> 2) & 0b11],
351                [3, 3, 11, 11][(IMM8 as usize >> 2) & 0b11],
352                [4, 12, 4, 12][(IMM8 as usize >> 4) & 0b11],
353                [5, 5, 13, 13][(IMM8 as usize >> 4) & 0b11],
354                [6, 14, 6, 14][(IMM8 as usize >> 6) & 0b11],
355                [7, 7, 15, 15][(IMM8 as usize >> 6) & 0b11],
356            ],
357        );
358        transmute(r)
359    }
360}
361
362/// Blends packed 16-bit integers from `a` and `b` using control mask `IMM8`.
363///
364/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_epi16)
365#[inline]
366#[target_feature(enable = "avx2")]
367#[cfg_attr(test, assert_instr(vpblendw, IMM8 = 9))]
368#[rustc_legacy_const_generics(2)]
369#[stable(feature = "simd_x86", since = "1.27.0")]
370pub fn _mm256_blend_epi16<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
371    static_assert_uimm_bits!(IMM8, 8);
372    unsafe {
373        let a = a.as_i16x16();
374        let b = b.as_i16x16();
375
376        let r: i16x16 = simd_shuffle!(
377            a,
378            b,
379            [
380                [0, 16, 0, 16][IMM8 as usize & 0b11],
381                [1, 1, 17, 17][IMM8 as usize & 0b11],
382                [2, 18, 2, 18][(IMM8 as usize >> 2) & 0b11],
383                [3, 3, 19, 19][(IMM8 as usize >> 2) & 0b11],
384                [4, 20, 4, 20][(IMM8 as usize >> 4) & 0b11],
385                [5, 5, 21, 21][(IMM8 as usize >> 4) & 0b11],
386                [6, 22, 6, 22][(IMM8 as usize >> 6) & 0b11],
387                [7, 7, 23, 23][(IMM8 as usize >> 6) & 0b11],
388                [8, 24, 8, 24][IMM8 as usize & 0b11],
389                [9, 9, 25, 25][IMM8 as usize & 0b11],
390                [10, 26, 10, 26][(IMM8 as usize >> 2) & 0b11],
391                [11, 11, 27, 27][(IMM8 as usize >> 2) & 0b11],
392                [12, 28, 12, 28][(IMM8 as usize >> 4) & 0b11],
393                [13, 13, 29, 29][(IMM8 as usize >> 4) & 0b11],
394                [14, 30, 14, 30][(IMM8 as usize >> 6) & 0b11],
395                [15, 15, 31, 31][(IMM8 as usize >> 6) & 0b11],
396            ],
397        );
398        transmute(r)
399    }
400}
401
402/// Blends packed 8-bit integers from `a` and `b` using `mask`.
403///
404/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blendv_epi8)
405#[inline]
406#[target_feature(enable = "avx2")]
407#[cfg_attr(test, assert_instr(vpblendvb))]
408#[stable(feature = "simd_x86", since = "1.27.0")]
409pub fn _mm256_blendv_epi8(a: __m256i, b: __m256i, mask: __m256i) -> __m256i {
410    unsafe {
411        let mask: i8x32 = simd_lt(mask.as_i8x32(), i8x32::ZERO);
412        transmute(simd_select(mask, b.as_i8x32(), a.as_i8x32()))
413    }
414}
415
416/// Broadcasts the low packed 8-bit integer from `a` to all elements of
417/// the 128-bit returned value.
418///
419/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastb_epi8)
420#[inline]
421#[target_feature(enable = "avx2")]
422#[cfg_attr(test, assert_instr(vpbroadcastb))]
423#[stable(feature = "simd_x86", since = "1.27.0")]
424pub fn _mm_broadcastb_epi8(a: __m128i) -> __m128i {
425    unsafe {
426        let ret = simd_shuffle!(a.as_i8x16(), i8x16::ZERO, [0_u32; 16]);
427        transmute::<i8x16, _>(ret)
428    }
429}
430
431/// Broadcasts the low packed 8-bit integer from `a` to all elements of
432/// the 256-bit returned value.
433///
434/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastb_epi8)
435#[inline]
436#[target_feature(enable = "avx2")]
437#[cfg_attr(test, assert_instr(vpbroadcastb))]
438#[stable(feature = "simd_x86", since = "1.27.0")]
439pub fn _mm256_broadcastb_epi8(a: __m128i) -> __m256i {
440    unsafe {
441        let ret = simd_shuffle!(a.as_i8x16(), i8x16::ZERO, [0_u32; 32]);
442        transmute::<i8x32, _>(ret)
443    }
444}
445
446// N.B., `simd_shuffle4` with integer data types for `a` and `b` is
447// often compiled to `vbroadcastss`.
448/// Broadcasts the low packed 32-bit integer from `a` to all elements of
449/// the 128-bit returned value.
450///
451/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastd_epi32)
452#[inline]
453#[target_feature(enable = "avx2")]
454#[cfg_attr(test, assert_instr(vbroadcastss))]
455#[stable(feature = "simd_x86", since = "1.27.0")]
456pub fn _mm_broadcastd_epi32(a: __m128i) -> __m128i {
457    unsafe {
458        let ret = simd_shuffle!(a.as_i32x4(), i32x4::ZERO, [0_u32; 4]);
459        transmute::<i32x4, _>(ret)
460    }
461}
462
463// N.B., `simd_shuffle4`` with integer data types for `a` and `b` is
464// often compiled to `vbroadcastss`.
465/// Broadcasts the low packed 32-bit integer from `a` to all elements of
466/// the 256-bit returned value.
467///
468/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastd_epi32)
469#[inline]
470#[target_feature(enable = "avx2")]
471#[cfg_attr(test, assert_instr(vbroadcastss))]
472#[stable(feature = "simd_x86", since = "1.27.0")]
473pub fn _mm256_broadcastd_epi32(a: __m128i) -> __m256i {
474    unsafe {
475        let ret = simd_shuffle!(a.as_i32x4(), i32x4::ZERO, [0_u32; 8]);
476        transmute::<i32x8, _>(ret)
477    }
478}
479
480/// Broadcasts the low packed 64-bit integer from `a` to all elements of
481/// the 128-bit returned value.
482///
483/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastq_epi64)
484#[inline]
485#[target_feature(enable = "avx2")]
486// Emits `vmovddup` instead of `vpbroadcastq`
487// See https://github.com/rust-lang/stdarch/issues/791
488#[cfg_attr(test, assert_instr(vmovddup))]
489#[stable(feature = "simd_x86", since = "1.27.0")]
490pub fn _mm_broadcastq_epi64(a: __m128i) -> __m128i {
491    unsafe {
492        let ret = simd_shuffle!(a.as_i64x2(), a.as_i64x2(), [0_u32; 2]);
493        transmute::<i64x2, _>(ret)
494    }
495}
496
497/// Broadcasts the low packed 64-bit integer from `a` to all elements of
498/// the 256-bit returned value.
499///
500/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastq_epi64)
501#[inline]
502#[target_feature(enable = "avx2")]
503#[cfg_attr(test, assert_instr(vbroadcastsd))]
504#[stable(feature = "simd_x86", since = "1.27.0")]
505pub fn _mm256_broadcastq_epi64(a: __m128i) -> __m256i {
506    unsafe {
507        let ret = simd_shuffle!(a.as_i64x2(), a.as_i64x2(), [0_u32; 4]);
508        transmute::<i64x4, _>(ret)
509    }
510}
511
512/// Broadcasts the low double-precision (64-bit) floating-point element
513/// from `a` to all elements of the 128-bit returned value.
514///
515/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastsd_pd)
516#[inline]
517#[target_feature(enable = "avx2")]
518#[cfg_attr(test, assert_instr(vmovddup))]
519#[stable(feature = "simd_x86", since = "1.27.0")]
520pub fn _mm_broadcastsd_pd(a: __m128d) -> __m128d {
521    unsafe { simd_shuffle!(a, _mm_setzero_pd(), [0_u32; 2]) }
522}
523
524/// Broadcasts the low double-precision (64-bit) floating-point element
525/// from `a` to all elements of the 256-bit returned value.
526///
527/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastsd_pd)
528#[inline]
529#[target_feature(enable = "avx2")]
530#[cfg_attr(test, assert_instr(vbroadcastsd))]
531#[stable(feature = "simd_x86", since = "1.27.0")]
532pub fn _mm256_broadcastsd_pd(a: __m128d) -> __m256d {
533    unsafe { simd_shuffle!(a, _mm_setzero_pd(), [0_u32; 4]) }
534}
535
536/// Broadcasts 128 bits of integer data from a to all 128-bit lanes in
537/// the 256-bit returned value.
538///
539/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastsi128_si256)
540#[inline]
541#[target_feature(enable = "avx2")]
542#[stable(feature = "simd_x86_updates", since = "1.82.0")]
543pub fn _mm_broadcastsi128_si256(a: __m128i) -> __m256i {
544    unsafe {
545        let ret = simd_shuffle!(a.as_i64x2(), i64x2::ZERO, [0, 1, 0, 1]);
546        transmute::<i64x4, _>(ret)
547    }
548}
549
550// N.B., `broadcastsi128_si256` is often compiled to `vinsertf128` or
551// `vbroadcastf128`.
552/// Broadcasts 128 bits of integer data from a to all 128-bit lanes in
553/// the 256-bit returned value.
554///
555/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastsi128_si256)
556#[inline]
557#[target_feature(enable = "avx2")]
558#[stable(feature = "simd_x86", since = "1.27.0")]
559pub fn _mm256_broadcastsi128_si256(a: __m128i) -> __m256i {
560    unsafe {
561        let ret = simd_shuffle!(a.as_i64x2(), i64x2::ZERO, [0, 1, 0, 1]);
562        transmute::<i64x4, _>(ret)
563    }
564}
565
566/// Broadcasts the low single-precision (32-bit) floating-point element
567/// from `a` to all elements of the 128-bit returned value.
568///
569/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastss_ps)
570#[inline]
571#[target_feature(enable = "avx2")]
572#[cfg_attr(test, assert_instr(vbroadcastss))]
573#[stable(feature = "simd_x86", since = "1.27.0")]
574pub fn _mm_broadcastss_ps(a: __m128) -> __m128 {
575    unsafe { simd_shuffle!(a, _mm_setzero_ps(), [0_u32; 4]) }
576}
577
578/// Broadcasts the low single-precision (32-bit) floating-point element
579/// from `a` to all elements of the 256-bit returned value.
580///
581/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastss_ps)
582#[inline]
583#[target_feature(enable = "avx2")]
584#[cfg_attr(test, assert_instr(vbroadcastss))]
585#[stable(feature = "simd_x86", since = "1.27.0")]
586pub fn _mm256_broadcastss_ps(a: __m128) -> __m256 {
587    unsafe { simd_shuffle!(a, _mm_setzero_ps(), [0_u32; 8]) }
588}
589
590/// Broadcasts the low packed 16-bit integer from a to all elements of
591/// the 128-bit returned value
592///
593/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastw_epi16)
594#[inline]
595#[target_feature(enable = "avx2")]
596#[cfg_attr(test, assert_instr(vpbroadcastw))]
597#[stable(feature = "simd_x86", since = "1.27.0")]
598pub fn _mm_broadcastw_epi16(a: __m128i) -> __m128i {
599    unsafe {
600        let ret = simd_shuffle!(a.as_i16x8(), i16x8::ZERO, [0_u32; 8]);
601        transmute::<i16x8, _>(ret)
602    }
603}
604
605/// Broadcasts the low packed 16-bit integer from a to all elements of
606/// the 256-bit returned value
607///
608/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastw_epi16)
609#[inline]
610#[target_feature(enable = "avx2")]
611#[cfg_attr(test, assert_instr(vpbroadcastw))]
612#[stable(feature = "simd_x86", since = "1.27.0")]
613pub fn _mm256_broadcastw_epi16(a: __m128i) -> __m256i {
614    unsafe {
615        let ret = simd_shuffle!(a.as_i16x8(), i16x8::ZERO, [0_u32; 16]);
616        transmute::<i16x16, _>(ret)
617    }
618}
619
620/// Compares packed 64-bit integers in `a` and `b` for equality.
621///
622/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi64)
623#[inline]
624#[target_feature(enable = "avx2")]
625#[cfg_attr(test, assert_instr(vpcmpeqq))]
626#[stable(feature = "simd_x86", since = "1.27.0")]
627pub fn _mm256_cmpeq_epi64(a: __m256i, b: __m256i) -> __m256i {
628    unsafe { transmute::<i64x4, _>(simd_eq(a.as_i64x4(), b.as_i64x4())) }
629}
630
631/// Compares packed 32-bit integers in `a` and `b` for equality.
632///
633/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi32)
634#[inline]
635#[target_feature(enable = "avx2")]
636#[cfg_attr(test, assert_instr(vpcmpeqd))]
637#[stable(feature = "simd_x86", since = "1.27.0")]
638pub fn _mm256_cmpeq_epi32(a: __m256i, b: __m256i) -> __m256i {
639    unsafe { transmute::<i32x8, _>(simd_eq(a.as_i32x8(), b.as_i32x8())) }
640}
641
642/// Compares packed 16-bit integers in `a` and `b` for equality.
643///
644/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi16)
645#[inline]
646#[target_feature(enable = "avx2")]
647#[cfg_attr(test, assert_instr(vpcmpeqw))]
648#[stable(feature = "simd_x86", since = "1.27.0")]
649pub fn _mm256_cmpeq_epi16(a: __m256i, b: __m256i) -> __m256i {
650    unsafe { transmute::<i16x16, _>(simd_eq(a.as_i16x16(), b.as_i16x16())) }
651}
652
653/// Compares packed 8-bit integers in `a` and `b` for equality.
654///
655/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi8)
656#[inline]
657#[target_feature(enable = "avx2")]
658#[cfg_attr(test, assert_instr(vpcmpeqb))]
659#[stable(feature = "simd_x86", since = "1.27.0")]
660pub fn _mm256_cmpeq_epi8(a: __m256i, b: __m256i) -> __m256i {
661    unsafe { transmute::<i8x32, _>(simd_eq(a.as_i8x32(), b.as_i8x32())) }
662}
663
664/// Compares packed 64-bit integers in `a` and `b` for greater-than.
665///
666/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi64)
667#[inline]
668#[target_feature(enable = "avx2")]
669#[cfg_attr(test, assert_instr(vpcmpgtq))]
670#[stable(feature = "simd_x86", since = "1.27.0")]
671pub fn _mm256_cmpgt_epi64(a: __m256i, b: __m256i) -> __m256i {
672    unsafe { transmute::<i64x4, _>(simd_gt(a.as_i64x4(), b.as_i64x4())) }
673}
674
675/// Compares packed 32-bit integers in `a` and `b` for greater-than.
676///
677/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi32)
678#[inline]
679#[target_feature(enable = "avx2")]
680#[cfg_attr(test, assert_instr(vpcmpgtd))]
681#[stable(feature = "simd_x86", since = "1.27.0")]
682pub fn _mm256_cmpgt_epi32(a: __m256i, b: __m256i) -> __m256i {
683    unsafe { transmute::<i32x8, _>(simd_gt(a.as_i32x8(), b.as_i32x8())) }
684}
685
686/// Compares packed 16-bit integers in `a` and `b` for greater-than.
687///
688/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi16)
689#[inline]
690#[target_feature(enable = "avx2")]
691#[cfg_attr(test, assert_instr(vpcmpgtw))]
692#[stable(feature = "simd_x86", since = "1.27.0")]
693pub fn _mm256_cmpgt_epi16(a: __m256i, b: __m256i) -> __m256i {
694    unsafe { transmute::<i16x16, _>(simd_gt(a.as_i16x16(), b.as_i16x16())) }
695}
696
697/// Compares packed 8-bit integers in `a` and `b` for greater-than.
698///
699/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi8)
700#[inline]
701#[target_feature(enable = "avx2")]
702#[cfg_attr(test, assert_instr(vpcmpgtb))]
703#[stable(feature = "simd_x86", since = "1.27.0")]
704pub fn _mm256_cmpgt_epi8(a: __m256i, b: __m256i) -> __m256i {
705    unsafe { transmute::<i8x32, _>(simd_gt(a.as_i8x32(), b.as_i8x32())) }
706}
707
708/// Sign-extend 16-bit integers to 32-bit integers.
709///
710/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi16_epi32)
711#[inline]
712#[target_feature(enable = "avx2")]
713#[cfg_attr(test, assert_instr(vpmovsxwd))]
714#[stable(feature = "simd_x86", since = "1.27.0")]
715pub fn _mm256_cvtepi16_epi32(a: __m128i) -> __m256i {
716    unsafe { transmute::<i32x8, _>(simd_cast(a.as_i16x8())) }
717}
718
719/// Sign-extend 16-bit integers to 64-bit integers.
720///
721/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi16_epi64)
722#[inline]
723#[target_feature(enable = "avx2")]
724#[cfg_attr(test, assert_instr(vpmovsxwq))]
725#[stable(feature = "simd_x86", since = "1.27.0")]
726pub fn _mm256_cvtepi16_epi64(a: __m128i) -> __m256i {
727    unsafe {
728        let a = a.as_i16x8();
729        let v64: i16x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
730        transmute::<i64x4, _>(simd_cast(v64))
731    }
732}
733
734/// Sign-extend 32-bit integers to 64-bit integers.
735///
736/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi32_epi64)
737#[inline]
738#[target_feature(enable = "avx2")]
739#[cfg_attr(test, assert_instr(vpmovsxdq))]
740#[stable(feature = "simd_x86", since = "1.27.0")]
741pub fn _mm256_cvtepi32_epi64(a: __m128i) -> __m256i {
742    unsafe { transmute::<i64x4, _>(simd_cast(a.as_i32x4())) }
743}
744
745/// Sign-extend 8-bit integers to 16-bit integers.
746///
747/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi8_epi16)
748#[inline]
749#[target_feature(enable = "avx2")]
750#[cfg_attr(test, assert_instr(vpmovsxbw))]
751#[stable(feature = "simd_x86", since = "1.27.0")]
752pub fn _mm256_cvtepi8_epi16(a: __m128i) -> __m256i {
753    unsafe { transmute::<i16x16, _>(simd_cast(a.as_i8x16())) }
754}
755
756/// Sign-extend 8-bit integers to 32-bit integers.
757///
758/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi8_epi32)
759#[inline]
760#[target_feature(enable = "avx2")]
761#[cfg_attr(test, assert_instr(vpmovsxbd))]
762#[stable(feature = "simd_x86", since = "1.27.0")]
763pub fn _mm256_cvtepi8_epi32(a: __m128i) -> __m256i {
764    unsafe {
765        let a = a.as_i8x16();
766        let v64: i8x8 = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
767        transmute::<i32x8, _>(simd_cast(v64))
768    }
769}
770
771/// Sign-extend 8-bit integers to 64-bit integers.
772///
773/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi8_epi64)
774#[inline]
775#[target_feature(enable = "avx2")]
776#[cfg_attr(test, assert_instr(vpmovsxbq))]
777#[stable(feature = "simd_x86", since = "1.27.0")]
778pub fn _mm256_cvtepi8_epi64(a: __m128i) -> __m256i {
779    unsafe {
780        let a = a.as_i8x16();
781        let v32: i8x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
782        transmute::<i64x4, _>(simd_cast(v32))
783    }
784}
785
786/// Zeroes extend packed unsigned 16-bit integers in `a` to packed 32-bit
787/// integers, and stores the results in `dst`.
788///
789/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu16_epi32)
790#[inline]
791#[target_feature(enable = "avx2")]
792#[cfg_attr(test, assert_instr(vpmovzxwd))]
793#[stable(feature = "simd_x86", since = "1.27.0")]
794pub fn _mm256_cvtepu16_epi32(a: __m128i) -> __m256i {
795    unsafe { transmute::<i32x8, _>(simd_cast(a.as_u16x8())) }
796}
797
798/// Zero-extend the lower four unsigned 16-bit integers in `a` to 64-bit
799/// integers. The upper four elements of `a` are unused.
800///
801/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu16_epi64)
802#[inline]
803#[target_feature(enable = "avx2")]
804#[cfg_attr(test, assert_instr(vpmovzxwq))]
805#[stable(feature = "simd_x86", since = "1.27.0")]
806pub fn _mm256_cvtepu16_epi64(a: __m128i) -> __m256i {
807    unsafe {
808        let a = a.as_u16x8();
809        let v64: u16x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
810        transmute::<i64x4, _>(simd_cast(v64))
811    }
812}
813
814/// Zero-extend unsigned 32-bit integers in `a` to 64-bit integers.
815///
816/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu32_epi64)
817#[inline]
818#[target_feature(enable = "avx2")]
819#[cfg_attr(test, assert_instr(vpmovzxdq))]
820#[stable(feature = "simd_x86", since = "1.27.0")]
821pub fn _mm256_cvtepu32_epi64(a: __m128i) -> __m256i {
822    unsafe { transmute::<i64x4, _>(simd_cast(a.as_u32x4())) }
823}
824
825/// Zero-extend unsigned 8-bit integers in `a` to 16-bit integers.
826///
827/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu8_epi16)
828#[inline]
829#[target_feature(enable = "avx2")]
830#[cfg_attr(test, assert_instr(vpmovzxbw))]
831#[stable(feature = "simd_x86", since = "1.27.0")]
832pub fn _mm256_cvtepu8_epi16(a: __m128i) -> __m256i {
833    unsafe { transmute::<i16x16, _>(simd_cast(a.as_u8x16())) }
834}
835
836/// Zero-extend the lower eight unsigned 8-bit integers in `a` to 32-bit
837/// integers. The upper eight elements of `a` are unused.
838///
839/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu8_epi32)
840#[inline]
841#[target_feature(enable = "avx2")]
842#[cfg_attr(test, assert_instr(vpmovzxbd))]
843#[stable(feature = "simd_x86", since = "1.27.0")]
844pub fn _mm256_cvtepu8_epi32(a: __m128i) -> __m256i {
845    unsafe {
846        let a = a.as_u8x16();
847        let v64: u8x8 = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
848        transmute::<i32x8, _>(simd_cast(v64))
849    }
850}
851
852/// Zero-extend the lower four unsigned 8-bit integers in `a` to 64-bit
853/// integers. The upper twelve elements of `a` are unused.
854///
855/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu8_epi64)
856#[inline]
857#[target_feature(enable = "avx2")]
858#[cfg_attr(test, assert_instr(vpmovzxbq))]
859#[stable(feature = "simd_x86", since = "1.27.0")]
860pub fn _mm256_cvtepu8_epi64(a: __m128i) -> __m256i {
861    unsafe {
862        let a = a.as_u8x16();
863        let v32: u8x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
864        transmute::<i64x4, _>(simd_cast(v32))
865    }
866}
867
868/// Extracts 128 bits (of integer data) from `a` selected with `IMM1`.
869///
870/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extracti128_si256)
871#[inline]
872#[target_feature(enable = "avx2")]
873#[cfg_attr(test, assert_instr(vextractf128, IMM1 = 1))]
874#[rustc_legacy_const_generics(1)]
875#[stable(feature = "simd_x86", since = "1.27.0")]
876pub fn _mm256_extracti128_si256<const IMM1: i32>(a: __m256i) -> __m128i {
877    static_assert_uimm_bits!(IMM1, 1);
878    unsafe {
879        let a = a.as_i64x4();
880        let b = i64x4::ZERO;
881        let dst: i64x2 = simd_shuffle!(a, b, [[0, 1], [2, 3]][IMM1 as usize]);
882        transmute(dst)
883    }
884}
885
886/// Horizontally adds adjacent pairs of 16-bit integers in `a` and `b`.
887///
888/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_epi16)
889#[inline]
890#[target_feature(enable = "avx2")]
891#[cfg_attr(test, assert_instr(vphaddw))]
892#[stable(feature = "simd_x86", since = "1.27.0")]
893pub fn _mm256_hadd_epi16(a: __m256i, b: __m256i) -> __m256i {
894    unsafe { transmute(phaddw(a.as_i16x16(), b.as_i16x16())) }
895}
896
897/// Horizontally adds adjacent pairs of 32-bit integers in `a` and `b`.
898///
899/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_epi32)
900#[inline]
901#[target_feature(enable = "avx2")]
902#[cfg_attr(test, assert_instr(vphaddd))]
903#[stable(feature = "simd_x86", since = "1.27.0")]
904pub fn _mm256_hadd_epi32(a: __m256i, b: __m256i) -> __m256i {
905    unsafe { transmute(phaddd(a.as_i32x8(), b.as_i32x8())) }
906}
907
908/// Horizontally adds adjacent pairs of 16-bit integers in `a` and `b`
909/// using saturation.
910///
911/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadds_epi16)
912#[inline]
913#[target_feature(enable = "avx2")]
914#[cfg_attr(test, assert_instr(vphaddsw))]
915#[stable(feature = "simd_x86", since = "1.27.0")]
916pub fn _mm256_hadds_epi16(a: __m256i, b: __m256i) -> __m256i {
917    unsafe { transmute(phaddsw(a.as_i16x16(), b.as_i16x16())) }
918}
919
920/// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`.
921///
922/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsub_epi16)
923#[inline]
924#[target_feature(enable = "avx2")]
925#[cfg_attr(test, assert_instr(vphsubw))]
926#[stable(feature = "simd_x86", since = "1.27.0")]
927pub fn _mm256_hsub_epi16(a: __m256i, b: __m256i) -> __m256i {
928    unsafe { transmute(phsubw(a.as_i16x16(), b.as_i16x16())) }
929}
930
931/// Horizontally subtract adjacent pairs of 32-bit integers in `a` and `b`.
932///
933/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsub_epi32)
934#[inline]
935#[target_feature(enable = "avx2")]
936#[cfg_attr(test, assert_instr(vphsubd))]
937#[stable(feature = "simd_x86", since = "1.27.0")]
938pub fn _mm256_hsub_epi32(a: __m256i, b: __m256i) -> __m256i {
939    unsafe { transmute(phsubd(a.as_i32x8(), b.as_i32x8())) }
940}
941
942/// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`
943/// using saturation.
944///
945/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsubs_epi16)
946#[inline]
947#[target_feature(enable = "avx2")]
948#[cfg_attr(test, assert_instr(vphsubsw))]
949#[stable(feature = "simd_x86", since = "1.27.0")]
950pub fn _mm256_hsubs_epi16(a: __m256i, b: __m256i) -> __m256i {
951    unsafe { transmute(phsubsw(a.as_i16x16(), b.as_i16x16())) }
952}
953
954/// Returns values from `slice` at offsets determined by `offsets * scale`,
955/// where
956/// `scale` should be 1, 2, 4 or 8.
957///
958/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i32gather_epi32)
959#[inline]
960#[target_feature(enable = "avx2")]
961#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))]
962#[rustc_legacy_const_generics(2)]
963#[stable(feature = "simd_x86", since = "1.27.0")]
964pub unsafe fn _mm_i32gather_epi32<const SCALE: i32>(
965    slice: *const i32,
966    offsets: __m128i,
967) -> __m128i {
968    static_assert_imm8_scale!(SCALE);
969    let zero = i32x4::ZERO;
970    let neg_one = _mm_set1_epi32(-1).as_i32x4();
971    let offsets = offsets.as_i32x4();
972    let slice = slice as *const i8;
973    let r = pgatherdd(zero, slice, offsets, neg_one, SCALE as i8);
974    transmute(r)
975}
976
977/// Returns values from `slice` at offsets determined by `offsets * scale`,
978/// where
979/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
980/// that position instead.
981///
982/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i32gather_epi32)
983#[inline]
984#[target_feature(enable = "avx2")]
985#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))]
986#[rustc_legacy_const_generics(4)]
987#[stable(feature = "simd_x86", since = "1.27.0")]
988pub unsafe fn _mm_mask_i32gather_epi32<const SCALE: i32>(
989    src: __m128i,
990    slice: *const i32,
991    offsets: __m128i,
992    mask: __m128i,
993) -> __m128i {
994    static_assert_imm8_scale!(SCALE);
995    let src = src.as_i32x4();
996    let mask = mask.as_i32x4();
997    let offsets = offsets.as_i32x4();
998    let slice = slice as *const i8;
999    let r = pgatherdd(src, slice, offsets, mask, SCALE as i8);
1000    transmute(r)
1001}
1002
1003/// Returns values from `slice` at offsets determined by `offsets * scale`,
1004/// where
1005/// `scale` should be 1, 2, 4 or 8.
1006///
1007/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i32gather_epi32)
1008#[inline]
1009#[target_feature(enable = "avx2")]
1010#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))]
1011#[rustc_legacy_const_generics(2)]
1012#[stable(feature = "simd_x86", since = "1.27.0")]
1013pub unsafe fn _mm256_i32gather_epi32<const SCALE: i32>(
1014    slice: *const i32,
1015    offsets: __m256i,
1016) -> __m256i {
1017    static_assert_imm8_scale!(SCALE);
1018    let zero = i32x8::ZERO;
1019    let neg_one = _mm256_set1_epi32(-1).as_i32x8();
1020    let offsets = offsets.as_i32x8();
1021    let slice = slice as *const i8;
1022    let r = vpgatherdd(zero, slice, offsets, neg_one, SCALE as i8);
1023    transmute(r)
1024}
1025
1026/// Returns values from `slice` at offsets determined by `offsets * scale`,
1027/// where
1028/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1029/// that position instead.
1030///
1031/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i32gather_epi32)
1032#[inline]
1033#[target_feature(enable = "avx2")]
1034#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))]
1035#[rustc_legacy_const_generics(4)]
1036#[stable(feature = "simd_x86", since = "1.27.0")]
1037pub unsafe fn _mm256_mask_i32gather_epi32<const SCALE: i32>(
1038    src: __m256i,
1039    slice: *const i32,
1040    offsets: __m256i,
1041    mask: __m256i,
1042) -> __m256i {
1043    static_assert_imm8_scale!(SCALE);
1044    let src = src.as_i32x8();
1045    let mask = mask.as_i32x8();
1046    let offsets = offsets.as_i32x8();
1047    let slice = slice as *const i8;
1048    let r = vpgatherdd(src, slice, offsets, mask, SCALE as i8);
1049    transmute(r)
1050}
1051
1052/// Returns values from `slice` at offsets determined by `offsets * scale`,
1053/// where
1054/// `scale` should be 1, 2, 4 or 8.
1055///
1056/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i32gather_ps)
1057#[inline]
1058#[target_feature(enable = "avx2")]
1059#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))]
1060#[rustc_legacy_const_generics(2)]
1061#[stable(feature = "simd_x86", since = "1.27.0")]
1062pub unsafe fn _mm_i32gather_ps<const SCALE: i32>(slice: *const f32, offsets: __m128i) -> __m128 {
1063    static_assert_imm8_scale!(SCALE);
1064    let zero = _mm_setzero_ps();
1065    let neg_one = _mm_set1_ps(-1.0);
1066    let offsets = offsets.as_i32x4();
1067    let slice = slice as *const i8;
1068    pgatherdps(zero, slice, offsets, neg_one, SCALE as i8)
1069}
1070
1071/// Returns values from `slice` at offsets determined by `offsets * scale`,
1072/// where
1073/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1074/// that position instead.
1075///
1076/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i32gather_ps)
1077#[inline]
1078#[target_feature(enable = "avx2")]
1079#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))]
1080#[rustc_legacy_const_generics(4)]
1081#[stable(feature = "simd_x86", since = "1.27.0")]
1082pub unsafe fn _mm_mask_i32gather_ps<const SCALE: i32>(
1083    src: __m128,
1084    slice: *const f32,
1085    offsets: __m128i,
1086    mask: __m128,
1087) -> __m128 {
1088    static_assert_imm8_scale!(SCALE);
1089    let offsets = offsets.as_i32x4();
1090    let slice = slice as *const i8;
1091    pgatherdps(src, slice, offsets, mask, SCALE as i8)
1092}
1093
1094/// Returns values from `slice` at offsets determined by `offsets * scale`,
1095/// where
1096/// `scale` should be 1, 2, 4 or 8.
1097///
1098/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i32gather_ps)
1099#[inline]
1100#[target_feature(enable = "avx2")]
1101#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))]
1102#[rustc_legacy_const_generics(2)]
1103#[stable(feature = "simd_x86", since = "1.27.0")]
1104pub unsafe fn _mm256_i32gather_ps<const SCALE: i32>(slice: *const f32, offsets: __m256i) -> __m256 {
1105    static_assert_imm8_scale!(SCALE);
1106    let zero = _mm256_setzero_ps();
1107    let neg_one = _mm256_set1_ps(-1.0);
1108    let offsets = offsets.as_i32x8();
1109    let slice = slice as *const i8;
1110    vpgatherdps(zero, slice, offsets, neg_one, SCALE as i8)
1111}
1112
1113/// Returns values from `slice` at offsets determined by `offsets * scale`,
1114/// where
1115/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1116/// that position instead.
1117///
1118/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i32gather_ps)
1119#[inline]
1120#[target_feature(enable = "avx2")]
1121#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))]
1122#[rustc_legacy_const_generics(4)]
1123#[stable(feature = "simd_x86", since = "1.27.0")]
1124pub unsafe fn _mm256_mask_i32gather_ps<const SCALE: i32>(
1125    src: __m256,
1126    slice: *const f32,
1127    offsets: __m256i,
1128    mask: __m256,
1129) -> __m256 {
1130    static_assert_imm8_scale!(SCALE);
1131    let offsets = offsets.as_i32x8();
1132    let slice = slice as *const i8;
1133    vpgatherdps(src, slice, offsets, mask, SCALE as i8)
1134}
1135
1136/// Returns values from `slice` at offsets determined by `offsets * scale`,
1137/// where
1138/// `scale` should be 1, 2, 4 or 8.
1139///
1140/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i32gather_epi64)
1141#[inline]
1142#[target_feature(enable = "avx2")]
1143#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
1144#[rustc_legacy_const_generics(2)]
1145#[stable(feature = "simd_x86", since = "1.27.0")]
1146pub unsafe fn _mm_i32gather_epi64<const SCALE: i32>(
1147    slice: *const i64,
1148    offsets: __m128i,
1149) -> __m128i {
1150    static_assert_imm8_scale!(SCALE);
1151    let zero = i64x2::ZERO;
1152    let neg_one = _mm_set1_epi64x(-1).as_i64x2();
1153    let offsets = offsets.as_i32x4();
1154    let slice = slice as *const i8;
1155    let r = pgatherdq(zero, slice, offsets, neg_one, SCALE as i8);
1156    transmute(r)
1157}
1158
1159/// Returns values from `slice` at offsets determined by `offsets * scale`,
1160/// where
1161/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1162/// that position instead.
1163///
1164/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i32gather_epi64)
1165#[inline]
1166#[target_feature(enable = "avx2")]
1167#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
1168#[rustc_legacy_const_generics(4)]
1169#[stable(feature = "simd_x86", since = "1.27.0")]
1170pub unsafe fn _mm_mask_i32gather_epi64<const SCALE: i32>(
1171    src: __m128i,
1172    slice: *const i64,
1173    offsets: __m128i,
1174    mask: __m128i,
1175) -> __m128i {
1176    static_assert_imm8_scale!(SCALE);
1177    let src = src.as_i64x2();
1178    let mask = mask.as_i64x2();
1179    let offsets = offsets.as_i32x4();
1180    let slice = slice as *const i8;
1181    let r = pgatherdq(src, slice, offsets, mask, SCALE as i8);
1182    transmute(r)
1183}
1184
1185/// Returns values from `slice` at offsets determined by `offsets * scale`,
1186/// where
1187/// `scale` should be 1, 2, 4 or 8.
1188///
1189/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i32gather_epi64)
1190#[inline]
1191#[target_feature(enable = "avx2")]
1192#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
1193#[rustc_legacy_const_generics(2)]
1194#[stable(feature = "simd_x86", since = "1.27.0")]
1195pub unsafe fn _mm256_i32gather_epi64<const SCALE: i32>(
1196    slice: *const i64,
1197    offsets: __m128i,
1198) -> __m256i {
1199    static_assert_imm8_scale!(SCALE);
1200    let zero = i64x4::ZERO;
1201    let neg_one = _mm256_set1_epi64x(-1).as_i64x4();
1202    let offsets = offsets.as_i32x4();
1203    let slice = slice as *const i8;
1204    let r = vpgatherdq(zero, slice, offsets, neg_one, SCALE as i8);
1205    transmute(r)
1206}
1207
1208/// Returns values from `slice` at offsets determined by `offsets * scale`,
1209/// where
1210/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1211/// that position instead.
1212///
1213/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i32gather_epi64)
1214#[inline]
1215#[target_feature(enable = "avx2")]
1216#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
1217#[rustc_legacy_const_generics(4)]
1218#[stable(feature = "simd_x86", since = "1.27.0")]
1219pub unsafe fn _mm256_mask_i32gather_epi64<const SCALE: i32>(
1220    src: __m256i,
1221    slice: *const i64,
1222    offsets: __m128i,
1223    mask: __m256i,
1224) -> __m256i {
1225    static_assert_imm8_scale!(SCALE);
1226    let src = src.as_i64x4();
1227    let mask = mask.as_i64x4();
1228    let offsets = offsets.as_i32x4();
1229    let slice = slice as *const i8;
1230    let r = vpgatherdq(src, slice, offsets, mask, SCALE as i8);
1231    transmute(r)
1232}
1233
1234/// Returns values from `slice` at offsets determined by `offsets * scale`,
1235/// where
1236/// `scale` should be 1, 2, 4 or 8.
1237///
1238/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i32gather_pd)
1239#[inline]
1240#[target_feature(enable = "avx2")]
1241#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
1242#[rustc_legacy_const_generics(2)]
1243#[stable(feature = "simd_x86", since = "1.27.0")]
1244pub unsafe fn _mm_i32gather_pd<const SCALE: i32>(slice: *const f64, offsets: __m128i) -> __m128d {
1245    static_assert_imm8_scale!(SCALE);
1246    let zero = _mm_setzero_pd();
1247    let neg_one = _mm_set1_pd(-1.0);
1248    let offsets = offsets.as_i32x4();
1249    let slice = slice as *const i8;
1250    pgatherdpd(zero, slice, offsets, neg_one, SCALE as i8)
1251}
1252
1253/// Returns values from `slice` at offsets determined by `offsets * scale`,
1254/// where
1255/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1256/// that position instead.
1257///
1258/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i32gather_pd)
1259#[inline]
1260#[target_feature(enable = "avx2")]
1261#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
1262#[rustc_legacy_const_generics(4)]
1263#[stable(feature = "simd_x86", since = "1.27.0")]
1264pub unsafe fn _mm_mask_i32gather_pd<const SCALE: i32>(
1265    src: __m128d,
1266    slice: *const f64,
1267    offsets: __m128i,
1268    mask: __m128d,
1269) -> __m128d {
1270    static_assert_imm8_scale!(SCALE);
1271    let offsets = offsets.as_i32x4();
1272    let slice = slice as *const i8;
1273    pgatherdpd(src, slice, offsets, mask, SCALE as i8)
1274}
1275
1276/// Returns values from `slice` at offsets determined by `offsets * scale`,
1277/// where
1278/// `scale` should be 1, 2, 4 or 8.
1279///
1280/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i32gather_pd)
1281#[inline]
1282#[target_feature(enable = "avx2")]
1283#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
1284#[rustc_legacy_const_generics(2)]
1285#[stable(feature = "simd_x86", since = "1.27.0")]
1286pub unsafe fn _mm256_i32gather_pd<const SCALE: i32>(
1287    slice: *const f64,
1288    offsets: __m128i,
1289) -> __m256d {
1290    static_assert_imm8_scale!(SCALE);
1291    let zero = _mm256_setzero_pd();
1292    let neg_one = _mm256_set1_pd(-1.0);
1293    let offsets = offsets.as_i32x4();
1294    let slice = slice as *const i8;
1295    vpgatherdpd(zero, slice, offsets, neg_one, SCALE as i8)
1296}
1297
1298/// Returns values from `slice` at offsets determined by `offsets * scale`,
1299/// where
1300/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1301/// that position instead.
1302///
1303/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i32gather_pd)
1304#[inline]
1305#[target_feature(enable = "avx2")]
1306#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
1307#[rustc_legacy_const_generics(4)]
1308#[stable(feature = "simd_x86", since = "1.27.0")]
1309pub unsafe fn _mm256_mask_i32gather_pd<const SCALE: i32>(
1310    src: __m256d,
1311    slice: *const f64,
1312    offsets: __m128i,
1313    mask: __m256d,
1314) -> __m256d {
1315    static_assert_imm8_scale!(SCALE);
1316    let offsets = offsets.as_i32x4();
1317    let slice = slice as *const i8;
1318    vpgatherdpd(src, slice, offsets, mask, SCALE as i8)
1319}
1320
1321/// Returns values from `slice` at offsets determined by `offsets * scale`,
1322/// where
1323/// `scale` should be 1, 2, 4 or 8.
1324///
1325/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i64gather_epi32)
1326#[inline]
1327#[target_feature(enable = "avx2")]
1328#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))]
1329#[rustc_legacy_const_generics(2)]
1330#[stable(feature = "simd_x86", since = "1.27.0")]
1331pub unsafe fn _mm_i64gather_epi32<const SCALE: i32>(
1332    slice: *const i32,
1333    offsets: __m128i,
1334) -> __m128i {
1335    static_assert_imm8_scale!(SCALE);
1336    let zero = i32x4::ZERO;
1337    let neg_one = _mm_set1_epi64x(-1).as_i32x4();
1338    let offsets = offsets.as_i64x2();
1339    let slice = slice as *const i8;
1340    let r = pgatherqd(zero, slice, offsets, neg_one, SCALE as i8);
1341    transmute(r)
1342}
1343
1344/// Returns values from `slice` at offsets determined by `offsets * scale`,
1345/// where
1346/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1347/// that position instead.
1348///
1349/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i64gather_epi32)
1350#[inline]
1351#[target_feature(enable = "avx2")]
1352#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))]
1353#[rustc_legacy_const_generics(4)]
1354#[stable(feature = "simd_x86", since = "1.27.0")]
1355pub unsafe fn _mm_mask_i64gather_epi32<const SCALE: i32>(
1356    src: __m128i,
1357    slice: *const i32,
1358    offsets: __m128i,
1359    mask: __m128i,
1360) -> __m128i {
1361    static_assert_imm8_scale!(SCALE);
1362    let src = src.as_i32x4();
1363    let mask = mask.as_i32x4();
1364    let offsets = offsets.as_i64x2();
1365    let slice = slice as *const i8;
1366    let r = pgatherqd(src, slice, offsets, mask, SCALE as i8);
1367    transmute(r)
1368}
1369
1370/// Returns values from `slice` at offsets determined by `offsets * scale`,
1371/// where
1372/// `scale` should be 1, 2, 4 or 8.
1373///
1374/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i64gather_epi32)
1375#[inline]
1376#[target_feature(enable = "avx2")]
1377#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))]
1378#[rustc_legacy_const_generics(2)]
1379#[stable(feature = "simd_x86", since = "1.27.0")]
1380pub unsafe fn _mm256_i64gather_epi32<const SCALE: i32>(
1381    slice: *const i32,
1382    offsets: __m256i,
1383) -> __m128i {
1384    static_assert_imm8_scale!(SCALE);
1385    let zero = i32x4::ZERO;
1386    let neg_one = _mm_set1_epi64x(-1).as_i32x4();
1387    let offsets = offsets.as_i64x4();
1388    let slice = slice as *const i8;
1389    let r = vpgatherqd(zero, slice, offsets, neg_one, SCALE as i8);
1390    transmute(r)
1391}
1392
1393/// Returns values from `slice` at offsets determined by `offsets * scale`,
1394/// where
1395/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1396/// that position instead.
1397///
1398/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i64gather_epi32)
1399#[inline]
1400#[target_feature(enable = "avx2")]
1401#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))]
1402#[rustc_legacy_const_generics(4)]
1403#[stable(feature = "simd_x86", since = "1.27.0")]
1404pub unsafe fn _mm256_mask_i64gather_epi32<const SCALE: i32>(
1405    src: __m128i,
1406    slice: *const i32,
1407    offsets: __m256i,
1408    mask: __m128i,
1409) -> __m128i {
1410    static_assert_imm8_scale!(SCALE);
1411    let src = src.as_i32x4();
1412    let mask = mask.as_i32x4();
1413    let offsets = offsets.as_i64x4();
1414    let slice = slice as *const i8;
1415    let r = vpgatherqd(src, slice, offsets, mask, SCALE as i8);
1416    transmute(r)
1417}
1418
1419/// Returns values from `slice` at offsets determined by `offsets * scale`,
1420/// where
1421/// `scale` should be 1, 2, 4 or 8.
1422///
1423/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i64gather_ps)
1424#[inline]
1425#[target_feature(enable = "avx2")]
1426#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))]
1427#[rustc_legacy_const_generics(2)]
1428#[stable(feature = "simd_x86", since = "1.27.0")]
1429pub unsafe fn _mm_i64gather_ps<const SCALE: i32>(slice: *const f32, offsets: __m128i) -> __m128 {
1430    static_assert_imm8_scale!(SCALE);
1431    let zero = _mm_setzero_ps();
1432    let neg_one = _mm_set1_ps(-1.0);
1433    let offsets = offsets.as_i64x2();
1434    let slice = slice as *const i8;
1435    pgatherqps(zero, slice, offsets, neg_one, SCALE as i8)
1436}
1437
1438/// Returns values from `slice` at offsets determined by `offsets * scale`,
1439/// where
1440/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1441/// that position instead.
1442///
1443/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i64gather_ps)
1444#[inline]
1445#[target_feature(enable = "avx2")]
1446#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))]
1447#[rustc_legacy_const_generics(4)]
1448#[stable(feature = "simd_x86", since = "1.27.0")]
1449pub unsafe fn _mm_mask_i64gather_ps<const SCALE: i32>(
1450    src: __m128,
1451    slice: *const f32,
1452    offsets: __m128i,
1453    mask: __m128,
1454) -> __m128 {
1455    static_assert_imm8_scale!(SCALE);
1456    let offsets = offsets.as_i64x2();
1457    let slice = slice as *const i8;
1458    pgatherqps(src, slice, offsets, mask, SCALE as i8)
1459}
1460
1461/// Returns values from `slice` at offsets determined by `offsets * scale`,
1462/// where
1463/// `scale` should be 1, 2, 4 or 8.
1464///
1465/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i64gather_ps)
1466#[inline]
1467#[target_feature(enable = "avx2")]
1468#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))]
1469#[rustc_legacy_const_generics(2)]
1470#[stable(feature = "simd_x86", since = "1.27.0")]
1471pub unsafe fn _mm256_i64gather_ps<const SCALE: i32>(slice: *const f32, offsets: __m256i) -> __m128 {
1472    static_assert_imm8_scale!(SCALE);
1473    let zero = _mm_setzero_ps();
1474    let neg_one = _mm_set1_ps(-1.0);
1475    let offsets = offsets.as_i64x4();
1476    let slice = slice as *const i8;
1477    vpgatherqps(zero, slice, offsets, neg_one, SCALE as i8)
1478}
1479
1480/// Returns values from `slice` at offsets determined by `offsets * scale`,
1481/// where
1482/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1483/// that position instead.
1484///
1485/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i64gather_ps)
1486#[inline]
1487#[target_feature(enable = "avx2")]
1488#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))]
1489#[rustc_legacy_const_generics(4)]
1490#[stable(feature = "simd_x86", since = "1.27.0")]
1491pub unsafe fn _mm256_mask_i64gather_ps<const SCALE: i32>(
1492    src: __m128,
1493    slice: *const f32,
1494    offsets: __m256i,
1495    mask: __m128,
1496) -> __m128 {
1497    static_assert_imm8_scale!(SCALE);
1498    let offsets = offsets.as_i64x4();
1499    let slice = slice as *const i8;
1500    vpgatherqps(src, slice, offsets, mask, SCALE as i8)
1501}
1502
1503/// Returns values from `slice` at offsets determined by `offsets * scale`,
1504/// where
1505/// `scale` should be 1, 2, 4 or 8.
1506///
1507/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i64gather_epi64)
1508#[inline]
1509#[target_feature(enable = "avx2")]
1510#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))]
1511#[rustc_legacy_const_generics(2)]
1512#[stable(feature = "simd_x86", since = "1.27.0")]
1513pub unsafe fn _mm_i64gather_epi64<const SCALE: i32>(
1514    slice: *const i64,
1515    offsets: __m128i,
1516) -> __m128i {
1517    static_assert_imm8_scale!(SCALE);
1518    let zero = i64x2::ZERO;
1519    let neg_one = _mm_set1_epi64x(-1).as_i64x2();
1520    let slice = slice as *const i8;
1521    let offsets = offsets.as_i64x2();
1522    let r = pgatherqq(zero, slice, offsets, neg_one, SCALE as i8);
1523    transmute(r)
1524}
1525
1526/// Returns values from `slice` at offsets determined by `offsets * scale`,
1527/// where
1528/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1529/// that position instead.
1530///
1531/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i64gather_epi64)
1532#[inline]
1533#[target_feature(enable = "avx2")]
1534#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))]
1535#[rustc_legacy_const_generics(4)]
1536#[stable(feature = "simd_x86", since = "1.27.0")]
1537pub unsafe fn _mm_mask_i64gather_epi64<const SCALE: i32>(
1538    src: __m128i,
1539    slice: *const i64,
1540    offsets: __m128i,
1541    mask: __m128i,
1542) -> __m128i {
1543    static_assert_imm8_scale!(SCALE);
1544    let src = src.as_i64x2();
1545    let mask = mask.as_i64x2();
1546    let offsets = offsets.as_i64x2();
1547    let slice = slice as *const i8;
1548    let r = pgatherqq(src, slice, offsets, mask, SCALE as i8);
1549    transmute(r)
1550}
1551
1552/// Returns values from `slice` at offsets determined by `offsets * scale`,
1553/// where
1554/// `scale` should be 1, 2, 4 or 8.
1555///
1556/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i64gather_epi64)
1557#[inline]
1558#[target_feature(enable = "avx2")]
1559#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))]
1560#[rustc_legacy_const_generics(2)]
1561#[stable(feature = "simd_x86", since = "1.27.0")]
1562pub unsafe fn _mm256_i64gather_epi64<const SCALE: i32>(
1563    slice: *const i64,
1564    offsets: __m256i,
1565) -> __m256i {
1566    static_assert_imm8_scale!(SCALE);
1567    let zero = i64x4::ZERO;
1568    let neg_one = _mm256_set1_epi64x(-1).as_i64x4();
1569    let slice = slice as *const i8;
1570    let offsets = offsets.as_i64x4();
1571    let r = vpgatherqq(zero, slice, offsets, neg_one, SCALE as i8);
1572    transmute(r)
1573}
1574
1575/// Returns values from `slice` at offsets determined by `offsets * scale`,
1576/// where
1577/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1578/// that position instead.
1579///
1580/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i64gather_epi64)
1581#[inline]
1582#[target_feature(enable = "avx2")]
1583#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))]
1584#[rustc_legacy_const_generics(4)]
1585#[stable(feature = "simd_x86", since = "1.27.0")]
1586pub unsafe fn _mm256_mask_i64gather_epi64<const SCALE: i32>(
1587    src: __m256i,
1588    slice: *const i64,
1589    offsets: __m256i,
1590    mask: __m256i,
1591) -> __m256i {
1592    static_assert_imm8_scale!(SCALE);
1593    let src = src.as_i64x4();
1594    let mask = mask.as_i64x4();
1595    let offsets = offsets.as_i64x4();
1596    let slice = slice as *const i8;
1597    let r = vpgatherqq(src, slice, offsets, mask, SCALE as i8);
1598    transmute(r)
1599}
1600
1601/// Returns values from `slice` at offsets determined by `offsets * scale`,
1602/// where
1603/// `scale` should be 1, 2, 4 or 8.
1604///
1605/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i64gather_pd)
1606#[inline]
1607#[target_feature(enable = "avx2")]
1608#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))]
1609#[rustc_legacy_const_generics(2)]
1610#[stable(feature = "simd_x86", since = "1.27.0")]
1611pub unsafe fn _mm_i64gather_pd<const SCALE: i32>(slice: *const f64, offsets: __m128i) -> __m128d {
1612    static_assert_imm8_scale!(SCALE);
1613    let zero = _mm_setzero_pd();
1614    let neg_one = _mm_set1_pd(-1.0);
1615    let slice = slice as *const i8;
1616    let offsets = offsets.as_i64x2();
1617    pgatherqpd(zero, slice, offsets, neg_one, SCALE as i8)
1618}
1619
1620/// Returns values from `slice` at offsets determined by `offsets * scale`,
1621/// where
1622/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1623/// that position instead.
1624///
1625/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i64gather_pd)
1626#[inline]
1627#[target_feature(enable = "avx2")]
1628#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))]
1629#[rustc_legacy_const_generics(4)]
1630#[stable(feature = "simd_x86", since = "1.27.0")]
1631pub unsafe fn _mm_mask_i64gather_pd<const SCALE: i32>(
1632    src: __m128d,
1633    slice: *const f64,
1634    offsets: __m128i,
1635    mask: __m128d,
1636) -> __m128d {
1637    static_assert_imm8_scale!(SCALE);
1638    let slice = slice as *const i8;
1639    let offsets = offsets.as_i64x2();
1640    pgatherqpd(src, slice, offsets, mask, SCALE as i8)
1641}
1642
1643/// Returns values from `slice` at offsets determined by `offsets * scale`,
1644/// where
1645/// `scale` should be 1, 2, 4 or 8.
1646///
1647/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i64gather_pd)
1648#[inline]
1649#[target_feature(enable = "avx2")]
1650#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))]
1651#[rustc_legacy_const_generics(2)]
1652#[stable(feature = "simd_x86", since = "1.27.0")]
1653pub unsafe fn _mm256_i64gather_pd<const SCALE: i32>(
1654    slice: *const f64,
1655    offsets: __m256i,
1656) -> __m256d {
1657    static_assert_imm8_scale!(SCALE);
1658    let zero = _mm256_setzero_pd();
1659    let neg_one = _mm256_set1_pd(-1.0);
1660    let slice = slice as *const i8;
1661    let offsets = offsets.as_i64x4();
1662    vpgatherqpd(zero, slice, offsets, neg_one, SCALE as i8)
1663}
1664
1665/// Returns values from `slice` at offsets determined by `offsets * scale`,
1666/// where
1667/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1668/// that position instead.
1669///
1670/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i64gather_pd)
1671#[inline]
1672#[target_feature(enable = "avx2")]
1673#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))]
1674#[rustc_legacy_const_generics(4)]
1675#[stable(feature = "simd_x86", since = "1.27.0")]
1676pub unsafe fn _mm256_mask_i64gather_pd<const SCALE: i32>(
1677    src: __m256d,
1678    slice: *const f64,
1679    offsets: __m256i,
1680    mask: __m256d,
1681) -> __m256d {
1682    static_assert_imm8_scale!(SCALE);
1683    let slice = slice as *const i8;
1684    let offsets = offsets.as_i64x4();
1685    vpgatherqpd(src, slice, offsets, mask, SCALE as i8)
1686}
1687
1688/// Copies `a` to `dst`, then insert 128 bits (of integer data) from `b` at the
1689/// location specified by `IMM1`.
1690///
1691/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_inserti128_si256)
1692#[inline]
1693#[target_feature(enable = "avx2")]
1694#[cfg_attr(test, assert_instr(vinsertf128, IMM1 = 1))]
1695#[rustc_legacy_const_generics(2)]
1696#[stable(feature = "simd_x86", since = "1.27.0")]
1697pub fn _mm256_inserti128_si256<const IMM1: i32>(a: __m256i, b: __m128i) -> __m256i {
1698    static_assert_uimm_bits!(IMM1, 1);
1699    unsafe {
1700        let a = a.as_i64x4();
1701        let b = _mm256_castsi128_si256(b).as_i64x4();
1702        let dst: i64x4 = simd_shuffle!(a, b, [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize]);
1703        transmute(dst)
1704    }
1705}
1706
1707/// Multiplies packed signed 16-bit integers in `a` and `b`, producing
1708/// intermediate signed 32-bit integers. Horizontally add adjacent pairs
1709/// of intermediate 32-bit integers.
1710///
1711/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_madd_epi16)
1712#[inline]
1713#[target_feature(enable = "avx2")]
1714#[cfg_attr(test, assert_instr(vpmaddwd))]
1715#[stable(feature = "simd_x86", since = "1.27.0")]
1716pub fn _mm256_madd_epi16(a: __m256i, b: __m256i) -> __m256i {
1717    unsafe { transmute(pmaddwd(a.as_i16x16(), b.as_i16x16())) }
1718}
1719
1720/// Vertically multiplies each unsigned 8-bit integer from `a` with the
1721/// corresponding signed 8-bit integer from `b`, producing intermediate
1722/// signed 16-bit integers. Horizontally add adjacent pairs of intermediate
1723/// signed 16-bit integers
1724///
1725/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maddubs_epi16)
1726#[inline]
1727#[target_feature(enable = "avx2")]
1728#[cfg_attr(test, assert_instr(vpmaddubsw))]
1729#[stable(feature = "simd_x86", since = "1.27.0")]
1730pub fn _mm256_maddubs_epi16(a: __m256i, b: __m256i) -> __m256i {
1731    unsafe { transmute(pmaddubsw(a.as_u8x32(), b.as_u8x32())) }
1732}
1733
1734/// Loads packed 32-bit integers from memory pointed by `mem_addr` using `mask`
1735/// (elements are zeroed out when the highest bit is not set in the
1736/// corresponding element).
1737///
1738/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskload_epi32)
1739#[inline]
1740#[target_feature(enable = "avx2")]
1741#[cfg_attr(test, assert_instr(vpmaskmovd))]
1742#[stable(feature = "simd_x86", since = "1.27.0")]
1743pub unsafe fn _mm_maskload_epi32(mem_addr: *const i32, mask: __m128i) -> __m128i {
1744    transmute(maskloadd(mem_addr as *const i8, mask.as_i32x4()))
1745}
1746
1747/// Loads packed 32-bit integers from memory pointed by `mem_addr` using `mask`
1748/// (elements are zeroed out when the highest bit is not set in the
1749/// corresponding element).
1750///
1751/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskload_epi32)
1752#[inline]
1753#[target_feature(enable = "avx2")]
1754#[cfg_attr(test, assert_instr(vpmaskmovd))]
1755#[stable(feature = "simd_x86", since = "1.27.0")]
1756pub unsafe fn _mm256_maskload_epi32(mem_addr: *const i32, mask: __m256i) -> __m256i {
1757    transmute(maskloadd256(mem_addr as *const i8, mask.as_i32x8()))
1758}
1759
1760/// Loads packed 64-bit integers from memory pointed by `mem_addr` using `mask`
1761/// (elements are zeroed out when the highest bit is not set in the
1762/// corresponding element).
1763///
1764/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskload_epi64)
1765#[inline]
1766#[target_feature(enable = "avx2")]
1767#[cfg_attr(test, assert_instr(vpmaskmovq))]
1768#[stable(feature = "simd_x86", since = "1.27.0")]
1769pub unsafe fn _mm_maskload_epi64(mem_addr: *const i64, mask: __m128i) -> __m128i {
1770    transmute(maskloadq(mem_addr as *const i8, mask.as_i64x2()))
1771}
1772
1773/// Loads packed 64-bit integers from memory pointed by `mem_addr` using `mask`
1774/// (elements are zeroed out when the highest bit is not set in the
1775/// corresponding element).
1776///
1777/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskload_epi64)
1778#[inline]
1779#[target_feature(enable = "avx2")]
1780#[cfg_attr(test, assert_instr(vpmaskmovq))]
1781#[stable(feature = "simd_x86", since = "1.27.0")]
1782pub unsafe fn _mm256_maskload_epi64(mem_addr: *const i64, mask: __m256i) -> __m256i {
1783    transmute(maskloadq256(mem_addr as *const i8, mask.as_i64x4()))
1784}
1785
1786/// Stores packed 32-bit integers from `a` into memory pointed by `mem_addr`
1787/// using `mask` (elements are not stored when the highest bit is not set
1788/// in the corresponding element).
1789///
1790/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskstore_epi32)
1791#[inline]
1792#[target_feature(enable = "avx2")]
1793#[cfg_attr(test, assert_instr(vpmaskmovd))]
1794#[stable(feature = "simd_x86", since = "1.27.0")]
1795pub unsafe fn _mm_maskstore_epi32(mem_addr: *mut i32, mask: __m128i, a: __m128i) {
1796    maskstored(mem_addr as *mut i8, mask.as_i32x4(), a.as_i32x4())
1797}
1798
1799/// Stores packed 32-bit integers from `a` into memory pointed by `mem_addr`
1800/// using `mask` (elements are not stored when the highest bit is not set
1801/// in the corresponding element).
1802///
1803/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskstore_epi32)
1804#[inline]
1805#[target_feature(enable = "avx2")]
1806#[cfg_attr(test, assert_instr(vpmaskmovd))]
1807#[stable(feature = "simd_x86", since = "1.27.0")]
1808pub unsafe fn _mm256_maskstore_epi32(mem_addr: *mut i32, mask: __m256i, a: __m256i) {
1809    maskstored256(mem_addr as *mut i8, mask.as_i32x8(), a.as_i32x8())
1810}
1811
1812/// Stores packed 64-bit integers from `a` into memory pointed by `mem_addr`
1813/// using `mask` (elements are not stored when the highest bit is not set
1814/// in the corresponding element).
1815///
1816/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskstore_epi64)
1817#[inline]
1818#[target_feature(enable = "avx2")]
1819#[cfg_attr(test, assert_instr(vpmaskmovq))]
1820#[stable(feature = "simd_x86", since = "1.27.0")]
1821pub unsafe fn _mm_maskstore_epi64(mem_addr: *mut i64, mask: __m128i, a: __m128i) {
1822    maskstoreq(mem_addr as *mut i8, mask.as_i64x2(), a.as_i64x2())
1823}
1824
1825/// Stores packed 64-bit integers from `a` into memory pointed by `mem_addr`
1826/// using `mask` (elements are not stored when the highest bit is not set
1827/// in the corresponding element).
1828///
1829/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskstore_epi64)
1830#[inline]
1831#[target_feature(enable = "avx2")]
1832#[cfg_attr(test, assert_instr(vpmaskmovq))]
1833#[stable(feature = "simd_x86", since = "1.27.0")]
1834pub unsafe fn _mm256_maskstore_epi64(mem_addr: *mut i64, mask: __m256i, a: __m256i) {
1835    maskstoreq256(mem_addr as *mut i8, mask.as_i64x4(), a.as_i64x4())
1836}
1837
1838/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
1839/// maximum values.
1840///
1841/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epi16)
1842#[inline]
1843#[target_feature(enable = "avx2")]
1844#[cfg_attr(test, assert_instr(vpmaxsw))]
1845#[stable(feature = "simd_x86", since = "1.27.0")]
1846pub fn _mm256_max_epi16(a: __m256i, b: __m256i) -> __m256i {
1847    unsafe {
1848        let a = a.as_i16x16();
1849        let b = b.as_i16x16();
1850        transmute(simd_select::<i16x16, _>(simd_gt(a, b), a, b))
1851    }
1852}
1853
1854/// Compares packed 32-bit integers in `a` and `b`, and returns the packed
1855/// maximum values.
1856///
1857/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epi32)
1858#[inline]
1859#[target_feature(enable = "avx2")]
1860#[cfg_attr(test, assert_instr(vpmaxsd))]
1861#[stable(feature = "simd_x86", since = "1.27.0")]
1862pub fn _mm256_max_epi32(a: __m256i, b: __m256i) -> __m256i {
1863    unsafe {
1864        let a = a.as_i32x8();
1865        let b = b.as_i32x8();
1866        transmute(simd_select::<i32x8, _>(simd_gt(a, b), a, b))
1867    }
1868}
1869
1870/// Compares packed 8-bit integers in `a` and `b`, and returns the packed
1871/// maximum values.
1872///
1873/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epi8)
1874#[inline]
1875#[target_feature(enable = "avx2")]
1876#[cfg_attr(test, assert_instr(vpmaxsb))]
1877#[stable(feature = "simd_x86", since = "1.27.0")]
1878pub fn _mm256_max_epi8(a: __m256i, b: __m256i) -> __m256i {
1879    unsafe {
1880        let a = a.as_i8x32();
1881        let b = b.as_i8x32();
1882        transmute(simd_select::<i8x32, _>(simd_gt(a, b), a, b))
1883    }
1884}
1885
1886/// Compares packed unsigned 16-bit integers in `a` and `b`, and returns
1887/// the packed maximum values.
1888///
1889/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epu16)
1890#[inline]
1891#[target_feature(enable = "avx2")]
1892#[cfg_attr(test, assert_instr(vpmaxuw))]
1893#[stable(feature = "simd_x86", since = "1.27.0")]
1894pub fn _mm256_max_epu16(a: __m256i, b: __m256i) -> __m256i {
1895    unsafe {
1896        let a = a.as_u16x16();
1897        let b = b.as_u16x16();
1898        transmute(simd_select::<i16x16, _>(simd_gt(a, b), a, b))
1899    }
1900}
1901
1902/// Compares packed unsigned 32-bit integers in `a` and `b`, and returns
1903/// the packed maximum values.
1904///
1905/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epu32)
1906#[inline]
1907#[target_feature(enable = "avx2")]
1908#[cfg_attr(test, assert_instr(vpmaxud))]
1909#[stable(feature = "simd_x86", since = "1.27.0")]
1910pub fn _mm256_max_epu32(a: __m256i, b: __m256i) -> __m256i {
1911    unsafe {
1912        let a = a.as_u32x8();
1913        let b = b.as_u32x8();
1914        transmute(simd_select::<i32x8, _>(simd_gt(a, b), a, b))
1915    }
1916}
1917
1918/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns
1919/// the packed maximum values.
1920///
1921/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epu8)
1922#[inline]
1923#[target_feature(enable = "avx2")]
1924#[cfg_attr(test, assert_instr(vpmaxub))]
1925#[stable(feature = "simd_x86", since = "1.27.0")]
1926pub fn _mm256_max_epu8(a: __m256i, b: __m256i) -> __m256i {
1927    unsafe {
1928        let a = a.as_u8x32();
1929        let b = b.as_u8x32();
1930        transmute(simd_select::<i8x32, _>(simd_gt(a, b), a, b))
1931    }
1932}
1933
1934/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
1935/// minimum values.
1936///
1937/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epi16)
1938#[inline]
1939#[target_feature(enable = "avx2")]
1940#[cfg_attr(test, assert_instr(vpminsw))]
1941#[stable(feature = "simd_x86", since = "1.27.0")]
1942pub fn _mm256_min_epi16(a: __m256i, b: __m256i) -> __m256i {
1943    unsafe {
1944        let a = a.as_i16x16();
1945        let b = b.as_i16x16();
1946        transmute(simd_select::<i16x16, _>(simd_lt(a, b), a, b))
1947    }
1948}
1949
1950/// Compares packed 32-bit integers in `a` and `b`, and returns the packed
1951/// minimum values.
1952///
1953/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epi32)
1954#[inline]
1955#[target_feature(enable = "avx2")]
1956#[cfg_attr(test, assert_instr(vpminsd))]
1957#[stable(feature = "simd_x86", since = "1.27.0")]
1958pub fn _mm256_min_epi32(a: __m256i, b: __m256i) -> __m256i {
1959    unsafe {
1960        let a = a.as_i32x8();
1961        let b = b.as_i32x8();
1962        transmute(simd_select::<i32x8, _>(simd_lt(a, b), a, b))
1963    }
1964}
1965
1966/// Compares packed 8-bit integers in `a` and `b`, and returns the packed
1967/// minimum values.
1968///
1969/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epi8)
1970#[inline]
1971#[target_feature(enable = "avx2")]
1972#[cfg_attr(test, assert_instr(vpminsb))]
1973#[stable(feature = "simd_x86", since = "1.27.0")]
1974pub fn _mm256_min_epi8(a: __m256i, b: __m256i) -> __m256i {
1975    unsafe {
1976        let a = a.as_i8x32();
1977        let b = b.as_i8x32();
1978        transmute(simd_select::<i8x32, _>(simd_lt(a, b), a, b))
1979    }
1980}
1981
1982/// Compares packed unsigned 16-bit integers in `a` and `b`, and returns
1983/// the packed minimum values.
1984///
1985/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epu16)
1986#[inline]
1987#[target_feature(enable = "avx2")]
1988#[cfg_attr(test, assert_instr(vpminuw))]
1989#[stable(feature = "simd_x86", since = "1.27.0")]
1990pub fn _mm256_min_epu16(a: __m256i, b: __m256i) -> __m256i {
1991    unsafe {
1992        let a = a.as_u16x16();
1993        let b = b.as_u16x16();
1994        transmute(simd_select::<i16x16, _>(simd_lt(a, b), a, b))
1995    }
1996}
1997
1998/// Compares packed unsigned 32-bit integers in `a` and `b`, and returns
1999/// the packed minimum values.
2000///
2001/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epu32)
2002#[inline]
2003#[target_feature(enable = "avx2")]
2004#[cfg_attr(test, assert_instr(vpminud))]
2005#[stable(feature = "simd_x86", since = "1.27.0")]
2006pub fn _mm256_min_epu32(a: __m256i, b: __m256i) -> __m256i {
2007    unsafe {
2008        let a = a.as_u32x8();
2009        let b = b.as_u32x8();
2010        transmute(simd_select::<i32x8, _>(simd_lt(a, b), a, b))
2011    }
2012}
2013
2014/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns
2015/// the packed minimum values.
2016///
2017/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epu8)
2018#[inline]
2019#[target_feature(enable = "avx2")]
2020#[cfg_attr(test, assert_instr(vpminub))]
2021#[stable(feature = "simd_x86", since = "1.27.0")]
2022pub fn _mm256_min_epu8(a: __m256i, b: __m256i) -> __m256i {
2023    unsafe {
2024        let a = a.as_u8x32();
2025        let b = b.as_u8x32();
2026        transmute(simd_select::<i8x32, _>(simd_lt(a, b), a, b))
2027    }
2028}
2029
2030/// Creates mask from the most significant bit of each 8-bit element in `a`,
2031/// return the result.
2032///
2033/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movemask_epi8)
2034#[inline]
2035#[target_feature(enable = "avx2")]
2036#[cfg_attr(test, assert_instr(vpmovmskb))]
2037#[stable(feature = "simd_x86", since = "1.27.0")]
2038pub fn _mm256_movemask_epi8(a: __m256i) -> i32 {
2039    unsafe {
2040        let z = i8x32::ZERO;
2041        let m: i8x32 = simd_lt(a.as_i8x32(), z);
2042        simd_bitmask::<_, u32>(m) as i32
2043    }
2044}
2045
2046/// Computes the sum of absolute differences (SADs) of quadruplets of unsigned
2047/// 8-bit integers in `a` compared to those in `b`, and stores the 16-bit
2048/// results in dst. Eight SADs are performed for each 128-bit lane using one
2049/// quadruplet from `b` and eight quadruplets from `a`. One quadruplet is
2050/// selected from `b` starting at on the offset specified in `imm8`. Eight
2051/// quadruplets are formed from sequential 8-bit integers selected from `a`
2052/// starting at the offset specified in `imm8`.
2053///
2054/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mpsadbw_epu8)
2055#[inline]
2056#[target_feature(enable = "avx2")]
2057#[cfg_attr(test, assert_instr(vmpsadbw, IMM8 = 0))]
2058#[rustc_legacy_const_generics(2)]
2059#[stable(feature = "simd_x86", since = "1.27.0")]
2060pub fn _mm256_mpsadbw_epu8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
2061    static_assert_uimm_bits!(IMM8, 8);
2062    unsafe { transmute(mpsadbw(a.as_u8x32(), b.as_u8x32(), IMM8 as i8)) }
2063}
2064
2065/// Multiplies the low 32-bit integers from each packed 64-bit element in
2066/// `a` and `b`
2067///
2068/// Returns the 64-bit results.
2069///
2070/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mul_epi32)
2071#[inline]
2072#[target_feature(enable = "avx2")]
2073#[cfg_attr(test, assert_instr(vpmuldq))]
2074#[stable(feature = "simd_x86", since = "1.27.0")]
2075pub fn _mm256_mul_epi32(a: __m256i, b: __m256i) -> __m256i {
2076    unsafe {
2077        let a = simd_cast::<_, i64x4>(simd_cast::<_, i32x4>(a.as_i64x4()));
2078        let b = simd_cast::<_, i64x4>(simd_cast::<_, i32x4>(b.as_i64x4()));
2079        transmute(simd_mul(a, b))
2080    }
2081}
2082
2083/// Multiplies the low unsigned 32-bit integers from each packed 64-bit
2084/// element in `a` and `b`
2085///
2086/// Returns the unsigned 64-bit results.
2087///
2088/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mul_epu32)
2089#[inline]
2090#[target_feature(enable = "avx2")]
2091#[cfg_attr(test, assert_instr(vpmuludq))]
2092#[stable(feature = "simd_x86", since = "1.27.0")]
2093pub fn _mm256_mul_epu32(a: __m256i, b: __m256i) -> __m256i {
2094    unsafe {
2095        let a = a.as_u64x4();
2096        let b = b.as_u64x4();
2097        let mask = u64x4::splat(u32::MAX.into());
2098        transmute(simd_mul(simd_and(a, mask), simd_and(b, mask)))
2099    }
2100}
2101
2102/// Multiplies the packed 16-bit integers in `a` and `b`, producing
2103/// intermediate 32-bit integers and returning the high 16 bits of the
2104/// intermediate integers.
2105///
2106/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mulhi_epi16)
2107#[inline]
2108#[target_feature(enable = "avx2")]
2109#[cfg_attr(test, assert_instr(vpmulhw))]
2110#[stable(feature = "simd_x86", since = "1.27.0")]
2111pub fn _mm256_mulhi_epi16(a: __m256i, b: __m256i) -> __m256i {
2112    unsafe {
2113        let a = simd_cast::<_, i32x16>(a.as_i16x16());
2114        let b = simd_cast::<_, i32x16>(b.as_i16x16());
2115        let r = simd_shr(simd_mul(a, b), i32x16::splat(16));
2116        transmute(simd_cast::<i32x16, i16x16>(r))
2117    }
2118}
2119
2120/// Multiplies the packed unsigned 16-bit integers in `a` and `b`, producing
2121/// intermediate 32-bit integers and returning the high 16 bits of the
2122/// intermediate integers.
2123///
2124/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mulhi_epu16)
2125#[inline]
2126#[target_feature(enable = "avx2")]
2127#[cfg_attr(test, assert_instr(vpmulhuw))]
2128#[stable(feature = "simd_x86", since = "1.27.0")]
2129pub fn _mm256_mulhi_epu16(a: __m256i, b: __m256i) -> __m256i {
2130    unsafe {
2131        let a = simd_cast::<_, u32x16>(a.as_u16x16());
2132        let b = simd_cast::<_, u32x16>(b.as_u16x16());
2133        let r = simd_shr(simd_mul(a, b), u32x16::splat(16));
2134        transmute(simd_cast::<u32x16, u16x16>(r))
2135    }
2136}
2137
2138/// Multiplies the packed 16-bit integers in `a` and `b`, producing
2139/// intermediate 32-bit integers, and returns the low 16 bits of the
2140/// intermediate integers
2141///
2142/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mullo_epi16)
2143#[inline]
2144#[target_feature(enable = "avx2")]
2145#[cfg_attr(test, assert_instr(vpmullw))]
2146#[stable(feature = "simd_x86", since = "1.27.0")]
2147pub fn _mm256_mullo_epi16(a: __m256i, b: __m256i) -> __m256i {
2148    unsafe { transmute(simd_mul(a.as_i16x16(), b.as_i16x16())) }
2149}
2150
2151/// Multiplies the packed 32-bit integers in `a` and `b`, producing
2152/// intermediate 64-bit integers, and returns the low 32 bits of the
2153/// intermediate integers
2154///
2155/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mullo_epi32)
2156#[inline]
2157#[target_feature(enable = "avx2")]
2158#[cfg_attr(test, assert_instr(vpmulld))]
2159#[stable(feature = "simd_x86", since = "1.27.0")]
2160pub fn _mm256_mullo_epi32(a: __m256i, b: __m256i) -> __m256i {
2161    unsafe { transmute(simd_mul(a.as_i32x8(), b.as_i32x8())) }
2162}
2163
2164/// Multiplies packed 16-bit integers in `a` and `b`, producing
2165/// intermediate signed 32-bit integers. Truncate each intermediate
2166/// integer to the 18 most significant bits, round by adding 1, and
2167/// return bits `[16:1]`.
2168///
2169/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mulhrs_epi16)
2170#[inline]
2171#[target_feature(enable = "avx2")]
2172#[cfg_attr(test, assert_instr(vpmulhrsw))]
2173#[stable(feature = "simd_x86", since = "1.27.0")]
2174pub fn _mm256_mulhrs_epi16(a: __m256i, b: __m256i) -> __m256i {
2175    unsafe { transmute(pmulhrsw(a.as_i16x16(), b.as_i16x16())) }
2176}
2177
2178/// Computes the bitwise OR of 256 bits (representing integer data) in `a`
2179/// and `b`
2180///
2181/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_or_si256)
2182#[inline]
2183#[target_feature(enable = "avx2")]
2184#[cfg_attr(test, assert_instr(vorps))]
2185#[stable(feature = "simd_x86", since = "1.27.0")]
2186pub fn _mm256_or_si256(a: __m256i, b: __m256i) -> __m256i {
2187    unsafe { transmute(simd_or(a.as_i32x8(), b.as_i32x8())) }
2188}
2189
2190/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
2191/// using signed saturation
2192///
2193/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi16)
2194#[inline]
2195#[target_feature(enable = "avx2")]
2196#[cfg_attr(test, assert_instr(vpacksswb))]
2197#[stable(feature = "simd_x86", since = "1.27.0")]
2198pub fn _mm256_packs_epi16(a: __m256i, b: __m256i) -> __m256i {
2199    unsafe { transmute(packsswb(a.as_i16x16(), b.as_i16x16())) }
2200}
2201
2202/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
2203/// using signed saturation
2204///
2205/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi32)
2206#[inline]
2207#[target_feature(enable = "avx2")]
2208#[cfg_attr(test, assert_instr(vpackssdw))]
2209#[stable(feature = "simd_x86", since = "1.27.0")]
2210pub fn _mm256_packs_epi32(a: __m256i, b: __m256i) -> __m256i {
2211    unsafe { transmute(packssdw(a.as_i32x8(), b.as_i32x8())) }
2212}
2213
2214/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
2215/// using unsigned saturation
2216///
2217/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packus_epi16)
2218#[inline]
2219#[target_feature(enable = "avx2")]
2220#[cfg_attr(test, assert_instr(vpackuswb))]
2221#[stable(feature = "simd_x86", since = "1.27.0")]
2222pub fn _mm256_packus_epi16(a: __m256i, b: __m256i) -> __m256i {
2223    unsafe { transmute(packuswb(a.as_i16x16(), b.as_i16x16())) }
2224}
2225
2226/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
2227/// using unsigned saturation
2228///
2229/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packus_epi32)
2230#[inline]
2231#[target_feature(enable = "avx2")]
2232#[cfg_attr(test, assert_instr(vpackusdw))]
2233#[stable(feature = "simd_x86", since = "1.27.0")]
2234pub fn _mm256_packus_epi32(a: __m256i, b: __m256i) -> __m256i {
2235    unsafe { transmute(packusdw(a.as_i32x8(), b.as_i32x8())) }
2236}
2237
2238/// Permutes packed 32-bit integers from `a` according to the content of `b`.
2239///
2240/// The last 3 bits of each integer of `b` are used as addresses into the 8
2241/// integers of `a`.
2242///
2243/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar8x32_epi32)
2244#[inline]
2245#[target_feature(enable = "avx2")]
2246#[cfg_attr(test, assert_instr(vpermps))]
2247#[stable(feature = "simd_x86", since = "1.27.0")]
2248pub fn _mm256_permutevar8x32_epi32(a: __m256i, b: __m256i) -> __m256i {
2249    unsafe { transmute(permd(a.as_u32x8(), b.as_u32x8())) }
2250}
2251
2252/// Permutes 64-bit integers from `a` using control mask `imm8`.
2253///
2254/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute4x64_epi64)
2255#[inline]
2256#[target_feature(enable = "avx2")]
2257#[cfg_attr(test, assert_instr(vpermpd, IMM8 = 9))]
2258#[rustc_legacy_const_generics(1)]
2259#[stable(feature = "simd_x86", since = "1.27.0")]
2260pub fn _mm256_permute4x64_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
2261    static_assert_uimm_bits!(IMM8, 8);
2262    unsafe {
2263        let zero = i64x4::ZERO;
2264        let r: i64x4 = simd_shuffle!(
2265            a.as_i64x4(),
2266            zero,
2267            [
2268                IMM8 as u32 & 0b11,
2269                (IMM8 as u32 >> 2) & 0b11,
2270                (IMM8 as u32 >> 4) & 0b11,
2271                (IMM8 as u32 >> 6) & 0b11,
2272            ],
2273        );
2274        transmute(r)
2275    }
2276}
2277
2278/// Shuffles 128-bits of integer data selected by `imm8` from `a` and `b`.
2279///
2280/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2x128_si256)
2281#[inline]
2282#[target_feature(enable = "avx2")]
2283#[cfg_attr(test, assert_instr(vperm2f128, IMM8 = 9))]
2284#[rustc_legacy_const_generics(2)]
2285#[stable(feature = "simd_x86", since = "1.27.0")]
2286pub fn _mm256_permute2x128_si256<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
2287    static_assert_uimm_bits!(IMM8, 8);
2288    unsafe { transmute(vperm2i128(a.as_i64x4(), b.as_i64x4(), IMM8 as i8)) }
2289}
2290
2291/// Shuffles 64-bit floating-point elements in `a` across lanes using the
2292/// control in `imm8`.
2293///
2294/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute4x64_pd)
2295#[inline]
2296#[target_feature(enable = "avx2")]
2297#[cfg_attr(test, assert_instr(vpermpd, IMM8 = 1))]
2298#[rustc_legacy_const_generics(1)]
2299#[stable(feature = "simd_x86", since = "1.27.0")]
2300pub fn _mm256_permute4x64_pd<const IMM8: i32>(a: __m256d) -> __m256d {
2301    static_assert_uimm_bits!(IMM8, 8);
2302    unsafe {
2303        simd_shuffle!(
2304            a,
2305            _mm256_undefined_pd(),
2306            [
2307                IMM8 as u32 & 0b11,
2308                (IMM8 as u32 >> 2) & 0b11,
2309                (IMM8 as u32 >> 4) & 0b11,
2310                (IMM8 as u32 >> 6) & 0b11,
2311            ],
2312        )
2313    }
2314}
2315
2316/// Shuffles eight 32-bit floating-point elements in `a` across lanes using
2317/// the corresponding 32-bit integer index in `idx`.
2318///
2319/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar8x32_ps)
2320#[inline]
2321#[target_feature(enable = "avx2")]
2322#[cfg_attr(test, assert_instr(vpermps))]
2323#[stable(feature = "simd_x86", since = "1.27.0")]
2324pub fn _mm256_permutevar8x32_ps(a: __m256, idx: __m256i) -> __m256 {
2325    unsafe { permps(a, idx.as_i32x8()) }
2326}
2327
2328/// Computes the absolute differences of packed unsigned 8-bit integers in `a`
2329/// and `b`, then horizontally sum each consecutive 8 differences to
2330/// produce four unsigned 16-bit integers, and pack these unsigned 16-bit
2331/// integers in the low 16 bits of the 64-bit return value
2332///
2333/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sad_epu8)
2334#[inline]
2335#[target_feature(enable = "avx2")]
2336#[cfg_attr(test, assert_instr(vpsadbw))]
2337#[stable(feature = "simd_x86", since = "1.27.0")]
2338pub fn _mm256_sad_epu8(a: __m256i, b: __m256i) -> __m256i {
2339    unsafe { transmute(psadbw(a.as_u8x32(), b.as_u8x32())) }
2340}
2341
2342/// Shuffles bytes from `a` according to the content of `b`.
2343///
2344/// For each of the 128-bit low and high halves of the vectors, the last
2345/// 4 bits of each byte of `b` are used as addresses into the respective
2346/// low or high 16 bytes of `a`. That is, the halves are shuffled separately.
2347///
2348/// In addition, if the highest significant bit of a byte of `b` is set, the
2349/// respective destination byte is set to 0.
2350///
2351/// Picturing `a` and `b` as `[u8; 32]`, `_mm256_shuffle_epi8` is logically
2352/// equivalent to:
2353///
2354/// ```
2355/// fn mm256_shuffle_epi8(a: [u8; 32], b: [u8; 32]) -> [u8; 32] {
2356///     let mut r = [0; 32];
2357///     for i in 0..16 {
2358///         // if the most significant bit of b is set,
2359///         // then the destination byte is set to 0.
2360///         if b[i] & 0x80 == 0u8 {
2361///             r[i] = a[(b[i] % 16) as usize];
2362///         }
2363///         if b[i + 16] & 0x80 == 0u8 {
2364///             r[i + 16] = a[(b[i + 16] % 16 + 16) as usize];
2365///         }
2366///     }
2367///     r
2368/// }
2369/// ```
2370///
2371/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_epi8)
2372#[inline]
2373#[target_feature(enable = "avx2")]
2374#[cfg_attr(test, assert_instr(vpshufb))]
2375#[stable(feature = "simd_x86", since = "1.27.0")]
2376pub fn _mm256_shuffle_epi8(a: __m256i, b: __m256i) -> __m256i {
2377    unsafe { transmute(pshufb(a.as_u8x32(), b.as_u8x32())) }
2378}
2379
2380/// Shuffles 32-bit integers in 128-bit lanes of `a` using the control in
2381/// `imm8`.
2382///
2383/// ```rust
2384/// #[cfg(target_arch = "x86")]
2385/// use std::arch::x86::*;
2386/// #[cfg(target_arch = "x86_64")]
2387/// use std::arch::x86_64::*;
2388///
2389/// # fn main() {
2390/// #     if is_x86_feature_detected!("avx2") {
2391/// #         #[target_feature(enable = "avx2")]
2392/// #         unsafe fn worker() {
2393/// let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
2394///
2395/// let c1 = _mm256_shuffle_epi32(a, 0b00_11_10_01);
2396/// let c2 = _mm256_shuffle_epi32(a, 0b01_00_10_11);
2397///
2398/// let expected1 = _mm256_setr_epi32(1, 2, 3, 0, 5, 6, 7, 4);
2399/// let expected2 = _mm256_setr_epi32(3, 2, 0, 1, 7, 6, 4, 5);
2400///
2401/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c1, expected1)), !0);
2402/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c2, expected2)), !0);
2403/// #         }
2404/// #         unsafe { worker(); }
2405/// #     }
2406/// # }
2407/// ```
2408///
2409/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_epi32)
2410#[inline]
2411#[target_feature(enable = "avx2")]
2412#[cfg_attr(test, assert_instr(vshufps, MASK = 9))]
2413#[rustc_legacy_const_generics(1)]
2414#[stable(feature = "simd_x86", since = "1.27.0")]
2415pub fn _mm256_shuffle_epi32<const MASK: i32>(a: __m256i) -> __m256i {
2416    static_assert_uimm_bits!(MASK, 8);
2417    unsafe {
2418        let r: i32x8 = simd_shuffle!(
2419            a.as_i32x8(),
2420            a.as_i32x8(),
2421            [
2422                MASK as u32 & 0b11,
2423                (MASK as u32 >> 2) & 0b11,
2424                (MASK as u32 >> 4) & 0b11,
2425                (MASK as u32 >> 6) & 0b11,
2426                (MASK as u32 & 0b11) + 4,
2427                ((MASK as u32 >> 2) & 0b11) + 4,
2428                ((MASK as u32 >> 4) & 0b11) + 4,
2429                ((MASK as u32 >> 6) & 0b11) + 4,
2430            ],
2431        );
2432        transmute(r)
2433    }
2434}
2435
2436/// Shuffles 16-bit integers in the high 64 bits of 128-bit lanes of `a` using
2437/// the control in `imm8`. The low 64 bits of 128-bit lanes of `a` are copied
2438/// to the output.
2439///
2440/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shufflehi_epi16)
2441#[inline]
2442#[target_feature(enable = "avx2")]
2443#[cfg_attr(test, assert_instr(vpshufhw, IMM8 = 9))]
2444#[rustc_legacy_const_generics(1)]
2445#[stable(feature = "simd_x86", since = "1.27.0")]
2446pub fn _mm256_shufflehi_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
2447    static_assert_uimm_bits!(IMM8, 8);
2448    unsafe {
2449        let a = a.as_i16x16();
2450        let r: i16x16 = simd_shuffle!(
2451            a,
2452            a,
2453            [
2454                0,
2455                1,
2456                2,
2457                3,
2458                4 + (IMM8 as u32 & 0b11),
2459                4 + ((IMM8 as u32 >> 2) & 0b11),
2460                4 + ((IMM8 as u32 >> 4) & 0b11),
2461                4 + ((IMM8 as u32 >> 6) & 0b11),
2462                8,
2463                9,
2464                10,
2465                11,
2466                12 + (IMM8 as u32 & 0b11),
2467                12 + ((IMM8 as u32 >> 2) & 0b11),
2468                12 + ((IMM8 as u32 >> 4) & 0b11),
2469                12 + ((IMM8 as u32 >> 6) & 0b11),
2470            ],
2471        );
2472        transmute(r)
2473    }
2474}
2475
2476/// Shuffles 16-bit integers in the low 64 bits of 128-bit lanes of `a` using
2477/// the control in `imm8`. The high 64 bits of 128-bit lanes of `a` are copied
2478/// to the output.
2479///
2480/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shufflelo_epi16)
2481#[inline]
2482#[target_feature(enable = "avx2")]
2483#[cfg_attr(test, assert_instr(vpshuflw, IMM8 = 9))]
2484#[rustc_legacy_const_generics(1)]
2485#[stable(feature = "simd_x86", since = "1.27.0")]
2486pub fn _mm256_shufflelo_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
2487    static_assert_uimm_bits!(IMM8, 8);
2488    unsafe {
2489        let a = a.as_i16x16();
2490        let r: i16x16 = simd_shuffle!(
2491            a,
2492            a,
2493            [
2494                0 + (IMM8 as u32 & 0b11),
2495                0 + ((IMM8 as u32 >> 2) & 0b11),
2496                0 + ((IMM8 as u32 >> 4) & 0b11),
2497                0 + ((IMM8 as u32 >> 6) & 0b11),
2498                4,
2499                5,
2500                6,
2501                7,
2502                8 + (IMM8 as u32 & 0b11),
2503                8 + ((IMM8 as u32 >> 2) & 0b11),
2504                8 + ((IMM8 as u32 >> 4) & 0b11),
2505                8 + ((IMM8 as u32 >> 6) & 0b11),
2506                12,
2507                13,
2508                14,
2509                15,
2510            ],
2511        );
2512        transmute(r)
2513    }
2514}
2515
2516/// Negates packed 16-bit integers in `a` when the corresponding signed
2517/// 16-bit integer in `b` is negative, and returns the results.
2518/// Results are zeroed out when the corresponding element in `b` is zero.
2519///
2520/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sign_epi16)
2521#[inline]
2522#[target_feature(enable = "avx2")]
2523#[cfg_attr(test, assert_instr(vpsignw))]
2524#[stable(feature = "simd_x86", since = "1.27.0")]
2525pub fn _mm256_sign_epi16(a: __m256i, b: __m256i) -> __m256i {
2526    unsafe { transmute(psignw(a.as_i16x16(), b.as_i16x16())) }
2527}
2528
2529/// Negates packed 32-bit integers in `a` when the corresponding signed
2530/// 32-bit integer in `b` is negative, and returns the results.
2531/// Results are zeroed out when the corresponding element in `b` is zero.
2532///
2533/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sign_epi32)
2534#[inline]
2535#[target_feature(enable = "avx2")]
2536#[cfg_attr(test, assert_instr(vpsignd))]
2537#[stable(feature = "simd_x86", since = "1.27.0")]
2538pub fn _mm256_sign_epi32(a: __m256i, b: __m256i) -> __m256i {
2539    unsafe { transmute(psignd(a.as_i32x8(), b.as_i32x8())) }
2540}
2541
2542/// Negates packed 8-bit integers in `a` when the corresponding signed
2543/// 8-bit integer in `b` is negative, and returns the results.
2544/// Results are zeroed out when the corresponding element in `b` is zero.
2545///
2546/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sign_epi8)
2547#[inline]
2548#[target_feature(enable = "avx2")]
2549#[cfg_attr(test, assert_instr(vpsignb))]
2550#[stable(feature = "simd_x86", since = "1.27.0")]
2551pub fn _mm256_sign_epi8(a: __m256i, b: __m256i) -> __m256i {
2552    unsafe { transmute(psignb(a.as_i8x32(), b.as_i8x32())) }
2553}
2554
2555/// Shifts packed 16-bit integers in `a` left by `count` while
2556/// shifting in zeros, and returns the result
2557///
2558/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sll_epi16)
2559#[inline]
2560#[target_feature(enable = "avx2")]
2561#[cfg_attr(test, assert_instr(vpsllw))]
2562#[stable(feature = "simd_x86", since = "1.27.0")]
2563pub fn _mm256_sll_epi16(a: __m256i, count: __m128i) -> __m256i {
2564    unsafe { transmute(psllw(a.as_i16x16(), count.as_i16x8())) }
2565}
2566
2567/// Shifts packed 32-bit integers in `a` left by `count` while
2568/// shifting in zeros, and returns the result
2569///
2570/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sll_epi32)
2571#[inline]
2572#[target_feature(enable = "avx2")]
2573#[cfg_attr(test, assert_instr(vpslld))]
2574#[stable(feature = "simd_x86", since = "1.27.0")]
2575pub fn _mm256_sll_epi32(a: __m256i, count: __m128i) -> __m256i {
2576    unsafe { transmute(pslld(a.as_i32x8(), count.as_i32x4())) }
2577}
2578
2579/// Shifts packed 64-bit integers in `a` left by `count` while
2580/// shifting in zeros, and returns the result
2581///
2582/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sll_epi64)
2583#[inline]
2584#[target_feature(enable = "avx2")]
2585#[cfg_attr(test, assert_instr(vpsllq))]
2586#[stable(feature = "simd_x86", since = "1.27.0")]
2587pub fn _mm256_sll_epi64(a: __m256i, count: __m128i) -> __m256i {
2588    unsafe { transmute(psllq(a.as_i64x4(), count.as_i64x2())) }
2589}
2590
2591/// Shifts packed 16-bit integers in `a` left by `IMM8` while
2592/// shifting in zeros, return the results;
2593///
2594/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_epi16)
2595#[inline]
2596#[target_feature(enable = "avx2")]
2597#[cfg_attr(test, assert_instr(vpsllw, IMM8 = 7))]
2598#[rustc_legacy_const_generics(1)]
2599#[stable(feature = "simd_x86", since = "1.27.0")]
2600pub fn _mm256_slli_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
2601    static_assert_uimm_bits!(IMM8, 8);
2602    unsafe {
2603        if IMM8 >= 16 {
2604            _mm256_setzero_si256()
2605        } else {
2606            transmute(simd_shl(a.as_u16x16(), u16x16::splat(IMM8 as u16)))
2607        }
2608    }
2609}
2610
2611/// Shifts packed 32-bit integers in `a` left by `IMM8` while
2612/// shifting in zeros, return the results;
2613///
2614/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_epi32)
2615#[inline]
2616#[target_feature(enable = "avx2")]
2617#[cfg_attr(test, assert_instr(vpslld, IMM8 = 7))]
2618#[rustc_legacy_const_generics(1)]
2619#[stable(feature = "simd_x86", since = "1.27.0")]
2620pub fn _mm256_slli_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
2621    unsafe {
2622        static_assert_uimm_bits!(IMM8, 8);
2623        if IMM8 >= 32 {
2624            _mm256_setzero_si256()
2625        } else {
2626            transmute(simd_shl(a.as_u32x8(), u32x8::splat(IMM8 as u32)))
2627        }
2628    }
2629}
2630
2631/// Shifts packed 64-bit integers in `a` left by `IMM8` while
2632/// shifting in zeros, return the results;
2633///
2634/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_epi64)
2635#[inline]
2636#[target_feature(enable = "avx2")]
2637#[cfg_attr(test, assert_instr(vpsllq, IMM8 = 7))]
2638#[rustc_legacy_const_generics(1)]
2639#[stable(feature = "simd_x86", since = "1.27.0")]
2640pub fn _mm256_slli_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
2641    unsafe {
2642        static_assert_uimm_bits!(IMM8, 8);
2643        if IMM8 >= 64 {
2644            _mm256_setzero_si256()
2645        } else {
2646            transmute(simd_shl(a.as_u64x4(), u64x4::splat(IMM8 as u64)))
2647        }
2648    }
2649}
2650
2651/// Shifts 128-bit lanes in `a` left by `imm8` bytes while shifting in zeros.
2652///
2653/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_si256)
2654#[inline]
2655#[target_feature(enable = "avx2")]
2656#[cfg_attr(test, assert_instr(vpslldq, IMM8 = 3))]
2657#[rustc_legacy_const_generics(1)]
2658#[stable(feature = "simd_x86", since = "1.27.0")]
2659pub fn _mm256_slli_si256<const IMM8: i32>(a: __m256i) -> __m256i {
2660    static_assert_uimm_bits!(IMM8, 8);
2661    _mm256_bslli_epi128::<IMM8>(a)
2662}
2663
2664/// Shifts 128-bit lanes in `a` left by `imm8` bytes while shifting in zeros.
2665///
2666/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_bslli_epi128)
2667#[inline]
2668#[target_feature(enable = "avx2")]
2669#[cfg_attr(test, assert_instr(vpslldq, IMM8 = 3))]
2670#[rustc_legacy_const_generics(1)]
2671#[stable(feature = "simd_x86", since = "1.27.0")]
2672pub fn _mm256_bslli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
2673    static_assert_uimm_bits!(IMM8, 8);
2674    const fn mask(shift: i32, i: u32) -> u32 {
2675        let shift = shift as u32 & 0xff;
2676        if shift > 15 || i % 16 < shift {
2677            0
2678        } else {
2679            32 + (i - shift)
2680        }
2681    }
2682    unsafe {
2683        let a = a.as_i8x32();
2684        let r: i8x32 = simd_shuffle!(
2685            i8x32::ZERO,
2686            a,
2687            [
2688                mask(IMM8, 0),
2689                mask(IMM8, 1),
2690                mask(IMM8, 2),
2691                mask(IMM8, 3),
2692                mask(IMM8, 4),
2693                mask(IMM8, 5),
2694                mask(IMM8, 6),
2695                mask(IMM8, 7),
2696                mask(IMM8, 8),
2697                mask(IMM8, 9),
2698                mask(IMM8, 10),
2699                mask(IMM8, 11),
2700                mask(IMM8, 12),
2701                mask(IMM8, 13),
2702                mask(IMM8, 14),
2703                mask(IMM8, 15),
2704                mask(IMM8, 16),
2705                mask(IMM8, 17),
2706                mask(IMM8, 18),
2707                mask(IMM8, 19),
2708                mask(IMM8, 20),
2709                mask(IMM8, 21),
2710                mask(IMM8, 22),
2711                mask(IMM8, 23),
2712                mask(IMM8, 24),
2713                mask(IMM8, 25),
2714                mask(IMM8, 26),
2715                mask(IMM8, 27),
2716                mask(IMM8, 28),
2717                mask(IMM8, 29),
2718                mask(IMM8, 30),
2719                mask(IMM8, 31),
2720            ],
2721        );
2722        transmute(r)
2723    }
2724}
2725
2726/// Shifts packed 32-bit integers in `a` left by the amount
2727/// specified by the corresponding element in `count` while
2728/// shifting in zeros, and returns the result.
2729///
2730/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sllv_epi32)
2731#[inline]
2732#[target_feature(enable = "avx2")]
2733#[cfg_attr(test, assert_instr(vpsllvd))]
2734#[stable(feature = "simd_x86", since = "1.27.0")]
2735pub fn _mm_sllv_epi32(a: __m128i, count: __m128i) -> __m128i {
2736    unsafe { transmute(psllvd(a.as_i32x4(), count.as_i32x4())) }
2737}
2738
2739/// Shifts packed 32-bit integers in `a` left by the amount
2740/// specified by the corresponding element in `count` while
2741/// shifting in zeros, and returns the result.
2742///
2743/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sllv_epi32)
2744#[inline]
2745#[target_feature(enable = "avx2")]
2746#[cfg_attr(test, assert_instr(vpsllvd))]
2747#[stable(feature = "simd_x86", since = "1.27.0")]
2748pub fn _mm256_sllv_epi32(a: __m256i, count: __m256i) -> __m256i {
2749    unsafe { transmute(psllvd256(a.as_i32x8(), count.as_i32x8())) }
2750}
2751
2752/// Shifts packed 64-bit integers in `a` left by the amount
2753/// specified by the corresponding element in `count` while
2754/// shifting in zeros, and returns the result.
2755///
2756/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sllv_epi64)
2757#[inline]
2758#[target_feature(enable = "avx2")]
2759#[cfg_attr(test, assert_instr(vpsllvq))]
2760#[stable(feature = "simd_x86", since = "1.27.0")]
2761pub fn _mm_sllv_epi64(a: __m128i, count: __m128i) -> __m128i {
2762    unsafe { transmute(psllvq(a.as_i64x2(), count.as_i64x2())) }
2763}
2764
2765/// Shifts packed 64-bit integers in `a` left by the amount
2766/// specified by the corresponding element in `count` while
2767/// shifting in zeros, and returns the result.
2768///
2769/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sllv_epi64)
2770#[inline]
2771#[target_feature(enable = "avx2")]
2772#[cfg_attr(test, assert_instr(vpsllvq))]
2773#[stable(feature = "simd_x86", since = "1.27.0")]
2774pub fn _mm256_sllv_epi64(a: __m256i, count: __m256i) -> __m256i {
2775    unsafe { transmute(psllvq256(a.as_i64x4(), count.as_i64x4())) }
2776}
2777
2778/// Shifts packed 16-bit integers in `a` right by `count` while
2779/// shifting in sign bits.
2780///
2781/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sra_epi16)
2782#[inline]
2783#[target_feature(enable = "avx2")]
2784#[cfg_attr(test, assert_instr(vpsraw))]
2785#[stable(feature = "simd_x86", since = "1.27.0")]
2786pub fn _mm256_sra_epi16(a: __m256i, count: __m128i) -> __m256i {
2787    unsafe { transmute(psraw(a.as_i16x16(), count.as_i16x8())) }
2788}
2789
2790/// Shifts packed 32-bit integers in `a` right by `count` while
2791/// shifting in sign bits.
2792///
2793/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sra_epi32)
2794#[inline]
2795#[target_feature(enable = "avx2")]
2796#[cfg_attr(test, assert_instr(vpsrad))]
2797#[stable(feature = "simd_x86", since = "1.27.0")]
2798pub fn _mm256_sra_epi32(a: __m256i, count: __m128i) -> __m256i {
2799    unsafe { transmute(psrad(a.as_i32x8(), count.as_i32x4())) }
2800}
2801
2802/// Shifts packed 16-bit integers in `a` right by `IMM8` while
2803/// shifting in sign bits.
2804///
2805/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srai_epi16)
2806#[inline]
2807#[target_feature(enable = "avx2")]
2808#[cfg_attr(test, assert_instr(vpsraw, IMM8 = 7))]
2809#[rustc_legacy_const_generics(1)]
2810#[stable(feature = "simd_x86", since = "1.27.0")]
2811pub fn _mm256_srai_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
2812    static_assert_uimm_bits!(IMM8, 8);
2813    unsafe { transmute(simd_shr(a.as_i16x16(), i16x16::splat(IMM8.min(15) as i16))) }
2814}
2815
2816/// Shifts packed 32-bit integers in `a` right by `IMM8` while
2817/// shifting in sign bits.
2818///
2819/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srai_epi32)
2820#[inline]
2821#[target_feature(enable = "avx2")]
2822#[cfg_attr(test, assert_instr(vpsrad, IMM8 = 7))]
2823#[rustc_legacy_const_generics(1)]
2824#[stable(feature = "simd_x86", since = "1.27.0")]
2825pub fn _mm256_srai_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
2826    static_assert_uimm_bits!(IMM8, 8);
2827    unsafe { transmute(simd_shr(a.as_i32x8(), i32x8::splat(IMM8.min(31)))) }
2828}
2829
2830/// Shifts packed 32-bit integers in `a` right by the amount specified by the
2831/// corresponding element in `count` while shifting in sign bits.
2832///
2833/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srav_epi32)
2834#[inline]
2835#[target_feature(enable = "avx2")]
2836#[cfg_attr(test, assert_instr(vpsravd))]
2837#[stable(feature = "simd_x86", since = "1.27.0")]
2838pub fn _mm_srav_epi32(a: __m128i, count: __m128i) -> __m128i {
2839    unsafe { transmute(psravd(a.as_i32x4(), count.as_i32x4())) }
2840}
2841
2842/// Shifts packed 32-bit integers in `a` right by the amount specified by the
2843/// corresponding element in `count` while shifting in sign bits.
2844///
2845/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srav_epi32)
2846#[inline]
2847#[target_feature(enable = "avx2")]
2848#[cfg_attr(test, assert_instr(vpsravd))]
2849#[stable(feature = "simd_x86", since = "1.27.0")]
2850pub fn _mm256_srav_epi32(a: __m256i, count: __m256i) -> __m256i {
2851    unsafe { transmute(psravd256(a.as_i32x8(), count.as_i32x8())) }
2852}
2853
2854/// Shifts 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros.
2855///
2856/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_si256)
2857#[inline]
2858#[target_feature(enable = "avx2")]
2859#[cfg_attr(test, assert_instr(vpsrldq, IMM8 = 1))]
2860#[rustc_legacy_const_generics(1)]
2861#[stable(feature = "simd_x86", since = "1.27.0")]
2862pub fn _mm256_srli_si256<const IMM8: i32>(a: __m256i) -> __m256i {
2863    static_assert_uimm_bits!(IMM8, 8);
2864    _mm256_bsrli_epi128::<IMM8>(a)
2865}
2866
2867/// Shifts 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros.
2868///
2869/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_bsrli_epi128)
2870#[inline]
2871#[target_feature(enable = "avx2")]
2872#[cfg_attr(test, assert_instr(vpsrldq, IMM8 = 1))]
2873#[rustc_legacy_const_generics(1)]
2874#[stable(feature = "simd_x86", since = "1.27.0")]
2875pub fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
2876    static_assert_uimm_bits!(IMM8, 8);
2877    const fn mask(shift: i32, i: u32) -> u32 {
2878        let shift = shift as u32 & 0xff;
2879        if shift > 15 || (15 - (i % 16)) < shift {
2880            0
2881        } else {
2882            32 + (i + shift)
2883        }
2884    }
2885    unsafe {
2886        let a = a.as_i8x32();
2887        let r: i8x32 = simd_shuffle!(
2888            i8x32::ZERO,
2889            a,
2890            [
2891                mask(IMM8, 0),
2892                mask(IMM8, 1),
2893                mask(IMM8, 2),
2894                mask(IMM8, 3),
2895                mask(IMM8, 4),
2896                mask(IMM8, 5),
2897                mask(IMM8, 6),
2898                mask(IMM8, 7),
2899                mask(IMM8, 8),
2900                mask(IMM8, 9),
2901                mask(IMM8, 10),
2902                mask(IMM8, 11),
2903                mask(IMM8, 12),
2904                mask(IMM8, 13),
2905                mask(IMM8, 14),
2906                mask(IMM8, 15),
2907                mask(IMM8, 16),
2908                mask(IMM8, 17),
2909                mask(IMM8, 18),
2910                mask(IMM8, 19),
2911                mask(IMM8, 20),
2912                mask(IMM8, 21),
2913                mask(IMM8, 22),
2914                mask(IMM8, 23),
2915                mask(IMM8, 24),
2916                mask(IMM8, 25),
2917                mask(IMM8, 26),
2918                mask(IMM8, 27),
2919                mask(IMM8, 28),
2920                mask(IMM8, 29),
2921                mask(IMM8, 30),
2922                mask(IMM8, 31),
2923            ],
2924        );
2925        transmute(r)
2926    }
2927}
2928
2929/// Shifts packed 16-bit integers in `a` right by `count` while shifting in
2930/// zeros.
2931///
2932/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srl_epi16)
2933#[inline]
2934#[target_feature(enable = "avx2")]
2935#[cfg_attr(test, assert_instr(vpsrlw))]
2936#[stable(feature = "simd_x86", since = "1.27.0")]
2937pub fn _mm256_srl_epi16(a: __m256i, count: __m128i) -> __m256i {
2938    unsafe { transmute(psrlw(a.as_i16x16(), count.as_i16x8())) }
2939}
2940
2941/// Shifts packed 32-bit integers in `a` right by `count` while shifting in
2942/// zeros.
2943///
2944/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srl_epi32)
2945#[inline]
2946#[target_feature(enable = "avx2")]
2947#[cfg_attr(test, assert_instr(vpsrld))]
2948#[stable(feature = "simd_x86", since = "1.27.0")]
2949pub fn _mm256_srl_epi32(a: __m256i, count: __m128i) -> __m256i {
2950    unsafe { transmute(psrld(a.as_i32x8(), count.as_i32x4())) }
2951}
2952
2953/// Shifts packed 64-bit integers in `a` right by `count` while shifting in
2954/// zeros.
2955///
2956/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srl_epi64)
2957#[inline]
2958#[target_feature(enable = "avx2")]
2959#[cfg_attr(test, assert_instr(vpsrlq))]
2960#[stable(feature = "simd_x86", since = "1.27.0")]
2961pub fn _mm256_srl_epi64(a: __m256i, count: __m128i) -> __m256i {
2962    unsafe { transmute(psrlq(a.as_i64x4(), count.as_i64x2())) }
2963}
2964
2965/// Shifts packed 16-bit integers in `a` right by `IMM8` while shifting in
2966/// zeros
2967///
2968/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_epi16)
2969#[inline]
2970#[target_feature(enable = "avx2")]
2971#[cfg_attr(test, assert_instr(vpsrlw, IMM8 = 7))]
2972#[rustc_legacy_const_generics(1)]
2973#[stable(feature = "simd_x86", since = "1.27.0")]
2974pub fn _mm256_srli_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
2975    static_assert_uimm_bits!(IMM8, 8);
2976    unsafe {
2977        if IMM8 >= 16 {
2978            _mm256_setzero_si256()
2979        } else {
2980            transmute(simd_shr(a.as_u16x16(), u16x16::splat(IMM8 as u16)))
2981        }
2982    }
2983}
2984
2985/// Shifts packed 32-bit integers in `a` right by `IMM8` while shifting in
2986/// zeros
2987///
2988/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_epi32)
2989#[inline]
2990#[target_feature(enable = "avx2")]
2991#[cfg_attr(test, assert_instr(vpsrld, IMM8 = 7))]
2992#[rustc_legacy_const_generics(1)]
2993#[stable(feature = "simd_x86", since = "1.27.0")]
2994pub fn _mm256_srli_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
2995    static_assert_uimm_bits!(IMM8, 8);
2996    unsafe {
2997        if IMM8 >= 32 {
2998            _mm256_setzero_si256()
2999        } else {
3000            transmute(simd_shr(a.as_u32x8(), u32x8::splat(IMM8 as u32)))
3001        }
3002    }
3003}
3004
3005/// Shifts packed 64-bit integers in `a` right by `IMM8` while shifting in
3006/// zeros
3007///
3008/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_epi64)
3009#[inline]
3010#[target_feature(enable = "avx2")]
3011#[cfg_attr(test, assert_instr(vpsrlq, IMM8 = 7))]
3012#[rustc_legacy_const_generics(1)]
3013#[stable(feature = "simd_x86", since = "1.27.0")]
3014pub fn _mm256_srli_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
3015    static_assert_uimm_bits!(IMM8, 8);
3016    unsafe {
3017        if IMM8 >= 64 {
3018            _mm256_setzero_si256()
3019        } else {
3020            transmute(simd_shr(a.as_u64x4(), u64x4::splat(IMM8 as u64)))
3021        }
3022    }
3023}
3024
3025/// Shifts packed 32-bit integers in `a` right by the amount specified by
3026/// the corresponding element in `count` while shifting in zeros,
3027///
3028/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srlv_epi32)
3029#[inline]
3030#[target_feature(enable = "avx2")]
3031#[cfg_attr(test, assert_instr(vpsrlvd))]
3032#[stable(feature = "simd_x86", since = "1.27.0")]
3033pub fn _mm_srlv_epi32(a: __m128i, count: __m128i) -> __m128i {
3034    unsafe { transmute(psrlvd(a.as_i32x4(), count.as_i32x4())) }
3035}
3036
3037/// Shifts packed 32-bit integers in `a` right by the amount specified by
3038/// the corresponding element in `count` while shifting in zeros,
3039///
3040/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srlv_epi32)
3041#[inline]
3042#[target_feature(enable = "avx2")]
3043#[cfg_attr(test, assert_instr(vpsrlvd))]
3044#[stable(feature = "simd_x86", since = "1.27.0")]
3045pub fn _mm256_srlv_epi32(a: __m256i, count: __m256i) -> __m256i {
3046    unsafe { transmute(psrlvd256(a.as_i32x8(), count.as_i32x8())) }
3047}
3048
3049/// Shifts packed 64-bit integers in `a` right by the amount specified by
3050/// the corresponding element in `count` while shifting in zeros,
3051///
3052/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srlv_epi64)
3053#[inline]
3054#[target_feature(enable = "avx2")]
3055#[cfg_attr(test, assert_instr(vpsrlvq))]
3056#[stable(feature = "simd_x86", since = "1.27.0")]
3057pub fn _mm_srlv_epi64(a: __m128i, count: __m128i) -> __m128i {
3058    unsafe { transmute(psrlvq(a.as_i64x2(), count.as_i64x2())) }
3059}
3060
3061/// Shifts packed 64-bit integers in `a` right by the amount specified by
3062/// the corresponding element in `count` while shifting in zeros,
3063///
3064/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srlv_epi64)
3065#[inline]
3066#[target_feature(enable = "avx2")]
3067#[cfg_attr(test, assert_instr(vpsrlvq))]
3068#[stable(feature = "simd_x86", since = "1.27.0")]
3069pub fn _mm256_srlv_epi64(a: __m256i, count: __m256i) -> __m256i {
3070    unsafe { transmute(psrlvq256(a.as_i64x4(), count.as_i64x4())) }
3071}
3072
3073/// Load 256-bits of integer data from memory into dst using a non-temporal memory hint. mem_addr
3074/// must be aligned on a 32-byte boundary or a general-protection exception may be generated. To
3075/// minimize caching, the data is flagged as non-temporal (unlikely to be used again soon)
3076///
3077/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_stream_load_si256)
3078#[inline]
3079#[target_feature(enable = "avx2")]
3080#[cfg_attr(test, assert_instr(vmovntdqa))]
3081#[stable(feature = "simd_x86_updates", since = "1.82.0")]
3082pub unsafe fn _mm256_stream_load_si256(mem_addr: *const __m256i) -> __m256i {
3083    let dst: __m256i;
3084    crate::arch::asm!(
3085        vpl!("vmovntdqa {a}"),
3086        a = out(ymm_reg) dst,
3087        p = in(reg) mem_addr,
3088        options(pure, readonly, nostack, preserves_flags),
3089    );
3090    dst
3091}
3092
3093/// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`
3094///
3095/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi16)
3096#[inline]
3097#[target_feature(enable = "avx2")]
3098#[cfg_attr(test, assert_instr(vpsubw))]
3099#[stable(feature = "simd_x86", since = "1.27.0")]
3100pub fn _mm256_sub_epi16(a: __m256i, b: __m256i) -> __m256i {
3101    unsafe { transmute(simd_sub(a.as_i16x16(), b.as_i16x16())) }
3102}
3103
3104/// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`
3105///
3106/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi32)
3107#[inline]
3108#[target_feature(enable = "avx2")]
3109#[cfg_attr(test, assert_instr(vpsubd))]
3110#[stable(feature = "simd_x86", since = "1.27.0")]
3111pub fn _mm256_sub_epi32(a: __m256i, b: __m256i) -> __m256i {
3112    unsafe { transmute(simd_sub(a.as_i32x8(), b.as_i32x8())) }
3113}
3114
3115/// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`
3116///
3117/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi64)
3118#[inline]
3119#[target_feature(enable = "avx2")]
3120#[cfg_attr(test, assert_instr(vpsubq))]
3121#[stable(feature = "simd_x86", since = "1.27.0")]
3122pub fn _mm256_sub_epi64(a: __m256i, b: __m256i) -> __m256i {
3123    unsafe { transmute(simd_sub(a.as_i64x4(), b.as_i64x4())) }
3124}
3125
3126/// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`
3127///
3128/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi8)
3129#[inline]
3130#[target_feature(enable = "avx2")]
3131#[cfg_attr(test, assert_instr(vpsubb))]
3132#[stable(feature = "simd_x86", since = "1.27.0")]
3133pub fn _mm256_sub_epi8(a: __m256i, b: __m256i) -> __m256i {
3134    unsafe { transmute(simd_sub(a.as_i8x32(), b.as_i8x32())) }
3135}
3136
3137/// Subtract packed 16-bit integers in `b` from packed 16-bit integers in
3138/// `a` using saturation.
3139///
3140/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epi16)
3141#[inline]
3142#[target_feature(enable = "avx2")]
3143#[cfg_attr(test, assert_instr(vpsubsw))]
3144#[stable(feature = "simd_x86", since = "1.27.0")]
3145pub fn _mm256_subs_epi16(a: __m256i, b: __m256i) -> __m256i {
3146    unsafe { transmute(simd_saturating_sub(a.as_i16x16(), b.as_i16x16())) }
3147}
3148
3149/// Subtract packed 8-bit integers in `b` from packed 8-bit integers in
3150/// `a` using saturation.
3151///
3152/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epi8)
3153#[inline]
3154#[target_feature(enable = "avx2")]
3155#[cfg_attr(test, assert_instr(vpsubsb))]
3156#[stable(feature = "simd_x86", since = "1.27.0")]
3157pub fn _mm256_subs_epi8(a: __m256i, b: __m256i) -> __m256i {
3158    unsafe { transmute(simd_saturating_sub(a.as_i8x32(), b.as_i8x32())) }
3159}
3160
3161/// Subtract packed unsigned 16-bit integers in `b` from packed 16-bit
3162/// integers in `a` using saturation.
3163///
3164/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epu16)
3165#[inline]
3166#[target_feature(enable = "avx2")]
3167#[cfg_attr(test, assert_instr(vpsubusw))]
3168#[stable(feature = "simd_x86", since = "1.27.0")]
3169pub fn _mm256_subs_epu16(a: __m256i, b: __m256i) -> __m256i {
3170    unsafe { transmute(simd_saturating_sub(a.as_u16x16(), b.as_u16x16())) }
3171}
3172
3173/// Subtract packed unsigned 8-bit integers in `b` from packed 8-bit
3174/// integers in `a` using saturation.
3175///
3176/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epu8)
3177#[inline]
3178#[target_feature(enable = "avx2")]
3179#[cfg_attr(test, assert_instr(vpsubusb))]
3180#[stable(feature = "simd_x86", since = "1.27.0")]
3181pub fn _mm256_subs_epu8(a: __m256i, b: __m256i) -> __m256i {
3182    unsafe { transmute(simd_saturating_sub(a.as_u8x32(), b.as_u8x32())) }
3183}
3184
3185/// Unpacks and interleave 8-bit integers from the high half of each
3186/// 128-bit lane in `a` and `b`.
3187///
3188/// ```rust
3189/// #[cfg(target_arch = "x86")]
3190/// use std::arch::x86::*;
3191/// #[cfg(target_arch = "x86_64")]
3192/// use std::arch::x86_64::*;
3193///
3194/// # fn main() {
3195/// #     if is_x86_feature_detected!("avx2") {
3196/// #         #[target_feature(enable = "avx2")]
3197/// #         unsafe fn worker() {
3198/// let a = _mm256_setr_epi8(
3199///     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
3200///     20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
3201/// );
3202/// let b = _mm256_setr_epi8(
3203///     0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
3204///     -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
3205///     -30, -31,
3206/// );
3207///
3208/// let c = _mm256_unpackhi_epi8(a, b);
3209///
3210/// let expected = _mm256_setr_epi8(
3211///     8, -8, 9, -9, 10, -10, 11, -11, 12, -12, 13, -13, 14, -14, 15, -15,
3212///     24, -24, 25, -25, 26, -26, 27, -27, 28, -28, 29, -29, 30, -30, 31,
3213///     -31,
3214/// );
3215/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3216///
3217/// #         }
3218/// #         unsafe { worker(); }
3219/// #     }
3220/// # }
3221/// ```
3222///
3223/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi8)
3224#[inline]
3225#[target_feature(enable = "avx2")]
3226#[cfg_attr(test, assert_instr(vpunpckhbw))]
3227#[stable(feature = "simd_x86", since = "1.27.0")]
3228pub fn _mm256_unpackhi_epi8(a: __m256i, b: __m256i) -> __m256i {
3229    unsafe {
3230        #[rustfmt::skip]
3231        let r: i8x32 = simd_shuffle!(a.as_i8x32(), b.as_i8x32(), [
3232                8, 40, 9, 41, 10, 42, 11, 43,
3233                12, 44, 13, 45, 14, 46, 15, 47,
3234                24, 56, 25, 57, 26, 58, 27, 59,
3235                28, 60, 29, 61, 30, 62, 31, 63,
3236        ]);
3237        transmute(r)
3238    }
3239}
3240
3241/// Unpacks and interleave 8-bit integers from the low half of each
3242/// 128-bit lane of `a` and `b`.
3243///
3244/// ```rust
3245/// #[cfg(target_arch = "x86")]
3246/// use std::arch::x86::*;
3247/// #[cfg(target_arch = "x86_64")]
3248/// use std::arch::x86_64::*;
3249///
3250/// # fn main() {
3251/// #     if is_x86_feature_detected!("avx2") {
3252/// #         #[target_feature(enable = "avx2")]
3253/// #         unsafe fn worker() {
3254/// let a = _mm256_setr_epi8(
3255///     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
3256///     20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
3257/// );
3258/// let b = _mm256_setr_epi8(
3259///     0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
3260///     -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
3261///     -30, -31,
3262/// );
3263///
3264/// let c = _mm256_unpacklo_epi8(a, b);
3265///
3266/// let expected = _mm256_setr_epi8(
3267///     0, 0, 1, -1, 2, -2, 3, -3, 4, -4, 5, -5, 6, -6, 7, -7, 16, -16, 17,
3268///     -17, 18, -18, 19, -19, 20, -20, 21, -21, 22, -22, 23, -23,
3269/// );
3270/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3271///
3272/// #         }
3273/// #         unsafe { worker(); }
3274/// #     }
3275/// # }
3276/// ```
3277///
3278/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi8)
3279#[inline]
3280#[target_feature(enable = "avx2")]
3281#[cfg_attr(test, assert_instr(vpunpcklbw))]
3282#[stable(feature = "simd_x86", since = "1.27.0")]
3283pub fn _mm256_unpacklo_epi8(a: __m256i, b: __m256i) -> __m256i {
3284    unsafe {
3285        #[rustfmt::skip]
3286        let r: i8x32 = simd_shuffle!(a.as_i8x32(), b.as_i8x32(), [
3287            0, 32, 1, 33, 2, 34, 3, 35,
3288            4, 36, 5, 37, 6, 38, 7, 39,
3289            16, 48, 17, 49, 18, 50, 19, 51,
3290            20, 52, 21, 53, 22, 54, 23, 55,
3291        ]);
3292        transmute(r)
3293    }
3294}
3295
3296/// Unpacks and interleave 16-bit integers from the high half of each
3297/// 128-bit lane of `a` and `b`.
3298///
3299/// ```rust
3300/// #[cfg(target_arch = "x86")]
3301/// use std::arch::x86::*;
3302/// #[cfg(target_arch = "x86_64")]
3303/// use std::arch::x86_64::*;
3304///
3305/// # fn main() {
3306/// #     if is_x86_feature_detected!("avx2") {
3307/// #         #[target_feature(enable = "avx2")]
3308/// #         unsafe fn worker() {
3309/// let a = _mm256_setr_epi16(
3310///     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3311/// );
3312/// let b = _mm256_setr_epi16(
3313///     0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
3314/// );
3315///
3316/// let c = _mm256_unpackhi_epi16(a, b);
3317///
3318/// let expected = _mm256_setr_epi16(
3319///     4, -4, 5, -5, 6, -6, 7, -7, 12, -12, 13, -13, 14, -14, 15, -15,
3320/// );
3321/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3322///
3323/// #         }
3324/// #         unsafe { worker(); }
3325/// #     }
3326/// # }
3327/// ```
3328///
3329/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi16)
3330#[inline]
3331#[target_feature(enable = "avx2")]
3332#[cfg_attr(test, assert_instr(vpunpckhwd))]
3333#[stable(feature = "simd_x86", since = "1.27.0")]
3334pub fn _mm256_unpackhi_epi16(a: __m256i, b: __m256i) -> __m256i {
3335    unsafe {
3336        let r: i16x16 = simd_shuffle!(
3337            a.as_i16x16(),
3338            b.as_i16x16(),
3339            [4, 20, 5, 21, 6, 22, 7, 23, 12, 28, 13, 29, 14, 30, 15, 31],
3340        );
3341        transmute(r)
3342    }
3343}
3344
3345/// Unpacks and interleave 16-bit integers from the low half of each
3346/// 128-bit lane of `a` and `b`.
3347///
3348/// ```rust
3349/// #[cfg(target_arch = "x86")]
3350/// use std::arch::x86::*;
3351/// #[cfg(target_arch = "x86_64")]
3352/// use std::arch::x86_64::*;
3353///
3354/// # fn main() {
3355/// #     if is_x86_feature_detected!("avx2") {
3356/// #         #[target_feature(enable = "avx2")]
3357/// #         unsafe fn worker() {
3358///
3359/// let a = _mm256_setr_epi16(
3360///     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3361/// );
3362/// let b = _mm256_setr_epi16(
3363///     0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
3364/// );
3365///
3366/// let c = _mm256_unpacklo_epi16(a, b);
3367///
3368/// let expected = _mm256_setr_epi16(
3369///     0, 0, 1, -1, 2, -2, 3, -3, 8, -8, 9, -9, 10, -10, 11, -11,
3370/// );
3371/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3372///
3373/// #         }
3374/// #         unsafe { worker(); }
3375/// #     }
3376/// # }
3377/// ```
3378///
3379/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi16)
3380#[inline]
3381#[target_feature(enable = "avx2")]
3382#[cfg_attr(test, assert_instr(vpunpcklwd))]
3383#[stable(feature = "simd_x86", since = "1.27.0")]
3384pub fn _mm256_unpacklo_epi16(a: __m256i, b: __m256i) -> __m256i {
3385    unsafe {
3386        let r: i16x16 = simd_shuffle!(
3387            a.as_i16x16(),
3388            b.as_i16x16(),
3389            [0, 16, 1, 17, 2, 18, 3, 19, 8, 24, 9, 25, 10, 26, 11, 27],
3390        );
3391        transmute(r)
3392    }
3393}
3394
3395/// Unpacks and interleave 32-bit integers from the high half of each
3396/// 128-bit lane of `a` and `b`.
3397///
3398/// ```rust
3399/// #[cfg(target_arch = "x86")]
3400/// use std::arch::x86::*;
3401/// #[cfg(target_arch = "x86_64")]
3402/// use std::arch::x86_64::*;
3403///
3404/// # fn main() {
3405/// #     if is_x86_feature_detected!("avx2") {
3406/// #         #[target_feature(enable = "avx2")]
3407/// #         unsafe fn worker() {
3408/// let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
3409/// let b = _mm256_setr_epi32(0, -1, -2, -3, -4, -5, -6, -7);
3410///
3411/// let c = _mm256_unpackhi_epi32(a, b);
3412///
3413/// let expected = _mm256_setr_epi32(2, -2, 3, -3, 6, -6, 7, -7);
3414/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3415///
3416/// #         }
3417/// #         unsafe { worker(); }
3418/// #     }
3419/// # }
3420/// ```
3421///
3422/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi32)
3423#[inline]
3424#[target_feature(enable = "avx2")]
3425#[cfg_attr(test, assert_instr(vunpckhps))]
3426#[stable(feature = "simd_x86", since = "1.27.0")]
3427pub fn _mm256_unpackhi_epi32(a: __m256i, b: __m256i) -> __m256i {
3428    unsafe {
3429        let r: i32x8 = simd_shuffle!(a.as_i32x8(), b.as_i32x8(), [2, 10, 3, 11, 6, 14, 7, 15]);
3430        transmute(r)
3431    }
3432}
3433
3434/// Unpacks and interleave 32-bit integers from the low half of each
3435/// 128-bit lane of `a` and `b`.
3436///
3437/// ```rust
3438/// #[cfg(target_arch = "x86")]
3439/// use std::arch::x86::*;
3440/// #[cfg(target_arch = "x86_64")]
3441/// use std::arch::x86_64::*;
3442///
3443/// # fn main() {
3444/// #     if is_x86_feature_detected!("avx2") {
3445/// #         #[target_feature(enable = "avx2")]
3446/// #         unsafe fn worker() {
3447/// let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
3448/// let b = _mm256_setr_epi32(0, -1, -2, -3, -4, -5, -6, -7);
3449///
3450/// let c = _mm256_unpacklo_epi32(a, b);
3451///
3452/// let expected = _mm256_setr_epi32(0, 0, 1, -1, 4, -4, 5, -5);
3453/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3454///
3455/// #         }
3456/// #         unsafe { worker(); }
3457/// #     }
3458/// # }
3459/// ```
3460///
3461/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi32)
3462#[inline]
3463#[target_feature(enable = "avx2")]
3464#[cfg_attr(test, assert_instr(vunpcklps))]
3465#[stable(feature = "simd_x86", since = "1.27.0")]
3466pub fn _mm256_unpacklo_epi32(a: __m256i, b: __m256i) -> __m256i {
3467    unsafe {
3468        let r: i32x8 = simd_shuffle!(a.as_i32x8(), b.as_i32x8(), [0, 8, 1, 9, 4, 12, 5, 13]);
3469        transmute(r)
3470    }
3471}
3472
3473/// Unpacks and interleave 64-bit integers from the high half of each
3474/// 128-bit lane of `a` and `b`.
3475///
3476/// ```rust
3477/// #[cfg(target_arch = "x86")]
3478/// use std::arch::x86::*;
3479/// #[cfg(target_arch = "x86_64")]
3480/// use std::arch::x86_64::*;
3481///
3482/// # fn main() {
3483/// #     if is_x86_feature_detected!("avx2") {
3484/// #         #[target_feature(enable = "avx2")]
3485/// #         unsafe fn worker() {
3486/// let a = _mm256_setr_epi64x(0, 1, 2, 3);
3487/// let b = _mm256_setr_epi64x(0, -1, -2, -3);
3488///
3489/// let c = _mm256_unpackhi_epi64(a, b);
3490///
3491/// let expected = _mm256_setr_epi64x(1, -1, 3, -3);
3492/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3493///
3494/// #         }
3495/// #         unsafe { worker(); }
3496/// #     }
3497/// # }
3498/// ```
3499///
3500/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi64)
3501#[inline]
3502#[target_feature(enable = "avx2")]
3503#[cfg_attr(test, assert_instr(vunpckhpd))]
3504#[stable(feature = "simd_x86", since = "1.27.0")]
3505pub fn _mm256_unpackhi_epi64(a: __m256i, b: __m256i) -> __m256i {
3506    unsafe {
3507        let r: i64x4 = simd_shuffle!(a.as_i64x4(), b.as_i64x4(), [1, 5, 3, 7]);
3508        transmute(r)
3509    }
3510}
3511
3512/// Unpacks and interleave 64-bit integers from the low half of each
3513/// 128-bit lane of `a` and `b`.
3514///
3515/// ```rust
3516/// #[cfg(target_arch = "x86")]
3517/// use std::arch::x86::*;
3518/// #[cfg(target_arch = "x86_64")]
3519/// use std::arch::x86_64::*;
3520///
3521/// # fn main() {
3522/// #     if is_x86_feature_detected!("avx2") {
3523/// #         #[target_feature(enable = "avx2")]
3524/// #         unsafe fn worker() {
3525/// let a = _mm256_setr_epi64x(0, 1, 2, 3);
3526/// let b = _mm256_setr_epi64x(0, -1, -2, -3);
3527///
3528/// let c = _mm256_unpacklo_epi64(a, b);
3529///
3530/// let expected = _mm256_setr_epi64x(0, 0, 2, -2);
3531/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3532///
3533/// #         }
3534/// #         unsafe { worker(); }
3535/// #     }
3536/// # }
3537/// ```
3538///
3539/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi64)
3540#[inline]
3541#[target_feature(enable = "avx2")]
3542#[cfg_attr(test, assert_instr(vunpcklpd))]
3543#[stable(feature = "simd_x86", since = "1.27.0")]
3544pub fn _mm256_unpacklo_epi64(a: __m256i, b: __m256i) -> __m256i {
3545    unsafe {
3546        let r: i64x4 = simd_shuffle!(a.as_i64x4(), b.as_i64x4(), [0, 4, 2, 6]);
3547        transmute(r)
3548    }
3549}
3550
3551/// Computes the bitwise XOR of 256 bits (representing integer data)
3552/// in `a` and `b`
3553///
3554/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_xor_si256)
3555#[inline]
3556#[target_feature(enable = "avx2")]
3557#[cfg_attr(test, assert_instr(vxorps))]
3558#[stable(feature = "simd_x86", since = "1.27.0")]
3559pub fn _mm256_xor_si256(a: __m256i, b: __m256i) -> __m256i {
3560    unsafe { transmute(simd_xor(a.as_i64x4(), b.as_i64x4())) }
3561}
3562
3563/// Extracts an 8-bit integer from `a`, selected with `INDEX`. Returns a 32-bit
3564/// integer containing the zero-extended integer data.
3565///
3566/// See [LLVM commit D20468](https://reviews.llvm.org/D20468).
3567///
3568/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extract_epi8)
3569#[inline]
3570#[target_feature(enable = "avx2")]
3571// This intrinsic has no corresponding instruction.
3572#[rustc_legacy_const_generics(1)]
3573#[stable(feature = "simd_x86", since = "1.27.0")]
3574pub fn _mm256_extract_epi8<const INDEX: i32>(a: __m256i) -> i32 {
3575    static_assert_uimm_bits!(INDEX, 5);
3576    unsafe { simd_extract!(a.as_u8x32(), INDEX as u32, u8) as i32 }
3577}
3578
3579/// Extracts a 16-bit integer from `a`, selected with `INDEX`. Returns a 32-bit
3580/// integer containing the zero-extended integer data.
3581///
3582/// See [LLVM commit D20468](https://reviews.llvm.org/D20468).
3583///
3584/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extract_epi16)
3585#[inline]
3586#[target_feature(enable = "avx2")]
3587// This intrinsic has no corresponding instruction.
3588#[rustc_legacy_const_generics(1)]
3589#[stable(feature = "simd_x86", since = "1.27.0")]
3590pub fn _mm256_extract_epi16<const INDEX: i32>(a: __m256i) -> i32 {
3591    static_assert_uimm_bits!(INDEX, 4);
3592    unsafe { simd_extract!(a.as_u16x16(), INDEX as u32, u16) as i32 }
3593}
3594
3595#[allow(improper_ctypes)]
3596unsafe extern "C" {
3597    #[link_name = "llvm.x86.avx2.phadd.w"]
3598    fn phaddw(a: i16x16, b: i16x16) -> i16x16;
3599    #[link_name = "llvm.x86.avx2.phadd.d"]
3600    fn phaddd(a: i32x8, b: i32x8) -> i32x8;
3601    #[link_name = "llvm.x86.avx2.phadd.sw"]
3602    fn phaddsw(a: i16x16, b: i16x16) -> i16x16;
3603    #[link_name = "llvm.x86.avx2.phsub.w"]
3604    fn phsubw(a: i16x16, b: i16x16) -> i16x16;
3605    #[link_name = "llvm.x86.avx2.phsub.d"]
3606    fn phsubd(a: i32x8, b: i32x8) -> i32x8;
3607    #[link_name = "llvm.x86.avx2.phsub.sw"]
3608    fn phsubsw(a: i16x16, b: i16x16) -> i16x16;
3609    #[link_name = "llvm.x86.avx2.pmadd.wd"]
3610    fn pmaddwd(a: i16x16, b: i16x16) -> i32x8;
3611    #[link_name = "llvm.x86.avx2.pmadd.ub.sw"]
3612    fn pmaddubsw(a: u8x32, b: u8x32) -> i16x16;
3613    #[link_name = "llvm.x86.avx2.maskload.d"]
3614    fn maskloadd(mem_addr: *const i8, mask: i32x4) -> i32x4;
3615    #[link_name = "llvm.x86.avx2.maskload.d.256"]
3616    fn maskloadd256(mem_addr: *const i8, mask: i32x8) -> i32x8;
3617    #[link_name = "llvm.x86.avx2.maskload.q"]
3618    fn maskloadq(mem_addr: *const i8, mask: i64x2) -> i64x2;
3619    #[link_name = "llvm.x86.avx2.maskload.q.256"]
3620    fn maskloadq256(mem_addr: *const i8, mask: i64x4) -> i64x4;
3621    #[link_name = "llvm.x86.avx2.maskstore.d"]
3622    fn maskstored(mem_addr: *mut i8, mask: i32x4, a: i32x4);
3623    #[link_name = "llvm.x86.avx2.maskstore.d.256"]
3624    fn maskstored256(mem_addr: *mut i8, mask: i32x8, a: i32x8);
3625    #[link_name = "llvm.x86.avx2.maskstore.q"]
3626    fn maskstoreq(mem_addr: *mut i8, mask: i64x2, a: i64x2);
3627    #[link_name = "llvm.x86.avx2.maskstore.q.256"]
3628    fn maskstoreq256(mem_addr: *mut i8, mask: i64x4, a: i64x4);
3629    #[link_name = "llvm.x86.avx2.mpsadbw"]
3630    fn mpsadbw(a: u8x32, b: u8x32, imm8: i8) -> u16x16;
3631    #[link_name = "llvm.x86.avx2.pmul.hr.sw"]
3632    fn pmulhrsw(a: i16x16, b: i16x16) -> i16x16;
3633    #[link_name = "llvm.x86.avx2.packsswb"]
3634    fn packsswb(a: i16x16, b: i16x16) -> i8x32;
3635    #[link_name = "llvm.x86.avx2.packssdw"]
3636    fn packssdw(a: i32x8, b: i32x8) -> i16x16;
3637    #[link_name = "llvm.x86.avx2.packuswb"]
3638    fn packuswb(a: i16x16, b: i16x16) -> u8x32;
3639    #[link_name = "llvm.x86.avx2.packusdw"]
3640    fn packusdw(a: i32x8, b: i32x8) -> u16x16;
3641    #[link_name = "llvm.x86.avx2.psad.bw"]
3642    fn psadbw(a: u8x32, b: u8x32) -> u64x4;
3643    #[link_name = "llvm.x86.avx2.psign.b"]
3644    fn psignb(a: i8x32, b: i8x32) -> i8x32;
3645    #[link_name = "llvm.x86.avx2.psign.w"]
3646    fn psignw(a: i16x16, b: i16x16) -> i16x16;
3647    #[link_name = "llvm.x86.avx2.psign.d"]
3648    fn psignd(a: i32x8, b: i32x8) -> i32x8;
3649    #[link_name = "llvm.x86.avx2.psll.w"]
3650    fn psllw(a: i16x16, count: i16x8) -> i16x16;
3651    #[link_name = "llvm.x86.avx2.psll.d"]
3652    fn pslld(a: i32x8, count: i32x4) -> i32x8;
3653    #[link_name = "llvm.x86.avx2.psll.q"]
3654    fn psllq(a: i64x4, count: i64x2) -> i64x4;
3655    #[link_name = "llvm.x86.avx2.psllv.d"]
3656    fn psllvd(a: i32x4, count: i32x4) -> i32x4;
3657    #[link_name = "llvm.x86.avx2.psllv.d.256"]
3658    fn psllvd256(a: i32x8, count: i32x8) -> i32x8;
3659    #[link_name = "llvm.x86.avx2.psllv.q"]
3660    fn psllvq(a: i64x2, count: i64x2) -> i64x2;
3661    #[link_name = "llvm.x86.avx2.psllv.q.256"]
3662    fn psllvq256(a: i64x4, count: i64x4) -> i64x4;
3663    #[link_name = "llvm.x86.avx2.psra.w"]
3664    fn psraw(a: i16x16, count: i16x8) -> i16x16;
3665    #[link_name = "llvm.x86.avx2.psra.d"]
3666    fn psrad(a: i32x8, count: i32x4) -> i32x8;
3667    #[link_name = "llvm.x86.avx2.psrav.d"]
3668    fn psravd(a: i32x4, count: i32x4) -> i32x4;
3669    #[link_name = "llvm.x86.avx2.psrav.d.256"]
3670    fn psravd256(a: i32x8, count: i32x8) -> i32x8;
3671    #[link_name = "llvm.x86.avx2.psrl.w"]
3672    fn psrlw(a: i16x16, count: i16x8) -> i16x16;
3673    #[link_name = "llvm.x86.avx2.psrl.d"]
3674    fn psrld(a: i32x8, count: i32x4) -> i32x8;
3675    #[link_name = "llvm.x86.avx2.psrl.q"]
3676    fn psrlq(a: i64x4, count: i64x2) -> i64x4;
3677    #[link_name = "llvm.x86.avx2.psrlv.d"]
3678    fn psrlvd(a: i32x4, count: i32x4) -> i32x4;
3679    #[link_name = "llvm.x86.avx2.psrlv.d.256"]
3680    fn psrlvd256(a: i32x8, count: i32x8) -> i32x8;
3681    #[link_name = "llvm.x86.avx2.psrlv.q"]
3682    fn psrlvq(a: i64x2, count: i64x2) -> i64x2;
3683    #[link_name = "llvm.x86.avx2.psrlv.q.256"]
3684    fn psrlvq256(a: i64x4, count: i64x4) -> i64x4;
3685    #[link_name = "llvm.x86.avx2.pshuf.b"]
3686    fn pshufb(a: u8x32, b: u8x32) -> u8x32;
3687    #[link_name = "llvm.x86.avx2.permd"]
3688    fn permd(a: u32x8, b: u32x8) -> u32x8;
3689    #[link_name = "llvm.x86.avx2.permps"]
3690    fn permps(a: __m256, b: i32x8) -> __m256;
3691    #[link_name = "llvm.x86.avx2.vperm2i128"]
3692    fn vperm2i128(a: i64x4, b: i64x4, imm8: i8) -> i64x4;
3693    #[link_name = "llvm.x86.avx2.gather.d.d"]
3694    fn pgatherdd(src: i32x4, slice: *const i8, offsets: i32x4, mask: i32x4, scale: i8) -> i32x4;
3695    #[link_name = "llvm.x86.avx2.gather.d.d.256"]
3696    fn vpgatherdd(src: i32x8, slice: *const i8, offsets: i32x8, mask: i32x8, scale: i8) -> i32x8;
3697    #[link_name = "llvm.x86.avx2.gather.d.q"]
3698    fn pgatherdq(src: i64x2, slice: *const i8, offsets: i32x4, mask: i64x2, scale: i8) -> i64x2;
3699    #[link_name = "llvm.x86.avx2.gather.d.q.256"]
3700    fn vpgatherdq(src: i64x4, slice: *const i8, offsets: i32x4, mask: i64x4, scale: i8) -> i64x4;
3701    #[link_name = "llvm.x86.avx2.gather.q.d"]
3702    fn pgatherqd(src: i32x4, slice: *const i8, offsets: i64x2, mask: i32x4, scale: i8) -> i32x4;
3703    #[link_name = "llvm.x86.avx2.gather.q.d.256"]
3704    fn vpgatherqd(src: i32x4, slice: *const i8, offsets: i64x4, mask: i32x4, scale: i8) -> i32x4;
3705    #[link_name = "llvm.x86.avx2.gather.q.q"]
3706    fn pgatherqq(src: i64x2, slice: *const i8, offsets: i64x2, mask: i64x2, scale: i8) -> i64x2;
3707    #[link_name = "llvm.x86.avx2.gather.q.q.256"]
3708    fn vpgatherqq(src: i64x4, slice: *const i8, offsets: i64x4, mask: i64x4, scale: i8) -> i64x4;
3709    #[link_name = "llvm.x86.avx2.gather.d.pd"]
3710    fn pgatherdpd(
3711        src: __m128d,
3712        slice: *const i8,
3713        offsets: i32x4,
3714        mask: __m128d,
3715        scale: i8,
3716    ) -> __m128d;
3717    #[link_name = "llvm.x86.avx2.gather.d.pd.256"]
3718    fn vpgatherdpd(
3719        src: __m256d,
3720        slice: *const i8,
3721        offsets: i32x4,
3722        mask: __m256d,
3723        scale: i8,
3724    ) -> __m256d;
3725    #[link_name = "llvm.x86.avx2.gather.q.pd"]
3726    fn pgatherqpd(
3727        src: __m128d,
3728        slice: *const i8,
3729        offsets: i64x2,
3730        mask: __m128d,
3731        scale: i8,
3732    ) -> __m128d;
3733    #[link_name = "llvm.x86.avx2.gather.q.pd.256"]
3734    fn vpgatherqpd(
3735        src: __m256d,
3736        slice: *const i8,
3737        offsets: i64x4,
3738        mask: __m256d,
3739        scale: i8,
3740    ) -> __m256d;
3741    #[link_name = "llvm.x86.avx2.gather.d.ps"]
3742    fn pgatherdps(src: __m128, slice: *const i8, offsets: i32x4, mask: __m128, scale: i8)
3743    -> __m128;
3744    #[link_name = "llvm.x86.avx2.gather.d.ps.256"]
3745    fn vpgatherdps(
3746        src: __m256,
3747        slice: *const i8,
3748        offsets: i32x8,
3749        mask: __m256,
3750        scale: i8,
3751    ) -> __m256;
3752    #[link_name = "llvm.x86.avx2.gather.q.ps"]
3753    fn pgatherqps(src: __m128, slice: *const i8, offsets: i64x2, mask: __m128, scale: i8)
3754    -> __m128;
3755    #[link_name = "llvm.x86.avx2.gather.q.ps.256"]
3756    fn vpgatherqps(
3757        src: __m128,
3758        slice: *const i8,
3759        offsets: i64x4,
3760        mask: __m128,
3761        scale: i8,
3762    ) -> __m128;
3763}
3764
3765#[cfg(test)]
3766mod tests {
3767
3768    use stdarch_test::simd_test;
3769
3770    use crate::core_arch::x86::*;
3771
3772    #[simd_test(enable = "avx2")]
3773    unsafe fn test_mm256_abs_epi32() {
3774        #[rustfmt::skip]
3775        let a = _mm256_setr_epi32(
3776            0, 1, -1, i32::MAX,
3777            i32::MIN, 100, -100, -32,
3778        );
3779        let r = _mm256_abs_epi32(a);
3780        #[rustfmt::skip]
3781        let e = _mm256_setr_epi32(
3782            0, 1, 1, i32::MAX,
3783            i32::MAX.wrapping_add(1), 100, 100, 32,
3784        );
3785        assert_eq_m256i(r, e);
3786    }
3787
3788    #[simd_test(enable = "avx2")]
3789    unsafe fn test_mm256_abs_epi16() {
3790        #[rustfmt::skip]
3791        let a = _mm256_setr_epi16(
3792            0,  1, -1, 2, -2, 3, -3, 4,
3793            -4, 5, -5, i16::MAX, i16::MIN, 100, -100, -32,
3794        );
3795        let r = _mm256_abs_epi16(a);
3796        #[rustfmt::skip]
3797        let e = _mm256_setr_epi16(
3798            0, 1, 1, 2, 2, 3, 3, 4,
3799            4, 5, 5, i16::MAX, i16::MAX.wrapping_add(1), 100, 100, 32,
3800        );
3801        assert_eq_m256i(r, e);
3802    }
3803
3804    #[simd_test(enable = "avx2")]
3805    unsafe fn test_mm256_abs_epi8() {
3806        #[rustfmt::skip]
3807        let a = _mm256_setr_epi8(
3808            0, 1, -1, 2, -2, 3, -3, 4,
3809            -4, 5, -5, i8::MAX, i8::MIN, 100, -100, -32,
3810            0, 1, -1, 2, -2, 3, -3, 4,
3811            -4, 5, -5, i8::MAX, i8::MIN, 100, -100, -32,
3812        );
3813        let r = _mm256_abs_epi8(a);
3814        #[rustfmt::skip]
3815        let e = _mm256_setr_epi8(
3816            0, 1, 1, 2, 2, 3, 3, 4,
3817            4, 5, 5, i8::MAX, i8::MAX.wrapping_add(1), 100, 100, 32,
3818            0, 1, 1, 2, 2, 3, 3, 4,
3819            4, 5, 5, i8::MAX, i8::MAX.wrapping_add(1), 100, 100, 32,
3820        );
3821        assert_eq_m256i(r, e);
3822    }
3823
3824    #[simd_test(enable = "avx2")]
3825    unsafe fn test_mm256_add_epi64() {
3826        let a = _mm256_setr_epi64x(-10, 0, 100, 1_000_000_000);
3827        let b = _mm256_setr_epi64x(-1, 0, 1, 2);
3828        let r = _mm256_add_epi64(a, b);
3829        let e = _mm256_setr_epi64x(-11, 0, 101, 1_000_000_002);
3830        assert_eq_m256i(r, e);
3831    }
3832
3833    #[simd_test(enable = "avx2")]
3834    unsafe fn test_mm256_add_epi32() {
3835        let a = _mm256_setr_epi32(-1, 0, 1, 2, 3, 4, 5, 6);
3836        let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
3837        let r = _mm256_add_epi32(a, b);
3838        let e = _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14);
3839        assert_eq_m256i(r, e);
3840    }
3841
3842    #[simd_test(enable = "avx2")]
3843    unsafe fn test_mm256_add_epi16() {
3844        #[rustfmt::skip]
3845        let a = _mm256_setr_epi16(
3846            0, 1, 2, 3, 4, 5, 6, 7,
3847            8, 9, 10, 11, 12, 13, 14, 15,
3848        );
3849        #[rustfmt::skip]
3850        let b = _mm256_setr_epi16(
3851            0, 1, 2, 3, 4, 5, 6, 7,
3852            8, 9, 10, 11, 12, 13, 14, 15,
3853        );
3854        let r = _mm256_add_epi16(a, b);
3855        #[rustfmt::skip]
3856        let e = _mm256_setr_epi16(
3857            0, 2, 4, 6, 8, 10, 12, 14,
3858            16, 18, 20, 22, 24, 26, 28, 30,
3859        );
3860        assert_eq_m256i(r, e);
3861    }
3862
3863    #[simd_test(enable = "avx2")]
3864    unsafe fn test_mm256_add_epi8() {
3865        #[rustfmt::skip]
3866        let a = _mm256_setr_epi8(
3867            0, 1, 2, 3, 4, 5, 6, 7,
3868            8, 9, 10, 11, 12, 13, 14, 15,
3869            16, 17, 18, 19, 20, 21, 22, 23,
3870            24, 25, 26, 27, 28, 29, 30, 31,
3871        );
3872        #[rustfmt::skip]
3873        let b = _mm256_setr_epi8(
3874            0, 1, 2, 3, 4, 5, 6, 7,
3875            8, 9, 10, 11, 12, 13, 14, 15,
3876            16, 17, 18, 19, 20, 21, 22, 23,
3877            24, 25, 26, 27, 28, 29, 30, 31,
3878        );
3879        let r = _mm256_add_epi8(a, b);
3880        #[rustfmt::skip]
3881        let e = _mm256_setr_epi8(
3882            0, 2, 4, 6, 8, 10, 12, 14,
3883            16, 18, 20, 22, 24, 26, 28, 30,
3884            32, 34, 36, 38, 40, 42, 44, 46,
3885            48, 50, 52, 54, 56, 58, 60, 62,
3886        );
3887        assert_eq_m256i(r, e);
3888    }
3889
3890    #[simd_test(enable = "avx2")]
3891    unsafe fn test_mm256_adds_epi8() {
3892        #[rustfmt::skip]
3893        let a = _mm256_setr_epi8(
3894            0, 1, 2, 3, 4, 5, 6, 7,
3895            8, 9, 10, 11, 12, 13, 14, 15,
3896            16, 17, 18, 19, 20, 21, 22, 23,
3897            24, 25, 26, 27, 28, 29, 30, 31,
3898        );
3899        #[rustfmt::skip]
3900        let b = _mm256_setr_epi8(
3901            32, 33, 34, 35, 36, 37, 38, 39,
3902            40, 41, 42, 43, 44, 45, 46, 47,
3903            48, 49, 50, 51, 52, 53, 54, 55,
3904            56, 57, 58, 59, 60, 61, 62, 63,
3905        );
3906        let r = _mm256_adds_epi8(a, b);
3907        #[rustfmt::skip]
3908        let e = _mm256_setr_epi8(
3909            32, 34, 36, 38, 40, 42, 44, 46,
3910            48, 50, 52, 54, 56, 58, 60, 62,
3911            64, 66, 68, 70, 72, 74, 76, 78,
3912            80, 82, 84, 86, 88, 90, 92, 94,
3913        );
3914        assert_eq_m256i(r, e);
3915    }
3916
3917    #[simd_test(enable = "avx2")]
3918    unsafe fn test_mm256_adds_epi8_saturate_positive() {
3919        let a = _mm256_set1_epi8(0x7F);
3920        let b = _mm256_set1_epi8(1);
3921        let r = _mm256_adds_epi8(a, b);
3922        assert_eq_m256i(r, a);
3923    }
3924
3925    #[simd_test(enable = "avx2")]
3926    unsafe fn test_mm256_adds_epi8_saturate_negative() {
3927        let a = _mm256_set1_epi8(-0x80);
3928        let b = _mm256_set1_epi8(-1);
3929        let r = _mm256_adds_epi8(a, b);
3930        assert_eq_m256i(r, a);
3931    }
3932
3933    #[simd_test(enable = "avx2")]
3934    unsafe fn test_mm256_adds_epi16() {
3935        #[rustfmt::skip]
3936        let a = _mm256_setr_epi16(
3937            0, 1, 2, 3, 4, 5, 6, 7,
3938            8, 9, 10, 11, 12, 13, 14, 15,
3939        );
3940        #[rustfmt::skip]
3941        let b = _mm256_setr_epi16(
3942            32, 33, 34, 35, 36, 37, 38, 39,
3943            40, 41, 42, 43, 44, 45, 46, 47,
3944        );
3945        let r = _mm256_adds_epi16(a, b);
3946        #[rustfmt::skip]
3947        let e = _mm256_setr_epi16(
3948            32, 34, 36, 38, 40, 42, 44, 46,
3949            48, 50, 52, 54, 56, 58, 60, 62,
3950        );
3951
3952        assert_eq_m256i(r, e);
3953    }
3954
3955    #[simd_test(enable = "avx2")]
3956    unsafe fn test_mm256_adds_epi16_saturate_positive() {
3957        let a = _mm256_set1_epi16(0x7FFF);
3958        let b = _mm256_set1_epi16(1);
3959        let r = _mm256_adds_epi16(a, b);
3960        assert_eq_m256i(r, a);
3961    }
3962
3963    #[simd_test(enable = "avx2")]
3964    unsafe fn test_mm256_adds_epi16_saturate_negative() {
3965        let a = _mm256_set1_epi16(-0x8000);
3966        let b = _mm256_set1_epi16(-1);
3967        let r = _mm256_adds_epi16(a, b);
3968        assert_eq_m256i(r, a);
3969    }
3970
3971    #[simd_test(enable = "avx2")]
3972    unsafe fn test_mm256_adds_epu8() {
3973        #[rustfmt::skip]
3974        let a = _mm256_setr_epi8(
3975            0, 1, 2, 3, 4, 5, 6, 7,
3976            8, 9, 10, 11, 12, 13, 14, 15,
3977            16, 17, 18, 19, 20, 21, 22, 23,
3978            24, 25, 26, 27, 28, 29, 30, 31,
3979        );
3980        #[rustfmt::skip]
3981        let b = _mm256_setr_epi8(
3982            32, 33, 34, 35, 36, 37, 38, 39,
3983            40, 41, 42, 43, 44, 45, 46, 47,
3984            48, 49, 50, 51, 52, 53, 54, 55,
3985            56, 57, 58, 59, 60, 61, 62, 63,
3986        );
3987        let r = _mm256_adds_epu8(a, b);
3988        #[rustfmt::skip]
3989        let e = _mm256_setr_epi8(
3990            32, 34, 36, 38, 40, 42, 44, 46,
3991            48, 50, 52, 54, 56, 58, 60, 62,
3992            64, 66, 68, 70, 72, 74, 76, 78,
3993            80, 82, 84, 86, 88, 90, 92, 94,
3994        );
3995        assert_eq_m256i(r, e);
3996    }
3997
3998    #[simd_test(enable = "avx2")]
3999    unsafe fn test_mm256_adds_epu8_saturate() {
4000        let a = _mm256_set1_epi8(!0);
4001        let b = _mm256_set1_epi8(1);
4002        let r = _mm256_adds_epu8(a, b);
4003        assert_eq_m256i(r, a);
4004    }
4005
4006    #[simd_test(enable = "avx2")]
4007    unsafe fn test_mm256_adds_epu16() {
4008        #[rustfmt::skip]
4009        let a = _mm256_setr_epi16(
4010            0, 1, 2, 3, 4, 5, 6, 7,
4011            8, 9, 10, 11, 12, 13, 14, 15,
4012        );
4013        #[rustfmt::skip]
4014        let b = _mm256_setr_epi16(
4015            32, 33, 34, 35, 36, 37, 38, 39,
4016            40, 41, 42, 43, 44, 45, 46, 47,
4017        );
4018        let r = _mm256_adds_epu16(a, b);
4019        #[rustfmt::skip]
4020        let e = _mm256_setr_epi16(
4021            32, 34, 36, 38, 40, 42, 44, 46,
4022            48, 50, 52, 54, 56, 58, 60, 62,
4023        );
4024
4025        assert_eq_m256i(r, e);
4026    }
4027
4028    #[simd_test(enable = "avx2")]
4029    unsafe fn test_mm256_adds_epu16_saturate() {
4030        let a = _mm256_set1_epi16(!0);
4031        let b = _mm256_set1_epi16(1);
4032        let r = _mm256_adds_epu16(a, b);
4033        assert_eq_m256i(r, a);
4034    }
4035
4036    #[simd_test(enable = "avx2")]
4037    unsafe fn test_mm256_and_si256() {
4038        let a = _mm256_set1_epi8(5);
4039        let b = _mm256_set1_epi8(3);
4040        let got = _mm256_and_si256(a, b);
4041        assert_eq_m256i(got, _mm256_set1_epi8(1));
4042    }
4043
4044    #[simd_test(enable = "avx2")]
4045    unsafe fn test_mm256_andnot_si256() {
4046        let a = _mm256_set1_epi8(5);
4047        let b = _mm256_set1_epi8(3);
4048        let got = _mm256_andnot_si256(a, b);
4049        assert_eq_m256i(got, _mm256_set1_epi8(2));
4050    }
4051
4052    #[simd_test(enable = "avx2")]
4053    unsafe fn test_mm256_avg_epu8() {
4054        let (a, b) = (_mm256_set1_epi8(3), _mm256_set1_epi8(9));
4055        let r = _mm256_avg_epu8(a, b);
4056        assert_eq_m256i(r, _mm256_set1_epi8(6));
4057    }
4058
4059    #[simd_test(enable = "avx2")]
4060    unsafe fn test_mm256_avg_epu16() {
4061        let (a, b) = (_mm256_set1_epi16(3), _mm256_set1_epi16(9));
4062        let r = _mm256_avg_epu16(a, b);
4063        assert_eq_m256i(r, _mm256_set1_epi16(6));
4064    }
4065
4066    #[simd_test(enable = "avx2")]
4067    unsafe fn test_mm_blend_epi32() {
4068        let (a, b) = (_mm_set1_epi32(3), _mm_set1_epi32(9));
4069        let e = _mm_setr_epi32(9, 3, 3, 3);
4070        let r = _mm_blend_epi32::<0x01>(a, b);
4071        assert_eq_m128i(r, e);
4072
4073        let r = _mm_blend_epi32::<0x0E>(b, a);
4074        assert_eq_m128i(r, e);
4075    }
4076
4077    #[simd_test(enable = "avx2")]
4078    unsafe fn test_mm256_blend_epi32() {
4079        let (a, b) = (_mm256_set1_epi32(3), _mm256_set1_epi32(9));
4080        let e = _mm256_setr_epi32(9, 3, 3, 3, 3, 3, 3, 3);
4081        let r = _mm256_blend_epi32::<0x01>(a, b);
4082        assert_eq_m256i(r, e);
4083
4084        let e = _mm256_setr_epi32(3, 9, 3, 3, 3, 3, 3, 9);
4085        let r = _mm256_blend_epi32::<0x82>(a, b);
4086        assert_eq_m256i(r, e);
4087
4088        let e = _mm256_setr_epi32(3, 3, 9, 9, 9, 9, 9, 3);
4089        let r = _mm256_blend_epi32::<0x7C>(a, b);
4090        assert_eq_m256i(r, e);
4091    }
4092
4093    #[simd_test(enable = "avx2")]
4094    unsafe fn test_mm256_blend_epi16() {
4095        let (a, b) = (_mm256_set1_epi16(3), _mm256_set1_epi16(9));
4096        let e = _mm256_setr_epi16(9, 3, 3, 3, 3, 3, 3, 3, 9, 3, 3, 3, 3, 3, 3, 3);
4097        let r = _mm256_blend_epi16::<0x01>(a, b);
4098        assert_eq_m256i(r, e);
4099
4100        let r = _mm256_blend_epi16::<0xFE>(b, a);
4101        assert_eq_m256i(r, e);
4102    }
4103
4104    #[simd_test(enable = "avx2")]
4105    unsafe fn test_mm256_blendv_epi8() {
4106        let (a, b) = (_mm256_set1_epi8(4), _mm256_set1_epi8(2));
4107        let mask = _mm256_insert_epi8::<2>(_mm256_set1_epi8(0), -1);
4108        let e = _mm256_insert_epi8::<2>(_mm256_set1_epi8(4), 2);
4109        let r = _mm256_blendv_epi8(a, b, mask);
4110        assert_eq_m256i(r, e);
4111    }
4112
4113    #[simd_test(enable = "avx2")]
4114    unsafe fn test_mm_broadcastb_epi8() {
4115        let a = _mm_insert_epi8::<0>(_mm_set1_epi8(0x00), 0x2a);
4116        let res = _mm_broadcastb_epi8(a);
4117        assert_eq_m128i(res, _mm_set1_epi8(0x2a));
4118    }
4119
4120    #[simd_test(enable = "avx2")]
4121    unsafe fn test_mm256_broadcastb_epi8() {
4122        let a = _mm_insert_epi8::<0>(_mm_set1_epi8(0x00), 0x2a);
4123        let res = _mm256_broadcastb_epi8(a);
4124        assert_eq_m256i(res, _mm256_set1_epi8(0x2a));
4125    }
4126
4127    #[simd_test(enable = "avx2")]
4128    unsafe fn test_mm_broadcastd_epi32() {
4129        let a = _mm_setr_epi32(0x2a, 0x8000000, 0, 0);
4130        let res = _mm_broadcastd_epi32(a);
4131        assert_eq_m128i(res, _mm_set1_epi32(0x2a));
4132    }
4133
4134    #[simd_test(enable = "avx2")]
4135    unsafe fn test_mm256_broadcastd_epi32() {
4136        let a = _mm_setr_epi32(0x2a, 0x8000000, 0, 0);
4137        let res = _mm256_broadcastd_epi32(a);
4138        assert_eq_m256i(res, _mm256_set1_epi32(0x2a));
4139    }
4140
4141    #[simd_test(enable = "avx2")]
4142    unsafe fn test_mm_broadcastq_epi64() {
4143        let a = _mm_setr_epi64x(0x1ffffffff, 0);
4144        let res = _mm_broadcastq_epi64(a);
4145        assert_eq_m128i(res, _mm_set1_epi64x(0x1ffffffff));
4146    }
4147
4148    #[simd_test(enable = "avx2")]
4149    unsafe fn test_mm256_broadcastq_epi64() {
4150        let a = _mm_setr_epi64x(0x1ffffffff, 0);
4151        let res = _mm256_broadcastq_epi64(a);
4152        assert_eq_m256i(res, _mm256_set1_epi64x(0x1ffffffff));
4153    }
4154
4155    #[simd_test(enable = "avx2")]
4156    unsafe fn test_mm_broadcastsd_pd() {
4157        let a = _mm_setr_pd(6.88, 3.44);
4158        let res = _mm_broadcastsd_pd(a);
4159        assert_eq_m128d(res, _mm_set1_pd(6.88));
4160    }
4161
4162    #[simd_test(enable = "avx2")]
4163    unsafe fn test_mm256_broadcastsd_pd() {
4164        let a = _mm_setr_pd(6.88, 3.44);
4165        let res = _mm256_broadcastsd_pd(a);
4166        assert_eq_m256d(res, _mm256_set1_pd(6.88f64));
4167    }
4168
4169    #[simd_test(enable = "avx2")]
4170    unsafe fn test_mm_broadcastsi128_si256() {
4171        let a = _mm_setr_epi64x(0x0987654321012334, 0x5678909876543210);
4172        let res = _mm_broadcastsi128_si256(a);
4173        let retval = _mm256_setr_epi64x(
4174            0x0987654321012334,
4175            0x5678909876543210,
4176            0x0987654321012334,
4177            0x5678909876543210,
4178        );
4179        assert_eq_m256i(res, retval);
4180    }
4181
4182    #[simd_test(enable = "avx2")]
4183    unsafe fn test_mm256_broadcastsi128_si256() {
4184        let a = _mm_setr_epi64x(0x0987654321012334, 0x5678909876543210);
4185        let res = _mm256_broadcastsi128_si256(a);
4186        let retval = _mm256_setr_epi64x(
4187            0x0987654321012334,
4188            0x5678909876543210,
4189            0x0987654321012334,
4190            0x5678909876543210,
4191        );
4192        assert_eq_m256i(res, retval);
4193    }
4194
4195    #[simd_test(enable = "avx2")]
4196    unsafe fn test_mm_broadcastss_ps() {
4197        let a = _mm_setr_ps(6.88, 3.44, 0.0, 0.0);
4198        let res = _mm_broadcastss_ps(a);
4199        assert_eq_m128(res, _mm_set1_ps(6.88));
4200    }
4201
4202    #[simd_test(enable = "avx2")]
4203    unsafe fn test_mm256_broadcastss_ps() {
4204        let a = _mm_setr_ps(6.88, 3.44, 0.0, 0.0);
4205        let res = _mm256_broadcastss_ps(a);
4206        assert_eq_m256(res, _mm256_set1_ps(6.88));
4207    }
4208
4209    #[simd_test(enable = "avx2")]
4210    unsafe fn test_mm_broadcastw_epi16() {
4211        let a = _mm_insert_epi16::<0>(_mm_set1_epi16(0x2a), 0x22b);
4212        let res = _mm_broadcastw_epi16(a);
4213        assert_eq_m128i(res, _mm_set1_epi16(0x22b));
4214    }
4215
4216    #[simd_test(enable = "avx2")]
4217    unsafe fn test_mm256_broadcastw_epi16() {
4218        let a = _mm_insert_epi16::<0>(_mm_set1_epi16(0x2a), 0x22b);
4219        let res = _mm256_broadcastw_epi16(a);
4220        assert_eq_m256i(res, _mm256_set1_epi16(0x22b));
4221    }
4222
4223    #[simd_test(enable = "avx2")]
4224    unsafe fn test_mm256_cmpeq_epi8() {
4225        #[rustfmt::skip]
4226        let a = _mm256_setr_epi8(
4227            0, 1, 2, 3, 4, 5, 6, 7,
4228            8, 9, 10, 11, 12, 13, 14, 15,
4229            16, 17, 18, 19, 20, 21, 22, 23,
4230            24, 25, 26, 27, 28, 29, 30, 31,
4231        );
4232        #[rustfmt::skip]
4233        let b = _mm256_setr_epi8(
4234            31, 30, 2, 28, 27, 26, 25, 24,
4235            23, 22, 21, 20, 19, 18, 17, 16,
4236            15, 14, 13, 12, 11, 10, 9, 8,
4237            7, 6, 5, 4, 3, 2, 1, 0,
4238        );
4239        let r = _mm256_cmpeq_epi8(a, b);
4240        assert_eq_m256i(r, _mm256_insert_epi8::<2>(_mm256_set1_epi8(0), !0));
4241    }
4242
4243    #[simd_test(enable = "avx2")]
4244    unsafe fn test_mm256_cmpeq_epi16() {
4245        #[rustfmt::skip]
4246        let a = _mm256_setr_epi16(
4247            0, 1, 2, 3, 4, 5, 6, 7,
4248            8, 9, 10, 11, 12, 13, 14, 15,
4249        );
4250        #[rustfmt::skip]
4251        let b = _mm256_setr_epi16(
4252            15, 14, 2, 12, 11, 10, 9, 8,
4253            7, 6, 5, 4, 3, 2, 1, 0,
4254        );
4255        let r = _mm256_cmpeq_epi16(a, b);
4256        assert_eq_m256i(r, _mm256_insert_epi16::<2>(_mm256_set1_epi16(0), !0));
4257    }
4258
4259    #[simd_test(enable = "avx2")]
4260    unsafe fn test_mm256_cmpeq_epi32() {
4261        let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
4262        let b = _mm256_setr_epi32(7, 6, 2, 4, 3, 2, 1, 0);
4263        let r = _mm256_cmpeq_epi32(a, b);
4264        let e = _mm256_set1_epi32(0);
4265        let e = _mm256_insert_epi32::<2>(e, !0);
4266        assert_eq_m256i(r, e);
4267    }
4268
4269    #[simd_test(enable = "avx2")]
4270    unsafe fn test_mm256_cmpeq_epi64() {
4271        let a = _mm256_setr_epi64x(0, 1, 2, 3);
4272        let b = _mm256_setr_epi64x(3, 2, 2, 0);
4273        let r = _mm256_cmpeq_epi64(a, b);
4274        assert_eq_m256i(r, _mm256_insert_epi64::<2>(_mm256_set1_epi64x(0), !0));
4275    }
4276
4277    #[simd_test(enable = "avx2")]
4278    unsafe fn test_mm256_cmpgt_epi8() {
4279        let a = _mm256_insert_epi8::<0>(_mm256_set1_epi8(0), 5);
4280        let b = _mm256_set1_epi8(0);
4281        let r = _mm256_cmpgt_epi8(a, b);
4282        assert_eq_m256i(r, _mm256_insert_epi8::<0>(_mm256_set1_epi8(0), !0));
4283    }
4284
4285    #[simd_test(enable = "avx2")]
4286    unsafe fn test_mm256_cmpgt_epi16() {
4287        let a = _mm256_insert_epi16::<0>(_mm256_set1_epi16(0), 5);
4288        let b = _mm256_set1_epi16(0);
4289        let r = _mm256_cmpgt_epi16(a, b);
4290        assert_eq_m256i(r, _mm256_insert_epi16::<0>(_mm256_set1_epi16(0), !0));
4291    }
4292
4293    #[simd_test(enable = "avx2")]
4294    unsafe fn test_mm256_cmpgt_epi32() {
4295        let a = _mm256_insert_epi32::<0>(_mm256_set1_epi32(0), 5);
4296        let b = _mm256_set1_epi32(0);
4297        let r = _mm256_cmpgt_epi32(a, b);
4298        assert_eq_m256i(r, _mm256_insert_epi32::<0>(_mm256_set1_epi32(0), !0));
4299    }
4300
4301    #[simd_test(enable = "avx2")]
4302    unsafe fn test_mm256_cmpgt_epi64() {
4303        let a = _mm256_insert_epi64::<0>(_mm256_set1_epi64x(0), 5);
4304        let b = _mm256_set1_epi64x(0);
4305        let r = _mm256_cmpgt_epi64(a, b);
4306        assert_eq_m256i(r, _mm256_insert_epi64::<0>(_mm256_set1_epi64x(0), !0));
4307    }
4308
4309    #[simd_test(enable = "avx2")]
4310    unsafe fn test_mm256_cvtepi8_epi16() {
4311        #[rustfmt::skip]
4312        let a = _mm_setr_epi8(
4313            0, 0, -1, 1, -2, 2, -3, 3,
4314            -4, 4, -5, 5, -6, 6, -7, 7,
4315        );
4316        #[rustfmt::skip]
4317        let r = _mm256_setr_epi16(
4318            0, 0, -1, 1, -2, 2, -3, 3,
4319            -4, 4, -5, 5, -6, 6, -7, 7,
4320        );
4321        assert_eq_m256i(r, _mm256_cvtepi8_epi16(a));
4322    }
4323
4324    #[simd_test(enable = "avx2")]
4325    unsafe fn test_mm256_cvtepi8_epi32() {
4326        #[rustfmt::skip]
4327        let a = _mm_setr_epi8(
4328            0, 0, -1, 1, -2, 2, -3, 3,
4329            -4, 4, -5, 5, -6, 6, -7, 7,
4330        );
4331        let r = _mm256_setr_epi32(0, 0, -1, 1, -2, 2, -3, 3);
4332        assert_eq_m256i(r, _mm256_cvtepi8_epi32(a));
4333    }
4334
4335    #[simd_test(enable = "avx2")]
4336    unsafe fn test_mm256_cvtepi8_epi64() {
4337        #[rustfmt::skip]
4338        let a = _mm_setr_epi8(
4339            0, 0, -1, 1, -2, 2, -3, 3,
4340            -4, 4, -5, 5, -6, 6, -7, 7,
4341        );
4342        let r = _mm256_setr_epi64x(0, 0, -1, 1);
4343        assert_eq_m256i(r, _mm256_cvtepi8_epi64(a));
4344    }
4345
4346    #[simd_test(enable = "avx2")]
4347    unsafe fn test_mm256_cvtepi16_epi32() {
4348        let a = _mm_setr_epi16(0, 0, -1, 1, -2, 2, -3, 3);
4349        let r = _mm256_setr_epi32(0, 0, -1, 1, -2, 2, -3, 3);
4350        assert_eq_m256i(r, _mm256_cvtepi16_epi32(a));
4351    }
4352
4353    #[simd_test(enable = "avx2")]
4354    unsafe fn test_mm256_cvtepi16_epi64() {
4355        let a = _mm_setr_epi16(0, 0, -1, 1, -2, 2, -3, 3);
4356        let r = _mm256_setr_epi64x(0, 0, -1, 1);
4357        assert_eq_m256i(r, _mm256_cvtepi16_epi64(a));
4358    }
4359
4360    #[simd_test(enable = "avx2")]
4361    unsafe fn test_mm256_cvtepi32_epi64() {
4362        let a = _mm_setr_epi32(0, 0, -1, 1);
4363        let r = _mm256_setr_epi64x(0, 0, -1, 1);
4364        assert_eq_m256i(r, _mm256_cvtepi32_epi64(a));
4365    }
4366
4367    #[simd_test(enable = "avx2")]
4368    unsafe fn test_mm256_cvtepu16_epi32() {
4369        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4370        let r = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
4371        assert_eq_m256i(r, _mm256_cvtepu16_epi32(a));
4372    }
4373
4374    #[simd_test(enable = "avx2")]
4375    unsafe fn test_mm256_cvtepu16_epi64() {
4376        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4377        let r = _mm256_setr_epi64x(0, 1, 2, 3);
4378        assert_eq_m256i(r, _mm256_cvtepu16_epi64(a));
4379    }
4380
4381    #[simd_test(enable = "avx2")]
4382    unsafe fn test_mm256_cvtepu32_epi64() {
4383        let a = _mm_setr_epi32(0, 1, 2, 3);
4384        let r = _mm256_setr_epi64x(0, 1, 2, 3);
4385        assert_eq_m256i(r, _mm256_cvtepu32_epi64(a));
4386    }
4387
4388    #[simd_test(enable = "avx2")]
4389    unsafe fn test_mm256_cvtepu8_epi16() {
4390        #[rustfmt::skip]
4391        let a = _mm_setr_epi8(
4392            0, 1, 2, 3, 4, 5, 6, 7,
4393            8, 9, 10, 11, 12, 13, 14, 15,
4394        );
4395        #[rustfmt::skip]
4396        let r = _mm256_setr_epi16(
4397            0, 1, 2, 3, 4, 5, 6, 7,
4398            8, 9, 10, 11, 12, 13, 14, 15,
4399        );
4400        assert_eq_m256i(r, _mm256_cvtepu8_epi16(a));
4401    }
4402
4403    #[simd_test(enable = "avx2")]
4404    unsafe fn test_mm256_cvtepu8_epi32() {
4405        #[rustfmt::skip]
4406        let a = _mm_setr_epi8(
4407            0, 1, 2, 3, 4, 5, 6, 7,
4408            8, 9, 10, 11, 12, 13, 14, 15,
4409        );
4410        let r = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
4411        assert_eq_m256i(r, _mm256_cvtepu8_epi32(a));
4412    }
4413
4414    #[simd_test(enable = "avx2")]
4415    unsafe fn test_mm256_cvtepu8_epi64() {
4416        #[rustfmt::skip]
4417        let a = _mm_setr_epi8(
4418            0, 1, 2, 3, 4, 5, 6, 7,
4419            8, 9, 10, 11, 12, 13, 14, 15,
4420        );
4421        let r = _mm256_setr_epi64x(0, 1, 2, 3);
4422        assert_eq_m256i(r, _mm256_cvtepu8_epi64(a));
4423    }
4424
4425    #[simd_test(enable = "avx2")]
4426    unsafe fn test_mm256_extracti128_si256() {
4427        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4428        let r = _mm256_extracti128_si256::<1>(a);
4429        let e = _mm_setr_epi64x(3, 4);
4430        assert_eq_m128i(r, e);
4431    }
4432
4433    #[simd_test(enable = "avx2")]
4434    unsafe fn test_mm256_hadd_epi16() {
4435        let a = _mm256_set1_epi16(2);
4436        let b = _mm256_set1_epi16(4);
4437        let r = _mm256_hadd_epi16(a, b);
4438        let e = _mm256_setr_epi16(4, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8);
4439        assert_eq_m256i(r, e);
4440    }
4441
4442    #[simd_test(enable = "avx2")]
4443    unsafe fn test_mm256_hadd_epi32() {
4444        let a = _mm256_set1_epi32(2);
4445        let b = _mm256_set1_epi32(4);
4446        let r = _mm256_hadd_epi32(a, b);
4447        let e = _mm256_setr_epi32(4, 4, 8, 8, 4, 4, 8, 8);
4448        assert_eq_m256i(r, e);
4449    }
4450
4451    #[simd_test(enable = "avx2")]
4452    unsafe fn test_mm256_hadds_epi16() {
4453        let a = _mm256_set1_epi16(2);
4454        let a = _mm256_insert_epi16::<0>(a, 0x7fff);
4455        let a = _mm256_insert_epi16::<1>(a, 1);
4456        let b = _mm256_set1_epi16(4);
4457        let r = _mm256_hadds_epi16(a, b);
4458        #[rustfmt::skip]
4459        let e = _mm256_setr_epi16(
4460            0x7FFF, 4, 4, 4, 8, 8, 8, 8,
4461            4, 4, 4, 4, 8, 8, 8, 8,
4462        );
4463        assert_eq_m256i(r, e);
4464    }
4465
4466    #[simd_test(enable = "avx2")]
4467    unsafe fn test_mm256_hsub_epi16() {
4468        let a = _mm256_set1_epi16(2);
4469        let b = _mm256_set1_epi16(4);
4470        let r = _mm256_hsub_epi16(a, b);
4471        let e = _mm256_set1_epi16(0);
4472        assert_eq_m256i(r, e);
4473    }
4474
4475    #[simd_test(enable = "avx2")]
4476    unsafe fn test_mm256_hsub_epi32() {
4477        let a = _mm256_set1_epi32(2);
4478        let b = _mm256_set1_epi32(4);
4479        let r = _mm256_hsub_epi32(a, b);
4480        let e = _mm256_set1_epi32(0);
4481        assert_eq_m256i(r, e);
4482    }
4483
4484    #[simd_test(enable = "avx2")]
4485    unsafe fn test_mm256_hsubs_epi16() {
4486        let a = _mm256_set1_epi16(2);
4487        let a = _mm256_insert_epi16::<0>(a, 0x7fff);
4488        let a = _mm256_insert_epi16::<1>(a, -1);
4489        let b = _mm256_set1_epi16(4);
4490        let r = _mm256_hsubs_epi16(a, b);
4491        let e = _mm256_insert_epi16::<0>(_mm256_set1_epi16(0), 0x7FFF);
4492        assert_eq_m256i(r, e);
4493    }
4494
4495    #[simd_test(enable = "avx2")]
4496    unsafe fn test_mm256_madd_epi16() {
4497        let a = _mm256_set1_epi16(2);
4498        let b = _mm256_set1_epi16(4);
4499        let r = _mm256_madd_epi16(a, b);
4500        let e = _mm256_set1_epi32(16);
4501        assert_eq_m256i(r, e);
4502    }
4503
4504    #[simd_test(enable = "avx2")]
4505    unsafe fn test_mm256_inserti128_si256() {
4506        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4507        let b = _mm_setr_epi64x(7, 8);
4508        let r = _mm256_inserti128_si256::<1>(a, b);
4509        let e = _mm256_setr_epi64x(1, 2, 7, 8);
4510        assert_eq_m256i(r, e);
4511    }
4512
4513    #[simd_test(enable = "avx2")]
4514    unsafe fn test_mm256_maddubs_epi16() {
4515        let a = _mm256_set1_epi8(2);
4516        let b = _mm256_set1_epi8(4);
4517        let r = _mm256_maddubs_epi16(a, b);
4518        let e = _mm256_set1_epi16(16);
4519        assert_eq_m256i(r, e);
4520    }
4521
4522    #[simd_test(enable = "avx2")]
4523    unsafe fn test_mm_maskload_epi32() {
4524        let nums = [1, 2, 3, 4];
4525        let a = &nums as *const i32;
4526        let mask = _mm_setr_epi32(-1, 0, 0, -1);
4527        let r = _mm_maskload_epi32(a, mask);
4528        let e = _mm_setr_epi32(1, 0, 0, 4);
4529        assert_eq_m128i(r, e);
4530    }
4531
4532    #[simd_test(enable = "avx2")]
4533    unsafe fn test_mm256_maskload_epi32() {
4534        let nums = [1, 2, 3, 4, 5, 6, 7, 8];
4535        let a = &nums as *const i32;
4536        let mask = _mm256_setr_epi32(-1, 0, 0, -1, 0, -1, -1, 0);
4537        let r = _mm256_maskload_epi32(a, mask);
4538        let e = _mm256_setr_epi32(1, 0, 0, 4, 0, 6, 7, 0);
4539        assert_eq_m256i(r, e);
4540    }
4541
4542    #[simd_test(enable = "avx2")]
4543    unsafe fn test_mm_maskload_epi64() {
4544        let nums = [1_i64, 2_i64];
4545        let a = &nums as *const i64;
4546        let mask = _mm_setr_epi64x(0, -1);
4547        let r = _mm_maskload_epi64(a, mask);
4548        let e = _mm_setr_epi64x(0, 2);
4549        assert_eq_m128i(r, e);
4550    }
4551
4552    #[simd_test(enable = "avx2")]
4553    unsafe fn test_mm256_maskload_epi64() {
4554        let nums = [1_i64, 2_i64, 3_i64, 4_i64];
4555        let a = &nums as *const i64;
4556        let mask = _mm256_setr_epi64x(0, -1, -1, 0);
4557        let r = _mm256_maskload_epi64(a, mask);
4558        let e = _mm256_setr_epi64x(0, 2, 3, 0);
4559        assert_eq_m256i(r, e);
4560    }
4561
4562    #[simd_test(enable = "avx2")]
4563    unsafe fn test_mm_maskstore_epi32() {
4564        let a = _mm_setr_epi32(1, 2, 3, 4);
4565        let mut arr = [-1, -1, -1, -1];
4566        let mask = _mm_setr_epi32(-1, 0, 0, -1);
4567        _mm_maskstore_epi32(arr.as_mut_ptr(), mask, a);
4568        let e = [1, -1, -1, 4];
4569        assert_eq!(arr, e);
4570    }
4571
4572    #[simd_test(enable = "avx2")]
4573    unsafe fn test_mm256_maskstore_epi32() {
4574        let a = _mm256_setr_epi32(1, 0x6d726f, 3, 42, 0x777161, 6, 7, 8);
4575        let mut arr = [-1, -1, -1, 0x776173, -1, 0x68657265, -1, -1];
4576        let mask = _mm256_setr_epi32(-1, 0, 0, -1, 0, -1, -1, 0);
4577        _mm256_maskstore_epi32(arr.as_mut_ptr(), mask, a);
4578        let e = [1, -1, -1, 42, -1, 6, 7, -1];
4579        assert_eq!(arr, e);
4580    }
4581
4582    #[simd_test(enable = "avx2")]
4583    unsafe fn test_mm_maskstore_epi64() {
4584        let a = _mm_setr_epi64x(1_i64, 2_i64);
4585        let mut arr = [-1_i64, -1_i64];
4586        let mask = _mm_setr_epi64x(0, -1);
4587        _mm_maskstore_epi64(arr.as_mut_ptr(), mask, a);
4588        let e = [-1, 2];
4589        assert_eq!(arr, e);
4590    }
4591
4592    #[simd_test(enable = "avx2")]
4593    unsafe fn test_mm256_maskstore_epi64() {
4594        let a = _mm256_setr_epi64x(1_i64, 2_i64, 3_i64, 4_i64);
4595        let mut arr = [-1_i64, -1_i64, -1_i64, -1_i64];
4596        let mask = _mm256_setr_epi64x(0, -1, -1, 0);
4597        _mm256_maskstore_epi64(arr.as_mut_ptr(), mask, a);
4598        let e = [-1, 2, 3, -1];
4599        assert_eq!(arr, e);
4600    }
4601
4602    #[simd_test(enable = "avx2")]
4603    unsafe fn test_mm256_max_epi16() {
4604        let a = _mm256_set1_epi16(2);
4605        let b = _mm256_set1_epi16(4);
4606        let r = _mm256_max_epi16(a, b);
4607        assert_eq_m256i(r, b);
4608    }
4609
4610    #[simd_test(enable = "avx2")]
4611    unsafe fn test_mm256_max_epi32() {
4612        let a = _mm256_set1_epi32(2);
4613        let b = _mm256_set1_epi32(4);
4614        let r = _mm256_max_epi32(a, b);
4615        assert_eq_m256i(r, b);
4616    }
4617
4618    #[simd_test(enable = "avx2")]
4619    unsafe fn test_mm256_max_epi8() {
4620        let a = _mm256_set1_epi8(2);
4621        let b = _mm256_set1_epi8(4);
4622        let r = _mm256_max_epi8(a, b);
4623        assert_eq_m256i(r, b);
4624    }
4625
4626    #[simd_test(enable = "avx2")]
4627    unsafe fn test_mm256_max_epu16() {
4628        let a = _mm256_set1_epi16(2);
4629        let b = _mm256_set1_epi16(4);
4630        let r = _mm256_max_epu16(a, b);
4631        assert_eq_m256i(r, b);
4632    }
4633
4634    #[simd_test(enable = "avx2")]
4635    unsafe fn test_mm256_max_epu32() {
4636        let a = _mm256_set1_epi32(2);
4637        let b = _mm256_set1_epi32(4);
4638        let r = _mm256_max_epu32(a, b);
4639        assert_eq_m256i(r, b);
4640    }
4641
4642    #[simd_test(enable = "avx2")]
4643    unsafe fn test_mm256_max_epu8() {
4644        let a = _mm256_set1_epi8(2);
4645        let b = _mm256_set1_epi8(4);
4646        let r = _mm256_max_epu8(a, b);
4647        assert_eq_m256i(r, b);
4648    }
4649
4650    #[simd_test(enable = "avx2")]
4651    unsafe fn test_mm256_min_epi16() {
4652        let a = _mm256_set1_epi16(2);
4653        let b = _mm256_set1_epi16(4);
4654        let r = _mm256_min_epi16(a, b);
4655        assert_eq_m256i(r, a);
4656    }
4657
4658    #[simd_test(enable = "avx2")]
4659    unsafe fn test_mm256_min_epi32() {
4660        let a = _mm256_set1_epi32(2);
4661        let b = _mm256_set1_epi32(4);
4662        let r = _mm256_min_epi32(a, b);
4663        assert_eq_m256i(r, a);
4664    }
4665
4666    #[simd_test(enable = "avx2")]
4667    unsafe fn test_mm256_min_epi8() {
4668        let a = _mm256_set1_epi8(2);
4669        let b = _mm256_set1_epi8(4);
4670        let r = _mm256_min_epi8(a, b);
4671        assert_eq_m256i(r, a);
4672    }
4673
4674    #[simd_test(enable = "avx2")]
4675    unsafe fn test_mm256_min_epu16() {
4676        let a = _mm256_set1_epi16(2);
4677        let b = _mm256_set1_epi16(4);
4678        let r = _mm256_min_epu16(a, b);
4679        assert_eq_m256i(r, a);
4680    }
4681
4682    #[simd_test(enable = "avx2")]
4683    unsafe fn test_mm256_min_epu32() {
4684        let a = _mm256_set1_epi32(2);
4685        let b = _mm256_set1_epi32(4);
4686        let r = _mm256_min_epu32(a, b);
4687        assert_eq_m256i(r, a);
4688    }
4689
4690    #[simd_test(enable = "avx2")]
4691    unsafe fn test_mm256_min_epu8() {
4692        let a = _mm256_set1_epi8(2);
4693        let b = _mm256_set1_epi8(4);
4694        let r = _mm256_min_epu8(a, b);
4695        assert_eq_m256i(r, a);
4696    }
4697
4698    #[simd_test(enable = "avx2")]
4699    unsafe fn test_mm256_movemask_epi8() {
4700        let a = _mm256_set1_epi8(-1);
4701        let r = _mm256_movemask_epi8(a);
4702        let e = -1;
4703        assert_eq!(r, e);
4704    }
4705
4706    #[simd_test(enable = "avx2")]
4707    unsafe fn test_mm256_mpsadbw_epu8() {
4708        let a = _mm256_set1_epi8(2);
4709        let b = _mm256_set1_epi8(4);
4710        let r = _mm256_mpsadbw_epu8::<0>(a, b);
4711        let e = _mm256_set1_epi16(8);
4712        assert_eq_m256i(r, e);
4713    }
4714
4715    #[simd_test(enable = "avx2")]
4716    unsafe fn test_mm256_mul_epi32() {
4717        let a = _mm256_setr_epi32(0, 0, 0, 0, 2, 2, 2, 2);
4718        let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4719        let r = _mm256_mul_epi32(a, b);
4720        let e = _mm256_setr_epi64x(0, 0, 10, 14);
4721        assert_eq_m256i(r, e);
4722    }
4723
4724    #[simd_test(enable = "avx2")]
4725    unsafe fn test_mm256_mul_epu32() {
4726        let a = _mm256_setr_epi32(0, 0, 0, 0, 2, 2, 2, 2);
4727        let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4728        let r = _mm256_mul_epu32(a, b);
4729        let e = _mm256_setr_epi64x(0, 0, 10, 14);
4730        assert_eq_m256i(r, e);
4731    }
4732
4733    #[simd_test(enable = "avx2")]
4734    unsafe fn test_mm256_mulhi_epi16() {
4735        let a = _mm256_set1_epi16(6535);
4736        let b = _mm256_set1_epi16(6535);
4737        let r = _mm256_mulhi_epi16(a, b);
4738        let e = _mm256_set1_epi16(651);
4739        assert_eq_m256i(r, e);
4740    }
4741
4742    #[simd_test(enable = "avx2")]
4743    unsafe fn test_mm256_mulhi_epu16() {
4744        let a = _mm256_set1_epi16(6535);
4745        let b = _mm256_set1_epi16(6535);
4746        let r = _mm256_mulhi_epu16(a, b);
4747        let e = _mm256_set1_epi16(651);
4748        assert_eq_m256i(r, e);
4749    }
4750
4751    #[simd_test(enable = "avx2")]
4752    unsafe fn test_mm256_mullo_epi16() {
4753        let a = _mm256_set1_epi16(2);
4754        let b = _mm256_set1_epi16(4);
4755        let r = _mm256_mullo_epi16(a, b);
4756        let e = _mm256_set1_epi16(8);
4757        assert_eq_m256i(r, e);
4758    }
4759
4760    #[simd_test(enable = "avx2")]
4761    unsafe fn test_mm256_mullo_epi32() {
4762        let a = _mm256_set1_epi32(2);
4763        let b = _mm256_set1_epi32(4);
4764        let r = _mm256_mullo_epi32(a, b);
4765        let e = _mm256_set1_epi32(8);
4766        assert_eq_m256i(r, e);
4767    }
4768
4769    #[simd_test(enable = "avx2")]
4770    unsafe fn test_mm256_mulhrs_epi16() {
4771        let a = _mm256_set1_epi16(2);
4772        let b = _mm256_set1_epi16(4);
4773        let r = _mm256_mullo_epi16(a, b);
4774        let e = _mm256_set1_epi16(8);
4775        assert_eq_m256i(r, e);
4776    }
4777
4778    #[simd_test(enable = "avx2")]
4779    unsafe fn test_mm256_or_si256() {
4780        let a = _mm256_set1_epi8(-1);
4781        let b = _mm256_set1_epi8(0);
4782        let r = _mm256_or_si256(a, b);
4783        assert_eq_m256i(r, a);
4784    }
4785
4786    #[simd_test(enable = "avx2")]
4787    unsafe fn test_mm256_packs_epi16() {
4788        let a = _mm256_set1_epi16(2);
4789        let b = _mm256_set1_epi16(4);
4790        let r = _mm256_packs_epi16(a, b);
4791        #[rustfmt::skip]
4792        let e = _mm256_setr_epi8(
4793            2, 2, 2, 2, 2, 2, 2, 2,
4794            4, 4, 4, 4, 4, 4, 4, 4,
4795            2, 2, 2, 2, 2, 2, 2, 2,
4796            4, 4, 4, 4, 4, 4, 4, 4,
4797        );
4798
4799        assert_eq_m256i(r, e);
4800    }
4801
4802    #[simd_test(enable = "avx2")]
4803    unsafe fn test_mm256_packs_epi32() {
4804        let a = _mm256_set1_epi32(2);
4805        let b = _mm256_set1_epi32(4);
4806        let r = _mm256_packs_epi32(a, b);
4807        let e = _mm256_setr_epi16(2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, 4, 4, 4, 4);
4808
4809        assert_eq_m256i(r, e);
4810    }
4811
4812    #[simd_test(enable = "avx2")]
4813    unsafe fn test_mm256_packus_epi16() {
4814        let a = _mm256_set1_epi16(2);
4815        let b = _mm256_set1_epi16(4);
4816        let r = _mm256_packus_epi16(a, b);
4817        #[rustfmt::skip]
4818        let e = _mm256_setr_epi8(
4819            2, 2, 2, 2, 2, 2, 2, 2,
4820            4, 4, 4, 4, 4, 4, 4, 4,
4821            2, 2, 2, 2, 2, 2, 2, 2,
4822            4, 4, 4, 4, 4, 4, 4, 4,
4823        );
4824
4825        assert_eq_m256i(r, e);
4826    }
4827
4828    #[simd_test(enable = "avx2")]
4829    unsafe fn test_mm256_packus_epi32() {
4830        let a = _mm256_set1_epi32(2);
4831        let b = _mm256_set1_epi32(4);
4832        let r = _mm256_packus_epi32(a, b);
4833        let e = _mm256_setr_epi16(2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, 4, 4, 4, 4);
4834
4835        assert_eq_m256i(r, e);
4836    }
4837
4838    #[simd_test(enable = "avx2")]
4839    unsafe fn test_mm256_sad_epu8() {
4840        let a = _mm256_set1_epi8(2);
4841        let b = _mm256_set1_epi8(4);
4842        let r = _mm256_sad_epu8(a, b);
4843        let e = _mm256_set1_epi64x(16);
4844        assert_eq_m256i(r, e);
4845    }
4846
4847    #[simd_test(enable = "avx2")]
4848    unsafe fn test_mm256_shufflehi_epi16() {
4849        #[rustfmt::skip]
4850        let a = _mm256_setr_epi16(
4851            0, 1, 2, 3, 11, 22, 33, 44,
4852            4, 5, 6, 7, 55, 66, 77, 88,
4853        );
4854        #[rustfmt::skip]
4855        let e = _mm256_setr_epi16(
4856            0, 1, 2, 3, 44, 22, 22, 11,
4857            4, 5, 6, 7, 88, 66, 66, 55,
4858        );
4859        let r = _mm256_shufflehi_epi16::<0b00_01_01_11>(a);
4860        assert_eq_m256i(r, e);
4861    }
4862
4863    #[simd_test(enable = "avx2")]
4864    unsafe fn test_mm256_shufflelo_epi16() {
4865        #[rustfmt::skip]
4866        let a = _mm256_setr_epi16(
4867            11, 22, 33, 44, 0, 1, 2, 3,
4868            55, 66, 77, 88, 4, 5, 6, 7,
4869        );
4870        #[rustfmt::skip]
4871        let e = _mm256_setr_epi16(
4872            44, 22, 22, 11, 0, 1, 2, 3,
4873            88, 66, 66, 55, 4, 5, 6, 7,
4874        );
4875        let r = _mm256_shufflelo_epi16::<0b00_01_01_11>(a);
4876        assert_eq_m256i(r, e);
4877    }
4878
4879    #[simd_test(enable = "avx2")]
4880    unsafe fn test_mm256_sign_epi16() {
4881        let a = _mm256_set1_epi16(2);
4882        let b = _mm256_set1_epi16(-1);
4883        let r = _mm256_sign_epi16(a, b);
4884        let e = _mm256_set1_epi16(-2);
4885        assert_eq_m256i(r, e);
4886    }
4887
4888    #[simd_test(enable = "avx2")]
4889    unsafe fn test_mm256_sign_epi32() {
4890        let a = _mm256_set1_epi32(2);
4891        let b = _mm256_set1_epi32(-1);
4892        let r = _mm256_sign_epi32(a, b);
4893        let e = _mm256_set1_epi32(-2);
4894        assert_eq_m256i(r, e);
4895    }
4896
4897    #[simd_test(enable = "avx2")]
4898    unsafe fn test_mm256_sign_epi8() {
4899        let a = _mm256_set1_epi8(2);
4900        let b = _mm256_set1_epi8(-1);
4901        let r = _mm256_sign_epi8(a, b);
4902        let e = _mm256_set1_epi8(-2);
4903        assert_eq_m256i(r, e);
4904    }
4905
4906    #[simd_test(enable = "avx2")]
4907    unsafe fn test_mm256_sll_epi16() {
4908        let a = _mm256_set1_epi16(0xFF);
4909        let b = _mm_insert_epi16::<0>(_mm_set1_epi16(0), 4);
4910        let r = _mm256_sll_epi16(a, b);
4911        assert_eq_m256i(r, _mm256_set1_epi16(0xFF0));
4912    }
4913
4914    #[simd_test(enable = "avx2")]
4915    unsafe fn test_mm256_sll_epi32() {
4916        let a = _mm256_set1_epi32(0xFFFF);
4917        let b = _mm_insert_epi32::<0>(_mm_set1_epi32(0), 4);
4918        let r = _mm256_sll_epi32(a, b);
4919        assert_eq_m256i(r, _mm256_set1_epi32(0xFFFF0));
4920    }
4921
4922    #[simd_test(enable = "avx2")]
4923    unsafe fn test_mm256_sll_epi64() {
4924        let a = _mm256_set1_epi64x(0xFFFFFFFF);
4925        let b = _mm_insert_epi64::<0>(_mm_set1_epi64x(0), 4);
4926        let r = _mm256_sll_epi64(a, b);
4927        assert_eq_m256i(r, _mm256_set1_epi64x(0xFFFFFFFF0));
4928    }
4929
4930    #[simd_test(enable = "avx2")]
4931    unsafe fn test_mm256_slli_epi16() {
4932        assert_eq_m256i(
4933            _mm256_slli_epi16::<4>(_mm256_set1_epi16(0xFF)),
4934            _mm256_set1_epi16(0xFF0),
4935        );
4936    }
4937
4938    #[simd_test(enable = "avx2")]
4939    unsafe fn test_mm256_slli_epi32() {
4940        assert_eq_m256i(
4941            _mm256_slli_epi32::<4>(_mm256_set1_epi32(0xFFFF)),
4942            _mm256_set1_epi32(0xFFFF0),
4943        );
4944    }
4945
4946    #[simd_test(enable = "avx2")]
4947    unsafe fn test_mm256_slli_epi64() {
4948        assert_eq_m256i(
4949            _mm256_slli_epi64::<4>(_mm256_set1_epi64x(0xFFFFFFFF)),
4950            _mm256_set1_epi64x(0xFFFFFFFF0),
4951        );
4952    }
4953
4954    #[simd_test(enable = "avx2")]
4955    unsafe fn test_mm256_slli_si256() {
4956        let a = _mm256_set1_epi64x(0xFFFFFFFF);
4957        let r = _mm256_slli_si256::<3>(a);
4958        assert_eq_m256i(r, _mm256_set1_epi64x(0xFFFFFFFF000000));
4959    }
4960
4961    #[simd_test(enable = "avx2")]
4962    unsafe fn test_mm_sllv_epi32() {
4963        let a = _mm_set1_epi32(2);
4964        let b = _mm_set1_epi32(1);
4965        let r = _mm_sllv_epi32(a, b);
4966        let e = _mm_set1_epi32(4);
4967        assert_eq_m128i(r, e);
4968    }
4969
4970    #[simd_test(enable = "avx2")]
4971    unsafe fn test_mm256_sllv_epi32() {
4972        let a = _mm256_set1_epi32(2);
4973        let b = _mm256_set1_epi32(1);
4974        let r = _mm256_sllv_epi32(a, b);
4975        let e = _mm256_set1_epi32(4);
4976        assert_eq_m256i(r, e);
4977    }
4978
4979    #[simd_test(enable = "avx2")]
4980    unsafe fn test_mm_sllv_epi64() {
4981        let a = _mm_set1_epi64x(2);
4982        let b = _mm_set1_epi64x(1);
4983        let r = _mm_sllv_epi64(a, b);
4984        let e = _mm_set1_epi64x(4);
4985        assert_eq_m128i(r, e);
4986    }
4987
4988    #[simd_test(enable = "avx2")]
4989    unsafe fn test_mm256_sllv_epi64() {
4990        let a = _mm256_set1_epi64x(2);
4991        let b = _mm256_set1_epi64x(1);
4992        let r = _mm256_sllv_epi64(a, b);
4993        let e = _mm256_set1_epi64x(4);
4994        assert_eq_m256i(r, e);
4995    }
4996
4997    #[simd_test(enable = "avx2")]
4998    unsafe fn test_mm256_sra_epi16() {
4999        let a = _mm256_set1_epi16(-1);
5000        let b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
5001        let r = _mm256_sra_epi16(a, b);
5002        assert_eq_m256i(r, _mm256_set1_epi16(-1));
5003    }
5004
5005    #[simd_test(enable = "avx2")]
5006    unsafe fn test_mm256_sra_epi32() {
5007        let a = _mm256_set1_epi32(-1);
5008        let b = _mm_insert_epi32::<0>(_mm_set1_epi32(0), 1);
5009        let r = _mm256_sra_epi32(a, b);
5010        assert_eq_m256i(r, _mm256_set1_epi32(-1));
5011    }
5012
5013    #[simd_test(enable = "avx2")]
5014    unsafe fn test_mm256_srai_epi16() {
5015        assert_eq_m256i(
5016            _mm256_srai_epi16::<1>(_mm256_set1_epi16(-1)),
5017            _mm256_set1_epi16(-1),
5018        );
5019    }
5020
5021    #[simd_test(enable = "avx2")]
5022    unsafe fn test_mm256_srai_epi32() {
5023        assert_eq_m256i(
5024            _mm256_srai_epi32::<1>(_mm256_set1_epi32(-1)),
5025            _mm256_set1_epi32(-1),
5026        );
5027    }
5028
5029    #[simd_test(enable = "avx2")]
5030    unsafe fn test_mm_srav_epi32() {
5031        let a = _mm_set1_epi32(4);
5032        let count = _mm_set1_epi32(1);
5033        let r = _mm_srav_epi32(a, count);
5034        let e = _mm_set1_epi32(2);
5035        assert_eq_m128i(r, e);
5036    }
5037
5038    #[simd_test(enable = "avx2")]
5039    unsafe fn test_mm256_srav_epi32() {
5040        let a = _mm256_set1_epi32(4);
5041        let count = _mm256_set1_epi32(1);
5042        let r = _mm256_srav_epi32(a, count);
5043        let e = _mm256_set1_epi32(2);
5044        assert_eq_m256i(r, e);
5045    }
5046
5047    #[simd_test(enable = "avx2")]
5048    unsafe fn test_mm256_srli_si256() {
5049        #[rustfmt::skip]
5050        let a = _mm256_setr_epi8(
5051            1, 2, 3, 4, 5, 6, 7, 8,
5052            9, 10, 11, 12, 13, 14, 15, 16,
5053            17, 18, 19, 20, 21, 22, 23, 24,
5054            25, 26, 27, 28, 29, 30, 31, 32,
5055        );
5056        let r = _mm256_srli_si256::<3>(a);
5057        #[rustfmt::skip]
5058        let e = _mm256_setr_epi8(
5059            4, 5, 6, 7, 8, 9, 10, 11,
5060            12, 13, 14, 15, 16, 0, 0, 0,
5061            20, 21, 22, 23, 24, 25, 26, 27,
5062            28, 29, 30, 31, 32, 0, 0, 0,
5063        );
5064        assert_eq_m256i(r, e);
5065    }
5066
5067    #[simd_test(enable = "avx2")]
5068    unsafe fn test_mm256_srl_epi16() {
5069        let a = _mm256_set1_epi16(0xFF);
5070        let b = _mm_insert_epi16::<0>(_mm_set1_epi16(0), 4);
5071        let r = _mm256_srl_epi16(a, b);
5072        assert_eq_m256i(r, _mm256_set1_epi16(0xF));
5073    }
5074
5075    #[simd_test(enable = "avx2")]
5076    unsafe fn test_mm256_srl_epi32() {
5077        let a = _mm256_set1_epi32(0xFFFF);
5078        let b = _mm_insert_epi32::<0>(_mm_set1_epi32(0), 4);
5079        let r = _mm256_srl_epi32(a, b);
5080        assert_eq_m256i(r, _mm256_set1_epi32(0xFFF));
5081    }
5082
5083    #[simd_test(enable = "avx2")]
5084    unsafe fn test_mm256_srl_epi64() {
5085        let a = _mm256_set1_epi64x(0xFFFFFFFF);
5086        let b = _mm_setr_epi64x(4, 0);
5087        let r = _mm256_srl_epi64(a, b);
5088        assert_eq_m256i(r, _mm256_set1_epi64x(0xFFFFFFF));
5089    }
5090
5091    #[simd_test(enable = "avx2")]
5092    unsafe fn test_mm256_srli_epi16() {
5093        assert_eq_m256i(
5094            _mm256_srli_epi16::<4>(_mm256_set1_epi16(0xFF)),
5095            _mm256_set1_epi16(0xF),
5096        );
5097    }
5098
5099    #[simd_test(enable = "avx2")]
5100    unsafe fn test_mm256_srli_epi32() {
5101        assert_eq_m256i(
5102            _mm256_srli_epi32::<4>(_mm256_set1_epi32(0xFFFF)),
5103            _mm256_set1_epi32(0xFFF),
5104        );
5105    }
5106
5107    #[simd_test(enable = "avx2")]
5108    unsafe fn test_mm256_srli_epi64() {
5109        assert_eq_m256i(
5110            _mm256_srli_epi64::<4>(_mm256_set1_epi64x(0xFFFFFFFF)),
5111            _mm256_set1_epi64x(0xFFFFFFF),
5112        );
5113    }
5114
5115    #[simd_test(enable = "avx2")]
5116    unsafe fn test_mm_srlv_epi32() {
5117        let a = _mm_set1_epi32(2);
5118        let count = _mm_set1_epi32(1);
5119        let r = _mm_srlv_epi32(a, count);
5120        let e = _mm_set1_epi32(1);
5121        assert_eq_m128i(r, e);
5122    }
5123
5124    #[simd_test(enable = "avx2")]
5125    unsafe fn test_mm256_srlv_epi32() {
5126        let a = _mm256_set1_epi32(2);
5127        let count = _mm256_set1_epi32(1);
5128        let r = _mm256_srlv_epi32(a, count);
5129        let e = _mm256_set1_epi32(1);
5130        assert_eq_m256i(r, e);
5131    }
5132
5133    #[simd_test(enable = "avx2")]
5134    unsafe fn test_mm_srlv_epi64() {
5135        let a = _mm_set1_epi64x(2);
5136        let count = _mm_set1_epi64x(1);
5137        let r = _mm_srlv_epi64(a, count);
5138        let e = _mm_set1_epi64x(1);
5139        assert_eq_m128i(r, e);
5140    }
5141
5142    #[simd_test(enable = "avx2")]
5143    unsafe fn test_mm256_srlv_epi64() {
5144        let a = _mm256_set1_epi64x(2);
5145        let count = _mm256_set1_epi64x(1);
5146        let r = _mm256_srlv_epi64(a, count);
5147        let e = _mm256_set1_epi64x(1);
5148        assert_eq_m256i(r, e);
5149    }
5150
5151    #[simd_test(enable = "avx2")]
5152    unsafe fn test_mm256_stream_load_si256() {
5153        let a = _mm256_set_epi64x(5, 6, 7, 8);
5154        let r = _mm256_stream_load_si256(core::ptr::addr_of!(a) as *const _);
5155        assert_eq_m256i(a, r);
5156    }
5157
5158    #[simd_test(enable = "avx2")]
5159    unsafe fn test_mm256_sub_epi16() {
5160        let a = _mm256_set1_epi16(4);
5161        let b = _mm256_set1_epi16(2);
5162        let r = _mm256_sub_epi16(a, b);
5163        assert_eq_m256i(r, b);
5164    }
5165
5166    #[simd_test(enable = "avx2")]
5167    unsafe fn test_mm256_sub_epi32() {
5168        let a = _mm256_set1_epi32(4);
5169        let b = _mm256_set1_epi32(2);
5170        let r = _mm256_sub_epi32(a, b);
5171        assert_eq_m256i(r, b);
5172    }
5173
5174    #[simd_test(enable = "avx2")]
5175    unsafe fn test_mm256_sub_epi64() {
5176        let a = _mm256_set1_epi64x(4);
5177        let b = _mm256_set1_epi64x(2);
5178        let r = _mm256_sub_epi64(a, b);
5179        assert_eq_m256i(r, b);
5180    }
5181
5182    #[simd_test(enable = "avx2")]
5183    unsafe fn test_mm256_sub_epi8() {
5184        let a = _mm256_set1_epi8(4);
5185        let b = _mm256_set1_epi8(2);
5186        let r = _mm256_sub_epi8(a, b);
5187        assert_eq_m256i(r, b);
5188    }
5189
5190    #[simd_test(enable = "avx2")]
5191    unsafe fn test_mm256_subs_epi16() {
5192        let a = _mm256_set1_epi16(4);
5193        let b = _mm256_set1_epi16(2);
5194        let r = _mm256_subs_epi16(a, b);
5195        assert_eq_m256i(r, b);
5196    }
5197
5198    #[simd_test(enable = "avx2")]
5199    unsafe fn test_mm256_subs_epi8() {
5200        let a = _mm256_set1_epi8(4);
5201        let b = _mm256_set1_epi8(2);
5202        let r = _mm256_subs_epi8(a, b);
5203        assert_eq_m256i(r, b);
5204    }
5205
5206    #[simd_test(enable = "avx2")]
5207    unsafe fn test_mm256_subs_epu16() {
5208        let a = _mm256_set1_epi16(4);
5209        let b = _mm256_set1_epi16(2);
5210        let r = _mm256_subs_epu16(a, b);
5211        assert_eq_m256i(r, b);
5212    }
5213
5214    #[simd_test(enable = "avx2")]
5215    unsafe fn test_mm256_subs_epu8() {
5216        let a = _mm256_set1_epi8(4);
5217        let b = _mm256_set1_epi8(2);
5218        let r = _mm256_subs_epu8(a, b);
5219        assert_eq_m256i(r, b);
5220    }
5221
5222    #[simd_test(enable = "avx2")]
5223    unsafe fn test_mm256_xor_si256() {
5224        let a = _mm256_set1_epi8(5);
5225        let b = _mm256_set1_epi8(3);
5226        let r = _mm256_xor_si256(a, b);
5227        assert_eq_m256i(r, _mm256_set1_epi8(6));
5228    }
5229
5230    #[simd_test(enable = "avx2")]
5231    unsafe fn test_mm256_alignr_epi8() {
5232        #[rustfmt::skip]
5233        let a = _mm256_setr_epi8(
5234            1, 2, 3, 4, 5, 6, 7, 8,
5235            9, 10, 11, 12, 13, 14, 15, 16,
5236            17, 18, 19, 20, 21, 22, 23, 24,
5237            25, 26, 27, 28, 29, 30, 31, 32,
5238        );
5239        #[rustfmt::skip]
5240        let b = _mm256_setr_epi8(
5241            -1, -2, -3, -4, -5, -6, -7, -8,
5242            -9, -10, -11, -12, -13, -14, -15, -16,
5243            -17, -18, -19, -20, -21, -22, -23, -24,
5244            -25, -26, -27, -28, -29, -30, -31, -32,
5245        );
5246        let r = _mm256_alignr_epi8::<33>(a, b);
5247        assert_eq_m256i(r, _mm256_set1_epi8(0));
5248
5249        let r = _mm256_alignr_epi8::<17>(a, b);
5250        #[rustfmt::skip]
5251        let expected = _mm256_setr_epi8(
5252            2, 3, 4, 5, 6, 7, 8, 9,
5253            10, 11, 12, 13, 14, 15, 16, 0,
5254            18, 19, 20, 21, 22, 23, 24, 25,
5255            26, 27, 28, 29, 30, 31, 32, 0,
5256        );
5257        assert_eq_m256i(r, expected);
5258
5259        let r = _mm256_alignr_epi8::<4>(a, b);
5260        #[rustfmt::skip]
5261        let expected = _mm256_setr_epi8(
5262            -5, -6, -7, -8, -9, -10, -11, -12,
5263            -13, -14, -15, -16, 1, 2, 3, 4,
5264            -21, -22, -23, -24, -25, -26, -27, -28,
5265            -29, -30, -31, -32, 17, 18, 19, 20,
5266        );
5267        assert_eq_m256i(r, expected);
5268
5269        let r = _mm256_alignr_epi8::<15>(a, b);
5270        #[rustfmt::skip]
5271        let expected = _mm256_setr_epi8(
5272            -16, 1, 2, 3, 4, 5, 6, 7,
5273            8, 9, 10, 11, 12, 13, 14, 15,
5274            -32, 17, 18, 19, 20, 21, 22, 23,
5275            24, 25, 26, 27, 28, 29, 30, 31,
5276        );
5277        assert_eq_m256i(r, expected);
5278
5279        let r = _mm256_alignr_epi8::<0>(a, b);
5280        assert_eq_m256i(r, b);
5281
5282        let r = _mm256_alignr_epi8::<16>(a, b);
5283        assert_eq_m256i(r, a);
5284    }
5285
5286    #[simd_test(enable = "avx2")]
5287    unsafe fn test_mm256_shuffle_epi8() {
5288        #[rustfmt::skip]
5289        let a = _mm256_setr_epi8(
5290            1, 2, 3, 4, 5, 6, 7, 8,
5291            9, 10, 11, 12, 13, 14, 15, 16,
5292            17, 18, 19, 20, 21, 22, 23, 24,
5293            25, 26, 27, 28, 29, 30, 31, 32,
5294        );
5295        #[rustfmt::skip]
5296        let b = _mm256_setr_epi8(
5297            4, 128u8 as i8, 4, 3, 24, 12, 6, 19,
5298            12, 5, 5, 10, 4, 1, 8, 0,
5299            4, 128u8 as i8, 4, 3, 24, 12, 6, 19,
5300            12, 5, 5, 10, 4, 1, 8, 0,
5301        );
5302        #[rustfmt::skip]
5303        let expected = _mm256_setr_epi8(
5304            5, 0, 5, 4, 9, 13, 7, 4,
5305            13, 6, 6, 11, 5, 2, 9, 1,
5306            21, 0, 21, 20, 25, 29, 23, 20,
5307            29, 22, 22, 27, 21, 18, 25, 17,
5308        );
5309        let r = _mm256_shuffle_epi8(a, b);
5310        assert_eq_m256i(r, expected);
5311    }
5312
5313    #[simd_test(enable = "avx2")]
5314    unsafe fn test_mm256_permutevar8x32_epi32() {
5315        let a = _mm256_setr_epi32(100, 200, 300, 400, 500, 600, 700, 800);
5316        let b = _mm256_setr_epi32(5, 0, 5, 1, 7, 6, 3, 4);
5317        let expected = _mm256_setr_epi32(600, 100, 600, 200, 800, 700, 400, 500);
5318        let r = _mm256_permutevar8x32_epi32(a, b);
5319        assert_eq_m256i(r, expected);
5320    }
5321
5322    #[simd_test(enable = "avx2")]
5323    unsafe fn test_mm256_permute4x64_epi64() {
5324        let a = _mm256_setr_epi64x(100, 200, 300, 400);
5325        let expected = _mm256_setr_epi64x(400, 100, 200, 100);
5326        let r = _mm256_permute4x64_epi64::<0b00010011>(a);
5327        assert_eq_m256i(r, expected);
5328    }
5329
5330    #[simd_test(enable = "avx2")]
5331    unsafe fn test_mm256_permute2x128_si256() {
5332        let a = _mm256_setr_epi64x(100, 200, 500, 600);
5333        let b = _mm256_setr_epi64x(300, 400, 700, 800);
5334        let r = _mm256_permute2x128_si256::<0b00_01_00_11>(a, b);
5335        let e = _mm256_setr_epi64x(700, 800, 500, 600);
5336        assert_eq_m256i(r, e);
5337    }
5338
5339    #[simd_test(enable = "avx2")]
5340    unsafe fn test_mm256_permute4x64_pd() {
5341        let a = _mm256_setr_pd(1., 2., 3., 4.);
5342        let r = _mm256_permute4x64_pd::<0b00_01_00_11>(a);
5343        let e = _mm256_setr_pd(4., 1., 2., 1.);
5344        assert_eq_m256d(r, e);
5345    }
5346
5347    #[simd_test(enable = "avx2")]
5348    unsafe fn test_mm256_permutevar8x32_ps() {
5349        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
5350        let b = _mm256_setr_epi32(5, 0, 5, 1, 7, 6, 3, 4);
5351        let r = _mm256_permutevar8x32_ps(a, b);
5352        let e = _mm256_setr_ps(6., 1., 6., 2., 8., 7., 4., 5.);
5353        assert_eq_m256(r, e);
5354    }
5355
5356    #[simd_test(enable = "avx2")]
5357    unsafe fn test_mm_i32gather_epi32() {
5358        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5359        // A multiplier of 4 is word-addressing
5360        let r = _mm_i32gather_epi32::<4>(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48));
5361        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 32, 48));
5362    }
5363
5364    #[simd_test(enable = "avx2")]
5365    unsafe fn test_mm_mask_i32gather_epi32() {
5366        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5367        // A multiplier of 4 is word-addressing
5368        let r = _mm_mask_i32gather_epi32::<4>(
5369            _mm_set1_epi32(256),
5370            arr.as_ptr(),
5371            _mm_setr_epi32(0, 16, 64, 96),
5372            _mm_setr_epi32(-1, -1, -1, 0),
5373        );
5374        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 64, 256));
5375    }
5376
5377    #[simd_test(enable = "avx2")]
5378    unsafe fn test_mm256_i32gather_epi32() {
5379        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5380        // A multiplier of 4 is word-addressing
5381        let r =
5382            _mm256_i32gather_epi32::<4>(arr.as_ptr(), _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4));
5383        assert_eq_m256i(r, _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4));
5384    }
5385
5386    #[simd_test(enable = "avx2")]
5387    unsafe fn test_mm256_mask_i32gather_epi32() {
5388        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5389        // A multiplier of 4 is word-addressing
5390        let r = _mm256_mask_i32gather_epi32::<4>(
5391            _mm256_set1_epi32(256),
5392            arr.as_ptr(),
5393            _mm256_setr_epi32(0, 16, 64, 96, 0, 0, 0, 0),
5394            _mm256_setr_epi32(-1, -1, -1, 0, 0, 0, 0, 0),
5395        );
5396        assert_eq_m256i(r, _mm256_setr_epi32(0, 16, 64, 256, 256, 256, 256, 256));
5397    }
5398
5399    #[simd_test(enable = "avx2")]
5400    unsafe fn test_mm_i32gather_ps() {
5401        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5402        // A multiplier of 4 is word-addressing for f32s
5403        let r = _mm_i32gather_ps::<4>(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48));
5404        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 32.0, 48.0));
5405    }
5406
5407    #[simd_test(enable = "avx2")]
5408    unsafe fn test_mm_mask_i32gather_ps() {
5409        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5410        // A multiplier of 4 is word-addressing for f32s
5411        let r = _mm_mask_i32gather_ps::<4>(
5412            _mm_set1_ps(256.0),
5413            arr.as_ptr(),
5414            _mm_setr_epi32(0, 16, 64, 96),
5415            _mm_setr_ps(-1.0, -1.0, -1.0, 0.0),
5416        );
5417        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 64.0, 256.0));
5418    }
5419
5420    #[simd_test(enable = "avx2")]
5421    unsafe fn test_mm256_i32gather_ps() {
5422        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5423        // A multiplier of 4 is word-addressing for f32s
5424        let r =
5425            _mm256_i32gather_ps::<4>(arr.as_ptr(), _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4));
5426        assert_eq_m256(r, _mm256_setr_ps(0.0, 16.0, 32.0, 48.0, 1.0, 2.0, 3.0, 4.0));
5427    }
5428
5429    #[simd_test(enable = "avx2")]
5430    unsafe fn test_mm256_mask_i32gather_ps() {
5431        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5432        // A multiplier of 4 is word-addressing for f32s
5433        let r = _mm256_mask_i32gather_ps::<4>(
5434            _mm256_set1_ps(256.0),
5435            arr.as_ptr(),
5436            _mm256_setr_epi32(0, 16, 64, 96, 0, 0, 0, 0),
5437            _mm256_setr_ps(-1.0, -1.0, -1.0, 0.0, 0.0, 0.0, 0.0, 0.0),
5438        );
5439        assert_eq_m256(
5440            r,
5441            _mm256_setr_ps(0.0, 16.0, 64.0, 256.0, 256.0, 256.0, 256.0, 256.0),
5442        );
5443    }
5444
5445    #[simd_test(enable = "avx2")]
5446    unsafe fn test_mm_i32gather_epi64() {
5447        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5448        // A multiplier of 8 is word-addressing for i64s
5449        let r = _mm_i32gather_epi64::<8>(arr.as_ptr(), _mm_setr_epi32(0, 16, 0, 0));
5450        assert_eq_m128i(r, _mm_setr_epi64x(0, 16));
5451    }
5452
5453    #[simd_test(enable = "avx2")]
5454    unsafe fn test_mm_mask_i32gather_epi64() {
5455        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5456        // A multiplier of 8 is word-addressing for i64s
5457        let r = _mm_mask_i32gather_epi64::<8>(
5458            _mm_set1_epi64x(256),
5459            arr.as_ptr(),
5460            _mm_setr_epi32(16, 16, 16, 16),
5461            _mm_setr_epi64x(-1, 0),
5462        );
5463        assert_eq_m128i(r, _mm_setr_epi64x(16, 256));
5464    }
5465
5466    #[simd_test(enable = "avx2")]
5467    unsafe fn test_mm256_i32gather_epi64() {
5468        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5469        // A multiplier of 8 is word-addressing for i64s
5470        let r = _mm256_i32gather_epi64::<8>(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48));
5471        assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 32, 48));
5472    }
5473
5474    #[simd_test(enable = "avx2")]
5475    unsafe fn test_mm256_mask_i32gather_epi64() {
5476        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5477        // A multiplier of 8 is word-addressing for i64s
5478        let r = _mm256_mask_i32gather_epi64::<8>(
5479            _mm256_set1_epi64x(256),
5480            arr.as_ptr(),
5481            _mm_setr_epi32(0, 16, 64, 96),
5482            _mm256_setr_epi64x(-1, -1, -1, 0),
5483        );
5484        assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 64, 256));
5485    }
5486
5487    #[simd_test(enable = "avx2")]
5488    unsafe fn test_mm_i32gather_pd() {
5489        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5490        // A multiplier of 8 is word-addressing for f64s
5491        let r = _mm_i32gather_pd::<8>(arr.as_ptr(), _mm_setr_epi32(0, 16, 0, 0));
5492        assert_eq_m128d(r, _mm_setr_pd(0.0, 16.0));
5493    }
5494
5495    #[simd_test(enable = "avx2")]
5496    unsafe fn test_mm_mask_i32gather_pd() {
5497        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5498        // A multiplier of 8 is word-addressing for f64s
5499        let r = _mm_mask_i32gather_pd::<8>(
5500            _mm_set1_pd(256.0),
5501            arr.as_ptr(),
5502            _mm_setr_epi32(16, 16, 16, 16),
5503            _mm_setr_pd(-1.0, 0.0),
5504        );
5505        assert_eq_m128d(r, _mm_setr_pd(16.0, 256.0));
5506    }
5507
5508    #[simd_test(enable = "avx2")]
5509    unsafe fn test_mm256_i32gather_pd() {
5510        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5511        // A multiplier of 8 is word-addressing for f64s
5512        let r = _mm256_i32gather_pd::<8>(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48));
5513        assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 32.0, 48.0));
5514    }
5515
5516    #[simd_test(enable = "avx2")]
5517    unsafe fn test_mm256_mask_i32gather_pd() {
5518        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5519        // A multiplier of 8 is word-addressing for f64s
5520        let r = _mm256_mask_i32gather_pd::<8>(
5521            _mm256_set1_pd(256.0),
5522            arr.as_ptr(),
5523            _mm_setr_epi32(0, 16, 64, 96),
5524            _mm256_setr_pd(-1.0, -1.0, -1.0, 0.0),
5525        );
5526        assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 64.0, 256.0));
5527    }
5528
5529    #[simd_test(enable = "avx2")]
5530    unsafe fn test_mm_i64gather_epi32() {
5531        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5532        // A multiplier of 4 is word-addressing
5533        let r = _mm_i64gather_epi32::<4>(arr.as_ptr(), _mm_setr_epi64x(0, 16));
5534        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 0, 0));
5535    }
5536
5537    #[simd_test(enable = "avx2")]
5538    unsafe fn test_mm_mask_i64gather_epi32() {
5539        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5540        // A multiplier of 4 is word-addressing
5541        let r = _mm_mask_i64gather_epi32::<4>(
5542            _mm_set1_epi32(256),
5543            arr.as_ptr(),
5544            _mm_setr_epi64x(0, 16),
5545            _mm_setr_epi32(-1, 0, -1, 0),
5546        );
5547        assert_eq_m128i(r, _mm_setr_epi32(0, 256, 0, 0));
5548    }
5549
5550    #[simd_test(enable = "avx2")]
5551    unsafe fn test_mm256_i64gather_epi32() {
5552        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5553        // A multiplier of 4 is word-addressing
5554        let r = _mm256_i64gather_epi32::<4>(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48));
5555        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 32, 48));
5556    }
5557
5558    #[simd_test(enable = "avx2")]
5559    unsafe fn test_mm256_mask_i64gather_epi32() {
5560        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5561        // A multiplier of 4 is word-addressing
5562        let r = _mm256_mask_i64gather_epi32::<4>(
5563            _mm_set1_epi32(256),
5564            arr.as_ptr(),
5565            _mm256_setr_epi64x(0, 16, 64, 96),
5566            _mm_setr_epi32(-1, -1, -1, 0),
5567        );
5568        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 64, 256));
5569    }
5570
5571    #[simd_test(enable = "avx2")]
5572    unsafe fn test_mm_i64gather_ps() {
5573        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5574        // A multiplier of 4 is word-addressing for f32s
5575        let r = _mm_i64gather_ps::<4>(arr.as_ptr(), _mm_setr_epi64x(0, 16));
5576        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 0.0, 0.0));
5577    }
5578
5579    #[simd_test(enable = "avx2")]
5580    unsafe fn test_mm_mask_i64gather_ps() {
5581        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5582        // A multiplier of 4 is word-addressing for f32s
5583        let r = _mm_mask_i64gather_ps::<4>(
5584            _mm_set1_ps(256.0),
5585            arr.as_ptr(),
5586            _mm_setr_epi64x(0, 16),
5587            _mm_setr_ps(-1.0, 0.0, -1.0, 0.0),
5588        );
5589        assert_eq_m128(r, _mm_setr_ps(0.0, 256.0, 0.0, 0.0));
5590    }
5591
5592    #[simd_test(enable = "avx2")]
5593    unsafe fn test_mm256_i64gather_ps() {
5594        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5595        // A multiplier of 4 is word-addressing for f32s
5596        let r = _mm256_i64gather_ps::<4>(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48));
5597        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 32.0, 48.0));
5598    }
5599
5600    #[simd_test(enable = "avx2")]
5601    unsafe fn test_mm256_mask_i64gather_ps() {
5602        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5603        // A multiplier of 4 is word-addressing for f32s
5604        let r = _mm256_mask_i64gather_ps::<4>(
5605            _mm_set1_ps(256.0),
5606            arr.as_ptr(),
5607            _mm256_setr_epi64x(0, 16, 64, 96),
5608            _mm_setr_ps(-1.0, -1.0, -1.0, 0.0),
5609        );
5610        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 64.0, 256.0));
5611    }
5612
5613    #[simd_test(enable = "avx2")]
5614    unsafe fn test_mm_i64gather_epi64() {
5615        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5616        // A multiplier of 8 is word-addressing for i64s
5617        let r = _mm_i64gather_epi64::<8>(arr.as_ptr(), _mm_setr_epi64x(0, 16));
5618        assert_eq_m128i(r, _mm_setr_epi64x(0, 16));
5619    }
5620
5621    #[simd_test(enable = "avx2")]
5622    unsafe fn test_mm_mask_i64gather_epi64() {
5623        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5624        // A multiplier of 8 is word-addressing for i64s
5625        let r = _mm_mask_i64gather_epi64::<8>(
5626            _mm_set1_epi64x(256),
5627            arr.as_ptr(),
5628            _mm_setr_epi64x(16, 16),
5629            _mm_setr_epi64x(-1, 0),
5630        );
5631        assert_eq_m128i(r, _mm_setr_epi64x(16, 256));
5632    }
5633
5634    #[simd_test(enable = "avx2")]
5635    unsafe fn test_mm256_i64gather_epi64() {
5636        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5637        // A multiplier of 8 is word-addressing for i64s
5638        let r = _mm256_i64gather_epi64::<8>(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48));
5639        assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 32, 48));
5640    }
5641
5642    #[simd_test(enable = "avx2")]
5643    unsafe fn test_mm256_mask_i64gather_epi64() {
5644        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5645        // A multiplier of 8 is word-addressing for i64s
5646        let r = _mm256_mask_i64gather_epi64::<8>(
5647            _mm256_set1_epi64x(256),
5648            arr.as_ptr(),
5649            _mm256_setr_epi64x(0, 16, 64, 96),
5650            _mm256_setr_epi64x(-1, -1, -1, 0),
5651        );
5652        assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 64, 256));
5653    }
5654
5655    #[simd_test(enable = "avx2")]
5656    unsafe fn test_mm_i64gather_pd() {
5657        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5658        // A multiplier of 8 is word-addressing for f64s
5659        let r = _mm_i64gather_pd::<8>(arr.as_ptr(), _mm_setr_epi64x(0, 16));
5660        assert_eq_m128d(r, _mm_setr_pd(0.0, 16.0));
5661    }
5662
5663    #[simd_test(enable = "avx2")]
5664    unsafe fn test_mm_mask_i64gather_pd() {
5665        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5666        // A multiplier of 8 is word-addressing for f64s
5667        let r = _mm_mask_i64gather_pd::<8>(
5668            _mm_set1_pd(256.0),
5669            arr.as_ptr(),
5670            _mm_setr_epi64x(16, 16),
5671            _mm_setr_pd(-1.0, 0.0),
5672        );
5673        assert_eq_m128d(r, _mm_setr_pd(16.0, 256.0));
5674    }
5675
5676    #[simd_test(enable = "avx2")]
5677    unsafe fn test_mm256_i64gather_pd() {
5678        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5679        // A multiplier of 8 is word-addressing for f64s
5680        let r = _mm256_i64gather_pd::<8>(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48));
5681        assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 32.0, 48.0));
5682    }
5683
5684    #[simd_test(enable = "avx2")]
5685    unsafe fn test_mm256_mask_i64gather_pd() {
5686        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5687        // A multiplier of 8 is word-addressing for f64s
5688        let r = _mm256_mask_i64gather_pd::<8>(
5689            _mm256_set1_pd(256.0),
5690            arr.as_ptr(),
5691            _mm256_setr_epi64x(0, 16, 64, 96),
5692            _mm256_setr_pd(-1.0, -1.0, -1.0, 0.0),
5693        );
5694        assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 64.0, 256.0));
5695    }
5696
5697    #[simd_test(enable = "avx")]
5698    unsafe fn test_mm256_extract_epi8() {
5699        #[rustfmt::skip]
5700        let a = _mm256_setr_epi8(
5701            -1, 1, 2, 3, 4, 5, 6, 7,
5702            8, 9, 10, 11, 12, 13, 14, 15,
5703            16, 17, 18, 19, 20, 21, 22, 23,
5704            24, 25, 26, 27, 28, 29, 30, 31
5705        );
5706        let r1 = _mm256_extract_epi8::<0>(a);
5707        let r2 = _mm256_extract_epi8::<3>(a);
5708        assert_eq!(r1, 0xFF);
5709        assert_eq!(r2, 3);
5710    }
5711
5712    #[simd_test(enable = "avx2")]
5713    unsafe fn test_mm256_extract_epi16() {
5714        #[rustfmt::skip]
5715        let a = _mm256_setr_epi16(
5716            -1, 1, 2, 3, 4, 5, 6, 7,
5717            8, 9, 10, 11, 12, 13, 14, 15,
5718        );
5719        let r1 = _mm256_extract_epi16::<0>(a);
5720        let r2 = _mm256_extract_epi16::<3>(a);
5721        assert_eq!(r1, 0xFFFF);
5722        assert_eq!(r2, 3);
5723    }
5724}