BitMagic-C++
bmavx2.h
Go to the documentation of this file.
1#ifndef BMAVX2__H__INCLUDED__
2#define BMAVX2__H__INCLUDED__
3/*
4Copyright(c) 2002-2022 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)
5
6Licensed under the Apache License, Version 2.0 (the "License");
7you may not use this file except in compliance with the License.
8You may obtain a copy of the License at
9
10 http://www.apache.org/licenses/LICENSE-2.0
11
12Unless required by applicable law or agreed to in writing, software
13distributed under the License is distributed on an "AS IS" BASIS,
14WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15See the License for the specific language governing permissions and
16limitations under the License.
17
18For more information please visit: http://bitmagic.io
19*/
20
21// some of the algorithms here is based on modified libpopcnt library by Kim Walisch
22// https://github.com/kimwalisch/libpopcnt/
23//
24/*
25 * libpopcnt.h - C/C++ library for counting the number of 1 bits (bit
26 * population count) in an array as quickly as possible using
27 * specialized CPU instructions i.e. POPCNT, AVX2, AVX512, NEON.
28 *
29 * Copyright (c) 2016 - 2017, Kim Walisch
30 * Copyright (c) 2016 - 2017, Wojciech Muła
31 *
32 * All rights reserved.
33 *
34 * Redistribution and use in source and binary forms, with or without
35 * modification, are permitted provided that the following conditions are met:
36 *
37 * 1. Redistributions of source code must retain the above copyright notice, this
38 * list of conditions and the following disclaimer.
39 * 2. Redistributions in binary form must reproduce the above copyright notice,
40 * this list of conditions and the following disclaimer in the documentation
41 * and/or other materials provided with the distribution.
42 *
43 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
44 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
45 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
46 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
47 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
48 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
49 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
50 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
51 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
52 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
53 */
54
55
56/** @defgroup AVX2 AVX2 functions
57 Processor specific optimizations for AVX2 instructions (internals)
58 @ingroup bvector
59 @internal
60 */
61
62
63// Header implements processor specific intrinsics declarations for AVX2
64// instruction set
65//
66#include<emmintrin.h>
67#include<immintrin.h>
68
69#include "bmdef.h"
70#include "bmbmi2.h"
71#include "bmutil.h"
72
73namespace bm
74{
75
76// debugging utils
77#if 0
78inline
79void avx2_print256_u32(const char* prefix, const __m256i & value)
80{
81 const size_t n = sizeof(__m256i) / sizeof(unsigned);
82 unsigned buffer[n];
83 _mm256_storeu_si256((__m256i*)buffer, value);
84 std::cout << prefix << " [ ";
85 for (int i = n-1; 1; --i)
86 {
87 std::cout << std::hex << buffer[i] << " ";
88 if (i == 0)
89 break;
90 }
91 std::cout << "]" << std::endl;
92}
93
94inline
95void avx2_print256_u16(const char* prefix, const __m256i & value)
96{
97 const size_t n = sizeof(__m256i) / sizeof(unsigned short);
98 unsigned short buffer[n];
99 _mm256_storeu_si256((__m256i*)buffer, value);
100 std::cout << prefix << " [ ";
101 for (int i = n-1; 1; --i)
102 {
103 std::cout << buffer[i] << " ";
104 if (i == 0)
105 break;
106 }
107 std::cout << "]" << std::endl;
108}
109#endif
110
111#ifdef __GNUG__
112#pragma GCC diagnostic push
113#pragma GCC diagnostic ignored "-Wconversion"
114#endif
115
116
117#define BM_CSA256(h, l, a, b, c) \
118{ \
119 __m256i u = _mm256_xor_si256(a, b); \
120 h = _mm256_or_si256(_mm256_and_si256(a, b), _mm256_and_si256(u, c)); \
121 l = _mm256_xor_si256(u, c); \
122}
123
124#define BM_AVX2_BIT_COUNT(ret, v) \
125{ \
126 __m256i lo = _mm256_and_si256(v, low_mask); \
127 __m256i hi = _mm256_and_si256(_mm256_srli_epi16(v, 4), low_mask); \
128 __m256i cnt1 = _mm256_shuffle_epi8(lookup1, lo); \
129 __m256i cnt2 = _mm256_shuffle_epi8(lookup2, hi); \
130 ret = _mm256_sad_epu8(cnt1, cnt2); \
131}
132
133#define BM_AVX2_DECL_LOOKUP1 \
134 __m256i lookup1 = _mm256_setr_epi8(4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8, \
135 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8);
136#define BM_AVX2_DECL_LOOKUP2 \
137__m256i lookup2 = _mm256_setr_epi8(4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0, \
138 4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0);
139
140#define BM_AVX2_POPCNT_PROLOG \
141 BM_AVX2_DECL_LOOKUP1 \
142 BM_AVX2_DECL_LOOKUP2 \
143 __m256i low_mask = _mm256_set1_epi8(0x0f); \
144 __m256i bc;
145
146/*!
147 @brief AVX2 Harley-Seal popcount
148 The algorithm is based on the paper "Faster Population Counts
149 using AVX2 Instructions" by Daniel Lemire, Nathan Kurz and
150 Wojciech Mula (23 Nov 2016).
151 @see https://arxiv.org/abs/1611.07612
152
153 @ingroup AVX2
154*/
155inline
156bm::id_t avx2_bit_count(const __m256i* BMRESTRICT block,
157 const __m256i* BMRESTRICT block_end)
158{
159 __m256i cnt = _mm256_setzero_si256();
160 __m256i ones = _mm256_setzero_si256();
161 __m256i twos = _mm256_setzero_si256();
162 __m256i fours = _mm256_setzero_si256();
163 __m256i eights = _mm256_setzero_si256();
164 __m256i sixteens = _mm256_setzero_si256();
165 __m256i twosA, twosB, foursA, foursB, eightsA, eightsB;
166 __m256i b, c;
167
169 bm::id64_t* cnt64;
170
171 do
172 {
173 b = _mm256_load_si256(block+0); c = _mm256_load_si256(block+1);
174 BM_CSA256(twosA, ones, ones, b, c);
175
176 b = _mm256_load_si256(block+2); c = _mm256_load_si256(block+3);
177 BM_CSA256(twosB, ones, ones, b, c);
178 BM_CSA256(foursA, twos, twos, twosA, twosB);
179
180 b = _mm256_load_si256(block+4); c = _mm256_load_si256(block+5);
181 BM_CSA256(twosA, ones, ones, b, c);
182
183 b = _mm256_load_si256(block+6); c = _mm256_load_si256(block+7);
184 BM_CSA256(twosB, ones, ones, b, c);
185 BM_CSA256(foursB, twos, twos, twosA, twosB);
186 BM_CSA256(eightsA, fours, fours, foursA, foursB);
187
188 b = _mm256_load_si256(block+8); c = _mm256_load_si256(block+9);
189 BM_CSA256(twosA, ones, ones, b, c);
190
191 b = _mm256_load_si256(block+10); c = _mm256_load_si256(block+11);
192 BM_CSA256(twosB, ones, ones, b, c);
193 BM_CSA256(foursA, twos, twos, twosA, twosB);
194
195 b = _mm256_load_si256(block+12); c = _mm256_load_si256(block+13);
196 BM_CSA256(twosA, ones, ones, b, c);
197
198 b = _mm256_load_si256(block+14); c = _mm256_load_si256(block+15);
199 BM_CSA256(twosB, ones, ones, b, c);
200 BM_CSA256(foursB, twos, twos, twosA, twosB);
201 BM_CSA256(eightsB, fours, fours, foursA, foursB);
202 BM_CSA256(sixteens, eights, eights, eightsA, eightsB);
203
204 BM_AVX2_BIT_COUNT(bc, sixteens);
205 cnt = _mm256_add_epi64(cnt, bc);
206
207 block += 16;
208 } while (block < block_end);
209
210 cnt = _mm256_slli_epi64(cnt, 4);
211 BM_AVX2_BIT_COUNT(bc, eights)
212 cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(bc, 3));
213 BM_AVX2_BIT_COUNT(bc, fours);
214 cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(bc, 2));
215 BM_AVX2_BIT_COUNT(bc, twos);
216 cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(bc, 1));
217 BM_AVX2_BIT_COUNT(bc, ones);
218 cnt = _mm256_add_epi64(cnt, bc);
219
220 cnt64 = (bm::id64_t*) &cnt;
221
222 return (unsigned)(cnt64[0] + cnt64[1] + cnt64[2] + cnt64[3]);
223}
224
225/*!
226 @brief Calculate population count based on digest
227
228 @return popcnt
229 @ingroup AVX2
230*/
231inline
233 bm::id64_t digest)
234{
235 bm::id_t count = 0;
236 bm::id64_t* cnt64;
238 __m256i cnt = _mm256_setzero_si256();
239 while (digest)
240 {
241 bm::id64_t t = bm::bmi_blsi_u64(digest); // d & -d;
242
243 unsigned wave = (unsigned)_mm_popcnt_u64(t - 1);
244 unsigned off = wave * bm::set_block_digest_wave_size;
245
246 const __m256i* BMRESTRICT wave_src = (__m256i*)&block[off];
247
248 __m256i m1A, m1B, m1C, m1D;
249 m1A = _mm256_load_si256(wave_src);
250 m1B = _mm256_load_si256(wave_src+1);
251 if (!_mm256_testz_si256(m1A, m1A))
252 {
253 BM_AVX2_BIT_COUNT(bc, m1A)
254 cnt = _mm256_add_epi64(cnt, bc);
255 }
256 if (!_mm256_testz_si256(m1B, m1B))
257 {
258 BM_AVX2_BIT_COUNT(bc, m1B)
259 cnt = _mm256_add_epi64(cnt, bc);
260 }
261
262 m1C = _mm256_load_si256(wave_src+2);
263 m1D = _mm256_load_si256(wave_src+3);
264 if (!_mm256_testz_si256(m1C, m1C))
265 {
266 BM_AVX2_BIT_COUNT(bc, m1C)
267 cnt = _mm256_add_epi64(cnt, bc);
268 }
269 if (!_mm256_testz_si256(m1D, m1D))
270 {
271 BM_AVX2_BIT_COUNT(bc, m1D)
272 cnt = _mm256_add_epi64(cnt, bc);
273 }
274
275 digest = bm::bmi_bslr_u64(digest); // d &= d - 1;
276 } // while
277 cnt64 = (bm::id64_t*)&cnt;
278 count = (unsigned)(cnt64[0] + cnt64[1] + cnt64[2] + cnt64[3]);
279 return count;
280
281}
282
283
284
285/*!
286 @brief AND bit count for two aligned bit-blocks
287 @ingroup AVX2
288*/
289inline
291 const __m256i* BMRESTRICT block_end,
292 const __m256i* BMRESTRICT mask_block)
293{
294 bm::id64_t* cnt64;
296 __m256i cnt = _mm256_setzero_si256();
297 __m256i ymm0, ymm1;
298
299
300 do
301 {
302 ymm0 = _mm256_load_si256(block);
303 ymm1 = _mm256_load_si256(mask_block);
304 ymm0 = _mm256_and_si256(ymm0, ymm1);
305 ++block; ++mask_block;
306 BM_AVX2_BIT_COUNT(bc, ymm0)
307 cnt = _mm256_add_epi64(cnt, bc);
308
309 ymm0 = _mm256_load_si256(block);
310 ymm1 = _mm256_load_si256(mask_block);
311 ymm0 = _mm256_and_si256(ymm0, ymm1);
312 ++block; ++mask_block;
313 BM_AVX2_BIT_COUNT(bc, ymm0)
314 cnt = _mm256_add_epi64(cnt, bc);
315
316 ymm0 = _mm256_load_si256(block);
317 ymm1 = _mm256_load_si256(mask_block);
318 ymm0 = _mm256_and_si256(ymm0, ymm1);
319 ++block; ++mask_block;
320 BM_AVX2_BIT_COUNT(bc, ymm0)
321 cnt = _mm256_add_epi64(cnt, bc);
322
323 ymm0 = _mm256_load_si256(block);
324 ymm1 = _mm256_load_si256(mask_block);
325 ymm0 = _mm256_and_si256(ymm0, ymm1);
326 ++block; ++mask_block;
327 BM_AVX2_BIT_COUNT(bc, ymm0)
328 cnt = _mm256_add_epi64(cnt, bc);
329
330 } while (block < block_end);
331
332 cnt64 = (bm::id64_t*)&cnt;
333 return (unsigned)(cnt64[0] + cnt64[1] + cnt64[2] + cnt64[3]);
334}
335
336inline
338 const __m256i* BMRESTRICT block_end,
339 const __m256i* BMRESTRICT mask_block)
340{
341 bm::id64_t* cnt64;
343 __m256i cnt = _mm256_setzero_si256();
344 do
345 {
346 __m256i tmp0 = _mm256_load_si256(block);
347 __m256i tmp1 = _mm256_load_si256(mask_block);
348
349 tmp0 = _mm256_or_si256(tmp0, tmp1);
350
351 BM_AVX2_BIT_COUNT(bc, tmp0)
352 cnt = _mm256_add_epi64(cnt, bc);
353
354 ++block; ++mask_block;
355
356 } while (block < block_end);
357
358 cnt64 = (bm::id64_t*)&cnt;
359 return (unsigned)(cnt64[0] + cnt64[1] + cnt64[2] + cnt64[3]);
360}
361
362
363/*!
364 @brief XOR bit count for two aligned bit-blocks
365 @ingroup AVX2
366*/
367inline
369 const __m256i* BMRESTRICT block_end,
370 const __m256i* BMRESTRICT mask_block)
371{
372 bm::id64_t* cnt64;
374 __m256i cnt = _mm256_setzero_si256();
375 __m256i mA, mB, mC, mD;
376 do
377 {
378 mA = _mm256_xor_si256(_mm256_load_si256(block+0),
379 _mm256_load_si256(mask_block+0));
380 BM_AVX2_BIT_COUNT(bc, mA)
381 cnt = _mm256_add_epi64(cnt, bc);
382
383 mB = _mm256_xor_si256(_mm256_load_si256(block+1),
384 _mm256_load_si256(mask_block+1));
385 BM_AVX2_BIT_COUNT(bc, mB);
386 cnt = _mm256_add_epi64(cnt, bc);
387
388 mC = _mm256_xor_si256(_mm256_load_si256(block+2),
389 _mm256_load_si256(mask_block+2));
390 BM_AVX2_BIT_COUNT(bc, mC);
391 cnt = _mm256_add_epi64(cnt, bc);
392
393 mD = _mm256_xor_si256(_mm256_load_si256(block+3),
394 _mm256_load_si256(mask_block+3));
395 BM_AVX2_BIT_COUNT(bc, mD);
396 cnt = _mm256_add_epi64(cnt, bc);
397
398 block += 4; mask_block += 4;
399
400 } while (block < block_end);
401
402 cnt64 = (bm::id64_t*)&cnt;
403 return (unsigned)(cnt64[0] + cnt64[1] + cnt64[2] + cnt64[3]);
404}
405
406
407
408/*!
409 @brief AND NOT bit count for two aligned bit-blocks
410 @ingroup AVX2
411*/
412inline
414 const __m256i* BMRESTRICT block_end,
415 const __m256i* BMRESTRICT mask_block)
416{
417 bm::id64_t* cnt64;
419 __m256i cnt = _mm256_setzero_si256();
420 do
421 {
422 __m256i tmp0 = _mm256_load_si256(block);
423 __m256i tmp1 = _mm256_load_si256(mask_block);
424
425 tmp0 = _mm256_andnot_si256(tmp1, tmp0);
426
427 BM_AVX2_BIT_COUNT(bc, tmp0)
428 cnt = _mm256_add_epi64(cnt, bc);
429
430 ++block; ++mask_block;
431
432 } while (block < block_end);
433
434 cnt64 = (bm::id64_t*)&cnt;
435 return (unsigned)(cnt64[0] + cnt64[1] + cnt64[2] + cnt64[3]);
436}
437
438
439
440/*!
441 @brief XOR array elements to specified mask
442 *dst = *src ^ mask
443
444 @ingroup AVX2
445*/
446inline
448 const __m256i* BMRESTRICT src,
449 const __m256i* BMRESTRICT src_end,
450 bm::word_t mask)
451{
452 __m256i yM = _mm256_set1_epi32(int(mask));
453 do
454 {
455 _mm256_store_si256(dst+0, _mm256_xor_si256(_mm256_load_si256(src+0), yM)); // ymm1 = (~ymm1) & ymm2
456 _mm256_store_si256(dst+1, _mm256_xor_si256(_mm256_load_si256(src+1), yM));
457 _mm256_store_si256(dst+2, _mm256_xor_si256(_mm256_load_si256(src+2), yM));
458 _mm256_store_si256(dst+3, _mm256_xor_si256(_mm256_load_si256(src+3), yM));
459
460 dst += 4; src += 4;
461 } while (src < src_end);
462}
463
464
465/*!
466 @brief Inverts array elements and NOT them to specified mask
467 *dst = ~*src & mask
468
469 @ingroup AVX2
470*/
471inline
473 const __m256i* BMRESTRICT src,
474 const __m256i* BMRESTRICT src_end,
475 bm::word_t mask)
476{
477 __m256i yM = _mm256_set1_epi32(int(mask));
478 do
479 {
480 _mm256_store_si256(dst+0, _mm256_andnot_si256(_mm256_load_si256(src+0), yM)); // ymm1 = (~ymm1) & ymm2
481 _mm256_store_si256(dst+1, _mm256_andnot_si256(_mm256_load_si256(src+1), yM));
482 _mm256_store_si256(dst+2, _mm256_andnot_si256(_mm256_load_si256(src+2), yM));
483 _mm256_store_si256(dst+3, _mm256_andnot_si256(_mm256_load_si256(src+3), yM));
484
485 dst += 4; src += 4;
486 } while (src < src_end);
487}
488
489/*!
490 @brief AND array elements against another array
491 *dst &= *src
492 @return 0 if destination does not have any bits
493 @ingroup AVX2
494*/
495inline
496unsigned avx2_and_block(__m256i* BMRESTRICT dst,
497 const __m256i* BMRESTRICT src)
498{
499 __m256i m1A, m1B, m1C, m1D;
500 __m256i accA, accB, accC, accD;
501
502 const __m256i* BMRESTRICT src_end =
503 (const __m256i*)((bm::word_t*)(src) + bm::set_block_size);
504
505 accA = accB = accC = accD = _mm256_setzero_si256();
506
507 do
508 {
509 m1A = _mm256_and_si256(_mm256_load_si256(src+0), _mm256_load_si256(dst+0));
510 m1B = _mm256_and_si256(_mm256_load_si256(src+1), _mm256_load_si256(dst+1));
511 m1C = _mm256_and_si256(_mm256_load_si256(src+2), _mm256_load_si256(dst+2));
512 m1D = _mm256_and_si256(_mm256_load_si256(src+3), _mm256_load_si256(dst+3));
513
514 _mm256_store_si256(dst+0, m1A);
515 _mm256_store_si256(dst+1, m1B);
516 _mm256_store_si256(dst+2, m1C);
517 _mm256_store_si256(dst+3, m1D);
518
519 accA = _mm256_or_si256(accA, m1A);
520 accB = _mm256_or_si256(accB, m1B);
521 accC = _mm256_or_si256(accC, m1C);
522 accD = _mm256_or_si256(accD, m1D);
523
524 src += 4; dst += 4;
525
526 } while (src < src_end);
527
528 accA = _mm256_or_si256(accA, accB); // A = A | B
529 accC = _mm256_or_si256(accC, accD); // C = C | D
530 accA = _mm256_or_si256(accA, accC); // A = A | C
531
532 return !_mm256_testz_si256(accA, accA);
533}
534
535/*!
536 @brief AND block digest stride
537 *dst &= *src
538
539 @return true if stide is all zero
540 @ingroup AVX2
541*/
542inline
543bool avx2_and_digest(__m256i* BMRESTRICT dst,
544 const __m256i* BMRESTRICT src)
545{
546 __m256i m1A, m1B, m1C, m1D;
547
548 m1A = _mm256_and_si256(_mm256_load_si256(src+0), _mm256_load_si256(dst+0));
549 m1B = _mm256_and_si256(_mm256_load_si256(src+1), _mm256_load_si256(dst+1));
550 m1C = _mm256_and_si256(_mm256_load_si256(src+2), _mm256_load_si256(dst+2));
551 m1D = _mm256_and_si256(_mm256_load_si256(src+3), _mm256_load_si256(dst+3));
552
553 _mm256_store_si256(dst+0, m1A);
554 _mm256_store_si256(dst+1, m1B);
555 _mm256_store_si256(dst+2, m1C);
556 _mm256_store_si256(dst+3, m1D);
557
558 m1A = _mm256_or_si256(m1A, m1B);
559 m1C = _mm256_or_si256(m1C, m1D);
560 m1A = _mm256_or_si256(m1A, m1C);
561
562 return _mm256_testz_si256(m1A, m1A);
563}
564
565/*!
566 @brief AND block digest stride 2 way
567 *dst = *src1 & *src2
568
569 @return true if stide is all zero
570 @ingroup AVX2
571*/
572inline
574 const __m256i* BMRESTRICT src1,
575 const __m256i* BMRESTRICT src2)
576{
577 __m256i m1A, m1B, m1C, m1D;
578
579 m1A = _mm256_and_si256(_mm256_load_si256(src1+0), _mm256_load_si256(src2+0));
580 m1B = _mm256_and_si256(_mm256_load_si256(src1+1), _mm256_load_si256(src2+1));
581 m1C = _mm256_and_si256(_mm256_load_si256(src1+2), _mm256_load_si256(src2+2));
582 m1D = _mm256_and_si256(_mm256_load_si256(src1+3), _mm256_load_si256(src2+3));
583
584 _mm256_store_si256(dst+0, m1A);
585 _mm256_store_si256(dst+1, m1B);
586 _mm256_store_si256(dst+2, m1C);
587 _mm256_store_si256(dst+3, m1D);
588
589 m1A = _mm256_or_si256(m1A, m1B);
590 m1C = _mm256_or_si256(m1C, m1D);
591 m1A = _mm256_or_si256(m1A, m1C);
592
593 return _mm256_testz_si256(m1A, m1A);
594}
595
596/*!
597 @brief AND-OR block digest stride 2 way
598 *dst |= *src1 & *src2
599
600 @return true if stide is all zero
601 @ingroup AVX2
602*/
603inline
605 const __m256i* BMRESTRICT src1,
606 const __m256i* BMRESTRICT src2)
607{
608 const __m256i maskF = _mm256_set1_epi32(~0u); // brosdcast 0xFF
609
610 __m256i m1A, m1B, m1C, m1D;
611 __m256i mACC1;
612 __m256i mSA, mSB, mSC, mSD;
613
614
615 mSA = _mm256_load_si256(dst+0);
616 mSB = _mm256_load_si256(dst+1);
617 mACC1 = _mm256_and_si256(mSA, mSB);
618
619 mSC = _mm256_load_si256(dst+2);
620 mSD = _mm256_load_si256(dst+3);
621
622 mACC1 = _mm256_and_si256(mACC1, _mm256_and_si256(mSC, mSD));
623
624 mACC1 = _mm256_xor_si256(mACC1, maskF);
625 if (_mm256_testz_si256(mACC1, mACC1)) // whole wave is saturated 1111s already
626 return false;
627
628
629 m1A = _mm256_and_si256(_mm256_load_si256(src1+0), _mm256_load_si256(src2+0));
630 m1B = _mm256_and_si256(_mm256_load_si256(src1+1), _mm256_load_si256(src2+1));
631 m1C = _mm256_and_si256(_mm256_load_si256(src1+2), _mm256_load_si256(src2+2));
632 m1D = _mm256_and_si256(_mm256_load_si256(src1+3), _mm256_load_si256(src2+3));
633
634 mACC1 =
635 _mm256_or_si256(_mm256_or_si256(m1A, m1B), _mm256_or_si256(m1C, m1D));
636 bool all_z = _mm256_testz_si256(mACC1, mACC1);
637 if (all_z)
638 return all_z;
639
640 m1A = _mm256_or_si256(mSA, m1A);
641 m1B = _mm256_or_si256(mSB, m1B);
642 m1C = _mm256_or_si256(mSC, m1C);
643 m1D = _mm256_or_si256(mSD, m1D);
644
645 _mm256_store_si256(dst+0, m1A);
646 _mm256_store_si256(dst+1, m1B);
647 _mm256_store_si256(dst+2, m1C);
648 _mm256_store_si256(dst+3, m1D);
649
650 return all_z;
651}
652
653
654/*!
655 @brief AND block digest stride
656 @ingroup AVX2
657*/
658inline
660 const __m256i* BMRESTRICT src1,
661 const __m256i* BMRESTRICT src2,
662 const __m256i* BMRESTRICT src3,
663 const __m256i* BMRESTRICT src4)
664{
665 __m256i m1A, m1B, m1C, m1D;
666 __m256i m1E, m1F, m1G, m1H;
667
668 {
669 __m256i s1_0, s2_0, s1_1, s2_1;
670
671 s1_0 = _mm256_load_si256(src1 + 0); s2_0 = _mm256_load_si256(src2 + 0);
672 s1_1 = _mm256_load_si256(src1 + 1); s2_1 = _mm256_load_si256(src2 + 1);
673 m1A = _mm256_and_si256(s1_0, s2_0);
674 m1B = _mm256_and_si256(s1_1, s2_1);
675 s1_0 = _mm256_load_si256(src1 + 2); s2_0 = _mm256_load_si256(src2 + 2);
676 s1_1 = _mm256_load_si256(src1 + 3); s2_1 = _mm256_load_si256(src2 + 3);
677 m1C = _mm256_and_si256(s1_0, s2_0);
678 m1D = _mm256_and_si256(s1_1, s2_1);
679 }
680 {
681 __m256i s3_0, s4_0, s3_1, s4_1;
682
683 s3_0 = _mm256_load_si256(src3 + 0); s4_0 = _mm256_load_si256(src4 + 0);
684 s3_1 = _mm256_load_si256(src3 + 1); s4_1 = _mm256_load_si256(src4 + 1);
685 m1E = _mm256_and_si256(s3_0, s4_0);
686 m1F = _mm256_and_si256(s3_1, s4_1);
687
688 m1A = _mm256_and_si256(m1A, m1E);
689 m1B = _mm256_and_si256(m1B, m1F);
690
691 s3_0 = _mm256_load_si256(src3 + 2); s4_0 = _mm256_load_si256(src4 + 2);
692 s3_1 = _mm256_load_si256(src3 + 3); s4_1 = _mm256_load_si256(src4 + 3);
693 m1G = _mm256_and_si256(s3_0, s4_0);
694 m1H = _mm256_and_si256(s3_1, s4_1);
695 }
696 {
697 __m256i dst0, dst1;
698 dst0 = _mm256_load_si256(dst + 0); dst1 = _mm256_load_si256(dst + 1);
699
700 m1C = _mm256_and_si256(m1C, m1G);
701 m1D = _mm256_and_si256(m1D, m1H);
702 m1A = _mm256_and_si256(m1A, dst0);
703 m1B = _mm256_and_si256(m1B, dst1);
704
705 dst0 = _mm256_load_si256(dst + 2); dst1 = _mm256_load_si256(dst + 3);
706
707 m1C = _mm256_and_si256(m1C, dst0);
708 m1D = _mm256_and_si256(m1D, dst1);
709 }
710 _mm256_store_si256(dst + 0, m1A);
711 _mm256_store_si256(dst + 1, m1B);
712 _mm256_store_si256(dst + 2, m1C);
713 _mm256_store_si256(dst + 3, m1D);
714
715 m1A = _mm256_or_si256(m1A, m1B);
716 m1C = _mm256_or_si256(m1C, m1D);
717 m1A = _mm256_or_si256(m1A, m1C);
718
719 return _mm256_testz_si256(m1A, m1A);
720}
721
722/*!
723 @brief AND block digest stride
724 @ingroup AVX2
725*/
726inline
728 const __m256i* BMRESTRICT src1,
729 const __m256i* BMRESTRICT src2)
730{
731 __m256i m1A, m1B, m1C, m1D;
732
733 {
734 __m256i s1_0, s2_0, s1_1, s2_1;
735
736 s1_0 = _mm256_load_si256(src1 + 0); s2_0 = _mm256_load_si256(src2 + 0);
737 s1_1 = _mm256_load_si256(src1 + 1); s2_1 = _mm256_load_si256(src2 + 1);
738 m1A = _mm256_and_si256(s1_0, s2_0);
739 m1B = _mm256_and_si256(s1_1, s2_1);
740 s1_0 = _mm256_load_si256(src1 + 2); s2_0 = _mm256_load_si256(src2 + 2);
741 s1_1 = _mm256_load_si256(src1 + 3); s2_1 = _mm256_load_si256(src2 + 3);
742 m1C = _mm256_and_si256(s1_0, s2_0);
743 m1D = _mm256_and_si256(s1_1, s2_1);
744 }
745 {
746 __m256i dst0, dst1;
747 dst0 = _mm256_load_si256(dst + 0); dst1 = _mm256_load_si256(dst + 1);
748
749 m1A = _mm256_and_si256(m1A, dst0);
750 m1B = _mm256_and_si256(m1B, dst1);
751
752 dst0 = _mm256_load_si256(dst + 2); dst1 = _mm256_load_si256(dst + 3);
753
754 m1C = _mm256_and_si256(m1C, dst0);
755 m1D = _mm256_and_si256(m1D, dst1);
756 }
757 _mm256_store_si256(dst + 0, m1A);
758 _mm256_store_si256(dst + 1, m1B);
759 _mm256_store_si256(dst + 2, m1C);
760 _mm256_store_si256(dst + 3, m1D);
761
762 m1A = _mm256_or_si256(m1A, m1B);
763 m1C = _mm256_or_si256(m1C, m1D);
764 m1A = _mm256_or_si256(m1A, m1C);
765
766 return _mm256_testz_si256(m1A, m1A);
767}
768
769
770/*!
771 @brief AND array elements against another array (unaligned)
772 *dst &= *src
773 @return 0 if destination does not have any bits
774 @ingroup AVX2
775*/
776inline
777unsigned avx2_and_arr_unal(__m256i* BMRESTRICT dst,
778 const __m256i* BMRESTRICT src,
779 const __m256i* BMRESTRICT src_end)
780{
781 __m256i m1A, m2A, m1B, m2B, m1C, m2C, m1D, m2D;
782 __m256i accA, accB, accC, accD;
783
784 accA = _mm256_setzero_si256();
785 accB = _mm256_setzero_si256();
786 accC = _mm256_setzero_si256();
787 accD = _mm256_setzero_si256();
788
789 do
790 {
791 m1A = _mm256_loadu_si256(src+0);
792 m2A = _mm256_load_si256(dst+0);
793 m1A = _mm256_and_si256(m1A, m2A);
794 _mm256_store_si256(dst+0, m1A);
795 accA = _mm256_or_si256(accA, m1A);
796
797 m1B = _mm256_loadu_si256(src+1);
798 m2B = _mm256_load_si256(dst+1);
799 m1B = _mm256_and_si256(m1B, m2B);
800 _mm256_store_si256(dst+1, m1B);
801 accB = _mm256_or_si256(accB, m1B);
802
803 m1C = _mm256_loadu_si256(src+2);
804 m2C = _mm256_load_si256(dst+2);
805 m1C = _mm256_and_si256(m1C, m2C);
806 _mm256_store_si256(dst+2, m1C);
807 accC = _mm256_or_si256(accC, m1C);
808
809 m1D = _mm256_loadu_si256(src+3);
810 m2D = _mm256_load_si256(dst+3);
811 m1D = _mm256_and_si256(m1D, m2D);
812 _mm256_store_si256(dst+3, m1D);
813 accD = _mm256_or_si256(accD, m1D);
814
815 src += 4; dst += 4;
816
817 } while (src < src_end);
818
819 accA = _mm256_or_si256(accA, accB); // A = A | B
820 accC = _mm256_or_si256(accC, accD); // C = C | D
821 accA = _mm256_or_si256(accA, accC); // A = A | C
822
823 return !_mm256_testz_si256(accA, accA);
824}
825
826
827/*!
828 @brief OR array elements against another array
829 *dst |= *src
830 @return true if all bits are 1
831
832 @ingroup AVX2
833*/
834inline
835bool avx2_or_block(__m256i* BMRESTRICT dst,
836 const __m256i* BMRESTRICT src)
837{
838 __m256i m1A, m1B, m1C, m1D;
839
840 __m256i mAccF0 = _mm256_set1_epi32(~0u); // broadcast 0xFF
841 __m256i mAccF1 = _mm256_set1_epi32(~0u); // broadcast 0xFF
842
843 __m256i* BMRESTRICT dst2 =
844 (__m256i*)((bm::word_t*)(dst) + bm::set_block_size/2);
845 const __m256i* BMRESTRICT src2 =
846 (const __m256i*)((bm::word_t*)(src) + bm::set_block_size/2);
847 const __m256i* BMRESTRICT src_end =
848 (const __m256i*)((bm::word_t*)(src) + bm::set_block_size);
849 do
850 {
851 m1A = _mm256_or_si256(_mm256_load_si256(src), _mm256_load_si256(dst));
852 m1B = _mm256_or_si256(_mm256_load_si256(src+1), _mm256_load_si256(dst+1));
853 mAccF0 = _mm256_and_si256(mAccF0, m1A);
854 mAccF0 = _mm256_and_si256(mAccF0, m1B);
855
856 _mm256_stream_si256(dst, m1A);
857 _mm256_stream_si256(dst+1, m1B);
858
859 src += 2; dst += 2;
860
861 m1C = _mm256_or_si256(_mm256_load_si256(src2), _mm256_load_si256(dst2));
862 m1D = _mm256_or_si256(_mm256_load_si256(src2+1), _mm256_load_si256(dst2+1));
863 mAccF1 = _mm256_and_si256(mAccF1, m1C);
864 mAccF1 = _mm256_and_si256(mAccF1, m1D);
865
866 _mm256_stream_si256(dst2, m1C);
867 _mm256_stream_si256(dst2+1, m1D);
868
869 src2 += 2; dst2 += 2;
870 } while (src2 < src_end);
871
872 __m256i maskF = _mm256_set1_epi32(~0u);
873 mAccF0 = _mm256_and_si256(mAccF0, mAccF1);
874 __m256i wcmpA = _mm256_cmpeq_epi8(mAccF0, maskF);
875 unsigned maskA = unsigned(_mm256_movemask_epi8(wcmpA));
876 return (maskA == ~0u);
877}
878
879
880/*!
881 @brief OR array elements against another unaligned array
882 *dst |= *src
883 @return true if all bits are 1
884
885 @ingroup AVX2
886*/
887inline
888bool avx2_or_arr_unal(__m256i* BMRESTRICT dst,
889 const __m256i* BMRESTRICT src,
890 const __m256i* BMRESTRICT src_end)
891{
892 __m256i m1A, m2A, m1B, m2B, m1C, m2C, m1D, m2D;
893 __m256i mAccF0 = _mm256_set1_epi32(~0u); // broadcast 0xFF
894 __m256i mAccF1 = _mm256_set1_epi32(~0u); // broadcast 0xFF
895 do
896 {
897 m1A = _mm256_loadu_si256(src+0);
898 m2A = _mm256_load_si256(dst+0);
899 m1A = _mm256_or_si256(m1A, m2A);
900 _mm256_store_si256(dst+0, m1A);
901
902 m1B = _mm256_loadu_si256(src+1);
903 m2B = _mm256_load_si256(dst+1);
904 m1B = _mm256_or_si256(m1B, m2B);
905 _mm256_store_si256(dst+1, m1B);
906
907 m1C = _mm256_loadu_si256(src+2);
908 m2C = _mm256_load_si256(dst+2);
909 m1C = _mm256_or_si256(m1C, m2C);
910 _mm256_store_si256(dst+2, m1C);
911
912 m1D = _mm256_loadu_si256(src+3);
913 m2D = _mm256_load_si256(dst+3);
914 m1D = _mm256_or_si256(m1D, m2D);
915 _mm256_store_si256(dst+3, m1D);
916
917 mAccF1 = _mm256_and_si256(mAccF1, m1C);
918 mAccF1 = _mm256_and_si256(mAccF1, m1D);
919 mAccF0 = _mm256_and_si256(mAccF0, m1A);
920 mAccF0 = _mm256_and_si256(mAccF0, m1B);
921
922 src += 4; dst += 4;
923
924 } while (src < src_end);
925
926 __m256i maskF = _mm256_set1_epi32(~0u);
927 mAccF0 = _mm256_and_si256(mAccF0, mAccF1);
928 __m256i wcmpA = _mm256_cmpeq_epi8(mAccF0, maskF);
929 unsigned maskA = unsigned(_mm256_movemask_epi8(wcmpA));
930 return (maskA == ~0u);
931}
932
933/*!
934 @brief OR 2 arrays and copy to the destination
935 *dst = *src1 | src2
936 @return true if all bits are 1
937
938 @ingroup AVX2
939*/
940inline
942 const __m256i* BMRESTRICT src1,
943 const __m256i* BMRESTRICT src2)
944{
945 __m256i m1A, m1B, m1C, m1D;
946 __m256i mAccF0 = _mm256_set1_epi32(~0u); // broadcast 0xFF
947 __m256i mAccF1 = _mm256_set1_epi32(~0u); // broadcast 0xFF
948 const __m256i* BMRESTRICT src_end1 =
949 (const __m256i*)((bm::word_t*)(src1) + bm::set_block_size);
950
951 do
952 {
953 m1A = _mm256_or_si256(_mm256_load_si256(src1+0), _mm256_load_si256(src2+0));
954 m1B = _mm256_or_si256(_mm256_load_si256(src1+1), _mm256_load_si256(src2+1));
955 m1C = _mm256_or_si256(_mm256_load_si256(src1+2), _mm256_load_si256(src2+2));
956 m1D = _mm256_or_si256(_mm256_load_si256(src1+3), _mm256_load_si256(src2+3));
957
958 _mm256_store_si256(dst+0, m1A);
959 _mm256_store_si256(dst+1, m1B);
960 _mm256_store_si256(dst+2, m1C);
961 _mm256_store_si256(dst+3, m1D);
962
963 mAccF1 = _mm256_and_si256(mAccF1, m1C);
964 mAccF1 = _mm256_and_si256(mAccF1, m1D);
965 mAccF0 = _mm256_and_si256(mAccF0, m1A);
966 mAccF0 = _mm256_and_si256(mAccF0, m1B);
967
968 src1 += 4; src2 += 4; dst += 4;
969
970 } while (src1 < src_end1);
971
972 __m256i maskF = _mm256_set1_epi32(~0u);
973 mAccF0 = _mm256_and_si256(mAccF0, mAccF1);
974 __m256i wcmpA= _mm256_cmpeq_epi8(mAccF0, maskF);
975 unsigned maskA = unsigned(_mm256_movemask_epi8(wcmpA));
976 return (maskA == ~0u);
977}
978
979/*!
980 @brief OR array elements against another 2 arrays
981 *dst |= *src1 | src2
982 @return true if all bits are 1
983
984 @ingroup AVX2
985*/
986inline
988 const __m256i* BMRESTRICT src1,
989 const __m256i* BMRESTRICT src2)
990{
991 __m256i m1A, m1B, m1C, m1D;
992 __m256i mAccF0 = _mm256_set1_epi32(~0u); // broadcast 0xFF
993 __m256i mAccF1 = _mm256_set1_epi32(~0u); // broadcast 0xFF
994 const __m256i* BMRESTRICT src_end1 =
995 (const __m256i*)((bm::word_t*)(src1) + bm::set_block_size);
996
997 do
998 {
999 m1A = _mm256_or_si256(_mm256_load_si256(src1+0), _mm256_load_si256(dst+0));
1000 m1B = _mm256_or_si256(_mm256_load_si256(src1+1), _mm256_load_si256(dst+1));
1001 m1C = _mm256_or_si256(_mm256_load_si256(src1+2), _mm256_load_si256(dst+2));
1002 m1D = _mm256_or_si256(_mm256_load_si256(src1+3), _mm256_load_si256(dst+3));
1003
1004 m1A = _mm256_or_si256(m1A, _mm256_load_si256(src2+0));
1005 m1B = _mm256_or_si256(m1B, _mm256_load_si256(src2+1));
1006 m1C = _mm256_or_si256(m1C, _mm256_load_si256(src2+2));
1007 m1D = _mm256_or_si256(m1D, _mm256_load_si256(src2+3));
1008
1009 _mm256_store_si256(dst+0, m1A);
1010 _mm256_store_si256(dst+1, m1B);
1011 _mm256_store_si256(dst+2, m1C);
1012 _mm256_store_si256(dst+3, m1D);
1013
1014 mAccF1 = _mm256_and_si256(mAccF1, m1C);
1015 mAccF1 = _mm256_and_si256(mAccF1, m1D);
1016 mAccF0 = _mm256_and_si256(mAccF0, m1A);
1017 mAccF0 = _mm256_and_si256(mAccF0, m1B);
1018
1019 src1 += 4; src2 += 4; dst += 4;
1020
1021 } while (src1 < src_end1);
1022
1023 __m256i maskF = _mm256_set1_epi32(~0u);
1024 mAccF0 = _mm256_and_si256(mAccF0, mAccF1);
1025 __m256i wcmpA= _mm256_cmpeq_epi8(mAccF0, maskF);
1026 unsigned maskA = unsigned(_mm256_movemask_epi8(wcmpA));
1027 return (maskA == ~0u);
1028}
1029
1030
1031/*!
1032 @brief OR array elements against another 4 arrays
1033 *dst |= *src1 | src2
1034 @return true if all bits are 1
1035
1036 @ingroup AVX2
1037*/
1038inline
1040 const __m256i* BMRESTRICT src1,
1041 const __m256i* BMRESTRICT src2,
1042 const __m256i* BMRESTRICT src3,
1043 const __m256i* BMRESTRICT src4)
1044{
1045 __m256i m1A, m1B, m1C, m1D;
1046 __m256i mAccF0 = _mm256_set1_epi32(~0u); // broadcast 0xFF
1047 __m256i mAccF1 = _mm256_set1_epi32(~0u); // broadcast 0xFF
1048
1049 const __m256i* BMRESTRICT src_end1 =
1050 (const __m256i*)((bm::word_t*)(src1) + bm::set_block_size);
1051
1052 do
1053 {
1054 m1A = _mm256_or_si256(_mm256_load_si256(src1+0), _mm256_load_si256(dst+0));
1055 m1B = _mm256_or_si256(_mm256_load_si256(src1+1), _mm256_load_si256(dst+1));
1056 m1C = _mm256_or_si256(_mm256_load_si256(src1+2), _mm256_load_si256(dst+2));
1057 m1D = _mm256_or_si256(_mm256_load_si256(src1+3), _mm256_load_si256(dst+3));
1058
1059 m1A = _mm256_or_si256(m1A, _mm256_load_si256(src2+0));
1060 m1B = _mm256_or_si256(m1B, _mm256_load_si256(src2+1));
1061 m1C = _mm256_or_si256(m1C, _mm256_load_si256(src2+2));
1062 m1D = _mm256_or_si256(m1D, _mm256_load_si256(src2+3));
1063
1064 m1A = _mm256_or_si256(m1A, _mm256_load_si256(src3+0));
1065 m1B = _mm256_or_si256(m1B, _mm256_load_si256(src3+1));
1066 m1C = _mm256_or_si256(m1C, _mm256_load_si256(src3+2));
1067 m1D = _mm256_or_si256(m1D, _mm256_load_si256(src3+3));
1068
1069 m1A = _mm256_or_si256(m1A, _mm256_load_si256(src4+0));
1070 m1B = _mm256_or_si256(m1B, _mm256_load_si256(src4+1));
1071 m1C = _mm256_or_si256(m1C, _mm256_load_si256(src4+2));
1072 m1D = _mm256_or_si256(m1D, _mm256_load_si256(src4+3));
1073
1074 _mm256_stream_si256(dst+0, m1A);
1075 _mm256_stream_si256(dst+1, m1B);
1076 _mm256_stream_si256(dst+2, m1C);
1077 _mm256_stream_si256(dst+3, m1D);
1078
1079 mAccF1 = _mm256_and_si256(mAccF1, m1C);
1080 mAccF1 = _mm256_and_si256(mAccF1, m1D);
1081 mAccF0 = _mm256_and_si256(mAccF0, m1A);
1082 mAccF0 = _mm256_and_si256(mAccF0, m1B);
1083
1084 src1 += 4; src2 += 4;
1085 src3 += 4; src4 += 4;
1086 _mm_prefetch ((const char*)src3, _MM_HINT_T0);
1087 _mm_prefetch ((const char*)src4, _MM_HINT_T0);
1088
1089 dst += 4;
1090
1091 } while (src1 < src_end1);
1092
1093 __m256i maskF = _mm256_set1_epi32(~0u);
1094 mAccF0 = _mm256_and_si256(mAccF0, mAccF1);
1095 __m256i wcmpA= _mm256_cmpeq_epi8(mAccF0, maskF);
1096 unsigned maskA = unsigned(_mm256_movemask_epi8(wcmpA));
1097 return (maskA == ~0u);
1098}
1099
1100
1101/*!
1102 @brief XOR block against another
1103 *dst ^= *src
1104 @return 0 if destination does not have any bits
1105 @ingroup AVX2
1106*/
1107inline
1108unsigned avx2_xor_block(__m256i* BMRESTRICT dst,
1109 const __m256i* BMRESTRICT src)
1110{
1111 __m256i m1A, m1B, m1C, m1D;
1112 __m256i accA, accB, accC, accD;
1113
1114 const __m256i* BMRESTRICT src_end =
1115 (const __m256i*)((bm::word_t*)(src) + bm::set_block_size);
1116
1117 accA = accB = accC = accD = _mm256_setzero_si256();
1118
1119 do
1120 {
1121 m1A = _mm256_xor_si256(_mm256_load_si256(src+0), _mm256_load_si256(dst+0));
1122 m1B = _mm256_xor_si256(_mm256_load_si256(src+1), _mm256_load_si256(dst+1));
1123 m1C = _mm256_xor_si256(_mm256_load_si256(src+2), _mm256_load_si256(dst+2));
1124 m1D = _mm256_xor_si256(_mm256_load_si256(src+3), _mm256_load_si256(dst+3));
1125
1126 _mm256_store_si256(dst+0, m1A);
1127 _mm256_store_si256(dst+1, m1B);
1128 _mm256_store_si256(dst+2, m1C);
1129 _mm256_store_si256(dst+3, m1D);
1130
1131 accA = _mm256_or_si256(accA, m1A);
1132 accB = _mm256_or_si256(accB, m1B);
1133 accC = _mm256_or_si256(accC, m1C);
1134 accD = _mm256_or_si256(accD, m1D);
1135
1136 src += 4; dst += 4;
1137
1138 } while (src < src_end);
1139
1140 accA = _mm256_or_si256(accA, accB); // A = A | B
1141 accC = _mm256_or_si256(accC, accD); // C = C | D
1142 accA = _mm256_or_si256(accA, accC); // A = A | C
1143
1144 return !_mm256_testz_si256(accA, accA);
1145}
1146
1147/*!
1148 @brief 3 operand XOR
1149 *dst = *src1 ^ src2
1150 @return 0 if destination does not have any bits
1151 @ingroup AVX2
1152*/
1153inline
1154unsigned avx2_xor_block_2way(__m256i* BMRESTRICT dst,
1155 const __m256i* BMRESTRICT src1,
1156 const __m256i* BMRESTRICT src2)
1157{
1158 __m256i m1A, m1B, m1C, m1D;
1159 __m256i accA, accB, accC, accD;
1160
1161 const __m256i* BMRESTRICT src1_end =
1162 (const __m256i*)((bm::word_t*)(src1) + bm::set_block_size);
1163
1164 accA = accB = accC = accD = _mm256_setzero_si256();
1165
1166 do
1167 {
1168 m1A = _mm256_xor_si256(_mm256_load_si256(src1 + 0), _mm256_load_si256(src2 + 0));
1169 m1B = _mm256_xor_si256(_mm256_load_si256(src1 + 1), _mm256_load_si256(src2 + 1));
1170 m1C = _mm256_xor_si256(_mm256_load_si256(src1 + 2), _mm256_load_si256(src2 + 2));
1171 m1D = _mm256_xor_si256(_mm256_load_si256(src1 + 3), _mm256_load_si256(src2 + 3));
1172
1173 _mm256_store_si256(dst + 0, m1A);
1174 _mm256_store_si256(dst + 1, m1B);
1175 _mm256_store_si256(dst + 2, m1C);
1176 _mm256_store_si256(dst + 3, m1D);
1177
1178 accA = _mm256_or_si256(accA, m1A);
1179 accB = _mm256_or_si256(accB, m1B);
1180 accC = _mm256_or_si256(accC, m1C);
1181 accD = _mm256_or_si256(accD, m1D);
1182
1183 src1 += 4; src2 += 4; dst += 4;
1184
1185 } while (src1 < src1_end);
1186
1187 accA = _mm256_or_si256(accA, accB); // A = A | B
1188 accC = _mm256_or_si256(accC, accD); // C = C | D
1189 accA = _mm256_or_si256(accA, accC); // A = A | C
1190
1191 return !_mm256_testz_si256(accA, accA);
1192}
1193
1194
1195/*!
1196 @brief AND-NOT (SUB) array elements against another array
1197 *dst &= ~*src
1198
1199 @return 0 if destination does not have any bits
1200
1201 @ingroup AVX2
1202*/
1203inline
1204unsigned avx2_sub_block(__m256i* BMRESTRICT dst,
1205 const __m256i* BMRESTRICT src)
1206{
1207 __m256i m1A, m1B, m1C, m1D;
1208 __m256i accA, accB, accC, accD;
1209
1210 accA = accB = accC = accD = _mm256_setzero_si256();
1211
1212 const __m256i* BMRESTRICT src_end =
1213 (const __m256i*)((bm::word_t*)(src) + bm::set_block_size);
1214
1215 do
1216 {
1217 m1A = _mm256_andnot_si256(_mm256_load_si256(src), _mm256_load_si256(dst));
1218 m1B = _mm256_andnot_si256(_mm256_load_si256(src+1), _mm256_load_si256(dst+1));
1219 m1C = _mm256_andnot_si256(_mm256_load_si256(src+2), _mm256_load_si256(dst+2));
1220 m1D = _mm256_andnot_si256(_mm256_load_si256(src+3), _mm256_load_si256(dst+3));
1221
1222 _mm256_store_si256(dst+2, m1C);
1223 _mm256_store_si256(dst+3, m1D);
1224 _mm256_store_si256(dst+0, m1A);
1225 _mm256_store_si256(dst+1, m1B);
1226
1227 accA = _mm256_or_si256(accA, m1A);
1228 accB = _mm256_or_si256(accB, m1B);
1229 accC = _mm256_or_si256(accC, m1C);
1230 accD = _mm256_or_si256(accD, m1D);
1231
1232 src += 4; dst += 4;
1233 } while (src < src_end);
1234
1235 accA = _mm256_or_si256(accA, accB); // A = A | B
1236 accC = _mm256_or_si256(accC, accD); // C = C | D
1237 accA = _mm256_or_si256(accA, accC); // A = A | C
1238
1239 return !_mm256_testz_si256(accA, accA);
1240}
1241
1242/*!
1243 @brief SUB (AND NOT) block digest stride
1244 *dst &= ~*src
1245
1246 @return true if stide is all zero
1247 @ingroup AVX2
1248*/
1249inline
1250bool avx2_sub_digest(__m256i* BMRESTRICT dst,
1251 const __m256i* BMRESTRICT src)
1252{
1253 __m256i m1A, m1B, m1C, m1D;
1254
1255 m1A = _mm256_andnot_si256(_mm256_load_si256(src+0), _mm256_load_si256(dst+0));
1256 m1B = _mm256_andnot_si256(_mm256_load_si256(src+1), _mm256_load_si256(dst+1));
1257 m1C = _mm256_andnot_si256(_mm256_load_si256(src+2), _mm256_load_si256(dst+2));
1258 m1D = _mm256_andnot_si256(_mm256_load_si256(src+3), _mm256_load_si256(dst+3));
1259
1260 _mm256_store_si256(dst+0, m1A);
1261 _mm256_store_si256(dst+1, m1B);
1262 _mm256_store_si256(dst+2, m1C);
1263 _mm256_store_si256(dst+3, m1D);
1264
1265 m1A = _mm256_or_si256(m1A, m1B);
1266 m1C = _mm256_or_si256(m1C, m1D);
1267 m1A = _mm256_or_si256(m1A, m1C);
1268
1269 return _mm256_testz_si256(m1A, m1A);
1270}
1271
1272/*!
1273 @brief 2-operand SUB (AND NOT) block digest stride
1274 *dst = *src1 & ~*src2
1275
1276 @return true if stide is all zero
1277 @ingroup AVX2
1278*/
1279inline
1281 const __m256i* BMRESTRICT src1,
1282 const __m256i* BMRESTRICT src2)
1283{
1284 __m256i m1A, m1B, m1C, m1D;
1285
1286 m1A = _mm256_andnot_si256(_mm256_load_si256(src2+0), _mm256_load_si256(src1+0));
1287 m1B = _mm256_andnot_si256(_mm256_load_si256(src2+1), _mm256_load_si256(src1+1));
1288 m1C = _mm256_andnot_si256(_mm256_load_si256(src2+2), _mm256_load_si256(src1+2));
1289 m1D = _mm256_andnot_si256(_mm256_load_si256(src2+3), _mm256_load_si256(src1+3));
1290
1291 _mm256_store_si256(dst+0, m1A);
1292 _mm256_store_si256(dst+1, m1B);
1293 _mm256_store_si256(dst+2, m1C);
1294 _mm256_store_si256(dst+3, m1D);
1295
1296 m1A = _mm256_or_si256(m1A, m1B);
1297 m1C = _mm256_or_si256(m1C, m1D);
1298 m1A = _mm256_or_si256(m1A, m1C);
1299
1300 return _mm256_testz_si256(m1A, m1A);
1301}
1302
1303
1304
1305/*!
1306 @brief SUB block digest stride
1307 @ingroup AVX2
1308*/
1309inline
1311 const __m256i* BMRESTRICT src1,
1312 const __m256i* BMRESTRICT src2,
1313 const __m256i* BMRESTRICT src3,
1314 const __m256i* BMRESTRICT src4)
1315{
1316 __m256i m1A, m1B, m1C, m1D;
1317 __m256i m1E, m1F, m1G, m1H;
1318 const __m256i maskF = _mm256_set1_epi32(~0u); // brosdcast 0xFF
1319
1320 {
1321 __m256i s1_0, s2_0, s1_1, s2_1;
1322
1323 s1_0 = _mm256_load_si256(src1 + 0); s2_0 = _mm256_load_si256(src2 + 0);
1324 s1_1 = _mm256_load_si256(src1 + 1); s2_1 = _mm256_load_si256(src2 + 1);
1325 s1_0 = _mm256_xor_si256(s1_0, maskF);s2_0 = _mm256_xor_si256(s2_0, maskF);
1326 s1_1 = _mm256_xor_si256(s1_1, maskF);s2_1 = _mm256_xor_si256(s2_1, maskF);
1327
1328 m1A = _mm256_and_si256(s1_0, s2_0); m1B = _mm256_and_si256(s1_1, s2_1);
1329
1330 s1_0 = _mm256_load_si256(src1 + 2); s2_0 = _mm256_load_si256(src2 + 2);
1331 s1_1 = _mm256_load_si256(src1 + 3); s2_1 = _mm256_load_si256(src2 + 3);
1332 s1_0 = _mm256_xor_si256(s1_0, maskF);s2_0 = _mm256_xor_si256(s2_0, maskF);
1333 s1_1 = _mm256_xor_si256(s1_1, maskF);s2_1 = _mm256_xor_si256(s2_1, maskF);
1334
1335 m1C = _mm256_and_si256(s1_0, s2_0);
1336 m1D = _mm256_and_si256(s1_1, s2_1);
1337 }
1338 {
1339 __m256i s3_0, s4_0, s3_1, s4_1;
1340
1341 s3_0 = _mm256_load_si256(src3 + 0); s4_0 = _mm256_load_si256(src4 + 0);
1342 s3_1 = _mm256_load_si256(src3 + 1); s4_1 = _mm256_load_si256(src4 + 1);
1343 s3_0 = _mm256_xor_si256(s3_0, maskF);s4_0 = _mm256_xor_si256(s4_0, maskF);
1344 s3_1 = _mm256_xor_si256(s3_1, maskF);s4_1 = _mm256_xor_si256(s4_1, maskF);
1345
1346 m1E = _mm256_and_si256(s3_0, s4_0);
1347 m1F = _mm256_and_si256(s3_1, s4_1);
1348
1349 m1A = _mm256_and_si256(m1A, m1E);
1350 m1B = _mm256_and_si256(m1B, m1F);
1351
1352 s3_0 = _mm256_load_si256(src3 + 2); s4_0 = _mm256_load_si256(src4 + 2);
1353 s3_1 = _mm256_load_si256(src3 + 3); s4_1 = _mm256_load_si256(src4 + 3);
1354 s3_0 = _mm256_xor_si256(s3_0, maskF);s4_0 = _mm256_xor_si256(s4_0, maskF);
1355 s3_1 = _mm256_xor_si256(s3_1, maskF);s4_1 = _mm256_xor_si256(s4_1, maskF);
1356
1357 m1G = _mm256_and_si256(s3_0, s4_0);
1358 m1H = _mm256_and_si256(s3_1, s4_1);
1359 }
1360 {
1361 __m256i dst0, dst1;
1362 dst0 = _mm256_load_si256(dst + 0); dst1 = _mm256_load_si256(dst + 1);
1363
1364 m1C = _mm256_and_si256(m1C, m1G);
1365 m1D = _mm256_and_si256(m1D, m1H);
1366 m1A = _mm256_and_si256(m1A, dst0);
1367 m1B = _mm256_and_si256(m1B, dst1);
1368
1369 dst0 = _mm256_load_si256(dst + 2); dst1 = _mm256_load_si256(dst + 3);
1370
1371 m1C = _mm256_and_si256(m1C, dst0);
1372 m1D = _mm256_and_si256(m1D, dst1);
1373 }
1374 _mm256_store_si256(dst + 0, m1A);
1375 _mm256_store_si256(dst + 1, m1B);
1376 _mm256_store_si256(dst + 2, m1C);
1377 _mm256_store_si256(dst + 3, m1D);
1378
1379 m1A = _mm256_or_si256(m1A, m1B);
1380 m1C = _mm256_or_si256(m1C, m1D);
1381 m1A = _mm256_or_si256(m1A, m1C);
1382
1383 return _mm256_testz_si256(m1A, m1A);
1384}
1385
1386
1387/*!
1388 @brief SUB block digest stride
1389 @ingroup AVX2
1390*/
1391inline
1393 const __m256i* BMRESTRICT src1,
1394 const __m256i* BMRESTRICT src2)
1395{
1396 __m256i m1A, m1B, m1C, m1D;
1397// __m256i m1E, m1F, m1G, m1H;
1398 const __m256i maskF = _mm256_set1_epi32(~0u); // brosdcast 0xFF
1399
1400 {
1401 __m256i s1_0, s2_0, s1_1, s2_1;
1402
1403 s1_0 = _mm256_load_si256(src1 + 0); s2_0 = _mm256_load_si256(src2 + 0);
1404 s1_1 = _mm256_load_si256(src1 + 1); s2_1 = _mm256_load_si256(src2 + 1);
1405 s1_0 = _mm256_xor_si256(s1_0, maskF);s2_0 = _mm256_xor_si256(s2_0, maskF);
1406 s1_1 = _mm256_xor_si256(s1_1, maskF);s2_1 = _mm256_xor_si256(s2_1, maskF);
1407
1408 m1A = _mm256_and_si256(s1_0, s2_0); m1B = _mm256_and_si256(s1_1, s2_1);
1409
1410 s1_0 = _mm256_load_si256(src1 + 2); s2_0 = _mm256_load_si256(src2 + 2);
1411 s1_1 = _mm256_load_si256(src1 + 3); s2_1 = _mm256_load_si256(src2 + 3);
1412 s1_0 = _mm256_xor_si256(s1_0, maskF);s2_0 = _mm256_xor_si256(s2_0, maskF);
1413 s1_1 = _mm256_xor_si256(s1_1, maskF);s2_1 = _mm256_xor_si256(s2_1, maskF);
1414
1415 m1C = _mm256_and_si256(s1_0, s2_0);
1416 m1D = _mm256_and_si256(s1_1, s2_1);
1417 }
1418 /*
1419 {
1420 __m256i s3_0, s4_0, s3_1, s4_1;
1421
1422 s3_0 = _mm256_load_si256(src3 + 0); s4_0 = _mm256_load_si256(src4 + 0);
1423 s3_1 = _mm256_load_si256(src3 + 1); s4_1 = _mm256_load_si256(src4 + 1);
1424 s3_0 = _mm256_xor_si256(s3_0, maskF);s4_0 = _mm256_xor_si256(s4_0, maskF);
1425 s3_1 = _mm256_xor_si256(s3_1, maskF);s4_1 = _mm256_xor_si256(s4_1, maskF);
1426
1427 m1E = _mm256_and_si256(s3_0, s4_0);
1428 m1F = _mm256_and_si256(s3_1, s4_1);
1429
1430 m1A = _mm256_and_si256(m1A, m1E);
1431 m1B = _mm256_and_si256(m1B, m1F);
1432
1433 s3_0 = _mm256_load_si256(src3 + 2); s4_0 = _mm256_load_si256(src4 + 2);
1434 s3_1 = _mm256_load_si256(src3 + 3); s4_1 = _mm256_load_si256(src4 + 3);
1435 s3_0 = _mm256_xor_si256(s3_0, maskF);s4_0 = _mm256_xor_si256(s4_0, maskF);
1436 s3_1 = _mm256_xor_si256(s3_1, maskF);s4_1 = _mm256_xor_si256(s4_1, maskF);
1437
1438 m1G = _mm256_and_si256(s3_0, s4_0);
1439 m1H = _mm256_and_si256(s3_1, s4_1);
1440 }
1441 */
1442 {
1443 __m256i dst0, dst1;
1444 dst0 = _mm256_load_si256(dst + 0); dst1 = _mm256_load_si256(dst + 1);
1445
1446// m1C = _mm256_and_si256(m1C, m1G);
1447// m1D = _mm256_and_si256(m1D, m1H);
1448 m1A = _mm256_and_si256(m1A, dst0);
1449 m1B = _mm256_and_si256(m1B, dst1);
1450
1451 dst0 = _mm256_load_si256(dst + 2); dst1 = _mm256_load_si256(dst + 3);
1452
1453 m1C = _mm256_and_si256(m1C, dst0);
1454 m1D = _mm256_and_si256(m1D, dst1);
1455 }
1456 _mm256_store_si256(dst + 0, m1A);
1457 _mm256_store_si256(dst + 1, m1B);
1458 _mm256_store_si256(dst + 2, m1C);
1459 _mm256_store_si256(dst + 3, m1D);
1460
1461 m1A = _mm256_or_si256(m1A, m1B);
1462 m1C = _mm256_or_si256(m1C, m1D);
1463 m1A = _mm256_or_si256(m1A, m1C);
1464
1465 return _mm256_testz_si256(m1A, m1A);
1466}
1467
1468
1469
1470/*!
1471 @brief AVX2 block memset
1472 *dst = value
1473
1474 @ingroup AVX2
1475*/
1477void avx2_set_block(__m256i* BMRESTRICT dst, bm::word_t value)
1478{
1479 __m256i* BMRESTRICT dst_end =
1480 (__m256i*)((bm::word_t*)(dst) + bm::set_block_size);
1481
1482 __m256i ymm0 = _mm256_set1_epi32(int(value));
1483 do
1484 {
1485 _mm256_store_si256(dst, ymm0);
1486 _mm256_store_si256(dst+1, ymm0);
1487 _mm256_store_si256(dst+2, ymm0);
1488 _mm256_store_si256(dst+3, ymm0);
1489
1490 dst += 4;
1491 } while (dst < dst_end);
1492}
1493
1494
1495
1496/*!
1497 @brief AVX2 block copy
1498 *dst = *src
1499
1500 @ingroup AVX2
1501*/
1502inline
1503void avx2_copy_block(__m256i* BMRESTRICT dst,
1504 const __m256i* BMRESTRICT src)
1505{
1506 __m256i ymm0, ymm1, ymm2, ymm3;
1507
1508 const __m256i* BMRESTRICT src_end =
1509 (const __m256i*)((bm::word_t*)(src) + bm::set_block_size);
1510
1511 do
1512 {
1513 ymm0 = _mm256_load_si256(src+0);
1514 ymm1 = _mm256_load_si256(src+1);
1515 ymm2 = _mm256_load_si256(src+2);
1516 ymm3 = _mm256_load_si256(src+3);
1517
1518 _mm256_store_si256(dst+0, ymm0);
1519 _mm256_store_si256(dst+1, ymm1);
1520 _mm256_store_si256(dst+2, ymm2);
1521 _mm256_store_si256(dst+3, ymm3);
1522
1523 ymm0 = _mm256_load_si256(src+4);
1524 ymm1 = _mm256_load_si256(src+5);
1525 ymm2 = _mm256_load_si256(src+6);
1526 ymm3 = _mm256_load_si256(src+7);
1527
1528 _mm256_store_si256(dst+4, ymm0);
1529 _mm256_store_si256(dst+5, ymm1);
1530 _mm256_store_si256(dst+6, ymm2);
1531 _mm256_store_si256(dst+7, ymm3);
1532
1533 src += 8; dst += 8;
1534
1535 } while (src < src_end);
1536}
1537
1538/*!
1539 @brief AVX2 block copy (unaligned SRC)
1540 *dst = *src
1541
1542 @ingroup AVX2
1543*/
1544inline
1546 const __m256i* BMRESTRICT src)
1547{
1548 __m256i ymm0, ymm1, ymm2, ymm3;
1549
1550 const __m256i* BMRESTRICT src_end =
1551 (const __m256i*)((bm::word_t*)(src) + bm::set_block_size);
1552
1553 do
1554 {
1555 ymm0 = _mm256_loadu_si256(src+0);
1556 ymm1 = _mm256_loadu_si256(src+1);
1557 ymm2 = _mm256_loadu_si256(src+2);
1558 ymm3 = _mm256_loadu_si256(src+3);
1559
1560 _mm256_store_si256(dst+0, ymm0);
1561 _mm256_store_si256(dst+1, ymm1);
1562 _mm256_store_si256(dst+2, ymm2);
1563 _mm256_store_si256(dst+3, ymm3);
1564
1565 ymm0 = _mm256_loadu_si256(src+4);
1566 ymm1 = _mm256_loadu_si256(src+5);
1567 ymm2 = _mm256_loadu_si256(src+6);
1568 ymm3 = _mm256_loadu_si256(src+7);
1569
1570 _mm256_store_si256(dst+4, ymm0);
1571 _mm256_store_si256(dst+5, ymm1);
1572 _mm256_store_si256(dst+6, ymm2);
1573 _mm256_store_si256(dst+7, ymm3);
1574
1575 src += 8; dst += 8;
1576
1577 } while (src < src_end);
1578}
1579
1580
1581
1582/*!
1583 @brief AVX2 block copy
1584 *dst = *src
1585
1586 @ingroup AVX2
1587*/
1588inline
1590 const __m256i* BMRESTRICT src)
1591{
1592 __m256i ymm0, ymm1, ymm2, ymm3;
1593
1594 const __m256i* BMRESTRICT src_end =
1595 (const __m256i*)((bm::word_t*)(src) + bm::set_block_size);
1596
1597 do
1598 {
1599 ymm0 = _mm256_load_si256(src+0);
1600 ymm1 = _mm256_load_si256(src+1);
1601 ymm2 = _mm256_load_si256(src+2);
1602 ymm3 = _mm256_load_si256(src+3);
1603
1604 _mm256_stream_si256(dst+0, ymm0);
1605 _mm256_stream_si256(dst+1, ymm1);
1606 _mm256_stream_si256(dst+2, ymm2);
1607 _mm256_stream_si256(dst+3, ymm3);
1608
1609 ymm0 = _mm256_load_si256(src+4);
1610 ymm1 = _mm256_load_si256(src+5);
1611 ymm2 = _mm256_load_si256(src+6);
1612 ymm3 = _mm256_load_si256(src+7);
1613
1614 _mm256_stream_si256(dst+4, ymm0);
1615 _mm256_stream_si256(dst+5, ymm1);
1616 _mm256_stream_si256(dst+6, ymm2);
1617 _mm256_stream_si256(dst+7, ymm3);
1618
1619 src += 8; dst += 8;
1620
1621 } while (src < src_end);
1622}
1623
1624/*!
1625 @brief AVX2 block copy (unaligned SRC)
1626 *dst = *src
1627
1628 @ingroup AVX2
1629*/
1630inline
1632 const __m256i* BMRESTRICT src)
1633{
1634 __m256i ymm0, ymm1, ymm2, ymm3;
1635
1636 const __m256i* BMRESTRICT src_end =
1637 (const __m256i*)((bm::word_t*)(src) + bm::set_block_size);
1638
1639 do
1640 {
1641 ymm0 = _mm256_loadu_si256(src+0);
1642 ymm1 = _mm256_loadu_si256(src+1);
1643 ymm2 = _mm256_loadu_si256(src+2);
1644 ymm3 = _mm256_loadu_si256(src+3);
1645
1646 _mm256_stream_si256(dst+0, ymm0);
1647 _mm256_stream_si256(dst+1, ymm1);
1648 _mm256_stream_si256(dst+2, ymm2);
1649 _mm256_stream_si256(dst+3, ymm3);
1650
1651 ymm0 = _mm256_loadu_si256(src+4);
1652 ymm1 = _mm256_loadu_si256(src+5);
1653 ymm2 = _mm256_loadu_si256(src+6);
1654 ymm3 = _mm256_loadu_si256(src+7);
1655
1656 _mm256_stream_si256(dst+4, ymm0);
1657 _mm256_stream_si256(dst+5, ymm1);
1658 _mm256_stream_si256(dst+6, ymm2);
1659 _mm256_stream_si256(dst+7, ymm3);
1660
1661 src += 8; dst += 8;
1662
1663 } while (src < src_end);
1664}
1665
1666
1667
1668/*!
1669 @brief Invert bit-block
1670 *dst = ~*dst
1671 or
1672 *dst ^= *dst
1673
1674 @ingroup AVX2
1675*/
1676inline
1678{
1679 __m256i maskFF = _mm256_set1_epi32(-1); // broadcast 0xFF
1680 const __m256i* BMRESTRICT dst_end =
1681 (const __m256i*)((bm::word_t*)(dst) + bm::set_block_size);
1682
1683 __m256i ymm0, ymm1;
1684 do
1685 {
1686 ymm0 = _mm256_xor_si256(_mm256_load_si256(dst+0), maskFF);
1687 ymm1 = _mm256_xor_si256(_mm256_load_si256(dst+1), maskFF);
1688
1689 _mm256_store_si256(dst+0, ymm0);
1690 _mm256_store_si256(dst+1, ymm1);
1691
1692 ymm0 = _mm256_xor_si256(_mm256_load_si256(dst+2), maskFF);
1693 ymm1 = _mm256_xor_si256(_mm256_load_si256(dst+3), maskFF);
1694
1695 _mm256_store_si256(dst+2, ymm0);
1696 _mm256_store_si256(dst+3, ymm1);
1697
1698 dst += 4;
1699
1700 } while (dst < dst_end);
1701}
1702
1703/*!
1704 @brief check if block is all zero bits
1705 @ingroup AVX2
1706*/
1707inline
1708bool avx2_is_all_zero(const __m256i* BMRESTRICT block)
1709{
1710 const __m256i* BMRESTRICT block_end =
1711 (const __m256i*)((bm::word_t*)(block) + bm::set_block_size);
1712
1713 do
1714 {
1715 __m256i w0 = _mm256_load_si256(block+0);
1716 __m256i w1 = _mm256_load_si256(block+1);
1717
1718 __m256i wA = _mm256_or_si256(w0, w1);
1719
1720 __m256i w2 = _mm256_load_si256(block+2);
1721 __m256i w3 = _mm256_load_si256(block+3);
1722
1723 __m256i wB = _mm256_or_si256(w2, w3);
1724 wA = _mm256_or_si256(wA, wB);
1725
1726 if (!_mm256_testz_si256(wA, wA))
1727 return false;
1728 block += 4;
1729 } while (block < block_end);
1730 return true;
1731}
1732
1733/*!
1734 @brief check if digest stride is all zero bits
1735 @ingroup AVX2
1736*/
1737inline
1738bool avx2_is_digest_zero(const __m256i* BMRESTRICT block)
1739{
1740 __m256i wA = _mm256_or_si256(_mm256_load_si256(block+0), _mm256_load_si256(block+1));
1741 __m256i wB = _mm256_or_si256(_mm256_load_si256(block+2), _mm256_load_si256(block+3));
1742 wA = _mm256_or_si256(wA, wB);
1743
1744 return _mm256_testz_si256(wA, wA);
1745}
1746
1747/*!
1748 @brief set digest stride to 0xFF.. or 0x0 value
1749 @ingroup AVX2
1750*/
1751inline
1752void avx2_block_set_digest(__m256i* dst, unsigned value)
1753{
1754 __m256i mV = _mm256_set1_epi32(int(value));
1755 _mm256_store_si256(dst, mV);
1756 _mm256_store_si256(dst + 1, mV);
1757 _mm256_store_si256(dst + 2, mV);
1758 _mm256_store_si256(dst + 3, mV);
1759}
1760
1761/*!
1762 @brief check if block is all one bits
1763 @return true if all bits are 1
1764 @ingroup AVX2
1765*/
1766inline
1767bool avx2_is_all_one(const __m256i* BMRESTRICT block)
1768{
1769 const __m256i maskF = _mm256_set1_epi32(~0u); // brosdcast 0xFF
1770 const __m256i* BMRESTRICT block_end =
1771 (const __m256i*)((bm::word_t*)(block) + bm::set_block_size);
1772 do
1773 {
1774 __m256i m1A = _mm256_load_si256(block+0);
1775 __m256i m1B = _mm256_load_si256(block+1);
1776 m1A = _mm256_xor_si256(m1A, maskF);
1777 m1B = _mm256_xor_si256(m1B, maskF);
1778 m1A = _mm256_or_si256(m1A, m1B);
1779 if (!_mm256_testz_si256(m1A, m1A))
1780 return false;
1781 block += 2;
1782 } while (block < block_end);
1783 return true;
1784}
1785
1786/*!
1787 @brief check if wave of pointers is all 0xFFF
1788 @ingroup AVX2
1789*/
1791bool avx2_test_all_one_wave(const void* ptr)
1792{
1793 __m256i maskF = _mm256_set1_epi32(~0u); // braodcast 0xFF
1794 __m256i wcmpA = _mm256_cmpeq_epi8(_mm256_loadu_si256((__m256i*)ptr), maskF); // (w0 == maskF)
1795 unsigned maskA = unsigned(_mm256_movemask_epi8(wcmpA));
1796 return (maskA == ~0u);
1797}
1798
1799
1800/*!
1801 @brief check if wave of pointers is all NULL
1802 @ingroup AVX2
1803*/
1805bool avx2_test_all_zero_wave(const void* ptr)
1806{
1807 __m256i w0 = _mm256_loadu_si256((__m256i*)ptr);
1808 return _mm256_testz_si256(w0, w0);
1809}
1810
1811/*!
1812 @brief check if 2 wave of pointers are all NULL
1813 @ingroup AVX2
1814*/
1816bool avx2_test_all_zero_wave2(const void* ptr0, const void* ptr1)
1817{
1818 __m256i w0 = _mm256_loadu_si256((__m256i*)ptr0);
1819 __m256i w1 = _mm256_loadu_si256((__m256i*)ptr1);
1820 w0 = _mm256_or_si256(w0, w1);
1821 return _mm256_testz_si256(w0, w0);
1822}
1823
1824/*!
1825 @brief check if 2 wave of pointers are all the same (NULL or FULL)
1826 @ingroup AVX2
1827*/
1829bool avx2_test_all_eq_wave2(const void* ptr0, const void* ptr1)
1830{
1831 __m256i w0 = _mm256_loadu_si256((__m256i*)ptr0);
1832 __m256i w1 = _mm256_loadu_si256((__m256i*)ptr1);
1833 w0 = _mm256_xor_si256(w0, w1);
1834 return _mm256_testz_si256(w0, w0);
1835}
1836
1837/*!
1838 @brief block shift left by 1
1839 @ingroup AVX2
1840*/
1841inline
1842bool avx2_shift_l1(__m256i* block, bm::word_t* empty_acc, unsigned co1)
1843{
1844 __m256i* block_end =
1845 (__m256i*)((bm::word_t*)(block) + bm::set_block_size);
1846
1847 __m256i m1COshft, m2COshft;
1848 __m256i mAcc = _mm256_set1_epi32(0);
1849 __m256i mMask1 = _mm256_set1_epi32(1);
1850 __m256i mCOidx = _mm256_set_epi32(0, 7, 6, 5, 4, 3, 2, 1);
1851 unsigned co2;
1852
1853 for (--block_end; block_end >= block; block_end -= 2)
1854 {
1855 __m256i m1A = _mm256_load_si256(block_end);
1856 __m256i m2A = _mm256_load_si256(block_end-1);
1857
1858 __m256i m1CO = _mm256_and_si256(m1A, mMask1);
1859 __m256i m2CO = _mm256_and_si256(m2A, mMask1);
1860
1861 co2 = _mm256_extract_epi32(m1CO, 0);
1862
1863 m1A = _mm256_srli_epi32(m1A, 1); // (block[i] >> 1u)
1864 m2A = _mm256_srli_epi32(m2A, 1);
1865
1866 // shift CO flags using -1 permute indexes, add CO to v[0]
1867 m1COshft = _mm256_permutevar8x32_epi32(m1CO, mCOidx);
1868 m1COshft = _mm256_insert_epi32(m1COshft, co1, 7); // v[7] = co_flag
1869
1870 co1 = co2;
1871
1872 co2 = _mm256_extract_epi32(m2CO, 0);
1873
1874 m2COshft = _mm256_permutevar8x32_epi32(m2CO, mCOidx);
1875 m2COshft = _mm256_insert_epi32(m2COshft, co1, 7);
1876
1877 m1COshft = _mm256_slli_epi32(m1COshft, 31);
1878 m2COshft = _mm256_slli_epi32(m2COshft, 31);
1879
1880 m1A = _mm256_or_si256(m1A, m1COshft); // block[i] |= co_flag
1881 m2A = _mm256_or_si256(m2A, m2COshft);
1882
1883 _mm256_store_si256(block_end, m1A);
1884 _mm256_store_si256(block_end-1, m2A);
1885
1886 mAcc = _mm256_or_si256(mAcc, m1A);
1887 mAcc = _mm256_or_si256(mAcc, m2A);
1888
1889 co1 = co2;
1890
1891 } // for
1892
1893 *empty_acc = !_mm256_testz_si256(mAcc, mAcc);
1894 return co1;
1895}
1896
1897
1898/*!
1899 @brief block shift right by 1
1900 @ingroup AVX2
1901*/
1902inline
1903bool avx2_shift_r1(__m256i* block, bm::word_t* empty_acc, unsigned co1)
1904{
1905 const __m256i* block_end =
1906 (const __m256i*)((bm::word_t*)(block) + bm::set_block_size);
1907
1908 __m256i m1COshft, m2COshft;
1909 __m256i mAcc = _mm256_set1_epi32(0);
1910 __m256i mCOidx = _mm256_set_epi32(6, 5, 4, 3, 2, 1, 0, 0);
1911 unsigned co2;
1912
1913 for (;block < block_end; block+=2)
1914 {
1915 __m256i m1A = _mm256_load_si256(block);
1916 __m256i m2A = _mm256_load_si256(block+1);
1917
1918 __m256i m1CO = _mm256_srli_epi32(m1A, 31);
1919 __m256i m2CO = _mm256_srli_epi32(m2A, 31);
1920
1921 co2 = _mm256_extract_epi32(m1CO, 7);
1922
1923 m1A = _mm256_slli_epi32(m1A, 1); // (block[i] << 1u)
1924 m2A = _mm256_slli_epi32(m2A, 1);
1925
1926 // shift CO flags using +1 permute indexes, add CO to v[0]
1927 m1COshft = _mm256_permutevar8x32_epi32(m1CO, mCOidx);
1928 m1COshft = _mm256_insert_epi32(m1COshft, co1, 0); // v[0] = co_flag
1929
1930 co1 = co2;
1931
1932 co2 = _mm256_extract_epi32(m2CO, 7);
1933 m2COshft = _mm256_permutevar8x32_epi32(m2CO, mCOidx);
1934 m2COshft = _mm256_insert_epi32(m2COshft, co1, 0);
1935
1936 m1A = _mm256_or_si256(m1A, m1COshft); // block[i] |= co_flag
1937 m2A = _mm256_or_si256(m2A, m2COshft);
1938
1939 _mm256_store_si256(block, m1A);
1940 _mm256_store_si256(block+1, m2A);
1941
1942 mAcc = _mm256_or_si256(mAcc, m1A);
1943 mAcc = _mm256_or_si256(mAcc, m2A);
1944
1945 co1 = co2;
1946 } // for
1947
1948 *empty_acc = !_mm256_testz_si256(mAcc, mAcc);
1949 return co1;
1950}
1951
1952
1953/*!
1954 @brief fused block shift right by 1 plus AND
1955 @ingroup AVX2
1956*/
1957
1958inline
1959bool avx2_shift_r1_and(__m256i* BMRESTRICT block,
1960 bm::word_t co1,
1961 const __m256i* BMRESTRICT mask_block,
1962 bm::id64_t* BMRESTRICT digest)
1963{
1964 BM_ASSERT(*digest);
1965
1966 bm::word_t* wblock = (bm::word_t*) block;
1967 const bm::word_t* mblock = (const bm::word_t*) mask_block;
1968
1969 __m256i m1COshft, m2COshft;
1970 __m256i mAcc = _mm256_set1_epi32(0);
1971 __m256i mCOidx = _mm256_set_epi32(6, 5, 4, 3, 2, 1, 0, 0);
1972 unsigned co2;
1973
1974 bm::id64_t d, wd;
1975 wd = d = *digest;
1976 unsigned di = co1 ? 0 : unsigned(_tzcnt_u64(d)); // get first set bit
1977 for (; di < 64 ; ++di)
1978 {
1979 const unsigned d_base = di * bm::set_block_digest_wave_size;
1980 const bm::id64_t dmask = (1ull << di);
1981 if (d & dmask) // digest stride NOT empty
1982 {
1983 mAcc = _mm256_xor_si256(mAcc, mAcc); // mAcc = 0
1984
1985 mask_block = (__m256i*) &mblock[d_base];
1986 _mm_prefetch ((const char*)mask_block, _MM_HINT_NTA);
1987
1988 block = (__m256i*) &wblock[d_base];
1989
1990 for (unsigned i = 0; i < 2; ++i, block += 2, mask_block += 2)
1991 {
1992 __m256i m1A = _mm256_load_si256(block);
1993 __m256i m2A = _mm256_load_si256(block+1);
1994
1995 __m256i m1CO = _mm256_srli_epi32(m1A, 31);
1996 __m256i m2CO = _mm256_srli_epi32(m2A, 31);
1997
1998 co2 = _mm256_extract_epi32(m1CO, 7);
1999
2000 m1A = _mm256_slli_epi32(m1A, 1); // (block[i] << 1u)
2001 m2A = _mm256_slli_epi32(m2A, 1);
2002
2003 __m256i m1M = _mm256_load_si256(mask_block);
2004 __m256i m2M = _mm256_load_si256(mask_block+1);
2005
2006 // shift CO flags using +1 permute indexes, add CO to v[0]
2007 m1COshft = _mm256_insert_epi32(
2008 _mm256_permutevar8x32_epi32(m1CO, mCOidx),
2009 co1, 0); // v[0] = co_flag
2010
2011 co1 = co2;
2012 co2 = _mm256_extract_epi32(m2CO, 7);
2013 m2COshft = _mm256_insert_epi32(
2014 _mm256_permutevar8x32_epi32(m2CO, mCOidx),
2015 co1, 0);
2016
2017 m1A = _mm256_or_si256(m1A, m1COshft); // block[i] |= co_flag
2018 m2A = _mm256_or_si256(m2A, m2COshft);
2019
2020 m1A = _mm256_and_si256(m1A, m1M); // block[i] &= mask_block[i]
2021 m2A = _mm256_and_si256(m2A, m2M);
2022
2023 _mm256_store_si256(block, m1A);
2024 _mm256_store_si256(block+1, m2A);
2025
2026 mAcc = _mm256_or_si256(mAcc, m1A);
2027 mAcc = _mm256_or_si256(mAcc, m2A);
2028
2029 co1 = co2;
2030
2031 } // for i
2032
2033 if (_mm256_testz_si256(mAcc, mAcc)) // test if OR accum is zero
2034 d &= ~dmask; // clear the digest bit
2035
2036 wd = _blsr_u64(wd); // wd &= wd - 1; // reset lowest set bit
2037 }
2038 else // stride is empty
2039 {
2040 if (co1)
2041 {
2042 BM_ASSERT(co1 == 1);
2043 BM_ASSERT(wblock[d_base] == 0);
2044
2045 bm::id64_t w0 = wblock[d_base] = (co1 & mblock[d_base]);
2046 d |= (dmask & (w0 << di)); // update digest (branchless if (w0))
2047 co1 = 0;
2048 }
2049 if (!wd) // digest is empty, no CO -> exit
2050 break;
2051 }
2052 } // for di
2053
2054 *digest = d;
2055 return co1;
2056}
2057
2058
2059
2060/*
2061inline
2062void avx2_i32_shift()
2063{
2064 unsigned shift_in = 80;
2065
2066 __m256i mTest = _mm256_set_epi32(70, 60, 50, 40, 30, 20, 10, 100);
2067 __m256i mIdx = _mm256_set_epi32(6, 5, 4, 3, 2, 1, 0, 0);
2068
2069 __m256i m1shft = _mm256_permutevar8x32_epi32(mTest, mIdx);
2070 m1shft = _mm256_insert_epi32(m1shft, shift_in, 0);
2071
2072 avx2_print256("m1shft=", m1shft);
2073}
2074*/
2075
2076
2077
2078/*!
2079 AVX2 calculate number of bit changes from 0 to 1
2080 @ingroup AVX2
2081*/
2082inline
2083unsigned avx2_bit_block_calc_change(const __m256i* BMRESTRICT block,
2084 unsigned size)
2085{
2087
2088 const __m256i* block_end =
2089 (const __m256i*)((bm::word_t*)(block) + size);
2090
2091 __m256i m1COshft, m2COshft;
2092 __m256i mCOidx = _mm256_set_epi32(6, 5, 4, 3, 2, 1, 0, 0);
2093 __m256i cntAcc = _mm256_setzero_si256();
2094
2095 unsigned w0 = *((bm::word_t*)(block));
2096 unsigned count = 1;
2097
2099
2100 unsigned co2, co1 = 0;
2101 for (;block < block_end; block+=2)
2102 {
2103 __m256i m1A = _mm256_load_si256(block);
2104 __m256i m2A = _mm256_load_si256(block+1);
2105
2106 __m256i m1CO = _mm256_srli_epi32(m1A, 31);
2107 __m256i m2CO = _mm256_srli_epi32(m2A, 31);
2108
2109 co2 = _mm256_extract_epi32(m1CO, 7);
2110
2111 __m256i m1As = _mm256_slli_epi32(m1A, 1); // (block[i] << 1u)
2112 __m256i m2As = _mm256_slli_epi32(m2A, 1);
2113
2114 // shift CO flags using +1 permute indexes, add CO to v[0]
2115 m1COshft = _mm256_permutevar8x32_epi32(m1CO, mCOidx);
2116 m1COshft = _mm256_insert_epi32(m1COshft, co1, 0); // v[0] = co_flag
2117
2118 co1 = co2;
2119
2120 co2 = _mm256_extract_epi32(m2CO, 7);
2121 m2COshft = _mm256_permutevar8x32_epi32(m2CO, mCOidx);
2122 m2COshft = _mm256_insert_epi32(m2COshft, co1, 0);
2123
2124 m1As = _mm256_or_si256(m1As, m1COshft); // block[i] |= co_flag
2125 m2As = _mm256_or_si256(m2As, m2COshft);
2126
2127 co1 = co2;
2128
2129 // we now have two shifted AVX2 regs with carry-over
2130 m1A = _mm256_xor_si256(m1A, m1As); // w ^= (w >> 1);
2131 m2A = _mm256_xor_si256(m2A, m2As);
2132
2133 {
2134 BM_AVX2_BIT_COUNT(bc, m1A)
2135 cntAcc = _mm256_add_epi64(cntAcc, bc);
2136 BM_AVX2_BIT_COUNT(bc, m2A)
2137 cntAcc = _mm256_add_epi64(cntAcc, bc);
2138 }
2139 } // for
2140
2141 // horizontal count sum
2142 _mm256_store_si256 ((__m256i*)cnt_v, cntAcc);
2143 count += (unsigned)(cnt_v[0] + cnt_v[1] + cnt_v[2] + cnt_v[3]);
2144
2145 count -= (w0 & 1u); // correct initial carry-in error
2146 return count;
2147}
2148
2149/*!
2150 AVX2 calculate number of bit changes from 0 to 1 from a XOR product
2151 @ingroup AVX2
2152*/
2153inline
2155 const __m256i* BMRESTRICT xor_block,
2156 unsigned size,
2157 unsigned* BMRESTRICT gcount,
2158 unsigned* BMRESTRICT bcount)
2159{
2161
2162 const __m256i* BMRESTRICT block_end =
2163 (const __m256i*)((bm::word_t*)(block) + size);
2164
2165 __m256i m1COshft, m2COshft;
2166 __m256i mCOidx = _mm256_set_epi32(6, 5, 4, 3, 2, 1, 0, 0);
2167
2168 __m256i cntAcc = _mm256_setzero_si256();
2169 __m256i cntAcc2 = _mm256_setzero_si256();
2170
2171 unsigned w0 = *((bm::word_t*)(block));
2172 unsigned bit_count = 0;
2173 unsigned gap_count = 1;
2174
2176
2177 unsigned co2, co1 = 0;
2178 for (;block < block_end; block+=2, xor_block+=2)
2179 {
2180 __m256i m1A = _mm256_load_si256(block);
2181 __m256i m2A = _mm256_load_si256(block+1);
2182 __m256i m1B = _mm256_load_si256(xor_block);
2183 __m256i m2B = _mm256_load_si256(xor_block+1);
2184
2185 m1A = _mm256_xor_si256 (m1A, m1B);
2186 m2A = _mm256_xor_si256 (m2A, m2B);
2187
2188 {
2189 BM_AVX2_BIT_COUNT(bc, m1A)
2190 cntAcc2 = _mm256_add_epi64(cntAcc2, bc);
2191 BM_AVX2_BIT_COUNT(bc, m2A)
2192 cntAcc2 = _mm256_add_epi64(cntAcc2, bc);
2193 }
2194
2195 __m256i m1CO = _mm256_srli_epi32(m1A, 31);
2196 __m256i m2CO = _mm256_srli_epi32(m2A, 31);
2197
2198 co2 = _mm256_extract_epi32(m1CO, 7);
2199
2200 __m256i m1As = _mm256_slli_epi32(m1A, 1); // (block[i] << 1u)
2201 __m256i m2As = _mm256_slli_epi32(m2A, 1);
2202
2203 // shift CO flags using +1 permute indexes, add CO to v[0]
2204 m1COshft = _mm256_permutevar8x32_epi32(m1CO, mCOidx);
2205 m1COshft = _mm256_insert_epi32(m1COshft, co1, 0); // v[0] = co_flag
2206
2207 co1 = co2;
2208
2209 co2 = _mm256_extract_epi32(m2CO, 7);
2210 m2COshft = _mm256_permutevar8x32_epi32(m2CO, mCOidx);
2211 m2COshft = _mm256_insert_epi32(m2COshft, co1, 0);
2212
2213 m1As = _mm256_or_si256(m1As, m1COshft); // block[i] |= co_flag
2214 m2As = _mm256_or_si256(m2As, m2COshft);
2215
2216 co1 = co2;
2217
2218 // we now have two shifted AVX2 regs with carry-over
2219 m1A = _mm256_xor_si256(m1A, m1As); // w ^= (w >> 1);
2220 m2A = _mm256_xor_si256(m2A, m2As);
2221
2222 {
2223 BM_AVX2_BIT_COUNT(bc, m1A)
2224 cntAcc = _mm256_add_epi64(cntAcc, bc);
2225 BM_AVX2_BIT_COUNT(bc, m2A)
2226 cntAcc = _mm256_add_epi64(cntAcc, bc);
2227 }
2228 } // for
2229
2230 // horizontal count sum
2231 _mm256_store_si256 ((__m256i*)cnt_v, cntAcc);
2232 gap_count += (unsigned)(cnt_v[0] + cnt_v[1] + cnt_v[2] + cnt_v[3]);
2233 gap_count -= (w0 & 1u); // correct initial carry-in error
2234 if (!gap_count)
2235 ++gap_count; // always >0
2236
2237 _mm256_store_si256 ((__m256i*)cnt_v, cntAcc2);
2238 bit_count += (unsigned)(cnt_v[0] + cnt_v[1] + cnt_v[2] + cnt_v[3]);
2239
2240 *gcount = gap_count;
2241 *bcount = bit_count;
2242}
2243
2244
2245
2246/*!
2247 AVX2 calculate number of bit changes from 0 to 1 and bitcount
2248 @ingroup AVX2
2249*/
2250inline
2252 unsigned* gcount, unsigned* bcount)
2253{
2255
2256 const __m256i* block_end =
2257 (const __m256i*)((bm::word_t*)(block) + bm::set_block_size);
2258
2259 __m256i m1COshft, m2COshft;
2260 __m256i mCOidx = _mm256_set_epi32(6, 5, 4, 3, 2, 1, 0, 0);
2261 __m256i cntAcc = _mm256_setzero_si256();
2262
2263 unsigned w0 = *((bm::word_t*)(block));
2264 unsigned bit_count = 0;
2265 unsigned gap_count = 1;
2266
2268
2269 unsigned co2, co1 = 0;
2270 for (;block < block_end; block+=2)
2271 {
2272 __m256i m1A = _mm256_load_si256(block);
2273 __m256i m2A = _mm256_load_si256(block+1);
2274
2275 // popcount
2276 {
2277 bm::id64_t* b64 = (bm::id64_t*)block;
2278
2279 bit_count += (unsigned) (_mm_popcnt_u64(b64[0]) + _mm_popcnt_u64(b64[1]));
2280 bit_count += (unsigned)(_mm_popcnt_u64(b64[2]) + _mm_popcnt_u64(b64[3]));
2281
2282 bit_count += (unsigned)(_mm_popcnt_u64(b64[4]) + _mm_popcnt_u64(b64[5]));
2283 bit_count += (unsigned)(_mm_popcnt_u64(b64[6]) + _mm_popcnt_u64(b64[7]));
2284 }
2285
2286 __m256i m1CO = _mm256_srli_epi32(m1A, 31);
2287 __m256i m2CO = _mm256_srli_epi32(m2A, 31);
2288
2289 co2 = _mm256_extract_epi32(m1CO, 7);
2290
2291 __m256i m1As = _mm256_slli_epi32(m1A, 1); // (block[i] << 1u)
2292 __m256i m2As = _mm256_slli_epi32(m2A, 1);
2293
2294 // shift CO flags using +1 permute indexes, add CO to v[0]
2295 m1COshft = _mm256_permutevar8x32_epi32(m1CO, mCOidx);
2296 m1COshft = _mm256_insert_epi32(m1COshft, co1, 0); // v[0] = co_flag
2297
2298 co1 = co2;
2299
2300 co2 = _mm256_extract_epi32(m2CO, 7);
2301 m2COshft = _mm256_permutevar8x32_epi32(m2CO, mCOidx);
2302 m2COshft = _mm256_insert_epi32(m2COshft, co1, 0);
2303
2304 m1As = _mm256_or_si256(m1As, m1COshft); // block[i] |= co_flag
2305 m2As = _mm256_or_si256(m2As, m2COshft);
2306
2307 co1 = co2;
2308
2309 // we now have two shifted AVX2 regs with carry-over
2310 m1A = _mm256_xor_si256(m1A, m1As); // w ^= (w >> 1);
2311 m2A = _mm256_xor_si256(m2A, m2As);
2312
2313 {
2314 BM_AVX2_BIT_COUNT(bc, m1A)
2315 cntAcc = _mm256_add_epi64(cntAcc, bc);
2316 BM_AVX2_BIT_COUNT(bc, m2A)
2317 cntAcc = _mm256_add_epi64(cntAcc, bc);
2318 }
2319 } // for
2320
2321 // horizontal count sum
2322 _mm256_store_si256 ((__m256i*)cnt_v, cntAcc);
2323 gap_count += (unsigned)(cnt_v[0] + cnt_v[1] + cnt_v[2] + cnt_v[3]);
2324 gap_count -= (w0 & 1u); // correct initial carry-in error
2325
2326 *gcount = gap_count;
2327 *bcount = bit_count;
2328}
2329
2330
2331/*!
2332 \brief Find first bit which is different between two bit-blocks
2333 @ingroup AVX2
2334*/
2335inline
2336bool avx2_bit_find_first_diff(const __m256i* BMRESTRICT block1,
2337 const __m256i* BMRESTRICT block2,
2338 unsigned* pos)
2339{
2340 unsigned BM_ALIGN32 simd_buf[8] BM_ALIGN32ATTR;
2341
2342 const __m256i* block1_end =
2343 (const __m256i*)((bm::word_t*)(block1) + bm::set_block_size);
2344 __m256i maskZ = _mm256_setzero_si256();
2345 __m256i mA, mB;
2346 unsigned simd_lane = 0;
2347 do
2348 {
2349 mA = _mm256_xor_si256(_mm256_load_si256(block1),
2350 _mm256_load_si256(block2));
2351 mB = _mm256_xor_si256(_mm256_load_si256(block1+1),
2352 _mm256_load_si256(block2+1));
2353 __m256i mOR = _mm256_or_si256(mA, mB);
2354 if (!_mm256_testz_si256(mOR, mOR)) // test 2x256 lanes
2355 {
2356 if (!_mm256_testz_si256(mA, mA))
2357 {
2358 // invert to fing (w != 0)
2359 unsigned mask = ~_mm256_movemask_epi8(_mm256_cmpeq_epi32(mA, maskZ));
2360 BM_ASSERT(mask);
2361 int bsf = bm::bsf_asm32(mask); // find first !=0 (could use lzcnt())
2362 _mm256_store_si256 ((__m256i*)simd_buf, mA);
2363 unsigned widx = bsf >> 2; // (bsf / 4);
2364 unsigned w = simd_buf[widx];// _mm256_extract_epi32 (mA, widx);
2365 bsf = bm::bsf_asm32(w); // find first bit != 0
2366 *pos = (simd_lane * 256) + (widx * 32) + bsf;
2367 return true;
2368 }
2369 // invert to fing (w != 0)
2370 unsigned mask = ~_mm256_movemask_epi8(_mm256_cmpeq_epi32(mB, maskZ));
2371 BM_ASSERT(mask);
2372 int bsf = bm::bsf_asm32(mask); // find first !=0 (could use lzcnt())
2373 _mm256_store_si256 ((__m256i*)simd_buf, mB);
2374 unsigned widx = bsf >> 2; // (bsf / 4);
2375 unsigned w = simd_buf[widx];// _mm256_extract_epi32 (mB, widx);
2376 bsf = bm::bsf_asm32(w); // find first bit != 0
2377 *pos = ((++simd_lane) * 256) + (widx * 32) + bsf;
2378 return true;
2379 }
2380
2381 simd_lane+=2;
2382 block1+=2; block2+=2;
2383
2384 } while (block1 < block1_end);
2385 return false;
2386}
2387
2388
2389/*!
2390 \brief Find first bit set
2391 @ingroup AVX2
2392*/
2393inline
2394bool avx2_bit_find_first(const __m256i* BMRESTRICT block, unsigned off, unsigned* pos)
2395{
2396 unsigned BM_ALIGN32 simd_buf[8] BM_ALIGN32ATTR;
2397
2398 block = (const __m256i*)((bm::word_t*)(block) + off);
2399 const __m256i* block_end =
2400 (const __m256i*)((bm::word_t*)(block) + bm::set_block_size);
2401 __m256i maskZ = _mm256_setzero_si256();
2402 __m256i mA, mB;
2403 unsigned simd_lane = 0;
2404 do
2405 {
2406 mA = _mm256_load_si256(block); mB = _mm256_load_si256(block+1);
2407 __m256i mOR = _mm256_or_si256(mA, mB);
2408 if (!_mm256_testz_si256(mOR, mOR)) // test 2x256 lanes
2409 {
2410 if (!_mm256_testz_si256(mA, mA))
2411 {
2412 // invert to fing (w != 0)
2413 unsigned mask = ~_mm256_movemask_epi8(_mm256_cmpeq_epi32(mA, maskZ));
2414 BM_ASSERT(mask);
2415 int bsf = bm::bsf_asm32(mask); // find first !=0 (could use lzcnt())
2416 _mm256_store_si256 ((__m256i*)simd_buf, mA);
2417 unsigned widx = bsf >> 2; // (bsf / 4);
2418 unsigned w = simd_buf[widx];
2419 bsf = bm::bsf_asm32(w); // find first bit != 0
2420 *pos = (off * 32) + (simd_lane * 256) + (widx * 32) + bsf;
2421 return true;
2422 }
2423 // invert to fing (w != 0)
2424 unsigned mask = ~_mm256_movemask_epi8(_mm256_cmpeq_epi32(mB, maskZ));
2425 BM_ASSERT(mask);
2426 int bsf = bm::bsf_asm32(mask); // find first !=0 (could use lzcnt())
2427 _mm256_store_si256 ((__m256i*)simd_buf, mB);
2428 unsigned widx = bsf >> 2; // (bsf / 4);
2429 unsigned w = simd_buf[widx];
2430 bsf = bm::bsf_asm32(w); // find first bit != 0
2431 *pos = (off * 32) + ((++simd_lane) * 256) + (widx * 32) + bsf;
2432 return true;
2433 }
2434
2435 simd_lane+=2;
2436 block+=2;
2437
2438 } while (block < block_end);
2439 return false;
2440}
2441
2442
2443
2444/* @brief Gap block population count (array sum) utility
2445 @param pbuf - unrolled, aligned to 1-start GAP buffer
2446 @param avx_vect_waves - number of AVX vector lines to process
2447 @param sum - result acumulator
2448 @return tail pointer
2449
2450 @internal
2451*/
2452inline
2454 unsigned avx_vect_waves,
2455 unsigned* sum)
2456{
2457 __m256i xcnt = _mm256_setzero_si256();
2458
2459 // accumulate odd and even elements of the vector the result is
2460 // correct based on modulus 16 (max element value in gap blocks is 65535)
2461 // overflow is not an issue here
2462 for (unsigned i = 0; i < avx_vect_waves; ++i)
2463 {
2464 __m256i ymm0 = _mm256_loadu_si256((__m256i*)(pbuf - 1));
2465 __m256i ymm1 = _mm256_loadu_si256((__m256i*)(pbuf + 16 - 1));
2466 __m256i ymm_s2 = _mm256_add_epi16(ymm1, ymm0);
2467 xcnt = _mm256_add_epi16(xcnt, ymm_s2);
2468 pbuf += 32;
2469 }
2470 // odd minus even vector elements clears the result for 1111 blocks
2471 // bsrli - byte shifts the vector element by 2 bytes (1 short int)
2472 xcnt = _mm256_sub_epi16(_mm256_bsrli_epi128(xcnt, 2), xcnt);
2473
2474 // horizontal sum of vector elements
2475 // cnt16[0] + cnt16[2] + cnt16[4] + cnt16[6] + cnt16[8] + cnt16[10] + cnt16[12] + cnt16[14];
2476 //
2477 xcnt = _mm256_add_epi16(_mm256_bsrli_epi128(xcnt, 4), xcnt);
2478 xcnt = _mm256_add_epi16(_mm256_bsrli_epi128(xcnt, 8), xcnt);
2479 __m128i xcnt2 = _mm_add_epi16(_mm256_extracti128_si256(xcnt, 1), _mm256_extracti128_si256(xcnt, 0));
2480
2481 // extract 32-bit word and mask to take first 16 bits
2482 *sum += _mm_cvtsi128_si32(xcnt2) & 0xffff;
2483 return pbuf;
2484}
2485
2486
2487/*!
2488 AVX2 index lookup to check what belongs to the same block (8 elements)
2489 \internal
2490*/
2491inline
2492unsigned avx2_idx_arr_block_lookup(const unsigned* idx, unsigned size,
2493 unsigned nb, unsigned start)
2494{
2495 const unsigned unroll_factor = 16;
2496 const unsigned len = (size - start);
2497 const unsigned len_unr = len - (len % unroll_factor);
2498 unsigned k;
2499
2500 idx += start;
2501
2502 __m256i nbM = _mm256_set1_epi32(int(nb));
2503
2504 for (k = 0; k < len_unr; k+=unroll_factor)
2505 {
2506 __m256i idxA = _mm256_loadu_si256((__m256i*)(idx+k));
2507 __m256i nbA = _mm256_srli_epi32(idxA, bm::set_block_shift); // idx[k] >> bm::set_block_shift
2508
2509 __m256i wcmpA= _mm256_cmpeq_epi8(nbM, nbA);
2510 if (~0u != unsigned(_mm256_movemask_epi8(wcmpA)))
2511 break;
2512 __m256i idxB = _mm256_loadu_si256((__m256i*)(idx+k+8));
2513 __m256i nbB = _mm256_srli_epi32(idxB, bm::set_block_shift);
2514
2515 __m256i wcmpB = _mm256_cmpeq_epi8(nbM, nbB);
2516 if (~0u != unsigned(_mm256_movemask_epi8(wcmpB)))
2517 break;
2518 } // for k
2519 for (; k < len; ++k)
2520 {
2521 if (nb != unsigned(idx[k] >> bm::set_block_shift))
2522 break;
2523 } // for k
2524 return start + k;
2525}
2526
2527
2528/*!
2529 SSE4.2 bulk bit set
2530 \internal
2531*/
2532inline
2534 const unsigned* BMRESTRICT idx,
2535 unsigned start, unsigned stop )
2536{
2537 const unsigned unroll_factor = 8;
2538 const unsigned len = (stop - start);
2539 const unsigned len_unr = len - (len % unroll_factor);
2540
2541 idx += start;
2542
2543 __m256i sb_mask = _mm256_set1_epi32(bm::set_block_mask);
2544 __m256i sw_mask = _mm256_set1_epi32(bm::set_word_mask);
2545 __m256i mask1 = _mm256_set1_epi32(1);
2546 __m256i mask_tmp;
2547
2548 unsigned BM_ALIGN32 mask_v[8] BM_ALIGN32ATTR;
2549 unsigned BM_ALIGN32 mword_v[8] BM_ALIGN32ATTR;
2550
2551 unsigned k = 0, mask, w_idx;
2552 for (; k < len_unr; k+=unroll_factor)
2553 {
2554 __m256i idxA = _mm256_loadu_si256((__m256i*)(idx+k));
2555 __m256i nbitA = _mm256_and_si256 (idxA, sb_mask); // nbit = idx[k] & bm::set_block_mask
2556 __m256i nwordA = _mm256_srli_epi32 (nbitA, bm::set_word_shift); // nword = nbit >> bm::set_word_shift
2557
2558 nbitA = _mm256_and_si256 (nbitA, sw_mask); // nbit &= bm::set_word_mask;
2559
2560 __m256i maskA = _mm256_sllv_epi32(mask1, nbitA); // (1 << nbit)
2561
2562 _mm256_store_si256 ((__m256i*)mword_v, nwordA); // store block word idxs
2563
2564 // shufffle + permute to prepare comparison vector
2565 mask_tmp = _mm256_shuffle_epi32 (nwordA, _MM_SHUFFLE(1,1,1,1));
2566 mask_tmp = _mm256_permute2x128_si256 (mask_tmp, mask_tmp, 0);
2567 mask = _mm256_movemask_epi8(_mm256_cmpeq_epi32(mask_tmp, nwordA));
2568 if (mask == ~0u) // all idxs belong the same word
2569 {
2570 w_idx = mword_v[0];
2571 mask_tmp = _mm256_xor_si256 (mask_tmp, mask_tmp); // zero bits
2572 mask_tmp = _mm256_or_si256 (mask_tmp, maskA); // set bits
2573
2574 // horizontal OR via permutation of two 128-bit lanes
2575 // then byte-shifts + OR withing lower 128
2576 __m256i mtmp0 = _mm256_permute2x128_si256(mask_tmp, mask_tmp, 0);
2577 __m256i mtmp1 = _mm256_permute2x128_si256(mask_tmp, mask_tmp, 1);
2578 mask_tmp = _mm256_or_si256 (mtmp0, mtmp1);
2579 mtmp0 = _mm256_bsrli_epi128(mask_tmp, 4); // shift R by 1 int
2580 mask_tmp = _mm256_or_si256 (mtmp0, mask_tmp);
2581 mtmp0 = _mm256_bsrli_epi128(mask_tmp, 8); // shift R by 2 ints
2582 mask_tmp = _mm256_or_si256 (mtmp0, mask_tmp);
2583
2584 int u0 = _mm256_extract_epi32(mask_tmp, 0); // final OR
2585 block[w_idx] |= u0;
2586 }
2587 else // whole 256-bit lane does NOT hit the same word...
2588 {
2589 _mm256_store_si256 ((__m256i*)mask_v, maskA);
2590
2591 // compute horizonlal OR of set bit mask over lo-hi 128-bit lanes
2592 // it is used later if lo or hi lanes hit the same word
2593 // (probabilistic speculation)
2594 //
2595 int u0, u4;
2596 {
2597 mask_tmp = _mm256_bsrli_epi128(maskA, 4); // shift R by 1 int
2598 mask_tmp = _mm256_or_si256 (mask_tmp, maskA);
2599 __m256i m0 = _mm256_bsrli_epi128(mask_tmp, 8); // shift R by 2 ints
2600 mask_tmp = _mm256_or_si256 (m0, mask_tmp);
2601
2602 u0 = _mm256_extract_epi32(mask_tmp, 0); // final OR (128-lo)
2603 u4 = _mm256_extract_epi32(mask_tmp, 4); // final OR (128-hi)
2604 }
2605
2606 // check the lo 128-lane
2607 {
2608 mask_tmp = _mm256_permute2x128_si256 (nwordA, nwordA, 0); // lo
2609 __m256i m0 = _mm256_shuffle_epi32(mask_tmp, 0x0); // copy simd[0]
2610 mask = _mm256_movemask_epi8(_mm256_cmpeq_epi32(mask_tmp, m0));
2611 if (mask == ~0u) // all idxs belong the same word
2612 {
2613 w_idx = mword_v[0];
2614 block[w_idx] |= u0;
2615 }
2616 else // different block words: use "shotgun" OR
2617 {
2618 block[mword_v[0]] |= mask_v[0];
2619 block[mword_v[1]] |= mask_v[1];
2620 block[mword_v[2]] |= mask_v[2];
2621 block[mword_v[3]] |= mask_v[3];
2622
2623 }
2624 }
2625
2626 // check the hi 128-lane
2627 {
2628 mask_tmp = _mm256_permute2x128_si256 (nwordA, nwordA, 1); // hi
2629 __m256i m0 = _mm256_shuffle_epi32(mask_tmp, 0x0);
2630 mask = _mm256_movemask_epi8(_mm256_cmpeq_epi32(mask_tmp, m0));
2631 if (mask == ~0u) // all idxs belong the same word
2632 {
2633 w_idx = mword_v[4];
2634 block[w_idx] |= u4;
2635 }
2636 else
2637 {
2638 block[mword_v[4]] |= mask_v[4];
2639 block[mword_v[5]] |= mask_v[5];
2640 block[mword_v[6]] |= mask_v[6];
2641 block[mword_v[7]] |= mask_v[7];
2642 }
2643 }
2644 }
2645 } // for k
2646
2647 for (; k < len; ++k)
2648 {
2649 unsigned n = idx[k];
2650 unsigned nbit = unsigned(n & bm::set_block_mask);
2651 unsigned nword = nbit >> bm::set_word_shift;
2652 nbit &= bm::set_word_mask;
2653 block[nword] |= (1u << nbit);
2654 } // for k
2655}
2656
2657
2658/** Set a bits in an AVX target, by indexes (int4) from the source
2659 @internal
2660*/
2662__m256i avx2_setbit_256(__m256i target, __m256i source)
2663{
2664 __m256i stride_idx = _mm256_set_epi32(224, 192, 160, 128, 96, 64, 32, 0);
2665 __m256i mask1 = _mm256_set1_epi32(1);
2666
2667 __m256i v0, v1, acc1, acc2;
2668 v0 = _mm256_permutevar8x32_epi32(source, _mm256_set1_epi32(0));
2669 v1 = _mm256_permutevar8x32_epi32(source, _mm256_set1_epi32(1));
2670 v0 = _mm256_sub_epi32(v0, stride_idx);
2671 v1 = _mm256_sub_epi32(v1, stride_idx);
2672 v0 = _mm256_sllv_epi32(mask1, v0);
2673 v1 = _mm256_sllv_epi32(mask1, v1);
2674 acc1 = _mm256_or_si256(v1, v0);
2675 v0 = _mm256_permutevar8x32_epi32(source, _mm256_set1_epi32(2));
2676 v1 = _mm256_permutevar8x32_epi32(source, _mm256_set1_epi32(3));
2677 v0 = _mm256_sub_epi32(v0, stride_idx);
2678 v1 = _mm256_sub_epi32(v1, stride_idx);
2679 v0 = _mm256_sllv_epi32(mask1, v0);
2680 v1 = _mm256_sllv_epi32(mask1, v1);
2681 acc2 = _mm256_or_si256(v1, v0);
2682 target = _mm256_or_si256(target, acc1);
2683 v0 = _mm256_permutevar8x32_epi32(source, _mm256_set1_epi32(4));
2684 v1 = _mm256_permutevar8x32_epi32(source, _mm256_set1_epi32(5));
2685 v0 = _mm256_sub_epi32(v0, stride_idx);
2686 v1 = _mm256_sub_epi32(v1, stride_idx);
2687 v0 = _mm256_sllv_epi32(mask1, v0);
2688 v1 = _mm256_sllv_epi32(mask1, v1);
2689 acc1 = _mm256_or_si256(v1, v0);
2690 target = _mm256_or_si256(target, acc2);
2691 v0 = _mm256_permutevar8x32_epi32(source, _mm256_set1_epi32(6));
2692 v1 = _mm256_permutevar8x32_epi32(source, _mm256_set1_epi32(7));
2693 v0 = _mm256_sub_epi32(v0, stride_idx);
2694 v1 = _mm256_sub_epi32(v1, stride_idx);
2695 v0 = _mm256_sllv_epi32(mask1, v0);
2696 v1 = _mm256_sllv_epi32(mask1, v1);
2697 acc2 = _mm256_or_si256(v1, v0);
2698
2699 target = _mm256_or_si256(target, acc1);
2700 target = _mm256_or_si256(target, acc2);
2701 return target;
2702}
2703
2704
2705/** Experimental code to set bits via AVX strides
2706 @internal
2707*/
2708inline
2710 const unsigned* BMRESTRICT idx,
2711 unsigned start, unsigned stop )
2712{
2713 __m256i stride_idx = _mm256_set_epi32(224, 192, 160, 128, 96, 64, 32, 0);
2714 __m256i mask1 = _mm256_set1_epi32(1);
2715 __m256i* block_avx = (__m256i*)block;
2716
2717 unsigned stride = 0;
2718 __m256i* avx_stride_p = block_avx + stride;
2719 __m256i blkA = _mm256_load_si256(avx_stride_p);
2720
2721 for (unsigned i = start; i < stop; ++i)
2722 {
2723 unsigned n = idx[i];
2724 unsigned nbit = unsigned(n & bm::set_block_mask);
2725 unsigned new_stride = nbit >> 8; // (nbit / 256)
2726 unsigned stride_bit = nbit & 0xFF; // (nbit % 256)
2727 if (new_stride != stride)
2728 {
2729 _mm256_store_si256(avx_stride_p, blkA); // flush the avx2 accum
2730 stride = new_stride;
2731 avx_stride_p = block_avx + stride;
2732 blkA = _mm256_load_si256(avx_stride_p); // re-load the accum
2733 }
2734 // set avx2 stride bit
2735 __m256i v0 = _mm256_set1_epi32(stride_bit);
2736 __m256i s0 = _mm256_sub_epi32(v0, stride_idx);
2737 __m256i k0 = _mm256_sllv_epi32(mask1, s0);
2738 blkA = _mm256_or_si256(blkA, k0);
2739 } // for i
2740
2741 _mm256_store_si256(avx_stride_p, blkA);
2742}
2743
2744/** Experimental code to set bits via AVX strides
2745 @internal
2746*/
2747inline
2749 const unsigned* BMRESTRICT idx,
2750 unsigned start, unsigned stop )
2751{
2752 const unsigned unroll_factor = 8;
2753 const unsigned len = (stop - start);
2754 const unsigned len_unr = len - (len % unroll_factor);
2755
2756 idx += start;
2757
2758 __m256i stride_idx = _mm256_set_epi32(224, 192, 160, 128, 96, 64, 32, 0);
2759 __m256i mask1 = _mm256_set1_epi32(1);
2760
2761 __m256i sb_mask = _mm256_set1_epi32(bm::set_block_mask);
2762 __m256i stride_bit_mask = _mm256_set1_epi32(0xFF);
2763
2764 unsigned BM_ALIGN32 mstride_v[8] BM_ALIGN32ATTR;
2765 int BM_ALIGN32 mstride_bit_v[8] BM_ALIGN32ATTR;
2766
2767 // define the very first block stride based on index 0
2768 unsigned stride = unsigned(idx[0] & bm::set_block_mask) >> 8;
2769
2770 __m256i* block_avx = (__m256i*)block;
2771 __m256i* avx_stride_p = block_avx + stride;
2772
2773 __m256i blkA = _mm256_load_si256(avx_stride_p); // load the first accum
2774
2775 unsigned k = 0, mask;
2776 for (; k < len_unr; k+=unroll_factor)
2777 {
2778 __m256i idxA = _mm256_loadu_si256((__m256i*)(idx+k));
2779 __m256i nbitA = _mm256_and_si256 (idxA, sb_mask); // nbit = idx[k] & bm::set_block_mask
2780 __m256i strideA = _mm256_srli_epi32 (nbitA, 8); // new_stride = nbit >> 8
2781 __m256i strideBitA = _mm256_and_si256 (nbitA, stride_bit_mask); // stride_bit = nbit & 0xFF;
2782
2783 // construct a cmp vector from broadcasted v[0]
2784 __m256i mask_tmp = _mm256_shuffle_epi32 (strideA, 0x0);
2785 mask_tmp = _mm256_permute2x128_si256 (mask_tmp, mask_tmp, 0);
2786 mask = _mm256_movemask_epi8(_mm256_cmpeq_epi32(mask_tmp, strideA));
2787 if (mask == ~0u) // all idxs belong the same avx2 stride
2788 {
2789 unsigned new_stride = (unsigned)_mm256_extract_epi32(strideA, 0);
2790 if (new_stride != stride)
2791 {
2792 _mm256_store_si256(avx_stride_p, blkA); // flush avx2 accum
2793 stride = new_stride;
2794 avx_stride_p = block_avx + stride;
2795 blkA = _mm256_load_si256(avx_stride_p); // re-load accum
2796 }
2797 // set 8 bits all at once
2798 blkA = bm::avx2_setbit_256(blkA, strideBitA);
2799 }
2800 else // stride mix here, process one by one
2801 {
2802 _mm256_store_si256 ((__m256i*)mstride_bit_v, strideBitA); // store block stride-bit idxs
2803 _mm256_store_si256 ((__m256i*)mstride_v, strideA);
2804 for (unsigned j = 0; j < 8; ++j)
2805 {
2806 unsigned new_stride = mstride_v[j];
2807 if (new_stride != stride)
2808 {
2809 _mm256_store_si256(avx_stride_p, blkA); // flush avx2 accum
2810 stride = new_stride;
2811 avx_stride_p = block_avx + stride;
2812 blkA = _mm256_load_si256(avx_stride_p); // re-load accum
2813 }
2814 // set avx2 bits one by one
2815 mask_tmp = _mm256_set1_epi32(mstride_bit_v[j]);
2816 mask_tmp = _mm256_sub_epi32(mask_tmp, stride_idx);
2817 mask_tmp = _mm256_sllv_epi32(mask1, mask_tmp);
2818 blkA = _mm256_or_si256(blkA, mask_tmp);
2819 } // for j
2820 }
2821 } // for k
2822 _mm256_store_si256(avx_stride_p, blkA);
2823
2824 // set the tail bits conventionally
2825 for (; k < len; ++k)
2826 {
2827 unsigned n = idx[k];
2828 unsigned nbit = unsigned(n & bm::set_block_mask);
2829 unsigned nword = nbit >> bm::set_word_shift;
2830 nbit &= bm::set_word_mask;
2831 block[nword] |= (1u << nbit);
2832 } // for k
2833}
2834
2835
2836/**
2837 Experiemntal. Set number of bits in AVX register from 0 to i
2838 [ 000000 00000 0000000 00011 11111 ] - i = 7
2839*/
2840inline
2841__m256i avx2_setbit_to256(unsigned i)
2842{
2843 __m256i stride_idx1 = _mm256_set_epi32(224, 192, 160, 128, 96, 64, 32, 0);
2844 __m256i stride_idx2 = _mm256_add_epi32(stride_idx1, _mm256_set1_epi32(32));
2845 __m256i maskFF = _mm256_set1_epi32(-1);
2846 __m256i maskZ = _mm256_setzero_si256();
2847
2848 __m256i v0 = _mm256_set1_epi32(i);
2849 __m256i s0 = _mm256_sub_epi32(v0, stride_idx1);
2850 __m256i k1 = _mm256_sllv_epi32(maskFF, s0);
2851
2852 {
2853 __m256i cmp_eq = _mm256_cmpeq_epi32(k1, maskZ);
2854 cmp_eq = _mm256_xor_si256(maskFF, cmp_eq); // invert: != 0 mask
2855 k1 = _mm256_xor_si256(k1, cmp_eq); // [ 0 0 0 0 0 0 3 0 ]
2856 }
2857
2858 __m256i cmp_gt = _mm256_cmpgt_epi32 (stride_idx2, v0);
2859 cmp_gt = _mm256_xor_si256(maskFF, cmp_gt); // invert as GT == LT|EQ (LE)
2860 __m256i r = _mm256_xor_si256(k1, cmp_gt); // invert all full words (right)
2861
2862 return r;
2863}
2864
2865
2866
2867/**
2868 Experimental (test) function to do SIMD vector search (lower bound)
2869 in sorted, growing array
2870 @ingroup AVX2
2871
2872 \internal
2873*/
2874inline
2875int avx2_cmpge_u32(__m256i vect8, unsigned value)
2876{
2877 // a > b (unsigned, 32-bit) is the same as (a - 0x80000000) > (b - 0x80000000) (signed, 32-bit)
2878 // https://fgiesen.wordpress.com/2016/04/03/sse-mind-the-gap/
2879 //
2880 __m256i mask0x8 = _mm256_set1_epi32(0x80000000);
2881 __m256i mm_val = _mm256_set1_epi32(value);
2882
2883 __m256i norm_vect8 = _mm256_sub_epi32(vect8, mask0x8); // (signed) vect4 - 0x80000000
2884 __m256i norm_val = _mm256_sub_epi32(mm_val, mask0x8); // (signed) mm_val - 0x80000000
2885
2886 __m256i cmp_mask_gt = _mm256_cmpgt_epi32(norm_vect8, norm_val);
2887 __m256i cmp_mask_eq = _mm256_cmpeq_epi32(mm_val, vect8);
2888
2889 __m256i cmp_mask_ge = _mm256_or_si256(cmp_mask_gt, cmp_mask_eq);
2890 int mask = _mm256_movemask_epi8(cmp_mask_ge);
2891 if (mask)
2892 {
2893 int bsf = bm::bsf_asm32(mask); // could use lzcnt()
2894 return bsf / 4;
2895 }
2896 return -1;
2897}
2898
2899/**
2900 Experimental (test) function to do SIMD vector search
2901 in sorted, growing array
2902 @ingroup AVX2
2903
2904 \internal
2905*/
2906inline
2907int avx2_cmpge_u16(__m256i vect16, unsigned short value)
2908{
2909 __m256i mZ = _mm256_setzero_si256();
2910 __m256i mVal = _mm256_set1_epi16(value);
2911
2912 // subs_epu16 - unsigned substration with saturation, gives 0u if (a - b) < 0
2913 __m256i mSub = _mm256_subs_epu16(mVal, vect16);
2914 __m256i mge_mask = _mm256_cmpeq_epi16(mSub, mZ);
2915 unsigned mask = _mm256_movemask_epi8(mge_mask);
2916 if (mask)
2917 {
2918 int lz = _tzcnt_u32(mask);
2919 return lz / 2;
2920 }
2921 return -1;
2922}
2923
2924
2925/**
2926 Hybrid binary search, starts as binary, then switches to scan
2927
2928 NOTE: AVX code uses _mm256_subs_epu16 - saturated substraction
2929 which gives 0 if A-B=0 if A < B (not negative a value).
2930
2931 \param buf - GAP buffer pointer.
2932 \param pos - index of the element.
2933 \param is_set - output. GAP value (0 or 1).
2934 \return GAP index OR bit-test
2935
2936 @ingroup AVX2
2937*/
2938template<bool RET_TEST=false>
2939unsigned avx2_gap_bfind(const unsigned short* BMRESTRICT buf,
2940 unsigned pos, unsigned* BMRESTRICT is_set)
2941{
2942 BM_ASSERT(is_set || RET_TEST);
2943
2944 const unsigned linear_cutoff = 64;//48;
2945 const unsigned unroll_factor = 16;
2946
2948
2949 unsigned res;
2950 unsigned start = 1;
2951 unsigned end = ((*buf) >> 3);
2952
2953 const unsigned arr_end = end + 1;
2954 if (end <= unroll_factor) // too small for a full AVX stride
2955 {
2956 for (; true; ++start)
2957 if (buf[start] >= pos)
2958 goto ret;
2959 BM_ASSERT(0);
2960 }
2961
2962 do
2963 {
2964 unsigned dsize = end - start;
2965 for (; dsize >= 64; dsize = end - start)
2966 {
2967 unsigned mid = (start + end) >> 1;
2968 if (buf[mid] < pos)
2969 start = mid+1;
2970 else
2971 end = mid;
2972 if (buf[mid = (start + end) >> 1] < pos)
2973 start = mid+1;
2974 else
2975 end = mid;
2976 if (buf[mid = (start + end) >> 1] < pos)
2977 start = mid+1;
2978 else
2979 end = mid;
2980 if (buf[mid = (start + end) >> 1] < pos)
2981 start = mid+1;
2982 else
2983 end = mid;
2984 BM_ASSERT(buf[end] >= pos);
2985 } // for
2986
2987 dsize = end - start + 1;
2988 if (dsize < linear_cutoff)
2989 {
2990 // set wider scan window to possibly over-read the range,
2991 // but stay within allocated block memory
2992 //
2993 dsize = arr_end - start;
2994
2995 __m256i mZ = _mm256_setzero_si256();
2996 __m256i mPos = _mm256_set1_epi16((unsigned short)pos);
2997 __m256i vect16, mSub, mge_mask;
2998
2999 for (unsigned len_unr = start + (dsize - (dsize % unroll_factor));
3000 start < len_unr; start += unroll_factor)
3001 {
3002 vect16 = _mm256_loadu_si256((__m256i*)(&buf[start])); //16x u16s
3003 mSub = _mm256_subs_epu16(mPos, vect16);
3004 mge_mask = _mm256_cmpeq_epi16(mSub, mZ);
3005 if (int mask = _mm256_movemask_epi8(mge_mask); mask)
3006 {
3007 int lz = _tzcnt_u32(mask);
3008 start += (lz >> 1);
3009 goto ret;
3010 }
3011 } // for
3012// if (unsigned tail = unroll_factor-(end-start); start > tail+1)
3013 {
3014 start = end - 15;
3015 BM_ASSERT(buf[start + 15] >= pos);
3016 vect16 = _mm256_loadu_si256((__m256i*)(&buf[start])); //16x u16s
3017 mSub = _mm256_subs_epu16(mPos, vect16);
3018 mge_mask = _mm256_cmpeq_epi16(mSub, mZ);
3019 int mask = _mm256_movemask_epi8(mge_mask);
3020 BM_ASSERT(mask); // the result MUST be here at this point
3021 int lz = _tzcnt_u32(mask);
3022 start += (lz >> 1);
3023 goto ret;
3024 }
3025 for (; true; ++start)
3026 if (buf[start] >= pos)
3027 goto ret;
3028 BM_ASSERT(0);
3029 }
3030
3031 if (unsigned mid = (start + end) >> 1; buf[mid] < pos)
3032 start = mid + 1;
3033 else
3034 end = mid;
3035 if (unsigned mid = (start + end) >> 1; buf[mid] < pos)
3036 start = mid + 1;
3037 else
3038 end = mid;
3039 } while (1);
3040ret:
3041 res = ((*buf) & 1) ^ ((start-1) & 1);
3042 if constexpr(RET_TEST)
3043 return res;
3044 else
3045 {
3046 *is_set = res;
3047 return start;
3048 }
3049}
3050
3051
3052/**
3053 Hybrid binary search, starts as binary, then switches to scan
3054 @ingroup AVX2
3055*/
3056inline
3057unsigned avx2_gap_test(const unsigned short* BMRESTRICT buf, unsigned pos)
3058{
3059 return bm::avx2_gap_bfind<true>(buf, pos, 0);
3060}
3061
3062/**
3063 lower bound (great or equal) linear scan in ascending order sorted array
3064 @ingroup AVX2
3065 \internal
3066*/
3067inline
3068unsigned avx2_lower_bound_scan_u32(const unsigned* BMRESTRICT arr,
3069 unsigned target,
3070 unsigned from,
3071 unsigned to)
3072{
3073 // a > b (unsigned, 32-bit) is the same as (a - 0x80000000) > (b - 0x80000000) (signed, 32-bit)
3074 // see more at:
3075 // https://fgiesen.wordpress.com/2016/04/03/sse-mind-the-gap/
3076
3077 const unsigned* BMRESTRICT arr_base = &arr[from]; // unrolled search base
3078
3079 unsigned unroll_factor = 8;
3080 unsigned len = to - from + 1;
3081 unsigned len_unr = len - (len % unroll_factor);
3082
3083 __m256i mask0x8 = _mm256_set1_epi32(0x80000000);
3084 __m256i vect_target = _mm256_set1_epi32(target);
3085 __m256i norm_target = _mm256_sub_epi32(vect_target, mask0x8); // (signed) target - 0x80000000
3086
3087 int mask;
3088 __m256i vect80, norm_vect80, cmp_mask_ge;
3089
3090 unsigned k = 0;
3091 for (; k < len_unr; k += unroll_factor)
3092 {
3093 vect80 = _mm256_loadu_si256((__m256i*)(&arr_base[k])); // 8 u32s
3094 norm_vect80 = _mm256_sub_epi32(vect80, mask0x8); // (signed) vect4 - 0x80000000
3095
3096 cmp_mask_ge = _mm256_or_si256( // GT | EQ
3097 _mm256_cmpgt_epi32(norm_vect80, norm_target),
3098 _mm256_cmpeq_epi32(vect80, vect_target)
3099 );
3100 mask = _mm256_movemask_epi8(cmp_mask_ge);
3101 if (mask)
3102 {
3103 int bsf = bm::bsf_asm32(mask); //_bit_scan_forward(mask);
3104 return from + k + (bsf / 4);
3105 }
3106 } // for
3107
3108 for (; k < len; ++k)
3109 {
3110 if (arr_base[k] >= target)
3111 return from + k;
3112 }
3113 return to + 1;
3114}
3115
3116
3117/*!
3118 AVX2 bit block gather-scatter
3119
3120 @param arr - destination array to set bits
3121 @param blk - source bit-block
3122 @param idx - gather index array
3123 @param size - gather array size
3124 @param start - gaher start index
3125 @param bit_idx - bit to set in the target array
3126
3127 \internal
3128
3129 C algorithm:
3130
3131 for (unsigned k = start; k < size; ++k)
3132 {
3133 nbit = unsigned(idx[k] & bm::set_block_mask);
3134 nword = unsigned(nbit >> bm::set_word_shift);
3135 mask0 = 1u << (nbit & bm::set_word_mask);
3136 arr[k] |= TRGW(bool(blk[nword] & mask0) << bit_idx);
3137 }
3138
3139*/
3140inline
3142 const unsigned* BMRESTRICT blk,
3143 const unsigned* BMRESTRICT idx,
3144 unsigned size,
3145 unsigned start,
3146 unsigned bit_idx)
3147{
3148 const unsigned unroll_factor = 8;
3149 const unsigned len = (size - start);
3150 const unsigned len_unr = len - (len % unroll_factor);
3151
3152 __m256i sb_mask = _mm256_set1_epi32(bm::set_block_mask);
3153 __m256i sw_mask = _mm256_set1_epi32(bm::set_word_mask);
3154 __m256i maskFF = _mm256_set1_epi32(~0u);
3155
3156 __m256i mask_tmp, mask_0;
3157
3158 unsigned BM_ALIGN32 mword_v[8] BM_ALIGN32ATTR;
3159
3160 unsigned k = 0, mask, w_idx;
3161 for (; k < len_unr; k+=unroll_factor)
3162 {
3163 __m256i nbitA, nwordA;
3164 const unsigned base = start + k;
3165 __m256i* idx_ptr = (__m256i*)(idx+base); // idx[base]
3166
3167 nbitA = _mm256_and_si256 (_mm256_loadu_si256(idx_ptr), sb_mask); // nbit = idx[base] & bm::set_block_mask
3168 nwordA = _mm256_srli_epi32 (nbitA, bm::set_word_shift); // nword = nbit >> bm::set_word_shift
3169
3170 // shufffle + permute to prepare comparison vector
3171 mask_tmp = _mm256_shuffle_epi32 (nwordA, _MM_SHUFFLE(1,1,1,1));
3172 mask_tmp = _mm256_permute2x128_si256 (mask_tmp, mask_tmp, 0);
3173 mask = _mm256_movemask_epi8(_mm256_cmpeq_epi32(mask_tmp, nwordA));
3174 _mm256_store_si256((__m256i*)mword_v, nwordA);
3175
3176 if (mask == ~0u) // all idxs belong the same word avoid (costly) gather
3177 {
3178 w_idx = mword_v[0];
3179 mask_tmp = _mm256_set1_epi32(blk[w_idx]); // use broadcast
3180 }
3181 else // gather for: blk[nword] (.. & mask0 )
3182 {
3183 mask_tmp = _mm256_set_epi32(blk[mword_v[7]], blk[mword_v[6]],
3184 blk[mword_v[5]], blk[mword_v[4]],
3185 blk[mword_v[3]], blk[mword_v[2]],
3186 blk[mword_v[1]], blk[mword_v[0]]);
3187 }
3188
3189 // mask0 = 1u << (nbit & bm::set_word_mask);
3190 //
3191 __m256i shiftA = _mm256_and_si256 (nbitA, sw_mask);
3192 __m256i mask1 = _mm256_srli_epi32 (maskFF, 31);
3193 mask_0 = _mm256_sllv_epi32(mask1, shiftA);
3194
3195 mask_tmp = _mm256_and_si256(mask_tmp, mask_0);
3196 if (!_mm256_testz_si256(mask_tmp, mask_tmp)) // AND tests empty
3197 {
3198 __m256i* target_ptr = (__m256i*)(arr+base); // arr[base]
3199 // bool(blk[nword] ... )
3200 __m256i maskZ = _mm256_xor_si256(maskFF, maskFF); // all zero
3201 mask1 = _mm256_slli_epi32(mask1, bit_idx); // << bit_idx
3202 mask_tmp = _mm256_cmpeq_epi32 (mask_tmp, maskZ); // set 0xFF if==0
3203 mask_tmp = _mm256_xor_si256 (mask_tmp, maskFF); // invert
3204 mask_tmp = _mm256_and_si256 (mask_tmp, mask1);
3205 _mm256_storeu_si256 (target_ptr, // arr[base] |= MASK_EXPR
3206 _mm256_or_si256 (mask_tmp,
3207 _mm256_loadu_si256(target_ptr)));
3208 }
3209
3210 } // for
3211
3212 for (; k < len; ++k)
3213 {
3214 const unsigned base = start + k;
3215 unsigned nbit = unsigned(idx[base] & bm::set_block_mask);
3216 arr[base] |= unsigned(bool(blk[nbit >> bm::set_word_shift] & (1u << (nbit & bm::set_word_mask))) << bit_idx);
3217 }
3218
3219}
3220
3221/**
3222 Convert bit block to GAP block
3223 @ingroup AVX2
3224 \internal
3225*/
3226inline
3228 const unsigned* BMRESTRICT block,
3229 unsigned dest_len)
3230{
3231 const unsigned* BMRESTRICT block_end = block + bm::set_block_size;
3232 gap_word_t* BMRESTRICT pcurr = dest;
3233 gap_word_t* BMRESTRICT end = dest + dest_len; (void)end;
3234
3235 unsigned bitval = (*block) & 1u;
3236 *pcurr++ = bm::gap_word_t(bitval);
3237 *pcurr = 0;
3238 unsigned bit_idx = 0;
3239
3240 const unsigned vCAP = 64; // 64-bit system
3241 __m256i maskZ = _mm256_set1_epi32(0);
3242
3243 for (; block < block_end; block += 8)
3244 {
3245 unsigned k = 0;
3246 if (!bitval)
3247 {
3248 // check number of trailing 64-bit words using AVX compare
3249 __m256i accA = _mm256_load_si256((__m256i*)block); // 4x u64s
3250 __m256i cmpA = _mm256_cmpeq_epi8(accA, maskZ);
3251 unsigned mask = ~_mm256_movemask_epi8(cmpA);
3252 if (!mask)
3253 {
3254 bit_idx += 256;
3255 continue;
3256 }
3257 unsigned w64_idx = _tzcnt_u32(mask);
3258 k = w64_idx / 8; // 8 byte word offset
3259 bit_idx += k * vCAP;
3260 }
3261
3262 for (; k < 4; ++k)
3263 {
3264 bm::id64_t val = (((bm::id64_t*)block)[k]);
3265
3266 if (!val || val == ~0ull)
3267 {
3268 // branchless if
3269 bool cmp = (bool(bitval) != bool(val));
3270 unsigned mask = ~(cmp - 1u);
3271 *pcurr = mask & (gap_word_t)(bit_idx-cmp);
3272 bitval ^= unsigned(cmp);
3273 unsigned long long pcu = reinterpret_cast<unsigned long long>(pcurr);
3274 pcu += mask & sizeof(gap_word_t);
3275 pcurr = reinterpret_cast<gap_word_t*>(pcu);
3276 bit_idx += vCAP;
3277 continue;
3278 } // while
3279
3280
3281 // process "0100011" word
3282 //
3283 unsigned bits_consumed = 0;
3284 do
3285 {
3286 unsigned tz = 1u;
3287 if (bitval != (val & tz))
3288 {
3289 bitval ^= tz;
3290 *pcurr++ = (gap_word_t)(bit_idx-tz);
3291
3292 BM_ASSERT((pcurr-1) == (dest+1) || *(pcurr-1) > *(pcurr-2));
3293 BM_ASSERT(pcurr != end);
3294 }
3295 else // match, find the next idx
3296 {
3297 tz = (unsigned)_tzcnt_u64(bitval ? ~val : val);
3298 }
3299
3300 bool cmp = ((bits_consumed+=tz) < vCAP);
3301 bit_idx += tz;
3302 val >>= tz;
3303
3304 if (!val)
3305 {
3306 tz = ~(cmp - 1u); // generate 0xFFFF or 0x0000 mask
3307 *pcurr = tz & (gap_word_t)(bit_idx-cmp);
3308 bitval ^= unsigned(cmp);
3309 bit_idx += tz & (vCAP - bits_consumed);
3310 unsigned long long pcu = reinterpret_cast<unsigned long long>(pcurr);
3311 pcu += tz & sizeof(gap_word_t);
3312 pcurr = reinterpret_cast<gap_word_t*>(pcu);
3313
3314 BM_ASSERT((pcurr-1) == (dest+1) || *(pcurr-1) > *(pcurr-2));
3315 BM_ASSERT(pcurr != end);
3316 break;
3317 }
3318 } while (1);
3319 } // for k
3320
3321 } // for block < end
3322
3323 *pcurr = (gap_word_t)(bit_idx-1);
3324 unsigned len = (unsigned)(pcurr - dest);
3325 *dest = (gap_word_t)((*dest & 7) + (len << 3));
3326 return len;
3327}
3328
3329/**
3330 Build partial XOR product of 2 bit-blocks using digest mask
3331
3332 @param target_block - target := block ^ xor_block
3333 @param block - arg1
3334 @param xor_block - arg2
3335 @param digest - mask for each block wave to XOR (1) or just copy (0)
3336
3337 @ingroup AVX2
3338 @internal
3339*/
3340inline
3342 const bm::word_t* block, const bm::word_t* xor_block,
3343 bm::id64_t digest)
3344{
3345 for (unsigned i = 0; i < bm::block_waves; ++i)
3346 {
3347 const bm::id64_t mask = (1ull << i);
3348 unsigned off = (i * bm::set_block_digest_wave_size);
3349 const __m256i* sub_block = (__m256i*) (block + off);
3350 __m256i* t_sub_block = (__m256i*)(target_block + off);
3351
3352 if (digest & mask) // XOR filtered sub-block
3353 {
3354 const __m256i* xor_sub_block = (__m256i*) (xor_block + off);
3355 __m256i mA, mB, mC, mD;
3356 mA = _mm256_xor_si256(_mm256_load_si256(sub_block),
3357 _mm256_load_si256(xor_sub_block));
3358 mB = _mm256_xor_si256(_mm256_load_si256(sub_block+1),
3359 _mm256_load_si256(xor_sub_block+1));
3360 mC = _mm256_xor_si256(_mm256_load_si256(sub_block+2),
3361 _mm256_load_si256(xor_sub_block+2));
3362 mD = _mm256_xor_si256(_mm256_load_si256(sub_block+3),
3363 _mm256_load_si256(xor_sub_block+3));
3364
3365 _mm256_store_si256(t_sub_block, mA);
3366 _mm256_store_si256(t_sub_block+1, mB);
3367 _mm256_store_si256(t_sub_block+2, mC);
3368 _mm256_store_si256(t_sub_block+3, mD);
3369 }
3370 else // just copy source
3371 {
3372 _mm256_store_si256(t_sub_block , _mm256_load_si256(sub_block));
3373 _mm256_store_si256(t_sub_block+1, _mm256_load_si256(sub_block+1));
3374 _mm256_store_si256(t_sub_block+2, _mm256_load_si256(sub_block+2));
3375 _mm256_store_si256(t_sub_block+3, _mm256_load_si256(sub_block+3));
3376 }
3377 } // for i
3378}
3379
3380
3381/**
3382 Build partial XOR product of 2 bit-blocks using digest mask
3383
3384 @param target_block - target ^= xor_block
3385 @param xor_block - arg1
3386 @param digest - mask for each block wave to XOR (1)
3387
3388 @ingroup AVX2
3389 @internal
3390*/
3391inline
3393 const bm::word_t* xor_block,
3394 bm::id64_t digest) BMNOEXCEPT
3395{
3396 while (digest)
3397 {
3398 bm::id64_t t = bm::bmi_blsi_u64(digest); // d & -d;
3399 unsigned wave = (unsigned)_mm_popcnt_u64(t - 1);
3400 unsigned off = wave * bm::set_block_digest_wave_size;
3401
3402 const __m256i* sub_block = (const __m256i*) (xor_block + off);
3403 __m256i* t_sub_block = (__m256i*)(target_block + off);
3404
3405 __m256i mA, mB, mC, mD;
3406 mA = _mm256_xor_si256(_mm256_load_si256(sub_block),
3407 _mm256_load_si256(t_sub_block));
3408 mB = _mm256_xor_si256(_mm256_load_si256(sub_block+1),
3409 _mm256_load_si256(t_sub_block+1));
3410 mC = _mm256_xor_si256(_mm256_load_si256(sub_block+2),
3411 _mm256_load_si256(t_sub_block+2));
3412 mD = _mm256_xor_si256(_mm256_load_si256(sub_block+3),
3413 _mm256_load_si256(t_sub_block+3));
3414
3415 _mm256_store_si256(t_sub_block, mA);
3416 _mm256_store_si256(t_sub_block+1, mB);
3417 _mm256_store_si256(t_sub_block+2, mC);
3418 _mm256_store_si256(t_sub_block+3, mD);
3419
3420 digest = bm::bmi_bslr_u64(digest); // d &= d - 1;
3421 } // while
3422
3423}
3424
3425
3426
3427#ifdef __GNUG__
3428#pragma GCC diagnostic pop
3429#endif
3430
3431
3432#define VECT_XOR_ARR_2_MASK(dst, src, src_end, mask)\
3433 avx2_xor_arr_2_mask((__m256i*)(dst), (__m256i*)(src), (__m256i*)(src_end), (bm::word_t)mask)
3434
3435#define VECT_ANDNOT_ARR_2_MASK(dst, src, src_end, mask)\
3436 avx2_andnot_arr_2_mask((__m256i*)(dst), (__m256i*)(src), (__m256i*)(src_end), (bm::word_t)mask)
3437
3438#define VECT_BITCOUNT(first, last) \
3439 avx2_bit_count((__m256i*) (first), (__m256i*) (last))
3440
3441#define VECT_BITCOUNT_AND(first, last, mask) \
3442 avx2_bit_count_and((__m256i*) (first), (__m256i*) (last), (__m256i*) (mask))
3443
3444#define VECT_BITCOUNT_OR(first, last, mask) \
3445 avx2_bit_count_or((__m256i*) (first), (__m256i*) (last), (__m256i*) (mask))
3446
3447#define VECT_BITCOUNT_XOR(first, last, mask) \
3448 avx2_bit_count_xor((__m256i*) (first), (__m256i*) (last), (__m256i*) (mask))
3449
3450#define VECT_BITCOUNT_SUB(first, last, mask) \
3451 avx2_bit_count_sub((__m256i*) (first), (__m256i*) (last), (__m256i*) (mask))
3452
3453#define VECT_INVERT_BLOCK(first) \
3454 avx2_invert_block((__m256i*)first);
3455
3456#define VECT_AND_BLOCK(dst, src) \
3457 avx2_and_block((__m256i*) dst, (const __m256i*) (src))
3458
3459#define VECT_AND_DIGEST(dst, src) \
3460 avx2_and_digest((__m256i*) dst, (const __m256i*) (src))
3461
3462#define VECT_AND_DIGEST_2WAY(dst, src1, src2) \
3463 avx2_and_digest_2way((__m256i*) dst, (const __m256i*) (src1), (const __m256i*) (src2))
3464
3465#define VECT_AND_OR_DIGEST_2WAY(dst, src1, src2) \
3466 avx2_and_or_digest_2way((__m256i*) dst, (const __m256i*) (src1), (const __m256i*) (src2))
3467
3468#define VECT_AND_DIGEST_5WAY(dst, src1, src2, src3, src4) \
3469 avx2_and_digest_5way((__m256i*) dst, (const __m256i*) (src1), (const __m256i*) (src2), (const __m256i*) (src3), (const __m256i*) (src4))
3470
3471#define VECT_AND_DIGEST_3WAY(dst, src1, src2) \
3472 avx2_and_digest_3way((__m256i*) dst, (const __m256i*) (src1), (const __m256i*) (src2))
3473
3474#define VECT_OR_BLOCK(dst, src) \
3475 avx2_or_block((__m256i*) dst, (__m256i*) (src))
3476
3477#define VECT_OR_BLOCK_3WAY(dst, src1, src2) \
3478 avx2_or_block_3way((__m256i*) dst, (__m256i*) (src1), (__m256i*) (src2))
3479
3480#define VECT_OR_BLOCK_2WAY(dst, src1, src2) \
3481 avx2_or_block_2way((__m256i*) dst, (__m256i*) (src1), (__m256i*) (src2))
3482
3483#define VECT_OR_BLOCK_3WAY(dst, src1, src2) \
3484 avx2_or_block_3way((__m256i*) dst, (__m256i*) (src1), (__m256i*) (src2))
3485
3486#define VECT_OR_BLOCK_5WAY(dst, src1, src2, src3, src4) \
3487 avx2_or_block_5way((__m256i*) dst, (__m256i*) (src1), (__m256i*) (src2), (__m256i*) (src3), (__m256i*) (src4))
3488
3489#define VECT_SUB_BLOCK(dst, src) \
3490 avx2_sub_block((__m256i*) dst, (__m256i*) (src))
3491
3492#define VECT_SUB_DIGEST(dst, src) \
3493 avx2_sub_digest((__m256i*) dst, (const __m256i*) (src))
3494
3495#define VECT_SUB_DIGEST_2WAY(dst, src1, src2) \
3496 avx2_sub_digest_2way((__m256i*) dst, (const __m256i*) (src1), (const __m256i*) (src2))
3497
3498#define VECT_SUB_DIGEST_5WAY(dst, src1, src2, src3, src4) \
3499 avx2_sub_digest_5way((__m256i*) dst, (const __m256i*) (src1), (const __m256i*) (src2), (const __m256i*) (src3), (const __m256i*) (src4))
3500
3501#define VECT_SUB_DIGEST_3WAY(dst, src1, src2) \
3502 avx2_sub_digest_3way((__m256i*) dst, (const __m256i*) (src1), (const __m256i*) (src2))
3503
3504#define VECT_XOR_BLOCK(dst, src) \
3505 avx2_xor_block((__m256i*) dst, (__m256i*) (src))
3506
3507#define VECT_XOR_BLOCK_2WAY(dst, src1, src2) \
3508 avx2_xor_block_2way((__m256i*) dst, (__m256i*) (src1), (__m256i*) (src2))
3509
3510#define VECT_COPY_BLOCK(dst, src) \
3511 avx2_copy_block((__m256i*) dst, (__m256i*) (src))
3512
3513#define VECT_COPY_BLOCK_UNALIGN(dst, src) \
3514 avx2_copy_block_unalign((__m256i*) dst, (__m256i*) (src))
3515
3516#define VECT_STREAM_BLOCK(dst, src) \
3517 avx2_stream_block((__m256i*) dst, (__m256i*) (src))
3518
3519#define VECT_STREAM_BLOCK_UNALIGN(dst, src) \
3520 avx2_stream_block_unalign((__m256i*) dst, (__m256i*) (src))
3521
3522#define VECT_SET_BLOCK(dst, value) \
3523 avx2_set_block((__m256i*) dst, (value))
3524
3525#define VECT_IS_ZERO_BLOCK(dst) \
3526 avx2_is_all_zero((__m256i*) dst)
3527
3528#define VECT_IS_ONE_BLOCK(dst) \
3529 avx2_is_all_one((__m256i*) dst)
3530
3531#define VECT_IS_DIGEST_ZERO(start) \
3532 avx2_is_digest_zero((__m256i*)start)
3533
3534#define VECT_BLOCK_SET_DIGEST(dst, val) \
3535 avx2_block_set_digest((__m256i*)dst, val)
3536
3537#define VECT_LOWER_BOUND_SCAN_U32(arr, target, from, to) \
3538 avx2_lower_bound_scan_u32(arr, target, from, to)
3539
3540#define VECT_SHIFT_L1(b, acc, co) \
3541 avx2_shift_l1((__m256i*)b, acc, co)
3542
3543#define VECT_SHIFT_R1(b, acc, co) \
3544 avx2_shift_r1((__m256i*)b, acc, co)
3545
3546#define VECT_SHIFT_R1_AND(b, co, m, digest) \
3547 avx2_shift_r1_and((__m256i*)b, co, (__m256i*)m, digest)
3548
3549#define VECT_ARR_BLOCK_LOOKUP(idx, size, nb, start) \
3550 avx2_idx_arr_block_lookup(idx, size, nb, start)
3551
3552#define VECT_SET_BLOCK_BITS(block, idx, start, stop) \
3553 avx2_set_block_bits3(block, idx, start, stop)
3554
3555#define VECT_BLOCK_CHANGE(block, size) \
3556 avx2_bit_block_calc_change((__m256i*)block, size)
3557
3558#define VECT_BLOCK_XOR_CHANGE(block, xor_block, size, gc, bc) \
3559 avx2_bit_block_calc_xor_change((__m256i*)block, (__m256i*)xor_block, size, gc, bc)
3560
3561#define VECT_BLOCK_CHANGE_BC(block, gc, bc) \
3562 avx2_bit_block_calc_change_bc((__m256i*)block, gc, bc)
3563
3564#define VECT_BIT_TO_GAP(dest, src, dest_len) \
3565 avx2_bit_to_gap(dest, src, dest_len)
3566
3567#define VECT_BIT_FIND_FIRST(src1, off, pos) \
3568 avx2_bit_find_first((__m256i*) src1, off, pos)
3569
3570#define VECT_BIT_FIND_DIFF(src1, src2, pos) \
3571 avx2_bit_find_first_diff((__m256i*) src1, (__m256i*) (src2), pos)
3572
3573#define VECT_BIT_BLOCK_XOR(t, src, src_xor, d) \
3574 avx2_bit_block_xor(t, src, src_xor, d)
3575
3576#define VECT_BIT_BLOCK_XOR_2WAY(t, src_xor, d) \
3577 avx2_bit_block_xor_2way(t, src_xor, d)
3578
3579#define VECT_GAP_BFIND(buf, pos, is_set) \
3580 avx2_gap_bfind(buf, pos, is_set)
3581
3582#define VECT_GAP_TEST(buf, pos) \
3583 avx2_gap_test(buf, pos)
3584
3585
3586#define VECT_BIT_COUNT_DIGEST(blk, d) \
3587 avx2_bit_block_count(blk, d)
3588
3589
3590} // namespace
3591
3592
3593
3594
3595#endif
#define BM_AVX2_POPCNT_PROLOG
Definition bmavx2.h:140
#define BM_CSA256(h, l, a, b, c)
Definition bmavx2.h:117
#define BM_AVX2_BIT_COUNT(ret, v)
Definition bmavx2.h:124
Definitions(internal).
#define BMRESTRICT
Definition bmdef.h:203
#define BMNOEXCEPT
Definition bmdef.h:82
#define BM_ALIGN32
Definition bmdef.h:292
#define BMFORCEINLINE
Definition bmdef.h:213
#define BM_ASSERT
Definition bmdef.h:139
#define BM_ALIGN32ATTR
Definition bmdef.h:293
Bit manipulation primitives (internal).
unsigned avx2_xor_block(__m256i *BMRESTRICT dst, const __m256i *BMRESTRICT src)
XOR block against another dst ^= *src.
Definition bmavx2.h:1108
bool avx2_sub_digest(__m256i *BMRESTRICT dst, const __m256i *BMRESTRICT src)
SUB (AND NOT) block digest stride dst &= ~*src.
Definition bmavx2.h:1250
bool avx2_and_digest_5way(__m256i *BMRESTRICT dst, const __m256i *BMRESTRICT src1, const __m256i *BMRESTRICT src2, const __m256i *BMRESTRICT src3, const __m256i *BMRESTRICT src4)
AND block digest stride.
Definition bmavx2.h:659
bool avx2_or_block_5way(__m256i *BMRESTRICT dst, const __m256i *BMRESTRICT src1, const __m256i *BMRESTRICT src2, const __m256i *BMRESTRICT src3, const __m256i *BMRESTRICT src4)
OR array elements against another 4 arrays dst |= *src1 | src2.
Definition bmavx2.h:1039
void avx2_bit_block_xor_2way(bm::word_t *target_block, const bm::word_t *xor_block, bm::id64_t digest) BMNOEXCEPT
Build partial XOR product of 2 bit-blocks using digest mask.
Definition bmavx2.h:3392
void avx2_bit_block_calc_change_bc(const __m256i *BMRESTRICT block, unsigned *gcount, unsigned *bcount)
Definition bmavx2.h:2251
bm::id_t avx2_bit_count_sub(const __m256i *BMRESTRICT block, const __m256i *BMRESTRICT block_end, const __m256i *BMRESTRICT mask_block)
AND NOT bit count for two aligned bit-blocks.
Definition bmavx2.h:413
bool avx2_or_block_3way(__m256i *BMRESTRICT dst, const __m256i *BMRESTRICT src1, const __m256i *BMRESTRICT src2)
OR array elements against another 2 arrays dst |= *src1 | src2.
Definition bmavx2.h:987
bm::id_t avx2_bit_count_xor(const __m256i *BMRESTRICT block, const __m256i *BMRESTRICT block_end, const __m256i *BMRESTRICT mask_block)
XOR bit count for two aligned bit-blocks.
Definition bmavx2.h:368
bool avx2_and_digest(__m256i *BMRESTRICT dst, const __m256i *BMRESTRICT src)
AND block digest stride dst &= *src.
Definition bmavx2.h:543
BMFORCEINLINE bool avx2_test_all_zero_wave(const void *ptr)
check if wave of pointers is all NULL
Definition bmavx2.h:1805
void avx2_copy_block(__m256i *BMRESTRICT dst, const __m256i *BMRESTRICT src)
AVX2 block copy dst = *src.
Definition bmavx2.h:1503
BMFORCEINLINE bool avx2_test_all_one_wave(const void *ptr)
check if wave of pointers is all 0xFFF
Definition bmavx2.h:1791
bool avx2_shift_r1(__m256i *block, bm::word_t *empty_acc, unsigned co1)
block shift right by 1
Definition bmavx2.h:1903
bool avx2_is_digest_zero(const __m256i *BMRESTRICT block)
check if digest stride is all zero bits
Definition bmavx2.h:1738
bool avx2_or_arr_unal(__m256i *BMRESTRICT dst, const __m256i *BMRESTRICT src, const __m256i *BMRESTRICT src_end)
OR array elements against another unaligned array dst |= *src.
Definition bmavx2.h:888
bool avx2_sub_digest_3way(__m256i *BMRESTRICT dst, const __m256i *BMRESTRICT src1, const __m256i *BMRESTRICT src2)
SUB block digest stride.
Definition bmavx2.h:1392
void avx2_bit_block_calc_xor_change(const __m256i *BMRESTRICT block, const __m256i *BMRESTRICT xor_block, unsigned size, unsigned *BMRESTRICT gcount, unsigned *BMRESTRICT bcount)
Definition bmavx2.h:2154
bool avx2_bit_find_first(const __m256i *BMRESTRICT block, unsigned off, unsigned *pos)
Find first bit set.
Definition bmavx2.h:2394
bool avx2_and_or_digest_2way(__m256i *BMRESTRICT dst, const __m256i *BMRESTRICT src1, const __m256i *BMRESTRICT src2)
AND-OR block digest stride 2 way dst |= *src1 & *src2.
Definition bmavx2.h:604
bool avx2_bit_find_first_diff(const __m256i *BMRESTRICT block1, const __m256i *BMRESTRICT block2, unsigned *pos)
Find first bit which is different between two bit-blocks.
Definition bmavx2.h:2336
unsigned avx2_bit_to_gap(gap_word_t *BMRESTRICT dest, const unsigned *BMRESTRICT block, unsigned dest_len)
Convert bit block to GAP block.
Definition bmavx2.h:3227
BMFORCEINLINE void avx2_set_block(__m256i *BMRESTRICT dst, bm::word_t value)
AVX2 block memset dst = value.
Definition bmavx2.h:1477
bool avx2_shift_r1_and(__m256i *BMRESTRICT block, bm::word_t co1, const __m256i *BMRESTRICT mask_block, bm::id64_t *BMRESTRICT digest)
fused block shift right by 1 plus AND
Definition bmavx2.h:1959
void avx2_copy_block_unalign(__m256i *BMRESTRICT dst, const __m256i *BMRESTRICT src)
AVX2 block copy (unaligned SRC) dst = *src.
Definition bmavx2.h:1545
bool avx2_or_block(__m256i *BMRESTRICT dst, const __m256i *BMRESTRICT src)
OR array elements against another array dst |= *src.
Definition bmavx2.h:835
bool avx2_and_digest_3way(__m256i *BMRESTRICT dst, const __m256i *BMRESTRICT src1, const __m256i *BMRESTRICT src2)
AND block digest stride.
Definition bmavx2.h:727
void avx2_stream_block(__m256i *BMRESTRICT dst, const __m256i *BMRESTRICT src)
AVX2 block copy dst = *src.
Definition bmavx2.h:1589
unsigned avx2_gap_test(const unsigned short *BMRESTRICT buf, unsigned pos)
Hybrid binary search, starts as binary, then switches to scan.
Definition bmavx2.h:3057
bm::id_t avx2_bit_count_and(const __m256i *BMRESTRICT block, const __m256i *BMRESTRICT block_end, const __m256i *BMRESTRICT mask_block)
AND bit count for two aligned bit-blocks.
Definition bmavx2.h:290
void avx2_andnot_arr_2_mask(__m256i *BMRESTRICT dst, const __m256i *BMRESTRICT src, const __m256i *BMRESTRICT src_end, bm::word_t mask)
Inverts array elements and NOT them to specified mask dst = ~*src & mask.
Definition bmavx2.h:472
void avx2_block_set_digest(__m256i *dst, unsigned value)
set digest stride to 0xFF.. or 0x0 value
Definition bmavx2.h:1752
unsigned avx2_sub_block(__m256i *BMRESTRICT dst, const __m256i *BMRESTRICT src)
AND-NOT (SUB) array elements against another array dst &= ~*src.
Definition bmavx2.h:1204
unsigned avx2_gap_bfind(const unsigned short *BMRESTRICT buf, unsigned pos, unsigned *BMRESTRICT is_set)
Hybrid binary search, starts as binary, then switches to scan.
Definition bmavx2.h:2939
bool avx2_is_all_zero(const __m256i *BMRESTRICT block)
check if block is all zero bits
Definition bmavx2.h:1708
BMFORCEINLINE bool avx2_test_all_zero_wave2(const void *ptr0, const void *ptr1)
check if 2 wave of pointers are all NULL
Definition bmavx2.h:1816
bool avx2_and_digest_2way(__m256i *BMRESTRICT dst, const __m256i *BMRESTRICT src1, const __m256i *BMRESTRICT src2)
AND block digest stride 2 way dst = *src1 & *src2.
Definition bmavx2.h:573
bool avx2_sub_digest_5way(__m256i *BMRESTRICT dst, const __m256i *BMRESTRICT src1, const __m256i *BMRESTRICT src2, const __m256i *BMRESTRICT src3, const __m256i *BMRESTRICT src4)
SUB block digest stride.
Definition bmavx2.h:1310
bm::id_t avx2_bit_block_count(const bm::word_t *const block, bm::id64_t digest)
Calculate population count based on digest.
Definition bmavx2.h:232
void avx2_stream_block_unalign(__m256i *BMRESTRICT dst, const __m256i *BMRESTRICT src)
AVX2 block copy (unaligned SRC) dst = *src.
Definition bmavx2.h:1631
BMFORCEINLINE bool avx2_test_all_eq_wave2(const void *ptr0, const void *ptr1)
check if 2 wave of pointers are all the same (NULL or FULL)
Definition bmavx2.h:1829
unsigned avx2_lower_bound_scan_u32(const unsigned *BMRESTRICT arr, unsigned target, unsigned from, unsigned to)
lower bound (great or equal) linear scan in ascending order sorted array
Definition bmavx2.h:3068
void avx2_bit_block_xor(bm::word_t *target_block, const bm::word_t *block, const bm::word_t *xor_block, bm::id64_t digest)
Build partial XOR product of 2 bit-blocks using digest mask.
Definition bmavx2.h:3341
bool avx2_is_all_one(const __m256i *BMRESTRICT block)
check if block is all one bits
Definition bmavx2.h:1767
unsigned avx2_bit_block_calc_change(const __m256i *BMRESTRICT block, unsigned size)
Definition bmavx2.h:2083
unsigned avx2_and_arr_unal(__m256i *BMRESTRICT dst, const __m256i *BMRESTRICT src, const __m256i *BMRESTRICT src_end)
AND array elements against another array (unaligned) dst &= *src.
Definition bmavx2.h:777
void avx2_invert_block(__m256i *BMRESTRICT dst)
Invert bit-block dst = ~*dst or dst ^= *dst.
Definition bmavx2.h:1677
void avx2_xor_arr_2_mask(__m256i *BMRESTRICT dst, const __m256i *BMRESTRICT src, const __m256i *BMRESTRICT src_end, bm::word_t mask)
XOR array elements to specified mask dst = *src ^ mask.
Definition bmavx2.h:447
unsigned avx2_xor_block_2way(__m256i *BMRESTRICT dst, const __m256i *BMRESTRICT src1, const __m256i *BMRESTRICT src2)
3 operand XOR dst = *src1 ^ src2
Definition bmavx2.h:1154
bool avx2_sub_digest_2way(__m256i *BMRESTRICT dst, const __m256i *BMRESTRICT src1, const __m256i *BMRESTRICT src2)
2-operand SUB (AND NOT) block digest stride dst = *src1 & ~*src2
Definition bmavx2.h:1280
unsigned avx2_and_block(__m256i *BMRESTRICT dst, const __m256i *BMRESTRICT src)
AND array elements against another array dst &= *src.
Definition bmavx2.h:496
bool avx2_shift_l1(__m256i *block, bm::word_t *empty_acc, unsigned co1)
block shift left by 1
Definition bmavx2.h:1842
int avx2_cmpge_u16(__m256i vect16, unsigned short value)
Experimental (test) function to do SIMD vector search in sorted, growing array.
Definition bmavx2.h:2907
int avx2_cmpge_u32(__m256i vect8, unsigned value)
Experimental (test) function to do SIMD vector search (lower bound) in sorted, growing array.
Definition bmavx2.h:2875
bm::id_t avx2_bit_count(const __m256i *BMRESTRICT block, const __m256i *BMRESTRICT block_end)
AVX2 Harley-Seal popcount The algorithm is based on the paper "Faster Population Countsusing AVX2 Ins...
Definition bmavx2.h:156
bool avx2_or_block_2way(__m256i *BMRESTRICT dst, const __m256i *BMRESTRICT src1, const __m256i *BMRESTRICT src2)
OR 2 arrays and copy to the destination dst = *src1 | src2.
Definition bmavx2.h:941
Definition bm.h:78
const unsigned set_block_digest_wave_size
Definition bmconst.h:67
void avx2_set_block_bits3(bm::word_t *BMRESTRICT block, const unsigned *BMRESTRICT idx, unsigned start, unsigned stop)
Experimental code to set bits via AVX strides.
Definition bmavx2.h:2748
unsigned int word_t
Definition bmconst.h:39
unsigned avx2_idx_arr_block_lookup(const unsigned *idx, unsigned size, unsigned nb, unsigned start)
Definition bmavx2.h:2492
__m256i avx2_setbit_to256(unsigned i)
Experiemntal.
Definition bmavx2.h:2841
const unsigned set_block_mask
Definition bmconst.h:57
const bm::gap_word_t * avx2_gap_sum_arr(const bm::gap_word_t *pbuf, unsigned avx_vect_waves, unsigned *sum)
Definition bmavx2.h:2453
bm::id_t avx2_bit_count_or(const __m256i *BMRESTRICT block, const __m256i *BMRESTRICT block_end, const __m256i *BMRESTRICT mask_block)
Definition bmavx2.h:337
void avx2_set_block_bits(bm::word_t *BMRESTRICT block, const unsigned *BMRESTRICT idx, unsigned start, unsigned stop)
Definition bmavx2.h:2533
const unsigned set_word_shift
Definition bmconst.h:72
const unsigned set_block_size
Definition bmconst.h:55
unsigned long long int id64_t
Definition bmconst.h:35
const unsigned block_waves
Definition bmconst.h:66
unsigned int id_t
Definition bmconst.h:38
BMFORCEINLINE __m256i avx2_setbit_256(__m256i target, __m256i source)
Set a bits in an AVX target, by indexes (int4) from the source.
Definition bmavx2.h:2662
BMFORCEINLINE unsigned long long bmi_bslr_u64(unsigned long long w) BMNOEXCEPT
Definition bmutil.h:335
void avx2_set_block_bits2(bm::word_t *BMRESTRICT block, const unsigned *BMRESTRICT idx, unsigned start, unsigned stop)
Experimental code to set bits via AVX strides.
Definition bmavx2.h:2709
unsigned short gap_word_t
Definition bmconst.h:78
const unsigned gap_max_bits
Definition bmconst.h:81
const unsigned set_block_shift
Definition bmconst.h:56
void avx2_bit_block_gather_scatter(unsigned *BMRESTRICT arr, const unsigned *BMRESTRICT blk, const unsigned *BMRESTRICT idx, unsigned size, unsigned start, unsigned bit_idx)
Definition bmavx2.h:3141
const unsigned set_word_mask
Definition bmconst.h:73
BMFORCEINLINE unsigned long long bmi_blsi_u64(unsigned long long w)
Definition bmutil.h:345