1#ifndef BMSSE4__H__INCLUDED__
2#define BMSSE4__H__INCLUDED__
50#pragma GCC diagnostic push
51#pragma GCC diagnostic ignored "-Wconversion"
55#pragma warning( push )
56#pragma warning( disable : 4146)
62# define _mm_popcnt_u32 __builtin_popcount
63# define _mm_popcnt_u64 __builtin_popcountll
64# define BM_BSF32 __builtin_ctz
66# define BM_BSF32 bm::bsf_asm32
101 count += unsigned( _mm_popcnt_u64(b[0]) +
102 _mm_popcnt_u64(b[1]) +
103 _mm_popcnt_u64(b[2]) +
104 _mm_popcnt_u64(b[3]));
110 const unsigned* b = (
unsigned*) block;
111 count += _mm_popcnt_u32(b[0]) +
112 _mm_popcnt_u32(b[1]) +
113 _mm_popcnt_u32(b[2]) +
114 _mm_popcnt_u32(b[3]);
115 }
while (++block < block_end);
137 const unsigned wave = (unsigned)_mm_popcnt_u64(t - 1);
146 unsigned( _mm_popcnt_u64(src_u->w64[j]) +
147 _mm_popcnt_u64(src_u->w64[j+1]) +
148 _mm_popcnt_u64(src_u->w64[j+2]) +
149 _mm_popcnt_u64(src_u->w64[j+3]));
165 unsigned ret = (a ^ b);
199 __m128i b = sse2_func(_mm_load_si128(block), _mm_load_si128(mask_block));
200 _mm_store_si128((__m128i*)tcnt, b);
201 count += unsigned(_mm_popcnt_u64(tcnt[0]) + _mm_popcnt_u64(tcnt[1]));
203 b = sse2_func(_mm_load_si128(block+1), _mm_load_si128(mask_block+1));
204 _mm_store_si128((__m128i*)tcnt, b);
205 count += unsigned(_mm_popcnt_u64(tcnt[0]) + _mm_popcnt_u64(tcnt[1]));
206 block+=2; mask_block+=2;
207 }
while (block < block_end);
211 __m128i tmp0 = _mm_load_si128(block);
212 __m128i tmp1 = _mm_load_si128(mask_block);
213 __m128i b = sse2_func(tmp0, tmp1);
215 count += _mm_popcnt_u32(_mm_extract_epi32(b, 0));
216 count += _mm_popcnt_u32(_mm_extract_epi32(b, 1));
217 count += _mm_popcnt_u32(_mm_extract_epi32(b, 2));
218 count += _mm_popcnt_u32(_mm_extract_epi32(b, 3));
220 ++block; ++mask_block;
221 }
while (block < block_end);
235 __m128i maskz = _mm_setzero_si128();
241 w = _mm_or_si128(_mm_load_si128(block+0), _mm_load_si128(block+1));
242 if (!_mm_test_all_ones(_mm_cmpeq_epi8(w, maskz)))
244 w = _mm_or_si128(_mm_load_si128(block+2), _mm_load_si128(block+3));
245 if (!_mm_test_all_ones(_mm_cmpeq_epi8(w, maskz)))
248 }
while (block < block_end);
259 __m128i wA = _mm_or_si128(_mm_load_si128(block+0), _mm_load_si128(block+1));
260 __m128i wB = _mm_or_si128(_mm_load_si128(block+2), _mm_load_si128(block+3));
261 wA = _mm_or_si128(wA, wB);
262 bool z1 = _mm_test_all_zeros(wA, wA);
264 wA = _mm_or_si128(_mm_load_si128(block+4), _mm_load_si128(block+5));
265 wB = _mm_or_si128(_mm_load_si128(block+6), _mm_load_si128(block+7));
266 wA = _mm_or_si128(wA, wB);
267 bool z2 = _mm_test_all_zeros(wA, wA);
278 __m128i mV = _mm_set1_epi32(
int(value));
279 _mm_store_si128(dst, mV); _mm_store_si128(dst + 1, mV);
280 _mm_store_si128(dst + 2, mV); _mm_store_si128(dst + 3, mV);
281 _mm_store_si128(dst + 4, mV); _mm_store_si128(dst + 5, mV);
282 _mm_store_si128(dst + 6, mV); _mm_store_si128(dst + 7, mV);
297 __m128i m1A, m1B, m1C, m1D;
298 __m128i accA, accB, accC, accD;
303 accA = accB = accC = accD = _mm_setzero_si128();
307 m1A = _mm_and_si128(_mm_load_si128(src+0), _mm_load_si128(dst+0));
308 m1B = _mm_and_si128(_mm_load_si128(src+1), _mm_load_si128(dst+1));
309 m1C = _mm_and_si128(_mm_load_si128(src+2), _mm_load_si128(dst+2));
310 m1D = _mm_and_si128(_mm_load_si128(src+3), _mm_load_si128(dst+3));
312 _mm_store_si128(dst+0, m1A);
313 _mm_store_si128(dst+1, m1B);
314 _mm_store_si128(dst+2, m1C);
315 _mm_store_si128(dst+3, m1D);
317 accA = _mm_or_si128(accA, m1A);
318 accB = _mm_or_si128(accB, m1B);
319 accC = _mm_or_si128(accC, m1C);
320 accD = _mm_or_si128(accD, m1D);
323 }
while (src < src_end);
325 accA = _mm_or_si128(accA, accB);
326 accC = _mm_or_si128(accC, accD);
327 accA = _mm_or_si128(accA, accC);
329 return !_mm_testz_si128(accA, accA);
344 __m128i m1A, m1B, m1C, m1D;
346 m1A = _mm_and_si128(_mm_load_si128(src+0), _mm_load_si128(dst+0));
347 m1B = _mm_and_si128(_mm_load_si128(src+1), _mm_load_si128(dst+1));
348 m1C = _mm_and_si128(_mm_load_si128(src+2), _mm_load_si128(dst+2));
349 m1D = _mm_and_si128(_mm_load_si128(src+3), _mm_load_si128(dst+3));
351 _mm_store_si128(dst+0, m1A);
352 _mm_store_si128(dst+1, m1B);
353 _mm_store_si128(dst+2, m1C);
354 _mm_store_si128(dst+3, m1D);
356 m1A = _mm_or_si128(m1A, m1B);
357 m1C = _mm_or_si128(m1C, m1D);
358 m1A = _mm_or_si128(m1A, m1C);
360 bool z1 = _mm_testz_si128(m1A, m1A);
362 m1A = _mm_and_si128(_mm_load_si128(src+4), _mm_load_si128(dst+4));
363 m1B = _mm_and_si128(_mm_load_si128(src+5), _mm_load_si128(dst+5));
364 m1C = _mm_and_si128(_mm_load_si128(src+6), _mm_load_si128(dst+6));
365 m1D = _mm_and_si128(_mm_load_si128(src+7), _mm_load_si128(dst+7));
367 _mm_store_si128(dst+4, m1A);
368 _mm_store_si128(dst+5, m1B);
369 _mm_store_si128(dst+6, m1C);
370 _mm_store_si128(dst+7, m1D);
372 m1A = _mm_or_si128(m1A, m1B);
373 m1C = _mm_or_si128(m1C, m1D);
374 m1A = _mm_or_si128(m1A, m1C);
376 bool z2 = _mm_testz_si128(m1A, m1A);
393 __m128i m1A, m1B, m1C, m1D;
395 m1A = _mm_and_si128(_mm_load_si128(src1+0), _mm_load_si128(src2+0));
396 m1B = _mm_and_si128(_mm_load_si128(src1+1), _mm_load_si128(src2+1));
397 m1C = _mm_and_si128(_mm_load_si128(src1+2), _mm_load_si128(src2+2));
398 m1D = _mm_and_si128(_mm_load_si128(src1+3), _mm_load_si128(src2+3));
400 _mm_store_si128(dst+0, m1A);
401 _mm_store_si128(dst+1, m1B);
402 _mm_store_si128(dst+2, m1C);
403 _mm_store_si128(dst+3, m1D);
405 m1A = _mm_or_si128(m1A, m1B);
406 m1C = _mm_or_si128(m1C, m1D);
407 m1A = _mm_or_si128(m1A, m1C);
409 bool z1 = _mm_testz_si128(m1A, m1A);
411 m1A = _mm_and_si128(_mm_load_si128(src1+4), _mm_load_si128(src2+4));
412 m1B = _mm_and_si128(_mm_load_si128(src1+5), _mm_load_si128(src2+5));
413 m1C = _mm_and_si128(_mm_load_si128(src1+6), _mm_load_si128(src2+6));
414 m1D = _mm_and_si128(_mm_load_si128(src1+7), _mm_load_si128(src2+7));
416 _mm_store_si128(dst+4, m1A);
417 _mm_store_si128(dst+5, m1B);
418 _mm_store_si128(dst+6, m1C);
419 _mm_store_si128(dst+7, m1D);
421 m1A = _mm_or_si128(m1A, m1B);
422 m1C = _mm_or_si128(m1C, m1D);
423 m1A = _mm_or_si128(m1A, m1C);
425 bool z2 = _mm_testz_si128(m1A, m1A);
442 __m128i m1A, m1B, m1C, m1D;
445 m1A = _mm_and_si128(_mm_load_si128(src1+0), _mm_load_si128(src2+0));
446 m1B = _mm_and_si128(_mm_load_si128(src1+1), _mm_load_si128(src2+1));
447 m1C = _mm_and_si128(_mm_load_si128(src1+2), _mm_load_si128(src2+2));
448 m1D = _mm_and_si128(_mm_load_si128(src1+3), _mm_load_si128(src2+3));
450 mACC1 = _mm_or_si128(_mm_or_si128(m1A, m1B), _mm_or_si128(m1C, m1D));
451 bool z1 = _mm_testz_si128(mACC1, mACC1);
453 m1A = _mm_or_si128(_mm_load_si128(dst+0), m1A);
454 m1B = _mm_or_si128(_mm_load_si128(dst+1), m1B);
455 m1C = _mm_or_si128(_mm_load_si128(dst+2), m1C);
456 m1D = _mm_or_si128(_mm_load_si128(dst+3), m1D);
458 _mm_store_si128(dst+0, m1A);
459 _mm_store_si128(dst+1, m1B);
460 _mm_store_si128(dst+2, m1C);
461 _mm_store_si128(dst+3, m1D);
464 m1A = _mm_and_si128(_mm_load_si128(src1+4), _mm_load_si128(src2+4));
465 m1B = _mm_and_si128(_mm_load_si128(src1+5), _mm_load_si128(src2+5));
466 m1C = _mm_and_si128(_mm_load_si128(src1+6), _mm_load_si128(src2+6));
467 m1D = _mm_and_si128(_mm_load_si128(src1+7), _mm_load_si128(src2+7));
469 mACC1 = _mm_or_si128(_mm_or_si128(m1A, m1B), _mm_or_si128(m1C, m1D));
470 bool z2 = _mm_testz_si128(mACC1, mACC1);
472 m1A = _mm_or_si128(_mm_load_si128(dst+4), m1A);
473 m1B = _mm_or_si128(_mm_load_si128(dst+5), m1B);
474 m1C = _mm_or_si128(_mm_load_si128(dst+6), m1C);
475 m1D = _mm_or_si128(_mm_load_si128(dst+7), m1D);
477 _mm_store_si128(dst+4, m1A);
478 _mm_store_si128(dst+5, m1B);
479 _mm_store_si128(dst+6, m1C);
480 _mm_store_si128(dst+7, m1D);
495 __m128i m1A, m1B, m1C, m1D;
497 m1A = _mm_and_si128(_mm_load_si128(src1+0), _mm_load_si128(src2+0));
498 m1B = _mm_and_si128(_mm_load_si128(src1+1), _mm_load_si128(src2+1));
499 m1C = _mm_and_si128(_mm_load_si128(src1+2), _mm_load_si128(src2+2));
500 m1D = _mm_and_si128(_mm_load_si128(src1+3), _mm_load_si128(src2+3));
503 m1A = _mm_and_si128(m1A, _mm_load_si128(dst+0));
504 m1B = _mm_and_si128(m1B, _mm_load_si128(dst+1));
505 m1C = _mm_and_si128(m1C, _mm_load_si128(dst+2));
506 m1D = _mm_and_si128(m1D, _mm_load_si128(dst+3));
508 _mm_store_si128(dst+0, m1A);
509 _mm_store_si128(dst+1, m1B);
510 _mm_store_si128(dst+2, m1C);
511 _mm_store_si128(dst+3, m1D);
513 m1A = _mm_or_si128(m1A, m1B);
514 m1C = _mm_or_si128(m1C, m1D);
515 m1A = _mm_or_si128(m1A, m1C);
517 bool z1 = _mm_testz_si128(m1A, m1A);
519 m1A = _mm_and_si128(_mm_load_si128(src1+4), _mm_load_si128(src2+4));
520 m1B = _mm_and_si128(_mm_load_si128(src1+5), _mm_load_si128(src2+5));
521 m1C = _mm_and_si128(_mm_load_si128(src1+6), _mm_load_si128(src2+6));
522 m1D = _mm_and_si128(_mm_load_si128(src1+7), _mm_load_si128(src2+7));
525 m1A = _mm_and_si128(m1A, _mm_load_si128(dst+4));
526 m1B = _mm_and_si128(m1B, _mm_load_si128(dst+5));
527 m1C = _mm_and_si128(m1C, _mm_load_si128(dst+6));
528 m1D = _mm_and_si128(m1D, _mm_load_si128(dst+7));
530 _mm_store_si128(dst+4, m1A);
531 _mm_store_si128(dst+5, m1B);
532 _mm_store_si128(dst+6, m1C);
533 _mm_store_si128(dst+7, m1D);
535 m1A = _mm_or_si128(m1A, m1B);
536 m1C = _mm_or_si128(m1C, m1D);
537 m1A = _mm_or_si128(m1A, m1C);
539 bool z2 = _mm_testz_si128(m1A, m1A);
558 __m128i m1A, m1B, m1C, m1D;
559 __m128i m1E, m1F, m1G, m1H;
561 m1A = _mm_and_si128(_mm_load_si128(src1+0), _mm_load_si128(src2+0));
562 m1B = _mm_and_si128(_mm_load_si128(src1+1), _mm_load_si128(src2+1));
563 m1C = _mm_and_si128(_mm_load_si128(src1+2), _mm_load_si128(src2+2));
564 m1D = _mm_and_si128(_mm_load_si128(src1+3), _mm_load_si128(src2+3));
566 m1E = _mm_and_si128(_mm_load_si128(src3+0), _mm_load_si128(src4+0));
567 m1F = _mm_and_si128(_mm_load_si128(src3+1), _mm_load_si128(src4+1));
568 m1G = _mm_and_si128(_mm_load_si128(src3+2), _mm_load_si128(src4+2));
569 m1H = _mm_and_si128(_mm_load_si128(src3+3), _mm_load_si128(src4+3));
571 m1A = _mm_and_si128(m1A, m1E);
572 m1B = _mm_and_si128(m1B, m1F);
573 m1C = _mm_and_si128(m1C, m1G);
574 m1D = _mm_and_si128(m1D, m1H);
576 m1A = _mm_and_si128(m1A, _mm_load_si128(dst+0));
577 m1B = _mm_and_si128(m1B, _mm_load_si128(dst+1));
578 m1C = _mm_and_si128(m1C, _mm_load_si128(dst+2));
579 m1D = _mm_and_si128(m1D, _mm_load_si128(dst+3));
581 _mm_store_si128(dst+0, m1A);
582 _mm_store_si128(dst+1, m1B);
583 _mm_store_si128(dst+2, m1C);
584 _mm_store_si128(dst+3, m1D);
586 m1A = _mm_or_si128(m1A, m1B);
587 m1C = _mm_or_si128(m1C, m1D);
588 m1A = _mm_or_si128(m1A, m1C);
590 bool z1 = _mm_testz_si128(m1A, m1A);
592 m1A = _mm_and_si128(_mm_load_si128(src1+4), _mm_load_si128(src2+4));
593 m1B = _mm_and_si128(_mm_load_si128(src1+5), _mm_load_si128(src2+5));
594 m1C = _mm_and_si128(_mm_load_si128(src1+6), _mm_load_si128(src2+6));
595 m1D = _mm_and_si128(_mm_load_si128(src1+7), _mm_load_si128(src2+7));
597 m1E = _mm_and_si128(_mm_load_si128(src3+4), _mm_load_si128(src4+4));
598 m1F = _mm_and_si128(_mm_load_si128(src3+5), _mm_load_si128(src4+5));
599 m1G = _mm_and_si128(_mm_load_si128(src3+6), _mm_load_si128(src4+6));
600 m1H = _mm_and_si128(_mm_load_si128(src3+7), _mm_load_si128(src4+7));
602 m1A = _mm_and_si128(m1A, m1E);
603 m1B = _mm_and_si128(m1B, m1F);
604 m1C = _mm_and_si128(m1C, m1G);
605 m1D = _mm_and_si128(m1D, m1H);
607 m1A = _mm_and_si128(m1A, _mm_load_si128(dst+4));
608 m1B = _mm_and_si128(m1B, _mm_load_si128(dst+5));
609 m1C = _mm_and_si128(m1C, _mm_load_si128(dst+6));
610 m1D = _mm_and_si128(m1D, _mm_load_si128(dst+7));
612 _mm_store_si128(dst+4, m1A);
613 _mm_store_si128(dst+5, m1B);
614 _mm_store_si128(dst+6, m1C);
615 _mm_store_si128(dst+7, m1D);
617 m1A = _mm_or_si128(m1A, m1B);
618 m1C = _mm_or_si128(m1C, m1D);
619 m1A = _mm_or_si128(m1A, m1C);
621 bool z2 = _mm_testz_si128(m1A, m1A);
639 __m128i m1A, m1B, m1C, m1D;
641 m1A = _mm_andnot_si128(_mm_load_si128(src+0), _mm_load_si128(dst+0));
642 m1B = _mm_andnot_si128(_mm_load_si128(src+1), _mm_load_si128(dst+1));
643 m1C = _mm_andnot_si128(_mm_load_si128(src+2), _mm_load_si128(dst+2));
644 m1D = _mm_andnot_si128(_mm_load_si128(src+3), _mm_load_si128(dst+3));
646 _mm_store_si128(dst+0, m1A);
647 _mm_store_si128(dst+1, m1B);
648 _mm_store_si128(dst+2, m1C);
649 _mm_store_si128(dst+3, m1D);
651 m1A = _mm_or_si128(m1A, m1B);
652 m1C = _mm_or_si128(m1C, m1D);
653 m1A = _mm_or_si128(m1A, m1C);
655 bool z1 = _mm_testz_si128(m1A, m1A);
657 m1A = _mm_andnot_si128(_mm_load_si128(src+4), _mm_load_si128(dst+4));
658 m1B = _mm_andnot_si128(_mm_load_si128(src+5), _mm_load_si128(dst+5));
659 m1C = _mm_andnot_si128(_mm_load_si128(src+6), _mm_load_si128(dst+6));
660 m1D = _mm_andnot_si128(_mm_load_si128(src+7), _mm_load_si128(dst+7));
662 _mm_store_si128(dst+4, m1A);
663 _mm_store_si128(dst+5, m1B);
664 _mm_store_si128(dst+6, m1C);
665 _mm_store_si128(dst+7, m1D);
667 m1A = _mm_or_si128(m1A, m1B);
668 m1C = _mm_or_si128(m1C, m1D);
669 m1A = _mm_or_si128(m1A, m1C);
671 bool z2 = _mm_testz_si128(m1A, m1A);
689 __m128i m1A, m1B, m1C, m1D;
691 m1A = _mm_andnot_si128(_mm_load_si128(src2+0), _mm_load_si128(src1+0));
692 m1B = _mm_andnot_si128(_mm_load_si128(src2+1), _mm_load_si128(src1+1));
693 m1C = _mm_andnot_si128(_mm_load_si128(src2+2), _mm_load_si128(src1+2));
694 m1D = _mm_andnot_si128(_mm_load_si128(src2+3), _mm_load_si128(src1+3));
696 _mm_store_si128(dst+0, m1A);
697 _mm_store_si128(dst+1, m1B);
698 _mm_store_si128(dst+2, m1C);
699 _mm_store_si128(dst+3, m1D);
701 m1A = _mm_or_si128(m1A, m1B);
702 m1C = _mm_or_si128(m1C, m1D);
703 m1A = _mm_or_si128(m1A, m1C);
705 bool z1 = _mm_testz_si128(m1A, m1A);
707 m1A = _mm_andnot_si128(_mm_load_si128(src2+4), _mm_load_si128(src1+4));
708 m1B = _mm_andnot_si128(_mm_load_si128(src2+5), _mm_load_si128(src1+5));
709 m1C = _mm_andnot_si128(_mm_load_si128(src2+6), _mm_load_si128(src1+6));
710 m1D = _mm_andnot_si128(_mm_load_si128(src2+7), _mm_load_si128(src1+7));
712 _mm_store_si128(dst+4, m1A);
713 _mm_store_si128(dst+5, m1B);
714 _mm_store_si128(dst+6, m1C);
715 _mm_store_si128(dst+7, m1D);
717 m1A = _mm_or_si128(m1A, m1B);
718 m1C = _mm_or_si128(m1C, m1D);
719 m1A = _mm_or_si128(m1A, m1C);
721 bool z2 = _mm_testz_si128(m1A, m1A);
738 __m128i m1A, m1B, m1C, m1D;
739 __m128i m1E, m1F, m1G, m1H;
740 __m128i maskFF = _mm_set1_epi32(~0u);
742 m1A = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+0)), _mm_xor_si128(maskFF,_mm_load_si128(src2+0)));
743 m1B = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+1)), _mm_xor_si128(maskFF,_mm_load_si128(src2+1)));
744 m1C = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+2)), _mm_xor_si128(maskFF,_mm_load_si128(src2+2)));
745 m1D = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+3)), _mm_xor_si128(maskFF,_mm_load_si128(src2+3)));
747 m1E = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src3+0)), _mm_xor_si128(maskFF,_mm_load_si128(src4+0)));
748 m1F = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src3+1)), _mm_xor_si128(maskFF,_mm_load_si128(src4+1)));
749 m1G = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src3+2)), _mm_xor_si128(maskFF,_mm_load_si128(src4+2)));
750 m1H = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src3+3)), _mm_xor_si128(maskFF,_mm_load_si128(src4+3)));
752 m1A = _mm_and_si128(m1A, m1E);
753 m1B = _mm_and_si128(m1B, m1F);
754 m1C = _mm_and_si128(m1C, m1G);
755 m1D = _mm_and_si128(m1D, m1H);
757 m1A = _mm_and_si128(m1A, _mm_load_si128(dst+0));
758 m1B = _mm_and_si128(m1B, _mm_load_si128(dst+1));
759 m1C = _mm_and_si128(m1C, _mm_load_si128(dst+2));
760 m1D = _mm_and_si128(m1D, _mm_load_si128(dst+3));
762 _mm_store_si128(dst+0, m1A);
763 _mm_store_si128(dst+1, m1B);
764 _mm_store_si128(dst+2, m1C);
765 _mm_store_si128(dst+3, m1D);
767 m1A = _mm_or_si128(m1A, m1B);
768 m1C = _mm_or_si128(m1C, m1D);
769 m1A = _mm_or_si128(m1A, m1C);
771 bool z1 = _mm_testz_si128(m1A, m1A);
773 m1A = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+4)), _mm_xor_si128(maskFF,_mm_load_si128(src2+4)));
774 m1B = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+5)), _mm_xor_si128(maskFF,_mm_load_si128(src2+5)));
775 m1C = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+6)), _mm_xor_si128(maskFF,_mm_load_si128(src2+6)));
776 m1D = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+7)), _mm_xor_si128(maskFF,_mm_load_si128(src2+7)));
778 m1E = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src3+4)), _mm_xor_si128(maskFF,_mm_load_si128(src4+4)));
779 m1F = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src3+5)), _mm_xor_si128(maskFF,_mm_load_si128(src4+5)));
780 m1G = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src3+6)), _mm_xor_si128(maskFF,_mm_load_si128(src4+6)));
781 m1H = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src3+7)), _mm_xor_si128(maskFF,_mm_load_si128(src4+7)));
783 m1A = _mm_and_si128(m1A, m1E);
784 m1B = _mm_and_si128(m1B, m1F);
785 m1C = _mm_and_si128(m1C, m1G);
786 m1D = _mm_and_si128(m1D, m1H);
788 m1A = _mm_and_si128(m1A, _mm_load_si128(dst+4));
789 m1B = _mm_and_si128(m1B, _mm_load_si128(dst+5));
790 m1C = _mm_and_si128(m1C, _mm_load_si128(dst+6));
791 m1D = _mm_and_si128(m1D, _mm_load_si128(dst+7));
793 _mm_store_si128(dst+4, m1A);
794 _mm_store_si128(dst+5, m1B);
795 _mm_store_si128(dst+6, m1C);
796 _mm_store_si128(dst+7, m1D);
798 m1A = _mm_or_si128(m1A, m1B);
799 m1C = _mm_or_si128(m1C, m1D);
800 m1A = _mm_or_si128(m1A, m1C);
802 bool z2 = _mm_testz_si128(m1A, m1A);
818 __m128i m1A, m1B, m1C, m1D;
819 __m128i maskFF = _mm_set1_epi32(~0u);
821 m1A = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+0)), _mm_xor_si128(maskFF,_mm_load_si128(src2+0)));
822 m1B = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+1)), _mm_xor_si128(maskFF,_mm_load_si128(src2+1)));
823 m1C = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+2)), _mm_xor_si128(maskFF,_mm_load_si128(src2+2)));
824 m1D = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+3)), _mm_xor_si128(maskFF,_mm_load_si128(src2+3)));
826 m1A = _mm_and_si128(m1A, _mm_load_si128(dst+0));
827 m1B = _mm_and_si128(m1B, _mm_load_si128(dst+1));
828 m1C = _mm_and_si128(m1C, _mm_load_si128(dst+2));
829 m1D = _mm_and_si128(m1D, _mm_load_si128(dst+3));
831 _mm_store_si128(dst+0, m1A);
832 _mm_store_si128(dst+1, m1B);
833 _mm_store_si128(dst+2, m1C);
834 _mm_store_si128(dst+3, m1D);
836 m1A = _mm_or_si128(m1A, m1B);
837 m1C = _mm_or_si128(m1C, m1D);
838 m1A = _mm_or_si128(m1A, m1C);
840 bool z1 = _mm_testz_si128(m1A, m1A);
842 m1A = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+4)), _mm_xor_si128(maskFF,_mm_load_si128(src2+4)));
843 m1B = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+5)), _mm_xor_si128(maskFF,_mm_load_si128(src2+5)));
844 m1C = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+6)), _mm_xor_si128(maskFF,_mm_load_si128(src2+6)));
845 m1D = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+7)), _mm_xor_si128(maskFF,_mm_load_si128(src2+7)));
847 m1A = _mm_and_si128(m1A, _mm_load_si128(dst+4));
848 m1B = _mm_and_si128(m1B, _mm_load_si128(dst+5));
849 m1C = _mm_and_si128(m1C, _mm_load_si128(dst+6));
850 m1D = _mm_and_si128(m1D, _mm_load_si128(dst+7));
852 _mm_store_si128(dst+4, m1A);
853 _mm_store_si128(dst+5, m1B);
854 _mm_store_si128(dst+6, m1C);
855 _mm_store_si128(dst+7, m1D);
857 m1A = _mm_or_si128(m1A, m1B);
858 m1C = _mm_or_si128(m1C, m1D);
859 m1A = _mm_or_si128(m1A, m1C);
861 bool z2 = _mm_testz_si128(m1A, m1A);
882 w = _mm_and_si128(_mm_load_si128(block+0), _mm_load_si128(block+1));
883 if (!_mm_test_all_ones(w))
885 w = _mm_and_si128(_mm_load_si128(block+2), _mm_load_si128(block+3));
886 if (!_mm_test_all_ones(w))
890 }
while (block < block_end);
901 return _mm_test_all_ones(_mm_loadu_si128((__m128i*)ptr));
912 __m128i w0 = _mm_loadu_si128((__m128i*)ptr);
913 return _mm_testz_si128(w0, w0);
923 __m128i w0 = _mm_loadu_si128((__m128i*)ptr0);
924 __m128i w1 = _mm_loadu_si128((__m128i*)ptr1);
925 w0 = _mm_or_si128(w0, w1);
926 return _mm_testz_si128(w0, w0);
936 __m128i w0 = _mm_loadu_si128((__m128i*)ptr0);
937 __m128i w1 = _mm_loadu_si128((__m128i*)ptr1);
938 w0 = _mm_xor_si128(w0, w1);
939 return _mm_testz_si128(w0, w0);
953 const __m128i* block_end =
955 __m128i m1COshft, m2COshft;
960 unsigned co2, co1 = 0;
961 for (;block < block_end; block += 2)
963 __m128i m1A = _mm_load_si128(block);
964 __m128i m2A = _mm_load_si128(block+1);
966 __m128i m1CO = _mm_srli_epi32(m1A, 31);
967 __m128i m2CO = _mm_srli_epi32(m2A, 31);
969 co2 = _mm_extract_epi32(m1CO, 3);
971 __m128i m1As = _mm_slli_epi32(m1A, 1);
972 __m128i m2As = _mm_slli_epi32(m2A, 1);
974 m1COshft = _mm_slli_si128 (m1CO, 4);
975 m1COshft = _mm_insert_epi32 (m1COshft, co1, 0);
979 co2 = _mm_extract_epi32(m2CO, 3);
981 m2COshft = _mm_slli_si128 (m2CO, 4);
982 m2COshft = _mm_insert_epi32 (m2COshft, co1, 0);
984 m1As = _mm_or_si128(m1As, m1COshft);
985 m2As = _mm_or_si128(m2As, m2COshft);
990 m1A = _mm_xor_si128(m1A, m1As);
991 m2A = _mm_xor_si128(m2A, m2As);
994 _mm_store_si128((__m128i*)tcnt, m1A);
995 count += unsigned(_mm_popcnt_u64(tcnt[0]) + _mm_popcnt_u64(tcnt[1]));
996 _mm_store_si128((__m128i*)tcnt, m2A);
997 count += unsigned(_mm_popcnt_u64(tcnt[0]) + _mm_popcnt_u64(tcnt[1]));
999 bm::id_t m0 = _mm_extract_epi32(m1A, 0);
1000 bm::id_t m1 = _mm_extract_epi32(m1A, 1);
1001 bm::id_t m2 = _mm_extract_epi32(m1A, 2);
1002 bm::id_t m3 = _mm_extract_epi32(m1A, 3);
1003 count += unsigned(_mm_popcnt_u32(m0) + _mm_popcnt_u32(m1) +
1004 _mm_popcnt_u32(m2) + _mm_popcnt_u32(m3));
1006 m0 = _mm_extract_epi32(m2A, 0);
1007 m1 = _mm_extract_epi32(m2A, 1);
1008 m2 = _mm_extract_epi32(m2A, 2);
1009 m3 = _mm_extract_epi32(m2A, 3);
1010 count += unsigned(_mm_popcnt_u32(m0) + _mm_popcnt_u32(m1) +
1011 _mm_popcnt_u32(m2) + _mm_popcnt_u32(m3));
1036 const __m128i* block_end =
1038 __m128i m1COshft, m2COshft;
1041 unsigned gap_count = 1;
1042 unsigned bit_count = 0;
1044 unsigned co2, co1 = 0;
1045 for (;block < block_end; block += 2, xor_block += 2)
1047 __m128i m1A = _mm_load_si128(block);
1048 __m128i m2A = _mm_load_si128(block+1);
1049 __m128i m1B = _mm_load_si128(xor_block);
1050 __m128i m2B = _mm_load_si128(xor_block+1);
1052 m1A = _mm_xor_si128(m1A, m1B);
1053 m2A = _mm_xor_si128(m2A, m2B);
1057 _mm_store_si128 ((__m128i*)simd_buf0, m1A);
1058 _mm_store_si128 ((__m128i*)simd_buf1, m2A);
1059 bit_count += unsigned(_mm_popcnt_u64(simd_buf0[0]) + _mm_popcnt_u64(simd_buf0[1]));
1060 bit_count += unsigned(_mm_popcnt_u64(simd_buf1[0]) + _mm_popcnt_u64(simd_buf1[1]));
1062 bm::id_t m0 = _mm_extract_epi32(m1A, 0);
1063 bm::id_t m1 = _mm_extract_epi32(m1A, 1);
1064 bm::id_t m2 = _mm_extract_epi32(m1A, 2);
1065 bm::id_t m3 = _mm_extract_epi32(m1A, 3);
1066 bit_count += unsigned(_mm_popcnt_u32(m0) + _mm_popcnt_u32(m1) +
1067 _mm_popcnt_u32(m2) + _mm_popcnt_u32(m3));
1069 m0 = _mm_extract_epi32(m2A, 0);
1070 m1 = _mm_extract_epi32(m2A, 1);
1071 m2 = _mm_extract_epi32(m2A, 2);
1072 m3 = _mm_extract_epi32(m2A, 3);
1073 bit_count += unsigned(_mm_popcnt_u32(m0) + _mm_popcnt_u32(m1) +
1074 _mm_popcnt_u32(m2) + _mm_popcnt_u32(m3));
1078 __m128i m1CO = _mm_srli_epi32(m1A, 31);
1079 __m128i m2CO = _mm_srli_epi32(m2A, 31);
1081 co2 = _mm_extract_epi32(m1CO, 3);
1083 __m128i m1As = _mm_slli_epi32(m1A, 1);
1084 __m128i m2As = _mm_slli_epi32(m2A, 1);
1086 m1COshft = _mm_slli_si128 (m1CO, 4);
1087 m1COshft = _mm_insert_epi32 (m1COshft, co1, 0);
1091 co2 = _mm_extract_epi32(m2CO, 3);
1093 m2COshft = _mm_slli_si128 (m2CO, 4);
1094 m2COshft = _mm_insert_epi32 (m2COshft, co1, 0);
1096 m1As = _mm_or_si128(m1As, m1COshft);
1097 m2As = _mm_or_si128(m2As, m2COshft);
1102 m1A = _mm_xor_si128(m1A, m1As);
1103 m2A = _mm_xor_si128(m2A, m2As);
1106 _mm_store_si128 ((__m128i*)simd_buf0, m1A);
1107 _mm_store_si128 ((__m128i*)simd_buf1, m2A);
1108 gap_count += unsigned(_mm_popcnt_u64(simd_buf0[0]) + _mm_popcnt_u64(simd_buf0[1]));
1109 gap_count += unsigned(_mm_popcnt_u64(simd_buf1[0]) + _mm_popcnt_u64(simd_buf1[1]));
1111 bm::id_t m0 = _mm_extract_epi32(m1A, 0);
1112 bm::id_t m1 = _mm_extract_epi32(m1A, 1);
1113 bm::id_t m2 = _mm_extract_epi32(m1A, 2);
1114 bm::id_t m3 = _mm_extract_epi32(m1A, 3);
1115 gap_count += unsigned(_mm_popcnt_u32(m0) + _mm_popcnt_u32(m1) +
1116 _mm_popcnt_u32(m2) + _mm_popcnt_u32(m3));
1118 m0 = _mm_extract_epi32(m2A, 0);
1119 m1 = _mm_extract_epi32(m2A, 1);
1120 m2 = _mm_extract_epi32(m2A, 2);
1121 m3 = _mm_extract_epi32(m2A, 3);
1122 gap_count += unsigned(_mm_popcnt_u32(m0) + _mm_popcnt_u32(m1) +
1123 _mm_popcnt_u32(m2) + _mm_popcnt_u32(m3));
1127 gap_count -= (w0 & 1u);
1146 const __m128i* block_end =
1148 __m128i m1COshft, m2COshft;
1151 unsigned bit_count = 0;
1152 unsigned gap_count = 1;
1154 unsigned co2, co1 = 0;
1155 for (;block < block_end; block += 2)
1157 __m128i m1A = _mm_load_si128(block);
1158 __m128i m2A = _mm_load_si128(block+1);
1162 bit_count += unsigned(_mm_popcnt_u64(m0) + _mm_popcnt_u64(m1));
1163 m0 = _mm_extract_epi64(m2A, 0);
1164 m1 = _mm_extract_epi64(m2A, 1);
1165 bit_count += unsigned(_mm_popcnt_u64(m0) + _mm_popcnt_u64(m1));
1168 __m128i m1CO = _mm_srli_epi32(m1A, 31);
1169 __m128i m2CO = _mm_srli_epi32(m2A, 31);
1171 co2 = _mm_extract_epi32(m1CO, 3);
1173 __m128i m1As = _mm_slli_epi32(m1A, 1);
1174 __m128i m2As = _mm_slli_epi32(m2A, 1);
1176 m1COshft = _mm_slli_si128 (m1CO, 4);
1177 m1COshft = _mm_insert_epi32 (m1COshft, co1, 0);
1181 co2 = _mm_extract_epi32(m2CO, 3);
1183 m2COshft = _mm_slli_si128 (m2CO, 4);
1184 m2COshft = _mm_insert_epi32 (m2COshft, co1, 0);
1186 m1As = _mm_or_si128(m1As, m1COshft);
1187 m2As = _mm_or_si128(m2As, m2COshft);
1192 m1A = _mm_xor_si128(m1A, m1As);
1193 m2A = _mm_xor_si128(m2A, m2As);
1197 gap_count += unsigned(_mm_popcnt_u64(m0) + _mm_popcnt_u64(m1));
1202 gap_count += unsigned(_mm_popcnt_u64(m0) + _mm_popcnt_u64(m1));
1205 gap_count -= (w0 & 1u);
1224 const __m128i* block1_end =
1226 const __m128i maskZ = _mm_setzero_si128();
1228 unsigned simd_lane = 0;
1231 mA = _mm_xor_si128(_mm_load_si128(block1), _mm_load_si128(block2));
1232 mB = _mm_xor_si128(_mm_load_si128(block1+1), _mm_load_si128(block2+1));
1233 __m128i mOR = _mm_or_si128(mA, mB);
1234 if (!_mm_test_all_zeros(mOR, mOR))
1236 if (!_mm_test_all_zeros(mA, mA))
1238 unsigned mask = _mm_movemask_epi8(_mm_cmpeq_epi32(mA, maskZ));
1242 _mm_store_si128 ((__m128i*)simd_buf, mA);
1243 unsigned widx = bsf >> 2;
1244 unsigned w = simd_buf[widx];
1246 *pos = (simd_lane * 128) + (widx * 32) + bsf;
1249 unsigned mask = _mm_movemask_epi8(_mm_cmpeq_epi32(mB, maskZ));
1253 _mm_store_si128 ((__m128i*)simd_buf, mB);
1254 unsigned widx = bsf >> 2;
1255 unsigned w = simd_buf[widx];
1257 *pos = ((++simd_lane) * 128) + (widx * 32) + bsf;
1262 block1+=2; block2+=2;
1264 }
while (block1 < block1_end);
1280 block = (
const __m128i*)((
const bm::word_t*)(block) + off);
1281 const __m128i* block_end =
1283 const __m128i maskZ = _mm_setzero_si128();
1285 unsigned simd_lane = 0;
1288 mA = _mm_load_si128(block); mB = _mm_load_si128(block+1);
1289 __m128i mOR = _mm_or_si128(mA, mB);
1290 if (!_mm_test_all_zeros(mOR, mOR))
1292 if (!_mm_test_all_zeros(mA, mA))
1294 unsigned mask = _mm_movemask_epi8(_mm_cmpeq_epi32(mA, maskZ));
1298 _mm_store_si128 ((__m128i*)simd_buf, mA);
1299 unsigned widx = bsf >> 2;
1300 unsigned w = simd_buf[widx];
1302 *pos = (off * 32) + (simd_lane * 128) + (widx * 32) + bsf;
1305 unsigned mask = _mm_movemask_epi8(_mm_cmpeq_epi32(mB, maskZ));
1309 _mm_store_si128 ((__m128i*)simd_buf, mB);
1310 unsigned widx = bsf >> 2;
1311 unsigned w = simd_buf[widx];
1313 *pos = (off * 32) + ((++simd_lane) * 128) + (widx * 32) + bsf;
1320 }
while (block < block_end);
1329#pragma GCC diagnostic push
1330#pragma GCC diagnostic ignored "-Warray-bounds"
1345 const unsigned unroll_factor = 8;
1347 __m128i m1, mz, maskF, maskFL;
1349 mz = _mm_setzero_si128();
1350 m1 = _mm_loadu_si128((__m128i*)(pbuf));
1352 maskF = _mm_cmpeq_epi64(mz, mz);
1353 maskFL = _mm_slli_si128(maskF, 4 * 2);
1354 int shiftL= (64 - (unroll_factor - size) * 16);
1355 maskFL = _mm_slli_epi64(maskFL, shiftL);
1357 m1 = _mm_andnot_si128(maskFL, m1);
1358 m1 = _mm_or_si128(m1, maskFL);
1360 __m128i mp = _mm_set1_epi16(pos);
1361 __m128i mge_mask = _mm_cmpeq_epi16(_mm_subs_epu16(mp, m1), mz);
1362 __m128i c_mask = _mm_slli_epi16(mge_mask, 15);
1363 int mi = _mm_movemask_epi8(c_mask);
1364 if (
unsigned bc = _mm_popcnt_u32(mi))
1365 return unroll_factor - bc;
1371 m1 = _mm_loadu_si128((__m128i*)(pbuf2));
1372 mge_mask = _mm_cmpeq_epi16(_mm_subs_epu16(mp, m1), mz);
1373 mi = _mm_movemask_epi8(_mm_slli_epi16(mge_mask, 15));
1374 unsigned bc = _mm_popcnt_u32(mi);
1395 unsigned end = ((*buf) >> 3);
1399 unsigned size = end - start;
1400 for (; size >= 64; size = end - start)
1402 unsigned mid = (start + end) >> 1;
1407 if (buf[mid = (start + end) >> 1] < pos)
1411 if (buf[mid = (start + end) >> 1] < pos)
1415 if (buf[mid = (start + end) >> 1] < pos)
1422 for (; size >= 16; size = end - start)
1424 if (
unsigned mid = (start + end) >> 1; buf[mid] < pos)
1428 if (
unsigned mid = (start + end) >> 1; buf[mid] < pos)
1437 const unsigned short*
BMRESTRICT pbuf = buf + start;
1438 if (pbuf[0] >= pos) { }
1439 else if (pbuf[1] >= pos) { start++; }
1450 *is_set = ((*buf) & 1) ^ ((start-1) & 1);
1465 unsigned end = ((*buf) >> 3);
1466 unsigned size = end - start;
1468 for (; size >= 64; size = end - start)
1470 unsigned mid = (start + end) >> 1;
1475 if (buf[mid = (start + end) >> 1] < pos)
1479 if (buf[mid = (start + end) >> 1] < pos)
1483 if (buf[mid = (start + end) >> 1] < pos)
1488 for (; size >= 16; size = end - start)
1490 if (
unsigned mid = (start + end) >> 1; buf[mid] < pos)
1499 const unsigned short*
BMRESTRICT pbuf = buf + start;
1500 if (pbuf[0] >= pos) { }
1501 else if (pbuf[1] >= pos) { start++; }
1513 BM_ASSERT(buf[start - 1] < pos || (start == 1));
1515 return ((*buf) & 1) ^ ((--start) & 1);
1532 __m128i mask0x8 = _mm_set1_epi32(0x80000000);
1533 __m128i mm_val = _mm_set1_epi32(value);
1535 __m128i norm_vect4 = _mm_sub_epi32(vect4, mask0x8);
1536 __m128i norm_val = _mm_sub_epi32(mm_val, mask0x8);
1538 __m128i cmp_mask_gt = _mm_cmpgt_epi32 (norm_vect4, norm_val);
1539 __m128i cmp_mask_eq = _mm_cmpeq_epi32 (mm_val, vect4);
1541 __m128i cmp_mask_ge = _mm_or_si128 (cmp_mask_gt, cmp_mask_eq);
1542 int mask = _mm_movemask_epi8(cmp_mask_ge);
1561 const unsigned unroll_factor = 8;
1562 const unsigned len = (size - start);
1563 const unsigned len_unr = len - (len % unroll_factor);
1568 __m128i nbM = _mm_set1_epi32(nb);
1570 for (k = 0; k < len_unr; k+=unroll_factor)
1572 __m128i idxA = _mm_loadu_si128((__m128i*)(idx+k));
1573 __m128i idxB = _mm_loadu_si128((__m128i*)(idx+k+4));
1577 if (!_mm_test_all_ones(_mm_cmpeq_epi32(nbM, nbA)) |
1578 !_mm_test_all_ones(_mm_cmpeq_epi32 (nbM, nbB)))
1582 for (; k < len; ++k)
1599 const unsigned unroll_factor = 4;
1600 const unsigned len = (stop - start);
1601 const unsigned len_unr = len - (len % unroll_factor);
1612 for (; k < len_unr; k+=unroll_factor)
1614 __m128i idxA = _mm_loadu_si128((__m128i*)(idx+k));
1615 __m128i nbitA = _mm_and_si128 (idxA, sb_mask);
1619 nbitA = _mm_and_si128 (nbitA, sw_mask);
1620 _mm_store_si128 ((__m128i*)mshift_v, nbitA);
1624 __m128i nwordA_0 = _mm_shuffle_epi32(nwordA, 0x0);
1625 __m128i cmpA = _mm_cmpeq_epi32(nwordA_0, nwordA);
1626 if (_mm_test_all_ones(cmpA))
1628 unsigned nword = _mm_extract_epi32(nwordA, 0);
1629 block[nword] |= (1u << mshift_v[0]) | (1u << mshift_v[1])
1630 |(1u << mshift_v[2]) | (1u << mshift_v[3]);
1634 _mm_store_si128 ((__m128i*)mword_v, nwordA);
1636 block[mword_v[0]] |= (1u << mshift_v[0]);
1637 block[mword_v[1]] |= (1u << mshift_v[1]);
1638 block[mword_v[2]] |= (1u << mshift_v[2]);
1639 block[mword_v[3]] |= (1u << mshift_v[3]);
1644 for (; k < len; ++k)
1646 unsigned n = idx[k];
1650 block[nword] |= (1u << nbit);
1686 const unsigned unroll_factor = 4;
1687 const unsigned len = (size - start);
1688 const unsigned len_unr = len - (len % unroll_factor);
1692 __m128i maskFF = _mm_set1_epi32(~0u);
1693 __m128i maskZ = _mm_xor_si128(maskFF, maskFF);
1695 __m128i mask_tmp, mask_0;
1701 unsigned base = start + k;
1702 __m128i* idx_ptr = (__m128i*)(idx + base);
1703 __m128i* target_ptr = (__m128i*)(arr + base);
1704 for (; k < len_unr; k+=unroll_factor)
1706 __m128i nbitA = _mm_and_si128 (_mm_loadu_si128(idx_ptr), sb_mask);
1709 _mm_store_si128 ((__m128i*)mshift_v, _mm_and_si128 (nbitA, sw_mask));
1710 _mm_store_si128 ((__m128i*)mword_v, nwordA);
1718 __m128i am_0 = _mm_set_epi32(0, 0, 0, ~0u);
1719 __m128i mask1 = _mm_srli_epi32 (maskFF, 31);
1720 mask_0 = _mm_and_si128 (_mm_slli_epi32 (mask1, mshift_v[0]), am_0);
1721 mask_tmp = _mm_and_si128 (_mm_slli_epi32(mask1, mshift_v[1]), _mm_slli_si128 (am_0, 4));
1722 mask_0 = _mm_or_si128 (mask_0, mask_tmp);
1724 __m128i mask_2 = _mm_and_si128 (_mm_slli_epi32 (mask1, mshift_v[2]),
1725 _mm_slli_si128 (am_0, 8));
1726 mask_tmp = _mm_and_si128 (
1727 _mm_slli_epi32(mask1, mshift_v[3]),
1728 _mm_slli_si128 (am_0, 12)
1731 mask_0 = _mm_or_si128 (mask_0,
1732 _mm_or_si128 (mask_2, mask_tmp));
1735 mask_0 = _mm_set_epi32(1 << mshift_v[3], 1 << mshift_v[2], 1 << mshift_v[1], 1 << mshift_v[0]);
1740 mask_tmp = _mm_and_si128(_mm_set_epi32(blk[mword_v[3]], blk[mword_v[2]],
1741 blk[mword_v[1]], blk[mword_v[0]]),
1746 mask_tmp = _mm_cmpeq_epi32 (mask_tmp, maskZ);
1747 mask_tmp = _mm_xor_si128 (mask_tmp, maskFF);
1748 mask_tmp = _mm_srli_epi32 (mask_tmp, 31);
1750 mask_tmp = _mm_slli_epi32(mask_tmp, bit_idx);
1752 _mm_storeu_si128 (target_ptr,
1753 _mm_or_si128 (mask_tmp, _mm_loadu_si128(target_ptr)));
1755 ++idx_ptr; ++target_ptr;
1756 _mm_prefetch((
const char*)target_ptr, _MM_HINT_T0);
1759 for (; k < len; ++k)
1775 __m128i* block_end =
1777 __m128i mAcc = _mm_set1_epi32(0);
1778 __m128i mMask1 = _mm_set1_epi32(1);
1781 for (--block_end; block_end >= block; block_end -= 2)
1783 __m128i m1A = _mm_load_si128(block_end);
1784 __m128i m2A = _mm_load_si128(block_end-1);
1786 __m128i m1CO = _mm_and_si128(m1A, mMask1);
1787 __m128i m2CO = _mm_and_si128(m2A, mMask1);
1789 co2 = _mm_extract_epi32(m1CO, 0);
1791 m1A = _mm_srli_epi32(m1A, 1);
1792 m2A = _mm_srli_epi32(m2A, 1);
1794 __m128i m1COshft = _mm_srli_si128 (m1CO, 4);
1795 __m128i m2COshft = _mm_srli_si128 (m2CO, 4);
1796 m1COshft = _mm_insert_epi32 (m1COshft, co1, 3);
1797 m2COshft = _mm_insert_epi32 (m2COshft, co2, 3);
1798 m1COshft = _mm_slli_epi32(m1COshft, 31);
1799 m2COshft = _mm_slli_epi32(m2COshft, 31);
1801 m1A = _mm_or_si128(m1A, m1COshft);
1802 m2A = _mm_or_si128(m2A, m2COshft);
1804 co1 = _mm_extract_epi32(m2CO, 0);
1806 _mm_store_si128(block_end, m1A);
1807 _mm_store_si128(block_end-1, m2A);
1809 mAcc = _mm_or_si128(mAcc, m1A);
1810 mAcc = _mm_or_si128(mAcc, m2A);
1813 *empty_acc = !_mm_testz_si128(mAcc, mAcc);
1825 __m128i* block_end =
1827 __m128i m1COshft, m2COshft;
1828 __m128i mAcc = _mm_set1_epi32(0);
1831 for (;block < block_end; block += 2)
1833 __m128i m1A = _mm_load_si128(block);
1834 __m128i m2A = _mm_load_si128(block+1);
1836 __m128i m1CO = _mm_srli_epi32(m1A, 31);
1837 __m128i m2CO = _mm_srli_epi32(m2A, 31);
1839 co2 = _mm_extract_epi32(m1CO, 3);
1841 m1A = _mm_slli_epi32(m1A, 1);
1842 m2A = _mm_slli_epi32(m2A, 1);
1844 m1COshft = _mm_slli_si128 (m1CO, 4);
1845 m2COshft = _mm_slli_si128 (m2CO, 4);
1846 m1COshft = _mm_insert_epi32 (m1COshft, co1, 0);
1847 m2COshft = _mm_insert_epi32 (m2COshft, co2, 0);
1849 m1A = _mm_or_si128(m1A, m1COshft);
1850 m2A = _mm_or_si128(m2A, m2COshft);
1852 co1 = _mm_extract_epi32(m2CO, 3);
1854 _mm_store_si128(block, m1A);
1855 _mm_store_si128(block+1, m2A);
1857 mAcc = _mm_or_si128(mAcc, m1A);
1858 mAcc = _mm_or_si128(mAcc, m2A);
1860 *empty_acc = !_mm_testz_si128(mAcc, mAcc);
1881 __m128i m1COshft, m2COshft;
1882 __m128i mAcc = _mm_set1_epi32(0);
1893 di = unsigned(_mm_popcnt_u64(t - 1));
1900 di += unsigned(_mm_popcnt_u32(t32 - 1));
1904 for (; di < 64 ; ++di)
1910 block = (__m128i*) &wblock[d_base];
1911 mask_block = (__m128i*) &mblock[d_base];
1912 mAcc = _mm_xor_si128(mAcc, mAcc);
1913 for (
unsigned i = 0; i < 4; ++i, block += 2, mask_block += 2)
1915 __m128i m1A = _mm_load_si128(block);
1916 __m128i m2A = _mm_load_si128(block+1);
1918 __m128i m1CO = _mm_srli_epi32(m1A, 31);
1919 __m128i m2CO = _mm_srli_epi32(m2A, 31);
1921 co2 = _mm_extract_epi32(m1CO, 3);
1923 m1A = _mm_slli_epi32(m1A, 1);
1924 m2A = _mm_slli_epi32(m2A, 1);
1926 m1COshft = _mm_slli_si128 (m1CO, 4);
1927 m1COshft = _mm_insert_epi32 (m1COshft, co1, 0);
1931 co2 = _mm_extract_epi32(m2CO, 3);
1933 m2COshft = _mm_slli_si128 (m2CO, 4);
1934 m2COshft = _mm_insert_epi32 (m2COshft, co1, 0);
1936 m1A = _mm_or_si128(m1A, m1COshft);
1937 m2A = _mm_or_si128(m2A, m2COshft);
1939 m1A = _mm_and_si128(m1A, _mm_load_si128(mask_block));
1940 m2A = _mm_and_si128(m2A, _mm_load_si128(mask_block+1));
1942 mAcc = _mm_or_si128(mAcc, m1A);
1943 mAcc = _mm_or_si128(mAcc, m2A);
1945 _mm_store_si128(block, m1A);
1946 _mm_store_si128(block+1, m2A);
1952 if (_mm_testz_si128(mAcc, mAcc))
1963 bm::id64_t w0 = wblock[d_base] = co1 & mblock[d_base];
1964 d |= (dmask & (w0 << di));
1996 const __m128i* sub_block = (__m128i*) (block + off);
1997 __m128i* t_sub_block = (__m128i*)(target_block + off);
2001 const __m128i* xor_sub_block = (__m128i*) (xor_block + off);
2002 __m128i mA, mB, mC, mD;
2003 mA = _mm_xor_si128(_mm_load_si128(sub_block),
2004 _mm_load_si128(xor_sub_block));
2005 mB = _mm_xor_si128(_mm_load_si128(sub_block+1),
2006 _mm_load_si128(xor_sub_block+1));
2007 mC = _mm_xor_si128(_mm_load_si128(sub_block+2),
2008 _mm_load_si128(xor_sub_block+2));
2009 mD = _mm_xor_si128(_mm_load_si128(sub_block+3),
2010 _mm_load_si128(xor_sub_block+3));
2012 _mm_store_si128(t_sub_block, mA);
2013 _mm_store_si128(t_sub_block+1, mB);
2014 _mm_store_si128(t_sub_block+2, mC);
2015 _mm_store_si128(t_sub_block+3, mD);
2017 mA = _mm_xor_si128(_mm_load_si128(sub_block+4),
2018 _mm_load_si128(xor_sub_block+4));
2019 mB = _mm_xor_si128(_mm_load_si128(sub_block+5),
2020 _mm_load_si128(xor_sub_block+5));
2021 mC = _mm_xor_si128(_mm_load_si128(sub_block+6),
2022 _mm_load_si128(xor_sub_block+6));
2023 mD = _mm_xor_si128(_mm_load_si128(sub_block+7),
2024 _mm_load_si128(xor_sub_block+7));
2026 _mm_store_si128(t_sub_block+4, mA);
2027 _mm_store_si128(t_sub_block+5, mB);
2028 _mm_store_si128(t_sub_block+6, mC);
2029 _mm_store_si128(t_sub_block+7, mD);
2034 _mm_store_si128(t_sub_block , _mm_load_si128(sub_block));
2035 _mm_store_si128(t_sub_block+1, _mm_load_si128(sub_block+1));
2036 _mm_store_si128(t_sub_block+2, _mm_load_si128(sub_block+2));
2037 _mm_store_si128(t_sub_block+3, _mm_load_si128(sub_block+3));
2039 _mm_store_si128(t_sub_block+4, _mm_load_si128(sub_block+4));
2040 _mm_store_si128(t_sub_block+5, _mm_load_si128(sub_block+5));
2041 _mm_store_si128(t_sub_block+6, _mm_load_si128(sub_block+6));
2042 _mm_store_si128(t_sub_block+7, _mm_load_si128(sub_block+7));
2065 unsigned wave = unsigned(_mm_popcnt_u64(t - 1));
2068 const __m128i* sub_block = (
const __m128i*) (xor_block + off);
2069 __m128i* t_sub_block = (__m128i*)(target_block + off);
2071 __m128i mA, mB, mC, mD;
2072 mA = _mm_xor_si128(_mm_load_si128(sub_block),
2073 _mm_load_si128(t_sub_block));
2074 mB = _mm_xor_si128(_mm_load_si128(sub_block+1),
2075 _mm_load_si128(t_sub_block+1));
2076 mC = _mm_xor_si128(_mm_load_si128(sub_block+2),
2077 _mm_load_si128(t_sub_block+2));
2078 mD = _mm_xor_si128(_mm_load_si128(sub_block+3),
2079 _mm_load_si128(t_sub_block+3));
2081 _mm_store_si128(t_sub_block, mA);
2082 _mm_store_si128(t_sub_block+1, mB);
2083 _mm_store_si128(t_sub_block+2, mC);
2084 _mm_store_si128(t_sub_block+3, mD);
2086 mA = _mm_xor_si128(_mm_load_si128(sub_block+4),
2087 _mm_load_si128(t_sub_block+4));
2088 mB = _mm_xor_si128(_mm_load_si128(sub_block+5),
2089 _mm_load_si128(t_sub_block+5));
2090 mC = _mm_xor_si128(_mm_load_si128(sub_block+6),
2091 _mm_load_si128(t_sub_block+6));
2092 mD = _mm_xor_si128(_mm_load_si128(sub_block+7),
2093 _mm_load_si128(t_sub_block+7));
2095 _mm_store_si128(t_sub_block+4, mA);
2096 _mm_store_si128(t_sub_block+5, mB);
2097 _mm_store_si128(t_sub_block+6, mC);
2098 _mm_store_si128(t_sub_block+7, mD);
2106#define VECT_XOR_ARR_2_MASK(dst, src, src_end, mask)\
2107 sse2_xor_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), (bm::word_t)mask)
2109#define VECT_ANDNOT_ARR_2_MASK(dst, src, src_end, mask)\
2110 sse2_andnot_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), (bm::word_t)mask)
2112#define VECT_BITCOUNT(first, last) \
2113 sse4_bit_count((__m128i*) (first), (__m128i*) (last))
2120#define VECT_BITCOUNT_AND(first, last, mask) \
2121 sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_and)
2123#define VECT_BITCOUNT_OR(first, last, mask) \
2124 sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_or)
2126#define VECT_BITCOUNT_XOR(first, last, mask) \
2127 sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_xor)
2129#define VECT_BITCOUNT_SUB(first, last, mask) \
2130 sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_sub)
2132#define VECT_INVERT_BLOCK(first) \
2133 sse2_invert_block((__m128i*)first);
2135#define VECT_AND_BLOCK(dst, src) \
2136 sse4_and_block((__m128i*) dst, (__m128i*) (src))
2138#define VECT_AND_DIGEST(dst, src) \
2139 sse4_and_digest((__m128i*) dst, (const __m128i*) (src))
2141#define VECT_AND_OR_DIGEST_2WAY(dst, src1, src2) \
2142 sse4_and_or_digest_2way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2))
2144#define VECT_AND_DIGEST_5WAY(dst, src1, src2, src3, src4) \
2145 sse4_and_digest_5way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2), (const __m128i*) (src3), (const __m128i*) (src4))
2147#define VECT_AND_DIGEST_3WAY(dst, src1, src2) \
2148 sse4_and_digest_3way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2))
2150#define VECT_AND_DIGEST_2WAY(dst, src1, src2) \
2151 sse4_and_digest_2way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2))
2153#define VECT_OR_BLOCK(dst, src) \
2154 sse2_or_block((__m128i*) dst, (__m128i*) (src))
2156#define VECT_OR_BLOCK_2WAY(dst, src1, src2) \
2157 sse2_or_block_2way((__m128i*) (dst), (const __m128i*) (src1), (const __m128i*) (src2))
2159#define VECT_OR_BLOCK_3WAY(dst, src1, src2) \
2160 sse2_or_block_3way((__m128i*) (dst), (const __m128i*) (src1), (const __m128i*) (src2))
2162#define VECT_OR_BLOCK_5WAY(dst, src1, src2, src3, src4) \
2163 sse2_or_block_5way((__m128i*) (dst), (__m128i*) (src1), (__m128i*) (src2), (__m128i*) (src3), (__m128i*) (src4))
2165#define VECT_SUB_BLOCK(dst, src) \
2166 sse2_sub_block((__m128i*) dst, (const __m128i*) (src))
2168#define VECT_SUB_DIGEST(dst, src) \
2169 sse4_sub_digest((__m128i*) dst, (const __m128i*) (src))
2171#define VECT_SUB_DIGEST_2WAY(dst, src1, src2) \
2172 sse4_sub_digest_2way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2))
2174#define VECT_SUB_DIGEST_5WAY(dst, src1, src2, src3, src4) \
2175 sse4_sub_digest_5way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2), (const __m128i*) (src3), (const __m128i*) (src4))
2177#define VECT_SUB_DIGEST_3WAY(dst, src1, src2) \
2178 sse4_sub_digest_3way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2))
2180#define VECT_XOR_BLOCK(dst, src) \
2181 sse2_xor_block((__m128i*) dst, (__m128i*) (src))
2183#define VECT_XOR_BLOCK_2WAY(dst, src1, src2) \
2184 sse2_xor_block_2way((__m128i*) (dst), (const __m128i*) (src1), (const __m128i*) (src2))
2186#define VECT_COPY_BLOCK(dst, src) \
2187 sse2_copy_block((__m128i*) dst, (__m128i*) (src))
2189#define VECT_COPY_BLOCK_UNALIGN(dst, src) \
2190 sse2_copy_block_unalign((__m128i*) dst, (__m128i*) (src))
2192#define VECT_STREAM_BLOCK(dst, src) \
2193 sse2_stream_block((__m128i*) dst, (__m128i*) (src))
2195#define VECT_STREAM_BLOCK_UNALIGN(dst, src) \
2196 sse2_stream_block_unalign((__m128i*) dst, (__m128i*) (src))
2198#define VECT_SET_BLOCK(dst, value) \
2199 sse2_set_block((__m128i*) dst, value)
2201#define VECT_IS_ZERO_BLOCK(dst) \
2202 sse4_is_all_zero((__m128i*) dst)
2204#define VECT_IS_ONE_BLOCK(dst) \
2205 sse4_is_all_one((__m128i*) dst)
2207#define VECT_IS_DIGEST_ZERO(start) \
2208 sse4_is_digest_zero((__m128i*)start)
2210#define VECT_BLOCK_SET_DIGEST(dst, val) \
2211 sse4_block_set_digest((__m128i*)dst, val)
2213#define VECT_LOWER_BOUND_SCAN_U32(arr, target, from, to) \
2214 sse2_lower_bound_scan_u32(arr, target, from, to)
2216#define VECT_SHIFT_L1(b, acc, co) \
2217 sse42_shift_l1((__m128i*)b, acc, co)
2219#define VECT_SHIFT_R1(b, acc, co) \
2220 sse42_shift_r1((__m128i*)b, acc, co)
2222#define VECT_SHIFT_R1_AND(b, co, m, digest) \
2223 sse42_shift_r1_and((__m128i*)b, co, (__m128i*)m, digest)
2225#define VECT_ARR_BLOCK_LOOKUP(idx, size, nb, start) \
2226 sse42_idx_arr_block_lookup(idx, size, nb, start)
2228#define VECT_SET_BLOCK_BITS(block, idx, start, stop) \
2229 sse42_set_block_bits(block, idx, start, stop)
2231#define VECT_BLOCK_CHANGE(block, size) \
2232 sse42_bit_block_calc_change((__m128i*)block, size)
2234#define VECT_BLOCK_XOR_CHANGE(block, xor_block, size, gc, bc) \
2235 sse42_bit_block_calc_xor_change((__m128i*)block, (__m128i*)xor_block, size, gc, bc)
2238#define VECT_BLOCK_CHANGE_BC(block, gc, bc) \
2239 sse42_bit_block_calc_change_bc((__m128i*)block, gc, bc)
2242#define VECT_BIT_FIND_FIRST(src, off, pos) \
2243 sse42_bit_find_first((__m128i*) src, off, pos)
2245#define VECT_BIT_FIND_DIFF(src1, src2, pos) \
2246 sse42_bit_find_first_diff((__m128i*) src1, (__m128i*) (src2), pos)
2248#define VECT_BIT_BLOCK_XOR(t, src, src_xor, d) \
2249 sse42_bit_block_xor(t, src, src_xor, d)
2251#define VECT_BIT_BLOCK_XOR_2WAY(t, src_xor, d) \
2252 sse42_bit_block_xor_2way(t, src_xor, d)
2255#define VECT_GAP_BFIND(buf, pos, is_set) \
2256 sse42_gap_bfind(buf, pos, is_set)
2258#define VECT_GAP_TEST(buf, pos) \
2259 sse42_gap_test(buf, pos)
2262#pragma GCC diagnostic pop
2271#pragma warning( pop )
Compute functions for SSE SIMD instruction set (internal).
Bit manipulation primitives (internal).
BMFORCEINLINE bool sse42_test_all_eq_wave2(const void *ptr0, const void *ptr1) BMNOEXCEPT
check if wave of 2 pointers are the same (null or FULL)
bool sse42_shift_l1(__m128i *block, unsigned *empty_acc, unsigned co1) BMNOEXCEPT
block shift left by 1
BMFORCEINLINE bool sse42_test_all_zero_wave(const void *ptr) BMNOEXCEPT
check if wave of pointers is all NULL
unsigned sse42_bit_block_calc_change(const __m128i *BMRESTRICT block, unsigned size) BMNOEXCEPT
bm::id_t sse42_bit_count_digest(const bm::word_t *BMRESTRICT block, bm::id64_t digest) BMNOEXCEPT
bool sse42_bit_find_first_diff(const __m128i *BMRESTRICT block1, const __m128i *BMRESTRICT block2, unsigned *pos) BMNOEXCEPT
Find first bit which is different between two bit-blocks.
bool sse42_shift_r1(__m128i *block, unsigned *empty_acc, unsigned co1) BMNOEXCEPT
block shift right by 1
void sse42_bit_block_calc_xor_change(const __m128i *BMRESTRICT block, const __m128i *BMRESTRICT xor_block, unsigned size, unsigned *BMRESTRICT gc, unsigned *BMRESTRICT bc) BMNOEXCEPT
int sse42_cmpge_u32(__m128i vect4, unsigned value) BMNOEXCEPT
Experimental (test) function to do SIMD vector search (lower bound) in sorted, growing array.
BMFORCEINLINE bool sse4_is_digest_zero(const __m128i *BMRESTRICT block) BMNOEXCEPT
check if digest stride is all zero bits
BMFORCEINLINE bool sse4_and_digest(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src) BMNOEXCEPT
AND block digest stride dst &= *src.
bool sse4_is_all_zero(const __m128i *BMRESTRICT block) BMNOEXCEPT
check if block is all zero bits
bm::id_t sse4_bit_count(const __m128i *block, const __m128i *block_end) BMNOEXCEPT
BMFORCEINLINE bool sse4_and_digest_2way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2) BMNOEXCEPT
AND block digest stride dst = *src1 & src2.
void sse42_bit_block_xor(bm::word_t *target_block, const bm::word_t *block, const bm::word_t *xor_block, bm::id64_t digest) BMNOEXCEPT
Build partial XOR product of 2 bit-blocks using digest mask.
bool sse4_sub_digest_5way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2, const __m128i *BMRESTRICT src3, const __m128i *BMRESTRICT src4) BMNOEXCEPT
SUB block digest stride.
bool sse4_and_or_digest_2way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2) BMNOEXCEPT
AND-OR block digest stride dst |= *src1 & src2.
unsigned sse4_and_block(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src) BMNOEXCEPT
AND blocks2 dst &= *src.
BMFORCEINLINE bool sse4_sub_digest_2way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2) BMNOEXCEPT
2-operand SUB (AND NOT) block digest stride dst = src1 & ~*src2
BMFORCEINLINE bool sse4_sub_digest(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src) BMNOEXCEPT
SUB (AND NOT) block digest stride dst &= ~*src.
bool sse4_sub_digest_3way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2) BMNOEXCEPT
SUB block digest stride.
bool sse4_and_digest_5way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2, const __m128i *BMRESTRICT src3, const __m128i *BMRESTRICT src4) BMNOEXCEPT
AND block digest stride.
unsigned sse42_gap_test(const unsigned short *BMRESTRICT buf, unsigned pos) BMNOEXCEPT
Hybrid binary search to test GAP value, starts as binary, then switches to scan.
unsigned sse42_gap_bfind(const unsigned short *BMRESTRICT buf, unsigned pos, unsigned *BMRESTRICT is_set) BMNOEXCEPT
Hybrid binary search, starts as binary, then switches to linear scan.
BMFORCEINLINE void sse4_block_set_digest(__m128i *dst, unsigned value) BMNOEXCEPT
set digest stride to 0xFF.. or 0x0 value
bool sse42_shift_r1_and(__m128i *block, bm::word_t co1, const __m128i *BMRESTRICT mask_block, bm::id64_t *digest) BMNOEXCEPT
block shift right by 1 plus AND
bool sse4_is_all_one(const __m128i *BMRESTRICT block) BMNOEXCEPT
check if block is all ONE bits
unsigned sse4_gap_find(const bm::gap_word_t *BMRESTRICT pbuf, const bm::gap_word_t pos, const unsigned size) BMNOEXCEPT
bool sse42_bit_find_first(const __m128i *BMRESTRICT block, unsigned off, unsigned *pos) BMNOEXCEPT
Find first non-zero bit.
BMFORCEINLINE bool sse42_test_all_zero_wave2(const void *ptr0, const void *ptr1) BMNOEXCEPT
check if 2 waves of pointers are all NULL
bool sse4_and_digest_3way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2) BMNOEXCEPT
AND block digest stride.
void sse42_bit_block_xor_2way(bm::word_t *target_block, const bm::word_t *xor_block, bm::id64_t digest) BMNOEXCEPT
Build partial XOR product of 2 bit-blocks using digest mask.
BMFORCEINLINE bool sse42_test_all_one_wave(const void *ptr) BMNOEXCEPT
check if SSE wave is all oxFFFF...FFF
void sse42_bit_block_calc_change_bc(const __m128i *BMRESTRICT block, unsigned *gc, unsigned *bc) BMNOEXCEPT
const unsigned set_block_digest_wave_size
const unsigned set_block_mask
BMFORCEINLINE unsigned op_or(unsigned a, unsigned b) BMNOEXCEPT
void sse4_bit_block_gather_scatter(unsigned *BMRESTRICT arr, const unsigned *BMRESTRICT blk, const unsigned *BMRESTRICT idx, unsigned size, unsigned start, unsigned bit_idx) BMNOEXCEPT
bm::id_t sse4_bit_count_op(const __m128i *BMRESTRICT block, const __m128i *BMRESTRICT block_end, const __m128i *BMRESTRICT mask_block, Func sse2_func) BMNOEXCEPT
BMFORCEINLINE unsigned op_and(unsigned a, unsigned b) BMNOEXCEPT
const unsigned set_word_shift
void sse42_set_block_bits(bm::word_t *BMRESTRICT block, const unsigned *BMRESTRICT idx, unsigned start, unsigned stop) BMNOEXCEPT
const unsigned set_block_size
unsigned long long int id64_t
const unsigned block_waves
unsigned sse42_idx_arr_block_lookup(const unsigned *idx, unsigned size, unsigned nb, unsigned start) BMNOEXCEPT
BMFORCEINLINE unsigned long long bmi_bslr_u64(unsigned long long w) BMNOEXCEPT
unsigned short gap_word_t
const unsigned set_block_shift
const unsigned set_word_mask
BMFORCEINLINE unsigned long long bmi_blsi_u64(unsigned long long w)
BMFORCEINLINE unsigned op_xor(unsigned a, unsigned b) BMNOEXCEPT