68 const unsigned mu1 = 0x55555555;
69 const unsigned mu2 = 0x33333333;
70 const unsigned mu3 = 0x0F0F0F0F;
71 const unsigned mu4 = 0x0000003F;
74 __m128i m1 = _mm_set_epi32 (mu1, mu1, mu1, mu1);
75 __m128i m2 = _mm_set_epi32 (mu2, mu2, mu2, mu2);
76 __m128i m3 = _mm_set_epi32 (mu3, mu3, mu3, mu3);
77 __m128i m4 = _mm_set_epi32 (mu4, mu4, mu4, mu4);
79 mcnt = _mm_xor_si128(m1, m1);
84 __m128i b = _mm_load_si128(block);
88 tmp1 = _mm_srli_epi32(b, 1);
89 tmp1 = _mm_and_si128(tmp1, m1);
90 tmp2 = _mm_and_si128(b, m1);
91 b = _mm_add_epi32(tmp1, tmp2);
94 tmp1 = _mm_srli_epi32(b, 2);
95 tmp1 = _mm_and_si128(tmp1, m2);
96 tmp2 = _mm_and_si128(b, m2);
97 b = _mm_add_epi32(tmp1, tmp2);
100 tmp1 = _mm_srli_epi32(b, 4);
101 b = _mm_add_epi32(b, tmp1);
102 b = _mm_and_si128(b, m3);
105 tmp1 = _mm_srli_epi32 (b, 8);
106 b = _mm_add_epi32(b, tmp1);
109 tmp1 = _mm_srli_epi32 (b, 16);
110 b = _mm_add_epi32(b, tmp1);
111 b = _mm_and_si128(b, m4);
113 mcnt = _mm_add_epi32(mcnt, b);
115 }
while (block < block_end);
119 _mm_store_si128((__m128i*)tcnt, mcnt);
121 return tcnt[0] + tcnt[1] + tcnt[2] + tcnt[3];
132 const unsigned mu1 = 0x55555555;
133 const unsigned mu2 = 0x33333333;
134 const unsigned mu3 = 0x0F0F0F0F;
135 const unsigned mu4 = 0x0000003F;
138 __m128i m1 = _mm_set_epi32 (mu1, mu1, mu1, mu1);
139 __m128i m2 = _mm_set_epi32 (mu2, mu2, mu2, mu2);
140 __m128i m3 = _mm_set_epi32 (mu3, mu3, mu3, mu3);
141 __m128i m4 = _mm_set_epi32 (mu4, mu4, mu4, mu4);
143 mcnt = _mm_xor_si128(m1, m1);
147 __m128i b = _mm_load_si128(block++);
149 tmp1 = _mm_load_si128(mask_block++);
151 b = sse2_func(b, tmp1);
154 tmp1 = _mm_srli_epi32(b, 1);
155 tmp1 = _mm_and_si128(tmp1, m1);
156 tmp2 = _mm_and_si128(b, m1);
157 b = _mm_add_epi32(tmp1, tmp2);
160 tmp1 = _mm_srli_epi32(b, 2);
161 tmp1 = _mm_and_si128(tmp1, m2);
162 tmp2 = _mm_and_si128(b, m2);
163 b = _mm_add_epi32(tmp1, tmp2);
166 tmp1 = _mm_srli_epi32(b, 4);
167 b = _mm_add_epi32(b, tmp1);
168 b = _mm_and_si128(b, m3);
171 tmp1 = _mm_srli_epi32 (b, 8);
172 b = _mm_add_epi32(b, tmp1);
175 tmp1 = _mm_srli_epi32 (b, 16);
176 b = _mm_add_epi32(b, tmp1);
177 b = _mm_and_si128(b, m4);
179 mcnt = _mm_add_epi32(mcnt, b);
181 }
while (block < block_end);
184 _mm_store_si128((__m128i*)tcnt, mcnt);
186 return tcnt[0] + tcnt[1] + tcnt[2] + tcnt[3];
298 const __m128i* sub_block = (__m128i*) (block + off);
299 __m128i* t_sub_block = (__m128i*)(target_block + off);
303 const __m128i* xor_sub_block = (__m128i*) (xor_block + off);
304 __m128i mA, mB, mC, mD;
305 mA = _mm_xor_si128(_mm_load_si128(sub_block),
306 _mm_load_si128(xor_sub_block));
307 mB = _mm_xor_si128(_mm_load_si128(sub_block+1),
308 _mm_load_si128(xor_sub_block+1));
309 mC = _mm_xor_si128(_mm_load_si128(sub_block+2),
310 _mm_load_si128(xor_sub_block+2));
311 mD = _mm_xor_si128(_mm_load_si128(sub_block+3),
312 _mm_load_si128(xor_sub_block+3));
314 _mm_store_si128(t_sub_block, mA);
315 _mm_store_si128(t_sub_block+1, mB);
316 _mm_store_si128(t_sub_block+2, mC);
317 _mm_store_si128(t_sub_block+3, mD);
319 mA = _mm_xor_si128(_mm_load_si128(sub_block+4),
320 _mm_load_si128(xor_sub_block+4));
321 mB = _mm_xor_si128(_mm_load_si128(sub_block+5),
322 _mm_load_si128(xor_sub_block+5));
323 mC = _mm_xor_si128(_mm_load_si128(sub_block+6),
324 _mm_load_si128(xor_sub_block+6));
325 mD = _mm_xor_si128(_mm_load_si128(sub_block+7),
326 _mm_load_si128(xor_sub_block+7));
328 _mm_store_si128(t_sub_block+4, mA);
329 _mm_store_si128(t_sub_block+5, mB);
330 _mm_store_si128(t_sub_block+6, mC);
331 _mm_store_si128(t_sub_block+7, mD);
336 _mm_store_si128(t_sub_block , _mm_load_si128(sub_block));
337 _mm_store_si128(t_sub_block+1, _mm_load_si128(sub_block+1));
338 _mm_store_si128(t_sub_block+2, _mm_load_si128(sub_block+2));
339 _mm_store_si128(t_sub_block+3, _mm_load_si128(sub_block+3));
341 _mm_store_si128(t_sub_block+4, _mm_load_si128(sub_block+4));
342 _mm_store_si128(t_sub_block+5, _mm_load_si128(sub_block+5));
343 _mm_store_si128(t_sub_block+6, _mm_load_si128(sub_block+6));
344 _mm_store_si128(t_sub_block+7, _mm_load_si128(sub_block+7));
370 const __m128i* sub_block = (
const __m128i*) (xor_block + off);
371 __m128i* t_sub_block = (__m128i*)(target_block + off);
373 __m128i mA, mB, mC, mD;
374 mA = _mm_xor_si128(_mm_load_si128(sub_block),
375 _mm_load_si128(t_sub_block));
376 mB = _mm_xor_si128(_mm_load_si128(sub_block+1),
377 _mm_load_si128(t_sub_block+1));
378 mC = _mm_xor_si128(_mm_load_si128(sub_block+2),
379 _mm_load_si128(t_sub_block+2));
380 mD = _mm_xor_si128(_mm_load_si128(sub_block+3),
381 _mm_load_si128(t_sub_block+3));
383 _mm_store_si128(t_sub_block, mA);
384 _mm_store_si128(t_sub_block+1, mB);
385 _mm_store_si128(t_sub_block+2, mC);
386 _mm_store_si128(t_sub_block+3, mD);
388 mA = _mm_xor_si128(_mm_load_si128(sub_block+4),
389 _mm_load_si128(t_sub_block+4));
390 mB = _mm_xor_si128(_mm_load_si128(sub_block+5),
391 _mm_load_si128(t_sub_block+5));
392 mC = _mm_xor_si128(_mm_load_si128(sub_block+6),
393 _mm_load_si128(t_sub_block+6));
394 mD = _mm_xor_si128(_mm_load_si128(sub_block+7),
395 _mm_load_si128(t_sub_block+7));
397 _mm_store_si128(t_sub_block+4, mA);
398 _mm_store_si128(t_sub_block+5, mB);
399 _mm_store_si128(t_sub_block+6, mC);
400 _mm_store_si128(t_sub_block+7, mD);
418 __m128i m1A, m1B, m1C, m1D;
419 const __m128i maskz = _mm_setzero_si128();
421 m1A = _mm_and_si128(_mm_load_si128(src+0), _mm_load_si128(dst+0));
422 m1B = _mm_and_si128(_mm_load_si128(src+1), _mm_load_si128(dst+1));
423 m1C = _mm_and_si128(_mm_load_si128(src+2), _mm_load_si128(dst+2));
424 m1D = _mm_and_si128(_mm_load_si128(src+3), _mm_load_si128(dst+3));
426 _mm_store_si128(dst+0, m1A);
427 _mm_store_si128(dst+1, m1B);
428 _mm_store_si128(dst+2, m1C);
429 _mm_store_si128(dst+3, m1D);
431 m1A = _mm_or_si128(m1A, m1B);
432 m1C = _mm_or_si128(m1C, m1D);
433 m1A = _mm_or_si128(m1A, m1C);
435 bool z1 = _mm_movemask_epi8(_mm_cmpeq_epi8(m1A, maskz)) == 0xFFFF;
437 m1A = _mm_and_si128(_mm_load_si128(src+4), _mm_load_si128(dst+4));
438 m1B = _mm_and_si128(_mm_load_si128(src+5), _mm_load_si128(dst+5));
439 m1C = _mm_and_si128(_mm_load_si128(src+6), _mm_load_si128(dst+6));
440 m1D = _mm_and_si128(_mm_load_si128(src+7), _mm_load_si128(dst+7));
442 _mm_store_si128(dst+4, m1A);
443 _mm_store_si128(dst+5, m1B);
444 _mm_store_si128(dst+6, m1C);
445 _mm_store_si128(dst+7, m1D);
447 m1A = _mm_or_si128(m1A, m1B);
448 m1C = _mm_or_si128(m1C, m1D);
449 m1A = _mm_or_si128(m1A, m1C);
451 bool z2 = _mm_movemask_epi8(_mm_cmpeq_epi8(m1A, maskz)) == 0xFFFF;
468 __m128i m1A, m1B, m1C, m1D;
470 const __m128i maskz = _mm_setzero_si128();
472 m1A = _mm_and_si128(_mm_load_si128(src1+0), _mm_load_si128(src2+0));
473 m1B = _mm_and_si128(_mm_load_si128(src1+1), _mm_load_si128(src2+1));
474 m1C = _mm_and_si128(_mm_load_si128(src1+2), _mm_load_si128(src2+2));
475 m1D = _mm_and_si128(_mm_load_si128(src1+3), _mm_load_si128(src2+3));
477 mACC1 = _mm_or_si128(_mm_or_si128(m1A, m1B), _mm_or_si128(m1C, m1D));
478 bool z1 = (_mm_movemask_epi8(_mm_cmpeq_epi8(mACC1, maskz)) == 0xFFFF);
480 m1A = _mm_or_si128(_mm_load_si128(dst+0), m1A);
481 m1B = _mm_or_si128(_mm_load_si128(dst+1), m1B);
482 m1C = _mm_or_si128(_mm_load_si128(dst+2), m1C);
483 m1D = _mm_or_si128(_mm_load_si128(dst+3), m1D);
485 _mm_store_si128(dst+0, m1A);
486 _mm_store_si128(dst+1, m1B);
487 _mm_store_si128(dst+2, m1C);
488 _mm_store_si128(dst+3, m1D);
491 m1A = _mm_and_si128(_mm_load_si128(src1+4), _mm_load_si128(src2+4));
492 m1B = _mm_and_si128(_mm_load_si128(src1+5), _mm_load_si128(src2+5));
493 m1C = _mm_and_si128(_mm_load_si128(src1+6), _mm_load_si128(src2+6));
494 m1D = _mm_and_si128(_mm_load_si128(src1+7), _mm_load_si128(src2+7));
496 mACC1 = _mm_or_si128(_mm_or_si128(m1A, m1B), _mm_or_si128(m1C, m1D));
497 bool z2 = (_mm_movemask_epi8(_mm_cmpeq_epi8(mACC1, maskz)) == 0xFFFF);
499 m1A = _mm_or_si128(_mm_load_si128(dst+4), m1A);
500 m1B = _mm_or_si128(_mm_load_si128(dst+5), m1B);
501 m1C = _mm_or_si128(_mm_load_si128(dst+6), m1C);
502 m1D = _mm_or_si128(_mm_load_si128(dst+7), m1D);
504 _mm_store_si128(dst+4, m1A);
505 _mm_store_si128(dst+5, m1B);
506 _mm_store_si128(dst+6, m1C);
507 _mm_store_si128(dst+7, m1D);
525 __m128i m1A, m1B, m1C, m1D;
526 __m128i m1E, m1F, m1G, m1H;
528 m1A = _mm_and_si128(_mm_load_si128(src1+0), _mm_load_si128(src2+0));
529 m1B = _mm_and_si128(_mm_load_si128(src1+1), _mm_load_si128(src2+1));
530 m1C = _mm_and_si128(_mm_load_si128(src1+2), _mm_load_si128(src2+2));
531 m1D = _mm_and_si128(_mm_load_si128(src1+3), _mm_load_si128(src2+3));
533 m1E = _mm_and_si128(_mm_load_si128(src3+0), _mm_load_si128(src4+0));
534 m1F = _mm_and_si128(_mm_load_si128(src3+1), _mm_load_si128(src4+1));
535 m1G = _mm_and_si128(_mm_load_si128(src3+2), _mm_load_si128(src4+2));
536 m1H = _mm_and_si128(_mm_load_si128(src3+3), _mm_load_si128(src4+3));
538 m1A = _mm_and_si128(m1A, m1E);
539 m1B = _mm_and_si128(m1B, m1F);
540 m1C = _mm_and_si128(m1C, m1G);
541 m1D = _mm_and_si128(m1D, m1H);
543 m1A = _mm_and_si128(m1A, _mm_load_si128(dst+0));
544 m1B = _mm_and_si128(m1B, _mm_load_si128(dst+1));
545 m1C = _mm_and_si128(m1C, _mm_load_si128(dst+2));
546 m1D = _mm_and_si128(m1D, _mm_load_si128(dst+3));
548 _mm_store_si128(dst+0, m1A);
549 _mm_store_si128(dst+1, m1B);
550 _mm_store_si128(dst+2, m1C);
551 _mm_store_si128(dst+3, m1D);
553 m1A = _mm_or_si128(m1A, m1B);
554 m1C = _mm_or_si128(m1C, m1D);
555 m1A = _mm_or_si128(m1A, m1C);
557 bool z1 = (_mm_movemask_epi8(_mm_cmpeq_epi8(m1A, _mm_setzero_si128())) == 0xFFFF);
559 m1A = _mm_and_si128(_mm_load_si128(src1+4), _mm_load_si128(src2+4));
560 m1B = _mm_and_si128(_mm_load_si128(src1+5), _mm_load_si128(src2+5));
561 m1C = _mm_and_si128(_mm_load_si128(src1+6), _mm_load_si128(src2+6));
562 m1D = _mm_and_si128(_mm_load_si128(src1+7), _mm_load_si128(src2+7));
564 m1E = _mm_and_si128(_mm_load_si128(src3+4), _mm_load_si128(src4+4));
565 m1F = _mm_and_si128(_mm_load_si128(src3+5), _mm_load_si128(src4+5));
566 m1G = _mm_and_si128(_mm_load_si128(src3+6), _mm_load_si128(src4+6));
567 m1H = _mm_and_si128(_mm_load_si128(src3+7), _mm_load_si128(src4+7));
569 m1A = _mm_and_si128(m1A, m1E);
570 m1B = _mm_and_si128(m1B, m1F);
571 m1C = _mm_and_si128(m1C, m1G);
572 m1D = _mm_and_si128(m1D, m1H);
574 m1A = _mm_and_si128(m1A, _mm_load_si128(dst+4));
575 m1B = _mm_and_si128(m1B, _mm_load_si128(dst+5));
576 m1C = _mm_and_si128(m1C, _mm_load_si128(dst+6));
577 m1D = _mm_and_si128(m1D, _mm_load_si128(dst+7));
579 _mm_store_si128(dst+4, m1A);
580 _mm_store_si128(dst+5, m1B);
581 _mm_store_si128(dst+6, m1C);
582 _mm_store_si128(dst+7, m1D);
584 m1A = _mm_or_si128(m1A, m1B);
585 m1C = _mm_or_si128(m1C, m1D);
586 m1A = _mm_or_si128(m1A, m1C);
588 bool z2 = (_mm_movemask_epi8(_mm_cmpeq_epi8(m1A, _mm_setzero_si128())) == 0xFFFF);
603 __m128i m1A, m1B, m1C, m1D;
606 m1A = _mm_and_si128(_mm_load_si128(src1+0), _mm_load_si128(src2+0));
607 m1B = _mm_and_si128(_mm_load_si128(src1+1), _mm_load_si128(src2+1));
608 m1C = _mm_and_si128(_mm_load_si128(src1+2), _mm_load_si128(src2+2));
609 m1D = _mm_and_si128(_mm_load_si128(src1+3), _mm_load_si128(src2+3));
621 m1A = _mm_and_si128(m1A, _mm_load_si128(dst+0));
622 m1B = _mm_and_si128(m1B, _mm_load_si128(dst+1));
623 m1C = _mm_and_si128(m1C, _mm_load_si128(dst+2));
624 m1D = _mm_and_si128(m1D, _mm_load_si128(dst+3));
626 _mm_store_si128(dst+0, m1A);
627 _mm_store_si128(dst+1, m1B);
628 _mm_store_si128(dst+2, m1C);
629 _mm_store_si128(dst+3, m1D);
631 m1A = _mm_or_si128(m1A, m1B);
632 m1C = _mm_or_si128(m1C, m1D);
633 m1A = _mm_or_si128(m1A, m1C);
635 bool z1 = (_mm_movemask_epi8(_mm_cmpeq_epi8(m1A, _mm_setzero_si128())) == 0xFFFF);
637 m1A = _mm_and_si128(_mm_load_si128(src1+4), _mm_load_si128(src2+4));
638 m1B = _mm_and_si128(_mm_load_si128(src1+5), _mm_load_si128(src2+5));
639 m1C = _mm_and_si128(_mm_load_si128(src1+6), _mm_load_si128(src2+6));
640 m1D = _mm_and_si128(_mm_load_si128(src1+7), _mm_load_si128(src2+7));
652 m1A = _mm_and_si128(m1A, _mm_load_si128(dst+4));
653 m1B = _mm_and_si128(m1B, _mm_load_si128(dst+5));
654 m1C = _mm_and_si128(m1C, _mm_load_si128(dst+6));
655 m1D = _mm_and_si128(m1D, _mm_load_si128(dst+7));
657 _mm_store_si128(dst+4, m1A);
658 _mm_store_si128(dst+5, m1B);
659 _mm_store_si128(dst+6, m1C);
660 _mm_store_si128(dst+7, m1D);
662 m1A = _mm_or_si128(m1A, m1B);
663 m1C = _mm_or_si128(m1C, m1D);
664 m1A = _mm_or_si128(m1A, m1C);
666 bool z2 = (_mm_movemask_epi8(_mm_cmpeq_epi8(m1A, _mm_setzero_si128())) == 0xFFFF);
685 __m128i m1A, m1B, m1C, m1D;
687 m1A = _mm_and_si128(_mm_load_si128(src1+0), _mm_load_si128(src2+0));
688 m1B = _mm_and_si128(_mm_load_si128(src1+1), _mm_load_si128(src2+1));
689 m1C = _mm_and_si128(_mm_load_si128(src1+2), _mm_load_si128(src2+2));
690 m1D = _mm_and_si128(_mm_load_si128(src1+3), _mm_load_si128(src2+3));
692 _mm_store_si128(dst+0, m1A);
693 _mm_store_si128(dst+1, m1B);
694 _mm_store_si128(dst+2, m1C);
695 _mm_store_si128(dst+3, m1D);
697 m1A = _mm_or_si128(m1A, m1B);
698 m1C = _mm_or_si128(m1C, m1D);
699 m1A = _mm_or_si128(m1A, m1C);
701 const __m128i maskz = _mm_setzero_si128();
702 bool z1 = (_mm_movemask_epi8(_mm_cmpeq_epi8(m1A, maskz)) == 0xFFFF);
704 m1A = _mm_and_si128(_mm_load_si128(src1+4), _mm_load_si128(src2+4));
705 m1B = _mm_and_si128(_mm_load_si128(src1+5), _mm_load_si128(src2+5));
706 m1C = _mm_and_si128(_mm_load_si128(src1+6), _mm_load_si128(src2+6));
707 m1D = _mm_and_si128(_mm_load_si128(src1+7), _mm_load_si128(src2+7));
709 _mm_store_si128(dst+4, m1A);
710 _mm_store_si128(dst+5, m1B);
711 _mm_store_si128(dst+6, m1C);
712 _mm_store_si128(dst+7, m1D);
714 m1A = _mm_or_si128(m1A, m1B);
715 m1C = _mm_or_si128(m1C, m1D);
716 m1A = _mm_or_si128(m1A, m1C);
718 bool z2 = (_mm_movemask_epi8(_mm_cmpeq_epi8(m1A, maskz)) == 0xFFFF);
734 __m128i m1A, m1B, m1C, m1D;
735 const __m128i maskz = _mm_setzero_si128();
737 m1A = _mm_andnot_si128(_mm_load_si128(src+0), _mm_load_si128(dst+0));
738 m1B = _mm_andnot_si128(_mm_load_si128(src+1), _mm_load_si128(dst+1));
739 m1C = _mm_andnot_si128(_mm_load_si128(src+2), _mm_load_si128(dst+2));
740 m1D = _mm_andnot_si128(_mm_load_si128(src+3), _mm_load_si128(dst+3));
742 _mm_store_si128(dst+0, m1A);
743 _mm_store_si128(dst+1, m1B);
744 _mm_store_si128(dst+2, m1C);
745 _mm_store_si128(dst+3, m1D);
747 m1A = _mm_or_si128(m1A, m1B);
748 m1C = _mm_or_si128(m1C, m1D);
749 m1A = _mm_or_si128(m1A, m1C);
751 bool z1 = (_mm_movemask_epi8(_mm_cmpeq_epi8(m1A, maskz)) == 0xFFFF);
753 m1A = _mm_andnot_si128(_mm_load_si128(src+4), _mm_load_si128(dst+4));
754 m1B = _mm_andnot_si128(_mm_load_si128(src+5), _mm_load_si128(dst+5));
755 m1C = _mm_andnot_si128(_mm_load_si128(src+6), _mm_load_si128(dst+6));
756 m1D = _mm_andnot_si128(_mm_load_si128(src+7), _mm_load_si128(dst+7));
758 _mm_store_si128(dst+4, m1A);
759 _mm_store_si128(dst+5, m1B);
760 _mm_store_si128(dst+6, m1C);
761 _mm_store_si128(dst+7, m1D);
763 m1A = _mm_or_si128(m1A, m1B);
764 m1C = _mm_or_si128(m1C, m1D);
765 m1A = _mm_or_si128(m1A, m1C);
767 bool z2 = (_mm_movemask_epi8(_mm_cmpeq_epi8(m1A, maskz)) == 0xFFFF);
784 __m128i m1A, m1B, m1C, m1D;
785 const __m128i maskz = _mm_setzero_si128();
787 m1A = _mm_andnot_si128(_mm_load_si128(src2+0), _mm_load_si128(src1+0));
788 m1B = _mm_andnot_si128(_mm_load_si128(src2+1), _mm_load_si128(src1+1));
789 m1C = _mm_andnot_si128(_mm_load_si128(src2+2), _mm_load_si128(src1+2));
790 m1D = _mm_andnot_si128(_mm_load_si128(src2+3), _mm_load_si128(src1+3));
792 _mm_store_si128(dst+0, m1A);
793 _mm_store_si128(dst+1, m1B);
794 _mm_store_si128(dst+2, m1C);
795 _mm_store_si128(dst+3, m1D);
797 m1A = _mm_or_si128(m1A, m1B);
798 m1C = _mm_or_si128(m1C, m1D);
799 m1A = _mm_or_si128(m1A, m1C);
801 bool z1 = (_mm_movemask_epi8(_mm_cmpeq_epi8(m1A, maskz)) == 0xFFFF);
803 m1A = _mm_andnot_si128(_mm_load_si128(src2+4), _mm_load_si128(src1+4));
804 m1B = _mm_andnot_si128(_mm_load_si128(src2+5), _mm_load_si128(src1+5));
805 m1C = _mm_andnot_si128(_mm_load_si128(src2+6), _mm_load_si128(src1+6));
806 m1D = _mm_andnot_si128(_mm_load_si128(src2+7), _mm_load_si128(src1+7));
808 _mm_store_si128(dst+4, m1A);
809 _mm_store_si128(dst+5, m1B);
810 _mm_store_si128(dst+6, m1C);
811 _mm_store_si128(dst+7, m1D);
813 m1A = _mm_or_si128(m1A, m1B);
814 m1C = _mm_or_si128(m1C, m1D);
815 m1A = _mm_or_si128(m1A, m1C);
817 bool z2 = (_mm_movemask_epi8(_mm_cmpeq_epi8(m1A, maskz)) == 0xFFFF);
834 __m128i m1A, m1B, m1C, m1D;
835 __m128i m1E, m1F, m1G, m1H;
836 __m128i maskFF = _mm_set1_epi32(~0u);
838 m1A = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+0)), _mm_xor_si128(maskFF,_mm_load_si128(src2+0)));
839 m1B = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+1)), _mm_xor_si128(maskFF,_mm_load_si128(src2+1)));
840 m1C = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+2)), _mm_xor_si128(maskFF,_mm_load_si128(src2+2)));
841 m1D = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+3)), _mm_xor_si128(maskFF,_mm_load_si128(src2+3)));
843 m1E = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src3+0)), _mm_xor_si128(maskFF,_mm_load_si128(src4+0)));
844 m1F = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src3+1)), _mm_xor_si128(maskFF,_mm_load_si128(src4+1)));
845 m1G = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src3+2)), _mm_xor_si128(maskFF,_mm_load_si128(src4+2)));
846 m1H = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src3+3)), _mm_xor_si128(maskFF,_mm_load_si128(src4+3)));
848 m1A = _mm_and_si128(m1A, m1E);
849 m1B = _mm_and_si128(m1B, m1F);
850 m1C = _mm_and_si128(m1C, m1G);
851 m1D = _mm_and_si128(m1D, m1H);
853 m1A = _mm_and_si128(m1A, _mm_load_si128(dst+0));
854 m1B = _mm_and_si128(m1B, _mm_load_si128(dst+1));
855 m1C = _mm_and_si128(m1C, _mm_load_si128(dst+2));
856 m1D = _mm_and_si128(m1D, _mm_load_si128(dst+3));
858 _mm_store_si128(dst+0, m1A);
859 _mm_store_si128(dst+1, m1B);
860 _mm_store_si128(dst+2, m1C);
861 _mm_store_si128(dst+3, m1D);
863 m1A = _mm_or_si128(m1A, m1B);
864 m1C = _mm_or_si128(m1C, m1D);
865 m1A = _mm_or_si128(m1A, m1C);
867 const __m128i maskz = _mm_setzero_si128();
868 bool z1 = (_mm_movemask_epi8(_mm_cmpeq_epi8(m1A, maskz)) == 0xFFFF);
870 m1A = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+4)), _mm_xor_si128(maskFF,_mm_load_si128(src2+4)));
871 m1B = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+5)), _mm_xor_si128(maskFF,_mm_load_si128(src2+5)));
872 m1C = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+6)), _mm_xor_si128(maskFF,_mm_load_si128(src2+6)));
873 m1D = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+7)), _mm_xor_si128(maskFF,_mm_load_si128(src2+7)));
875 m1E = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src3+4)), _mm_xor_si128(maskFF,_mm_load_si128(src4+4)));
876 m1F = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src3+5)), _mm_xor_si128(maskFF,_mm_load_si128(src4+5)));
877 m1G = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src3+6)), _mm_xor_si128(maskFF,_mm_load_si128(src4+6)));
878 m1H = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src3+7)), _mm_xor_si128(maskFF,_mm_load_si128(src4+7)));
880 m1A = _mm_and_si128(m1A, m1E);
881 m1B = _mm_and_si128(m1B, m1F);
882 m1C = _mm_and_si128(m1C, m1G);
883 m1D = _mm_and_si128(m1D, m1H);
885 m1A = _mm_and_si128(m1A, _mm_load_si128(dst+4));
886 m1B = _mm_and_si128(m1B, _mm_load_si128(dst+5));
887 m1C = _mm_and_si128(m1C, _mm_load_si128(dst+6));
888 m1D = _mm_and_si128(m1D, _mm_load_si128(dst+7));
890 _mm_store_si128(dst+4, m1A);
891 _mm_store_si128(dst+5, m1B);
892 _mm_store_si128(dst+6, m1C);
893 _mm_store_si128(dst+7, m1D);
895 m1A = _mm_or_si128(m1A, m1B);
896 m1C = _mm_or_si128(m1C, m1D);
897 m1A = _mm_or_si128(m1A, m1C);
899 bool z2 = (_mm_movemask_epi8(_mm_cmpeq_epi8(m1A, maskz)) == 0xFFFF);
915 __m128i m1A, m1B, m1C, m1D;
917 __m128i maskFF = _mm_set1_epi32(~0u);
919 m1A = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+0)), _mm_xor_si128(maskFF,_mm_load_si128(src2+0)));
920 m1B = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+1)), _mm_xor_si128(maskFF,_mm_load_si128(src2+1)));
921 m1C = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+2)), _mm_xor_si128(maskFF,_mm_load_si128(src2+2)));
922 m1D = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+3)), _mm_xor_si128(maskFF,_mm_load_si128(src2+3)));
934 m1A = _mm_and_si128(m1A, _mm_load_si128(dst+0));
935 m1B = _mm_and_si128(m1B, _mm_load_si128(dst+1));
936 m1C = _mm_and_si128(m1C, _mm_load_si128(dst+2));
937 m1D = _mm_and_si128(m1D, _mm_load_si128(dst+3));
939 _mm_store_si128(dst+0, m1A);
940 _mm_store_si128(dst+1, m1B);
941 _mm_store_si128(dst+2, m1C);
942 _mm_store_si128(dst+3, m1D);
944 m1A = _mm_or_si128(m1A, m1B);
945 m1C = _mm_or_si128(m1C, m1D);
946 m1A = _mm_or_si128(m1A, m1C);
948 const __m128i maskz = _mm_setzero_si128();
949 bool z1 = (_mm_movemask_epi8(_mm_cmpeq_epi8(m1A, maskz)) == 0xFFFF);
951 m1A = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+4)), _mm_xor_si128(maskFF,_mm_load_si128(src2+4)));
952 m1B = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+5)), _mm_xor_si128(maskFF,_mm_load_si128(src2+5)));
953 m1C = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+6)), _mm_xor_si128(maskFF,_mm_load_si128(src2+6)));
954 m1D = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+7)), _mm_xor_si128(maskFF,_mm_load_si128(src2+7)));
966 m1A = _mm_and_si128(m1A, _mm_load_si128(dst+4));
967 m1B = _mm_and_si128(m1B, _mm_load_si128(dst+5));
968 m1C = _mm_and_si128(m1C, _mm_load_si128(dst+6));
969 m1D = _mm_and_si128(m1D, _mm_load_si128(dst+7));
971 _mm_store_si128(dst+4, m1A);
972 _mm_store_si128(dst+5, m1B);
973 _mm_store_si128(dst+6, m1C);
974 _mm_store_si128(dst+7, m1D);
976 m1A = _mm_or_si128(m1A, m1B);
977 m1C = _mm_or_si128(m1C, m1D);
978 m1A = _mm_or_si128(m1A, m1C);
980 bool z2 = (_mm_movemask_epi8(_mm_cmpeq_epi8(m1A, maskz)) == 0xFFFF);
1114 __m128i* block_end =
1116 __m128i m1COshft, m2COshft;
1117 __m128i mAcc = _mm_set1_epi32(0);
1119 __m128i mMask0 = _mm_set_epi32(-1,-1,-1, 0);
1122 for (;block < block_end; block += 2)
1124 __m128i m1A = _mm_load_si128(block);
1125 __m128i m2A = _mm_load_si128(block+1);
1127 __m128i m1CO = _mm_srli_epi32(m1A, 31);
1128 __m128i m2CO = _mm_srli_epi32(m2A, 31);
1130 co2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(m1CO, 0xFF));
1132 m1A = _mm_slli_epi32(m1A, 1);
1133 m2A = _mm_slli_epi32(m2A, 1);
1135 m1COshft = _mm_slli_si128 (m1CO, 4);
1136 m2COshft = _mm_slli_si128 (m2CO, 4);
1138 m1COshft = _mm_and_si128(m1COshft, mMask0);
1139 m1COshft = _mm_or_si128(m1COshft, _mm_set_epi32(0, 0, 0, co1));
1141 m2COshft = _mm_and_si128(m2COshft, mMask0);
1142 m2COshft = _mm_or_si128(m2COshft, _mm_set_epi32(0, 0, 0, co2));
1144 m1A = _mm_or_si128(m1A, m1COshft);
1145 m2A = _mm_or_si128(m2A, m2COshft);
1147 co1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(m2CO, 0xFF));
1149 _mm_store_si128(block, m1A);
1150 _mm_store_si128(block+1, m2A);
1152 mAcc = _mm_or_si128(mAcc, m1A);
1153 mAcc = _mm_or_si128(mAcc, m2A);
1155 bool z1 = (_mm_movemask_epi8(_mm_cmpeq_epi8(mAcc, _mm_set1_epi32(0))) == 0xFFFF);
1167 __m128i* block_end =
1169 __m128i mAcc = _mm_set1_epi32(0);
1170 __m128i mMask1 = _mm_set1_epi32(1);
1171 __m128i mMask0 = _mm_set_epi32(0, -1, -1, -1);
1174 for (--block_end; block_end >= block; block_end -= 2)
1176 __m128i m1A = _mm_load_si128(block_end);
1177 __m128i m2A = _mm_load_si128(block_end-1);
1179 __m128i m1CO = _mm_and_si128(m1A, mMask1);
1180 __m128i m2CO = _mm_and_si128(m2A, mMask1);
1182 co2 = _mm_cvtsi128_si32 (m1CO);
1184 m1A = _mm_srli_epi32(m1A, 1);
1185 m2A = _mm_srli_epi32(m2A, 1);
1187 __m128i m1COshft = _mm_srli_si128 (m1CO, 4);
1188 __m128i m2COshft = _mm_srli_si128 (m2CO, 4);
1192 m1COshft = _mm_and_si128(m1COshft, mMask0);
1193 m1COshft = _mm_or_si128(m1COshft, _mm_set_epi32(co1, 0, 0, 0));
1194 m2COshft = _mm_and_si128(m2COshft, mMask0);
1195 m2COshft = _mm_or_si128(m2COshft, _mm_set_epi32(co2, 0, 0, 0));
1198 m1COshft = _mm_slli_epi32(m1COshft, 31);
1199 m2COshft = _mm_slli_epi32(m2COshft, 31);
1201 m1A = _mm_or_si128(m1A, m1COshft);
1202 m2A = _mm_or_si128(m2A, m2COshft);
1204 co1 = _mm_cvtsi128_si32 (m2CO);
1206 _mm_store_si128(block_end, m1A);
1207 _mm_store_si128(block_end-1, m2A);
1209 mAcc = _mm_or_si128(mAcc, m1A);
1210 mAcc = _mm_or_si128(mAcc, m2A);
1213 bool z1 = (_mm_movemask_epi8(_mm_cmpeq_epi8(mAcc, _mm_set1_epi32(0))) == 0xFFFF);
1225 const unsigned mu1 = 0x55555555;
1226 const unsigned mu2 = 0x33333333;
1227 const unsigned mu3 = 0x0F0F0F0F;
1228 const unsigned mu4 = 0x0000003F;
1231 __m128i m1 = _mm_set_epi32 (mu1, mu1, mu1, mu1);
1232 __m128i m2 = _mm_set_epi32 (mu2, mu2, mu2, mu2);
1233 __m128i m3 = _mm_set_epi32 (mu3, mu3, mu3, mu3);
1234 __m128i m4 = _mm_set_epi32 (mu4, mu4, mu4, mu4);
1236 mcnt = _mm_xor_si128(m1, m1);
1241 int count = (int)(block_end - block)*4;
1244 const int w_shift =
sizeof(w) * 8 - 1;
1245 bool first_word =
true;
1253 count -= (w_prev = (w0 >> w_shift));
1263 __m128i b = _mm_load_si128(block);
1266 tmp1 = _mm_srli_epi32(b, 1);
1267 tmp2 = _mm_xor_si128(b, tmp1);
1268 _mm_store_si128((__m128i*)tcnt, tmp2);
1276 tmp1 = _mm_and_si128(tmp1, m1);
1277 tmp2 = _mm_and_si128(b, m1);
1278 b = _mm_add_epi32(tmp1, tmp2);
1281 tmp1 = _mm_srli_epi32(b, 2);
1282 tmp1 = _mm_and_si128(tmp1, m2);
1283 tmp2 = _mm_and_si128(b, m2);
1284 b = _mm_add_epi32(tmp1, tmp2);
1287 tmp1 = _mm_srli_epi32(b, 4);
1288 b = _mm_add_epi32(b, tmp1);
1289 b = _mm_and_si128(b, m3);
1292 tmp1 = _mm_srli_epi32 (b, 8);
1293 b = _mm_add_epi32(b, tmp1);
1296 tmp1 = _mm_srli_epi32 (b, 16);
1297 b = _mm_add_epi32(b, tmp1);
1298 b = _mm_and_si128(b, m4);
1300 mcnt = _mm_add_epi32(mcnt, b);
1321 count -= !(w_prev ^ (w0 & 1));
1322 count -= w_prev = (w0 >> w_shift);
1326 count -= !w_prev; w_prev ^= w_prev;
1332 count -= !(w_prev ^ (w0 & 1));
1333 count -= w_prev = (w0 >> w_shift);
1337 count -= !w_prev; w_prev ^= w_prev;
1342 count -= !(w_prev ^ (w0 & 1));
1343 count -= w_prev = (w0 >> w_shift);
1347 count -= !w_prev; w_prev ^= w_prev;
1352 count -= !(w_prev ^ (w0 & 1));
1353 count -= w_prev = (w0 >> w_shift);
1357 count -= !w_prev; w_prev ^= w_prev;
1360 }
while (++block < block_end);
1362 _mm_store_si128((__m128i*)tcnt, mcnt);
1363 *bit_count = tcnt[0] + tcnt[1] + tcnt[2] + tcnt[3];
1365 return unsigned(count);
1385 const unsigned unroll_factor = 8;
1388 if (pbuf[0] >= pos) { size = 0; }
1389 else if (pbuf[1] >= pos) { size = 1; }
1390 else { size = 2;
BM_ASSERT(pbuf[2] >= pos); }
1394 __m128i m1, mz, maskF, maskFL;
1396 mz = _mm_setzero_si128();
1397 m1 = _mm_loadu_si128((__m128i*)(pbuf));
1399 maskF = _mm_cmpeq_epi32(mz, mz);
1400 maskFL = _mm_slli_si128(maskF, 4 * 2);
1401 int shiftL = (64 - (unroll_factor - size) * 16);
1402 maskFL = _mm_slli_epi64(maskFL, shiftL);
1404 m1 = _mm_andnot_si128(maskFL, m1);
1405 m1 = _mm_or_si128(m1, maskFL);
1407 __m128i mp = _mm_set1_epi16(pos);
1408 __m128i mge_mask = _mm_cmpeq_epi16(_mm_subs_epu16(mp, m1), mz);
1409 int mi = _mm_movemask_epi8(mge_mask);
1423 m1 = _mm_loadu_si128((__m128i*)(pbuf2));
1424 mge_mask = _mm_cmpeq_epi16(_mm_subs_epu16(mp, m1), mz);
1425 mi = _mm_movemask_epi8(mge_mask);
1429 return size - (unroll_factor - bsr_i);