BitMagic-C++
xsample05.cpp
Go to the documentation of this file.
1/*
2Copyright(c) 2018 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)
3
4Licensed under the Apache License, Version 2.0 (the "License");
5you may not use this file except in compliance with the License.
6You may obtain a copy of the License at
7
8 http://www.apache.org/licenses/LICENSE-2.0
9
10Unless required by applicable law or agreed to in writing, software
11distributed under the License is distributed on an "AS IS" BASIS,
12WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13See the License for the specific language governing permissions and
14limitations under the License.
15
16For more information please visit: http://bitmagic.io
17*/
18
19/** \example xsample05.cpp
20*/
21
22/*! \file xsample05.cpp
23 \brief Example: Example on how to use bit-transposed string sparse vector
24
25 Illustrates how to build a sparse vector, serialize it to disk,
26 load back and do search or binary hybrid search.
27
28 \sa bm::str_sparse_vector<>
29
30*/
31
32#include <iostream>
33#include <sstream>
34#include <chrono>
35#include <regex>
36#include <time.h>
37#include <stdio.h>
38
39
40#include <vector>
41#include <map>
42#include <chrono>
43#include <map>
44#include <utility>
45#include <algorithm>
46#include <random>
47
48using namespace std;
49
50//#define BMAVX2OPT
51
52#include "bm.h"
53#include "bmalgo.h"
54#include "bmserial.h"
55#include "bmrandom.h"
56#include "bmstrsparsevec.h"
57#include "bmsparsevec_algo.h"
58#include "bmsparsevec_serial.h"
59#include "bmalgo_similarity.h"
60#include "bmsparsevec_util.h"
61
62
63#include "bmdbg.h"
64#include "bmtimer.h"
65#include "bmundef.h" /* clear the pre-proc defines from BM */
66
67/// Print help
68static
70{
71 std::cerr
72 << "BitMagic Dictionary Search Sample (c) 2018" << std::endl
73 << "-idict file-name -- input set file to parse" << std::endl
74 << "-svout spase vector output -- sparse vector name to save" << std::endl
75 << "-svin sparse vector input -- sparse vector file name to load " << std::endl
76 << "-remap -- re-mapping of string characters " << std::endl
77 << "-xor -- use XOR compression filtering" << std::endl
78 << "-diag -- run diagnostics" << std::endl
79 << "-bench -- run benchmarks" << std::endl
80 << "-timing -- collect timings" << std::endl
81 ;
82}
83
84
85// Arguments
86//
87std::string sv_out_name;
88std::string sv_in_name;
89std::string i_name;
90bool is_diag = false;
91bool is_timing = false;
92bool is_bench = false;
93bool is_remap = false;
94bool is_xor = false;
95
96// Parse command line arguments
97static
98int parse_args(int argc, char *argv[])
99{
100 for (int i = 1; i < argc; ++i)
101 {
102 std::string arg = argv[i];
103 if ((arg == "-h") || (arg == "--help"))
104 {
105 show_help();
106 return 0;
107 }
108
109 if (arg == "-svout" || arg == "--svout")
110 {
111 if (i + 1 < argc)
112 {
113 sv_out_name = argv[++i];
114 }
115 else
116 {
117 std::cerr << "Error: -svout requires file name" << std::endl;
118 return 1;
119 }
120 continue;
121 }
122
123 if (arg == "-svin" || arg == "--svin")
124 {
125 if (i + 1 < argc)
126 {
127 sv_in_name = argv[++i];
128 }
129 else
130 {
131 std::cerr << "Error: -svin requires file name" << std::endl;
132 return 1;
133 }
134 continue;
135 }
136
137 if (arg == "-idict" || arg == "--idict" )
138 {
139 if (i + 1 < argc)
140 {
141 i_name = argv[++i];
142 }
143 else
144 {
145 std::cerr << "Error: -idict requires file name" << std::endl;
146 return 1;
147 }
148 continue;
149 }
150
151 if (arg == "-remap" || arg == "--remap" || arg == "-r" || arg == "--r")
152 {
153 is_remap = true;
154 continue;
155 }
156 if (arg == "-xor" || arg == "--xor" || arg == "-x" || arg == "--x")
157 {
158 is_xor = true;
159 continue;
160 }
161 if (arg == "-diag" || arg == "--diag" || arg == "-d" || arg == "--d")
162 {
163 is_diag = true;
164 continue;
165 }
166 if (arg == "-timing" || arg == "--timing" || arg == "-t" || arg == "--t")
167 {
168 is_timing = true;
169 continue;
170 }
171 if (arg == "-bench" || arg == "--bench" || arg == "-b" || arg == "--b")
172 {
173 is_bench = true;
174 continue;
175 }
176
177 std::cerr << "Error: unknown argument: " << arg << std::endl;
178 return 1;
179
180
181 } // for i
182 return 0;
183}
184
185
186// Global types
187//
189typedef vector<string> string_vector;
190
191
192
194
195/// Parse the input file and extract dictionary values.
196///
197static
198int load_dict_report(const std::string& fname, string_vector& str_vec)
199{
200 bm::chrono_taker tt1(cout, "1. parse input data ", 1, &timing_map);
201
202 std::ifstream fin(fname.c_str(), std::ios::in);
203 if (!fin.good())
204 return -1;
205
206 std::string line;
207 std::regex reg("[|]");
208 std::sregex_token_iterator it_end;
209
210 string trim_chars("\" ");
211
212 for (unsigned i = 0; std::getline(fin, line); ++i)
213 {
214 if (line.empty() || !isdigit(line.front()))
215 continue;
216
217 // regex based tokenizer
218 std::sregex_token_iterator it(line.begin(), line.end(), reg, -1);
219 std::vector<std::string> line_vec(it, it_end);
220 if (line_vec.empty())
221 continue;
222 try
223 {
224 // trip the extra chars
225 string& col13 = line_vec.at(13);
226 col13.erase(0, col13.find_first_not_of(trim_chars));
227 col13.erase(col13.find_last_not_of(trim_chars) + 1);
228
229 if (!col13.empty())
230 str_vec.emplace_back(col13);
231 }
232 catch (std::exception&)
233 {
234 // just ignore (not ideal, but ok for a sketch ETL)
235 }
236 if (i % 10000 == 0)
237 {
238 cout << "\rReading input file: " << i << flush;
239 }
240 } // for
241 cout << endl;
242 return 0;
243}
244
245/// Compare STL vector with bit-transposed container to check correctness
246///
247static
248void check_sparse(const str_sparse_vect& str_sv, const string_vector& str_vec)
249{
250 if (str_vec.size() != str_sv.size())
251 throw runtime_error("Error. size() comparison failed!");
252 string s;
253 for (str_sparse_vect::size_type i = 0; i < str_sv.size(); ++i)
254 {
255 str_sv.get(i, s);
256 const string& s_control = str_vec[i];
257 if (s != s_control)
258 {
259 cout << "idx=" << i << s << "!=" << s_control << endl;
260 throw runtime_error("Error. element comparison failed!");
261 }
262 } // for
263 std::cout << "Check ok. Dictionary size = " << str_sv.size() << std:: endl;
264}
265
266const unsigned benchmark_max = 50000; // benchmark sampling size
267
268/// Sample a few random terms out of collection
269static
270void pick_benchmark_set(string_vector& bench_vec, string_vector& bench_vec_not_found, const string_vector& str_vec)
271{
272 std::random_device rand_dev;
273 std::mt19937 gen(rand_dev()); // mersenne_twister_engine
274 std::uniform_int_distribution<unsigned> rand_dis(0, unsigned(str_vec.size()-1)); // generate uniform numebrs for [1, vector_max]
275
276 bm::bvector<> bv;
277
278 bench_vec.resize(0);
279 for (unsigned i = 0; i < benchmark_max; ++i)
280 {
281 unsigned idx;
282 while (true)
283 {
284 idx = unsigned(rand_dis(gen));
285 if (bv.test(idx)) // make sure benchmark example is not repeated
286 continue;
287 if (idx < str_vec.size())
288 bench_vec.push_back(str_vec[idx]);
289 break;
290 }
291 bv.set(idx); // mark as set
292
293 // generate not-found case by modifying a letter in an existing sample
294 {
295 string str_nf = str_vec[idx];
296 string::reverse_iterator rit = str_nf.rbegin();
297 string::reverse_iterator rit_end = str_nf.rend();
298 for (; rit != rit_end; ++rit)
299 {
300 char ch = *rit;
301 int a = rand() % 26 + int('A'); // generate random letter
302 ch = char(a);
303 *rit = ch;
304 auto it = std::lower_bound(str_vec.begin(), str_vec.end(), str_nf);
305 if (it == str_vec.end() || *it != str_nf) // check if not found
306 {
307 bench_vec_not_found.push_back(str_nf);
308 break;
309 }
310 } // for rit
311 }
312
313 } // for
314 cout << endl;
315}
316
317static
318void run_benchmark(const str_sparse_vect& str_sv, const string_vector& str_vec)
319{
320 string_vector bench_vec;
321 string_vector bench_vec_not_found; // vector for impossible dictionary items
322
323 pick_benchmark_set(bench_vec, bench_vec_not_found, str_vec);
324
325 bm::bvector<> bv1, bv2, bv3, bv4;
326
327 cout << "Picked " << bench_vec.size() << " / "
328 << bench_vec_not_found.size() << " samples. Running benchmarks."
329 << endl;
330
331 unsigned bench_size = unsigned(bench_vec.size());
332 {
333 {
334 bm::chrono_taker tt1(cout, "3. std::lower_bound() search", bench_size, &timing_map);
335 for (const string& term : bench_vec)
336 {
337 auto it = std::lower_bound(str_vec.begin(), str_vec.end(), term);
338 if (it != str_vec.end())
339 {
340 string_vector::size_type idx =
341 string_vector::size_type(std::distance(str_vec.begin(), it));
342 bv1.set(unsigned(idx));
343 }
344 } // for
345 }
346 {
347 bm::chrono_taker tt2(cout, "3a. std::lower_bound() search (empty)", bench_size, &timing_map);
348 for (const string& term : bench_vec_not_found)
349 {
350 auto p = std::lower_bound(str_vec.begin(), str_vec.end(), term);
351 (void) p;
352 } // for
353 }
354 }
355
356 {
357 // construct std::map<> (RB-tree)
358 std::map<string, unsigned> str_map;
359 for (string_vector::size_type i = 0; i < str_vec.size(); ++i)
360 {
361 const string& s = str_vec[i];
362 str_map[s] = unsigned(i);
363 } // for
364 {
365 bm::chrono_taker tt1(cout, "4. std::map<> search", bench_size, &timing_map);
366 for (const string& term : bench_vec)
367 {
368 auto it = str_map.find(term);
369 if (it != str_map.end())
370 {
371 bv2.set(unsigned(it->second));
372 }
373 } // for
374 }
375 {
376 bm::chrono_taker tt2(cout, "4a. std::map<> search (empty)", bench_size, &timing_map);
377 for (const string& term : bench_vec_not_found)
378 {
379 auto it = str_map.find(term);
380 if (it != str_map.end())
381 {
382 cerr << "empty search returned value..." << endl;
383 }
384 } // for
385 }
386 }
387
388 {
390 {
391 bm::chrono_taker tt1(cout, "5. bm::sparse_vector_scanner<> search", bench_size, &timing_map);
392 for (const string& term : bench_vec)
393 {
394 unsigned pos;
395 bool found = scanner.find_eq_str(str_sv, term.c_str(), pos);
396 if (found)
397 {
398 bv3.set(pos);
399 }
400 } // for
401 }
402 {
403 bm::chrono_taker tt1(cout, "5a. bm::sparse_vector_scanner<> search (empty)", bench_size, &timing_map);
404 for (const string& term : bench_vec_not_found)
405 {
406 unsigned pos;
407 bool found = scanner.find_eq_str(str_sv, term.c_str(), pos);
408 if (found)
409 {
410 cerr << "scanner empty search returned value..." << endl;
411 }
412 } // for
413 }
414 }
415
416 {
418 scanner.bind(str_sv, true); // attach SV as permanent search parameter to share cached values
419
420 {
421 bm::chrono_taker tt1(cout, "6. bm::sparse_vector_scanner<> binary search", bench_size, &timing_map);
422 for (const string& term : bench_vec)
423 {
424 unsigned pos;
425 bool found = scanner.bfind_eq_str(term.c_str(), pos);
426 if (found)
427 {
428 bv4.set(pos);
429 }
430 } // for
431 }
432 {
433 bm::chrono_taker tt2(cout, "6a. bm::sparse_vector_scanner<> binary search (empty)", bench_size, &timing_map);
434 for (const string& term : bench_vec_not_found)
435 {
436 unsigned pos;
437 bool found = scanner.bfind_eq_str(term.c_str(), pos);
438 if (found)
439 {
440 cerr << "scanner empty search returned value..." << endl;
441 }
442 } // for
443 }
444 }
445
446 // various integrity checks
447 //
448 int cmp = bv1.compare(bv2);
449 if (cmp != 0)
450 throw runtime_error("Error. RB-search mismatch!");
451 cmp = bv1.compare(bv3);
452 if (cmp != 0)
453 throw runtime_error("Error. scanner mismatch!");
454
455 cmp = bv1.compare(bv4);
456 if (cmp != 0)
457 throw runtime_error("Error. binary scanner mismatch!");
458
459 if (bv1.count() != bench_size)
460 throw runtime_error("Error. Search result missing elements!");
461
462
463}
464
465
466int main(int argc, char *argv[])
467{
468 if (argc < 3)
469 {
470 show_help();
471 return 1;
472 }
473
474 string_vector str_vec; // dictionary vector (STL)
475 str_sparse_vect str_sv; // bit-transposed sparse vector
476
477 try
478 {
479 auto ret = parse_args(argc, argv);
480 if (ret != 0)
481 {
482 return ret;
483 }
484 if (!i_name.empty())
485 {
486 auto res = load_dict_report(i_name, str_vec);
487 if (res != 0)
488 {
489 return res;
490 }
491 cout << "Loaded " << str_vec.size() << " dictionary names." << endl;
492
493 std::sort(str_vec.begin(), str_vec.end());
494 }
495
496 if (str_vec.size()) // load the sparse vector
497 {
498 bm::chrono_taker tt1(cout, "2. build sparse vector", 1, &timing_map);
499 {
500 // use insert iterator to load vector (faster than push-back)
501 //
502 auto bi = str_sv.get_back_inserter();
503 for (const string& term : str_vec)
504 bi = term;
505 bi.flush();
506 }
507
508 // build remapped (dense) succinct vector
509 // (this should be final), no more edits in this form
510 if (is_remap)
511 {
512 str_sparse_vect str_sv_remap;
513 str_sv_remap.remap_from(str_sv);
514 str_sv.swap(str_sv_remap);
515 }
516
518 str_sv.optimize(tb); // memory optimization after load
519 }
520
521 if (!sv_in_name.empty())
522 {
523 {
524 bm::chrono_taker tt1(cout, "8. Load sparse vector", 1, &timing_map);
525 file_load_svector(str_sv, sv_in_name);
526 }
527 if (str_sv.empty())
528 {
529 cout << "Input vector empty!" << endl;
530 exit(1);
531 }
532 if (str_vec.empty())
533 {
534 for (str_sparse_vect::size_type i = 0; i < str_sv.size(); ++i)
535 {
536 string s;
537 str_sv.get(i, s);
538 str_vec.emplace_back(std::move(s));
539 } // for
540 }
541 }
542
543 // save SV vector to file
544 if (!sv_out_name.empty() && !str_sv.empty())
545 {
546 bm::chrono_taker tt1(cout, "7. Save sparse vector", 1, &timing_map);
547 file_save_svector(str_sv, sv_out_name, 0, is_xor);
548
549 str_sparse_vect str_sv_control;
550 file_load_svector(str_sv_control, sv_out_name);
551 bool eq = str_sv.equal(str_sv_control);
552 if (!eq)
553 {
554 cerr << "Serialization control failed" << endl;
555 assert(0); exit(1);
556 }
557 }
558
559
560
561 if (is_diag)
562 {
563 if (!str_sv.empty())
564 {
565 print_svector_stat(cout,str_sv, false);
566 }
567
568 if (str_vec.size())
569 {
570 size_t total_size = 0;
571 for (const string& term : str_vec)
572 {
573 total_size += term.size();
574 }
575 cout << "String dictionary size: "
576 << total_size / 1024 << "KB (" << total_size / (1024*1024) << "MB)"
577 << endl;
578 }
579
580 if (str_sv.size() && str_vec.size())
581 {
582 cout << "Run full comparison check..." << endl;
583 check_sparse(str_sv, str_vec); // run a paranoiya check
584 cout << "Ok" << endl;
585 }
586 }
587
588 if (is_bench) // run set of benchmarks
589 {
590 run_benchmark(str_sv, str_vec);
591 }
592
593 if (is_timing) // print all collected timings
594 {
595 std::cout << std::endl << "Performance:" << std::endl;
597 }
598 }
599 catch (std::exception& ex)
600 {
601 std::cerr << "Error:" << ex.what() << std::endl;
602 return 1;
603 }
604
605 return 0;
606}
607
Compressed bit-vector bvector<> container, set algebraic methods, traversal iterators.
#define BM_DECLARE_TEMP_BLOCK(x)
Definition bm.h:47
Algorithms for bvector<> (main include).
Generation of random subset.
Serialization / compression of bvector<>. Set theoretical operations on compressed BLOBs.
Algorithms for bm::sparse_vector.
Serialization for sparse_vector<>.
string sparse vector based on bit-transposed matrix
Timing utilities for benchmarking (internal).
pre-processor un-defines to avoid global space pollution (internal)
Bitvector Bit-vector container with runtime compression of bits.
Definition bm.h:115
bool test(size_type n) const BMNOEXCEPT
returns true if bit n is set and false is bit n is 0.
Definition bm.h:1502
size_type size() const BMNOEXCEPT
Returns bvector's capacity (number of bits it can store).
Definition bm.h:1300
size_type count() const BMNOEXCEPT
population count (count of ON bits)
Definition bm.h:2401
bvector< Alloc > & set(size_type n, bool val=true)
Sets bit n if val is true, clears bit n if val is false.
Definition bm.h:4188
void resize(size_type new_size)
Change size of the bvector.
Definition bm.h:2463
int compare(const bvector< Alloc > &bvect) const BMNOEXCEPT
Lexicographical comparison with a bitvector.
Definition bm.h:3744
Utility class to collect performance measurements and statistics.
Definition bmtimer.h:41
std::map< std::string, statistics > duration_map_type
test name to duration map
Definition bmtimer.h:66
static void print_duration_map(TOut &tout, const duration_map_type &dmap, format fmt=ct_time)
Definition bmtimer.h:150
algorithms for sparse_vector scan/search
void bind(const SV &sv, bool sorted)
bind sparse vector for all searches
bool bfind_eq_str(const SV &sv, const value_type *str, size_type &pos)
binary find first sparse vector element (string).
bool find_eq_str(const SV &sv, const value_type *str, bvector_type &bv_out)
find sparse vector elements (string)
succinct sparse vector for strings with compression using bit-slicing ( transposition) method
void optimize(bm::word_t *temp_block=0, typename bvector_type::optmode opt_mode=bvector_type::opt_compress, typename str_sparse_vector< CharType, BV, STR_SIZE >::statistics *stat=0)
run memory optimization for all vector planes
bool empty() const
return true if vector is empty
bool equal(const str_sparse_vector< CharType, BV, STR_SIZE > &sv, bm::null_support null_able=bm::use_null) const BMNOEXCEPT
check if another sparse vector has the same content and size
void remap_from(const str_sparse_vector &str_sv, octet_freq_matrix_type *omatrix=0)
Build remapping profile and load content from another sparse vector Remapped vector likely saves memo...
size_type size() const
return size of the vector
size_type get(size_type idx, value_type *str, size_type buf_size) const BMNOEXCEPT
get specified element
back_insert_iterator get_back_inserter()
Provide back insert iterator Back insert iterator implements buffered insertion, which is faster,...
void swap(size_type idx1, size_type idx2)
swap two vector elements between each other
std::random_device rand_dev
Definition sample11.cpp:51
std::uniform_int_distribution rand_dis(1, int(vector_max))
bm::chrono_taker ::duration_map_type timing_map
Definition sample11.cpp:46
std::mt19937 gen(rand_dev())
int main(void)
Definition sample1.cpp:43
bool is_bench
std::string sv_in_name
bool is_diag
std::string sv_out_name
bool is_timing
static int parse_args(int argc, char *argv[])
Definition xsample05.cpp:98
static void show_help()
Print help.
Definition xsample05.cpp:69
vector< string > string_vector
static int load_dict_report(const std::string &fname, string_vector &str_vec)
Parse the input file and extract dictionary values.
static void run_benchmark(const str_sparse_vect &str_sv, const string_vector &str_vec)
static void pick_benchmark_set(string_vector &bench_vec, string_vector &bench_vec_not_found, const string_vector &str_vec)
Sample a few random terms out of collection.
static void check_sparse(const str_sparse_vect &str_sv, const string_vector &str_vec)
Compare STL vector with bit-transposed container to check correctness.
bool is_xor
Definition xsample05.cpp:94
std::string i_name
Definition xsample05.cpp:89
const unsigned benchmark_max
bool is_remap
Definition xsample05.cpp:93
bm::str_sparse_vector< char, bm::bvector<>, 64 > str_sparse_vect