Use of bvector<> for k-mer fingerprint K should be short, no minimizers are used here (the approach can be used to create a scheme with minimizers).
Use of bvector<> for k-mer fingerprint K should be short, no minimizers are used here (the approach can be used to create a scheme with minimizers).This example:
#include <assert.h>
#include <stdlib.h>
#include <iostream>
#include <vector>
#include <map>
#include <algorithm>
#include <utility>
#include <future>
#include <thread>
#include <mutex>
#include <atomic>
#include "bm64.h"
#include "bmdbg.h"
#include "dna_finger.h"
using namespace std;
#include "cmd_args.h"
static
{
seq_vect.resize(0);
std::ifstream fin(fname.c_str(), std::ios::in);
if (!fin.good())
return -1;
std::string line;
for (unsigned i = 0; std::getline(fin, line); ++i)
{
if (line.empty() ||
line.front() == '>')
continue;
for (std::string::iterator it = line.begin(); it != line.end(); ++it)
seq_vect.push_back(*it);
}
return 0;
}
inline
{
switch (bp)
{
case 'A':
dna_code = 0;
break;
case 'T':
dna_code = 1;
break;
case 'G':
dna_code = 2;
break;
case 'C':
dna_code = 3;
break;
default:
return false;
}
return true;
}
inline
size_t pos, unsigned k_size,
{
unsigned shift = 0;
dna += pos;
for (size_t i = 0; i < k_size; ++i)
{
char bp = dna[i];
if (!valid)
return false;
k_acc |= (dna_code << shift);
shift += 2;
}
k_mer = k_acc;
return true;
}
inline
{
static char lut[] = { 'A', 'T', 'G', 'C', 'N', '$' };
if (code < 5)
return lut[code];
assert(0);
return 'N';
}
inline
{
dna.resize(k_size);
for (size_t i = 0; i < k_size; ++i)
{
unsigned dna_code = unsigned(kmer_code & 3);
dna[i] = bp;
kmer_code >>= 2;
}
assert(!kmer_code);
}
inline
size_t pos, unsigned k_size,
{
for (size_t i = 0; i < k_size; ++i)
{
char bp = dna[pos+i];
unsigned dna_code = unsigned(k_mer & 3ull);
if (bp != bp_c)
{
if (bp == 'N' && bp_c != 'A')
{
cerr << bp << " " << bp_c << endl;
cerr << "Error! N code mismatch at pos = " << pos+i
<< endl;
exit(1);
}
}
k_mer >>= 2;
}
if (k_mer)
{
cerr << "Error! non-zero k-mer remainder at " << pos << endl;
exit(1);
}
}
template<typename VECT>
{
std::sort(vect.begin(), vect.end());
auto last = std::unique(vect.begin(), vect.end());
vect.erase(last, vect.end());
}
template<typename VECT, typename COUNT_VECT>
{
if (!vect.size())
return;
std::sort(vect.begin(), vect.end());
typename VECT::value_type prev = vect[0];
typename COUNT_VECT::value_type cnt = 1;
auto vsize = vect.size();
size_t i = 1;
for (; i < vsize; ++i)
{
auto v = vect[i];
if (v == prev)
{
++cnt;
continue;
}
cvect.inc_not_null(prev, cnt);
prev = v; cnt = 1;
}
cvect.inc_not_null(prev, cnt);
assert(cvect.in_sync());
}
template<typename BV>
unsigned k_size,
bool check)
{
bv.clear();
bv.init();
if (seq_vect.empty())
return;
const char* dna_str = &seq_vect[0];
std::vector<bm::id64_t> k_buf;
k_buf.reserve(chunk_size);
{
vector_char_type::size_type dna_sz = seq_vect.size()-(k_size-1);
vector_char_type::size_type pos = 0;
bool valid = false;
for (; pos < dna_sz; ++pos)
{
if (valid)
{
k_buf.push_back(k_mer_code);
break;
}
}
const unsigned k_shift = (k_size-1) * 2;
if (valid)
{
for (++pos; pos < dna_sz; ++pos)
{
if (!valid)
{
pos += k_size;
for (; pos < dna_sz; ++pos)
{
if (valid)
{
k_buf.push_back(k_mer_code);
break;
}
}
continue;
}
k_mer_code = ((k_mer_code >> 2) | (bp_code << k_shift));
k_buf.push_back(k_mer_code);
if (check)
{
assert(valid);
assert(k_check == k_mer_code);
}
if (k_buf.size() >= chunk_size)
{
if (k_buf.size())
{
k_buf.resize(0);
bv.optimize();
}
float pcnt = float(pos) / float(dna_sz);
pcnt *= 100;
cout << "\r" << unsigned(pcnt) << "% of " << dna_sz
<< " (" << (pos+1) <<") "
<< flush;
}
}
}
if (k_buf.size())
{
cout << "Unique k-mers: " << k_buf.size() << endl;
}
}
bv.optimize();
}
inline
unsigned k_size,
{
if (seq_vect.empty())
return;
const char* dna_str = &seq_vect[0];
std::vector<bm::id64_t> k_buf;
k_buf.reserve(chunk_size);
vector_char_type::size_type dna_sz = seq_vect.size()-(k_size-1);
vector_char_type::size_type pos = 0;
bool valid = false;
for (; pos < dna_sz; ++pos)
{
if (valid)
{
k_buf.push_back(k_mer_code);
break;
}
}
const unsigned k_shift = (k_size-1) * 2;
if (valid)
{
for (++pos; pos < dna_sz; ++pos)
{
if (!valid)
{
pos += k_size;
for (; pos < dna_sz; ++pos)
{
if (valid)
{
k_buf.push_back(k_mer_code);
break;
}
}
continue;
}
k_mer_code = ((k_mer_code >> 2) | (bp_code << k_shift));
k_buf.push_back(k_mer_code);
if (k_buf.size() == chunk_size)
{
k_buf.resize(0);
float pcnt = float(pos) / float(dna_sz);
pcnt *= 100;
cout << "\r" << unsigned(pcnt) << "% of " << dna_sz
<< " (" << (pos+1) <<") "
<< flush;
}
}
}
}
template<typename BV>
{
public:
unsigned k_size,
: m_seq_vect(seq_vect), m_k_size(k_size), m_from(from), m_to(to),
m_kmer_counts(kmer_counts)
{}
: m_seq_vect(func.m_seq_vect), m_k_size(func.m_k_size),
m_from(func.m_from), m_to(func.m_to),
m_kmer_counts(func.m_kmer_counts)
{}
{
const bvector_type* bv_null = m_kmer_counts.get_null_bvector();
kmer_counts_part.sync();
if (m_seq_vect.empty())
return;
const char* dna_str = &m_seq_vect[0];
std::vector<bm::id64_t> k_buf;
k_buf.reserve(chunk_size);
const auto k_size = m_k_size;
vector_char_type::size_type dna_sz = m_seq_vect.size()-(m_k_size-1);
vector_char_type::size_type pos = 0;
bool valid = false;
for (; pos < dna_sz; ++pos)
{
if (valid)
{
if (k_mer_code >= m_from && k_mer_code <= m_to)
k_buf.push_back(k_mer_code);
break;
}
}
const unsigned k_shift = (k_size-1) * 2;
if (valid)
{
for (++pos; pos < dna_sz; ++pos)
{
if (!valid)
{
pos += k_size;
for (; pos < dna_sz; ++pos)
{
if (valid)
{
if (k_mer_code >= m_from && k_mer_code <= m_to)
k_buf.push_back(k_mer_code);
break;
}
}
continue;
}
k_mer_code = ((k_mer_code >> 2) | (bp_code << k_shift));
if (k_mer_code >= m_from && k_mer_code <= m_to)
{
k_buf.push_back(k_mer_code);
if (k_buf.size() == chunk_size)
{
k_buf.resize(0);
}
}
}
}
{
static std::mutex mtx_counts_lock;
std::lock_guard<std::mutex> guard(mtx_counts_lock);
m_kmer_counts.merge_not_null(kmer_counts_part);
}
}
private:
unsigned m_k_size;
};
template<typename BV>
unsigned k_size,
unsigned concurrency)
{
if (split_rank < concurrency || concurrency < 2)
{
return;
}
std::vector<std::future<void> > futures;
futures.reserve(concurrency);
for (size_t k = 0; k < pair_vect.size(); ++k)
{
futures.emplace_back(std::async(std::launch::async,
pair_vect[k].first,
pair_vect[k].second,
kmer_counts)));
}
for (auto& e : futures)
{
unsigned m = 0;
while(1)
{
std::future_status status = e.wait_for(std::chrono::seconds(60));
if (status == std::future_status::ready)
break;
cout << "\r" << ++m << " min" << flush;
}
}
cout << endl;
}
template<typename BV>
{
std::string kmer_str;
typename BV::size_type cnt = 0;
typename BV::enumerator en = bv_kmers.first();
for ( ;en.valid(); ++en, ++cnt)
{
auto k_mer_code = *en;
typename BV::size_type count =
dna_scanner.FindCount(kmer_str);
assert(count);
kmer_counts.
set(k_mer_code, (
unsigned)count);
if ((cnt % 1000) == 0)
cout << "\r" << cnt << flush;
}
}
template<typename DNA_Scan>
{
public:
: m_parent_scanner(parent_scanner), m_bv_kmers(bv_kmers),
m_from(from), m_to(to), m_kmer_counts(kmer_counts)
{}
: m_parent_scanner(func.m_parent_scanner), m_bv_kmers(func.m_bv_kmers),
m_from(func.m_from), m_to(func.m_to), m_kmer_counts(func.m_kmer_counts)
{}
{
std::string kmer_str;
static std::mutex mtx_counts_lock;
for ( ;en.
valid(); ++en, ++cnt)
{
auto k_mer_code = *en;
if (k_mer_code > m_to)
break;
m_Agg.reset();
m_Agg.set_compute_count(true);
for (size_t i = 0; i < kmer_str.size(); ++i)
{
const bvector_type& bv_mask = m_parent_scanner.GetVector(kmer_str[i]);
m_Agg.add(&bv_mask);
}
m_Agg.combine_shift_right_and(bv_search_res);
{
std::lock_guard<std::mutex> guard(mtx_counts_lock);
m_kmer_counts.set(k_mer_code, unsigned(search_count));
assert(m_kmer_counts.in_sync());
}
}
}
private:
const DNA_Scan& m_parent_scanner;
typename DNA_Scan::aggregator_type m_Agg;
};
template<typename BV>
unsigned concurrency)
{
if (split_rank < concurrency || concurrency < 2)
{
return;
}
std::vector<std::future<void> > futures;
futures.reserve(concurrency);
for (size_t k = 0; k < pair_vect.size(); ++k)
{
futures.emplace_back(std::async(std::launch::async,
pair_vect[k].first,
pair_vect[k].second,
kmer_counts)));
}
for (auto& e : futures)
{
unsigned long long c_prev = 0;
while(1)
{
std::future_status status = e.wait_for(std::chrono::seconds(60));
if (status == std::future_status::ready)
break;
auto delta = c - c_prev;
c_prev = c;
auto remain_cnt = cnt - c;
auto remain_min = remain_cnt / delta;
cout << "\r" << c << ": progress per minute=" << delta;
if (remain_min < 120)
{
cout << " wait for " << remain_min << "m " << flush;
}
else
{
auto remain_h = remain_min / 60;
cout << " wait for " << remain_h << "h " << flush;
}
}
}
cout << endl;
}
static
{
auto en = bv_null->
first();
{
auto kmer_code = *en;
auto kmer_count = kmer_counts.
get(kmer_code);
auto mit = hmap.find(kmer_count);
if (mit == hmap.end())
hmap[kmer_count] = 1;
else
mit->second++;
}
}
static
{
ofstream outf;
outf.open(fname, ios::out | ios::trunc );
outf << "kmer count \t number of kmers\n";
auto it = hmap.rbegin(); auto it_end = hmap.rend();
for (; it != it_end; ++it)
{
outf << it->first << "\t" << it->second << endl;
}
}
template<typename BV>
unsigned percent,
unsigned k_size)
{
(void)k_size;
frequent_bv.clear();
if (!percent)
return;
BV bv_found;
auto total_kmers = bv_null->
count();
bm::id64_t target_f_count = (total_kmers * percent) / 100;
auto it = hmap.rbegin();
auto it_end = hmap.rend();
for (; it != it_end; ++it)
{
auto kmer_count = it->first;
scanner.
find_eq(kmer_counts, kmer_count, bv_found);
auto found_cnt = bv_found.count();
(void)found_cnt;
assert(found_cnt);
{
{
auto kmer_code = *en;
unsigned k_count = kmer_counts.
get(kmer_code);
if (k_count == 1)
continue;
if (it->second == 1)
{
assert(k_count == kmer_count);
}
frequent_bv.set(kmer_code);
if (kmer_count >= target_f_count)
{
frequent_bv.optimize();
return;
}
target_f_count -= 1;
}
}
}
frequent_bv.optimize();
}
int main(
int argc,
char *argv[])
{
try
{
if (ret != 0)
{
cerr << "cmd-line parse error. " << endl;
return ret;
}
{
if (res != 0)
return res;
std::cout << "FASTA sequence size=" << seq_vect.size() << std::endl;
}
if (seq_vect.size())
{
cout <<
"k-mer generation for k=" <<
ik_size << endl;
cout <<
"Found " << bv_kmers.
count() <<
" k-mers." << endl;
{
bm::print_bvector_stat(cout,bv_kmers);
size_t blob_size = bm::compute_serialization_size(bv_kmers);
cout << "DNA 2-bit coded FASTA size=" << seq_vect.size()/4 << endl;
cout << " Compressed size=" << blob_size << endl;
}
}
{
bm::SaveBVector(
ikd_name.c_str(), bv_kmers);
}
if (seq_vect.size())
{
}
if (seq_vect.size() &&
{
rsc_kmer_counts.sync();
rsc_kmer_counts2.sync();
cout << " Searching for k-mer counts..." << endl;
{
cout << " ... using bm::aggregator<>" << endl;
rsc_kmer_counts.optimize();
}
{
cout << " ... using std::sort() and count" << endl;
rsc_kmer_counts2.optimize();
{
if (res)
{
cerr << "Error: Count vector save failed!" << endl;
exit(1);
}
}
{
bool eq = rsc_kmer_counts.equal(rsc_kmer_counts2);
if(!eq)
{
rsc_kmer_counts2,
idx);
auto v1 = rsc_kmer_counts.get(idx);
auto v2 = rsc_kmer_counts2.get(idx);
cerr << "Mismatch at idx=" << idx << " v1=" << v1 << " v2=" << v2 << endl;
assert(found); (void)found;
assert(eq);
cerr << "Integrity check failed!" << endl;
exit(1);
}
}
}
{
}
{
}
{
}
{
}
cout << "Found frequent k-mers: " << bv_freq.count() << endl;
{
}
}
{
std::cout << std::endl << "Performance:" << std::endl;
}
}
catch(std::exception& ex)
{
std::cerr << ex.what() << std::endl;
return 1;
}
return 0;
}
Algorithms for fast aggregation of N bvectors.
Algorithms for bvector<> (main include).
Serialization / compression of bvector<>. Set theoretical operations on compressed BLOBs.
Algorithms for bm::sparse_vector.
Compressed sparse container rsc_sparse_vector<> for integer types.
Timing utilities for benchmarking (internal).
pre-processor un-defines to avoid global space pollution (internal)
k-mer counting job functor class using bm::aggregator<>
DNA_Scan::bvector_type bvector_type
Counting_JobFunctor(const DNA_Scan &parent_scanner, const bvector_type &bv_kmers, size_type from, size_type to, rsc_sparse_vector_u32 &kmer_counts)
constructor
bvector_type::size_type size_type
void operator()()
Main logic (functor).
Utility for keeping all DNA finger print vectors and search using various techniques.
Functor to process job batch (task).
void operator()()
Main logic (functor).
SortCounting_JobFunctor(const vector_char_type &seq_vect, unsigned k_size, size_type from, size_type to, rsc_sparse_vector_u32 &kmer_counts)
constructor
bvector_type::size_type size_type
Constant iterator designed to enumerate "ON" bits.
bool valid() const BMNOEXCEPT
Checks if iterator is still valid.
Bitvector Bit-vector container with runtime compression of bits.
size_type count() const BMNOEXCEPT
population count (count of ON bits)
bm::bvector< Alloc > & bit_sub(const bm::bvector< Alloc > &bv1, const bm::bvector< Alloc > &bv2, typename bm::bvector< Alloc >::optmode opt_mode=opt_none)
3-operand SUB : this := bv1 MINUS bv2 SUBtraction is also known as AND NOT
void optimize(bm::word_t *temp_block=0, optmode opt_mode=opt_compress, statistics *stat=0)
Optimize memory bitvector's memory allocation.
bvector_size_type size_type
enumerator first() const
Returns enumerator pointing on the first non-zero bit.
Utility class to collect performance measurements and statistics.
std::map< std::string, statistics > duration_map_type
test name to duration map
static void print_duration_map(TOut &tout, const duration_map_type &dmap, format fmt=ct_time)
Rank-Select compressed sparse vector.
sparse_vector_u32::size_type size_type
void set(size_type idx, value_type v)
set specified element with bounds checking and automatic resize
value_type get(size_type idx) const BMNOEXCEPT
get specified element without bounds checking
const bvector_type * get_null_bvector() const BMNOEXCEPT
Get bit-vector of assigned values (or NULL).
sparse_vector_u32::bvector_type bvector_type
algorithms for sparse_vector scan/search
void find_eq(const SV &sv, value_type value, bvector_type &bv_out)
find all sparse vector elements EQ to search value
succinct sparse vector with runtime compression using bit-slicing / transposition method
@ BM_SORTED
input set is sorted (ascending order)
@ BM_GAP
GAP compression is ON.
void rank_range_split(const BV &bv, typename BV::size_type rank, PairVect &target_v)
Algorithm to identify bit-vector ranges (splits) for the rank.
bool sparse_vector_find_first_mismatch(const SV &sv1, const SV &sv2, typename SV::size_type &midx, bm::null_support null_proc=bm::use_null)
Find first mismatch (element which is different) between two sparse vectors (uses linear scan in bit-...
unsigned long long int id64_t
bm::chrono_taker ::duration_map_type timing_map
bm::bvector ::size_type bv_size_type
std::vector< std::pair< bv_size_type, bv_size_type > > bv_ranges_vector
bm::sparse_vector< unsigned, bm::bvector<> > sparse_vector_u32
static int parse_args(int argc, char *argv[])
bm::rsc_sparse_vector< unsigned, sparse_vector_u32 > rsc_sparse_vector_u32
static int load_FASTA(const std::string &fname, std::vector< char > &seq_vect)
std::vector< char > vector_char_type
char int2DNA(unsigned code)
Translate integer code to DNA letter.
DNA_FingerprintScanner< bm::bvector<> > dna_scanner_type
void validate_k_mer(const char *dna, size_t pos, unsigned k_size, bm::id64_t k_mer)
QA function to validate if reverse k-mer decode gives the same string.
void compute_frequent_kmers(BV &frequent_bv, const histogram_map_u32 &hmap, const rsc_sparse_vector_u32 &kmer_counts, unsigned percent, unsigned k_size)
Create vector, representing subset of k-mers of high frequency.
void generate_k_mer_bvector(BV &bv, const vector_char_type &seq_vect, unsigned k_size, bool check)
This function turns each k-mer into an integer number and encodes it in a bit-vector (presense vector...
void translate_kmer(std::string &dna, bm::id64_t kmer_code, unsigned k_size)
Translate k-mer code into ATGC DNA string.
static void report_hmap(const string &fname, const histogram_map_u32 &hmap)
Save TSV report of k-mer frequences (reverse sorted, most frequent k-mers first).
std::map< unsigned, unsigned > histogram_map_u32
static void compute_kmer_histogram(histogram_map_u32 &hmap, const rsc_sparse_vector_u32 &kmer_counts)
Compute a map of how often each k-mer frequency is observed in the k-mer counts vector.
void count_kmers(const vector_char_type &seq_vect, unsigned k_size, rsc_sparse_vector_u32 &kmer_counts)
k-mer counting algorithm using reference sequence, regenerates k-mer codes, sorts them and counts
dna_scanner_type dna_scanner
std::string ikd_freq_name
std::string ikd_counts_name
bool get_kmer_code(const char *dna, size_t pos, unsigned k_size, bm::id64_t &k_mer)
Calculate k-mer as an unsigned long integer.
std::vector< char > vector_char_type
bm::rsc_sparse_vector< unsigned, sparse_vector_u32 > rsc_sparse_vector_u32
void sort_unique(VECT &vect)
Auxiliary function to do sort+unique on a vactor of ints removes duplicate elements.
void count_kmers_parallel(const BV &bv_kmers, const vector_char_type &seq_vect, rsc_sparse_vector_u32 &kmer_counts, unsigned k_size, unsigned concurrency)
MT k-mer counting.
void sort_count(VECT &vect, COUNT_VECT &cvect)
Auxiliary function to do sort+unique on a vactor of ints and save results in a counts vector.
bool get_DNA_code(char bp, bm::id64_t &dna_code)
std::atomic_ullong k_mer_progress_count(0)