.. _program_listing_file_src_cpp_factorizer.cpp: Program Listing for File factorizer.cpp ======================================= |exhale_lsh| :ref:`Return to documentation for file ` (``src/cpp/factorizer.cpp``) .. |exhale_lsh| unicode:: U+021B0 .. UPWARDS ARROW WITH TIP LEFTWARDS .. code-block:: cpp #include "factorizer.hpp" #include #include #include #include #include #include #include namespace noLZSS { using cst_t = sdsl::cst_sada<>; static size_t lcp(cst_t& cst, size_t i, size_t j) { if (i == j) return cst.csa.size() - cst.csa[i]; auto lca = cst.lca(cst.select_leaf(cst.csa.isa[i]+1), cst.select_leaf(cst.csa.isa[j]+1)); return cst.depth(lca); } static cst_t::node_type next_leaf(cst_t& cst, cst_t::node_type lambda, size_t iterations = 1) { assert(cst.is_leaf(lambda)); auto lambda_rank = cst.lb(lambda); for (size_t i = 0; i < iterations; i++) lambda_rank = cst.csa.psi[lambda_rank]; return cst.select_leaf(lambda_rank + 1); } // ---------- genomic utilities ---------- char complement(char c) { switch (c) { case 'A': return 'T'; case 'C': return 'G'; case 'G': return 'C'; case 'T': return 'A'; default: // Handle invalid input, e.g., throw an exception or return a sentinel value throw std::invalid_argument("Invalid nucleotide: " + std::string(1, c)); } } static std::string revcomp(std::string_view s) { std::string r; r.resize(s.size()); for (size_t i = 0, n = s.size(); i < n; ++i) r[i] = complement(s[n-1-i]); return r; } PreparedSequenceResult prepare_multiple_dna_sequences_w_rc(const std::vector& sequences) { if (sequences.empty()) { return {"", 0, {}}; } // Check if we have too many sequences if (sequences.size() > 125) { throw std::invalid_argument("Too many sequences: maximum 125 sequences supported (due to sentinel character limitations)"); } // Validate sequences contain only valid DNA nucleotides for (size_t i = 0; i < sequences.size(); ++i) { for (char c : sequences[i]) { if (c != 'A' && c != 'C' && c != 'G' && c != 'T' && c != 'a' && c != 'c' && c != 'g' && c != 't') { throw std::runtime_error("Invalid nucleotide '" + std::string(1, c) + "' found in sequence " + std::to_string(i)); } } } PreparedSequenceResult result; // Calculate total size for reservation size_t total_size = 0; for (const auto& seq : sequences) { total_size += seq.length() * 2; // Original + reverse complement } total_size += sequences.size() * 2; // Add space for sentinels result.prepared_string.reserve(total_size); // Generate sentinel characters avoiding 0 and uppercase DNA nucleotides A(65), C(67), G(71), T(84) auto get_sentinel = [](size_t index) -> char { char sentinel = 1; size_t count = 0; while (true) { // Check if current sentinel is valid (not 0 and not uppercase DNA nucleotides) if (sentinel != 0 && sentinel != 'A' && sentinel != 'C' && sentinel != 'G' && sentinel != 'T') { if (count == index) { return sentinel; // Found the sentinel for this index } count++; } sentinel++; if (sentinel == 0) sentinel = 1; // wrap around, skip 0 } }; // First, add original sequences with sentinels for (size_t i = 0; i < sequences.size(); ++i) { // Convert to uppercase std::string seq = sequences[i]; for (char& c : seq) { if (c >= 'a' && c <= 'z') c = c - 'a' + 'A'; } result.prepared_string += seq; // Add sentinel and track its position size_t sentinel_pos = result.prepared_string.length(); char sentinel = get_sentinel(i); result.prepared_string += sentinel; result.sentinel_positions.push_back(sentinel_pos); } result.original_length = result.prepared_string.length(); // Then, add reverse complements with different sentinels for (int i = static_cast(sequences.size()) - 1; i >= 0; --i) { // Convert to uppercase first std::string seq = sequences[i]; for (char& c : seq) { if (c >= 'a' && c <= 'z') c = c - 'a' + 'A'; } // Add reverse complement std::string rc = revcomp(seq); result.prepared_string += rc; // Add sentinel and track its position (offset by sequences.size() to make them unique) size_t sentinel_pos = result.prepared_string.length(); char sentinel = get_sentinel(sequences.size() + (sequences.size() - 1 - i)); result.prepared_string += sentinel; result.sentinel_positions.push_back(sentinel_pos); } return result; } PreparedSequenceResult prepare_multiple_dna_sequences_no_rc(const std::vector& sequences) { if (sequences.empty()) { return {"", 0, {}}; } // Check if we have too many sequences if (sequences.size() > 250) { throw std::invalid_argument("Too many sequences: maximum 250 sequences supported (due to sentinel character limitations)"); } // Validate sequences contain only valid DNA nucleotides for (size_t i = 0; i < sequences.size(); ++i) { for (char c : sequences[i]) { if (c != 'A' && c != 'C' && c != 'G' && c != 'T' && c != 'a' && c != 'c' && c != 'g' && c != 't') { throw std::runtime_error("Invalid nucleotide '" + std::string(1, c) + "' found in sequence " + std::to_string(i)); } } } PreparedSequenceResult result; // Calculate total size for reservation size_t total_size = 0; for (const auto& seq : sequences) { total_size += seq.length(); } total_size += sequences.size(); // Add space for sentinels result.prepared_string.reserve(total_size); // Generate sentinel characters avoiding 0 and uppercase DNA nucleotides A(65), C(67), G(71), T(84) auto get_sentinel = [](size_t index) -> char { char sentinel = 1; size_t count = 0; while (true) { // Check if current sentinel is valid (not 0 and not uppercase DNA nucleotides) if (sentinel != 0 && sentinel != 'A' && sentinel != 'C' && sentinel != 'G' && sentinel != 'T') { if (count == index) { return sentinel; // Found the sentinel for this index } count++; } sentinel++; if (sentinel == 0) sentinel = 1; // wrap around, skip 0 } }; // Add original sequences with sentinels for (size_t i = 0; i < sequences.size(); ++i) { // Convert to uppercase std::string seq = sequences[i]; for (char& c : seq) { if (c >= 'a' && c <= 'z') c = c - 'a' + 'A'; } result.prepared_string += seq; // Add sentinel and track its position (only between sequences, not after the last one) if (i < sequences.size() - 1) { size_t sentinel_pos = result.prepared_string.length(); char sentinel = get_sentinel(i); result.prepared_string += sentinel; result.sentinel_positions.push_back(sentinel_pos); } } result.original_length = result.prepared_string.length(); return result; } // ---------- generic, sink-driven noLZSS ---------- template static size_t nolzss(cst_t& cst, Sink&& sink) { sdsl::rmq_succinct_sct<> rmq(&cst.csa); const size_t str_len = cst.size() - 1; // the length of the string is the size of the CST minus the sentinel auto lambda = cst.select_leaf(cst.csa.isa[0] + 1); size_t lambda_node_depth = cst.node_depth(lambda); size_t lambda_sufnum = 0; cst_t::node_type v; size_t v_min_leaf_sufnum = 0; size_t u_min_leaf_sufnum = 0; size_t count = 0; while (lambda_sufnum < str_len) { // Compute current factor size_t d = 1; size_t l = 1; while (true) { v = cst.bp_support.level_anc(lambda, lambda_node_depth - d); v_min_leaf_sufnum = cst.csa[rmq(cst.lb(v), cst.rb(v))]; l = cst.depth(v); if (v_min_leaf_sufnum + l - 1 < lambda_sufnum) { u_min_leaf_sufnum = v_min_leaf_sufnum; ++d; continue; } auto u = cst.parent(v); auto u_depth = cst.depth(u); if (v_min_leaf_sufnum == lambda_sufnum) { if (u == cst.root()) { l = 1; Factor factor{static_cast(lambda_sufnum), static_cast(l), static_cast(lambda_sufnum)}; sink(factor); break; } else { l = u_depth; Factor factor{static_cast(lambda_sufnum), static_cast(l), static_cast(u_min_leaf_sufnum)}; sink(factor); break; } } l = std::min(lcp(cst, lambda_sufnum, v_min_leaf_sufnum), (lambda_sufnum - v_min_leaf_sufnum)); if (l <= u_depth) { l = u_depth; Factor factor{static_cast(lambda_sufnum), static_cast(l), static_cast(u_min_leaf_sufnum)}; sink(factor); break; } else { Factor factor{static_cast(lambda_sufnum), static_cast(l), static_cast(v_min_leaf_sufnum)}; sink(factor); break; } } ++count; // Advance to next position lambda = next_leaf(cst, lambda, l); lambda_node_depth = cst.node_depth(lambda); lambda_sufnum = cst.sn(lambda); } return count; } template static size_t nolzss_dna_w_rc(const std::string& T, Sink&& sink) { const size_t n = T.size(); if (n == 0) return 0; // Use prepare_multiple_dna_sequences_w_rc with a single sequence PreparedSequenceResult prep_result = prepare_multiple_dna_sequences_w_rc({T}); std::string S = prep_result.prepared_string; size_t original_length = prep_result.original_length; // Use the multiple sequence algorithm return nolzss_multiple_dna_w_rc(S, std::forward(sink)); } template static size_t nolzss_multiple_dna_w_rc(const std::string& S, Sink&& sink) { const size_t N = (S.size() / 2) - 1; if (N == 0) return 0; // Build CST over S cst_t cst; construct_im(cst, S, 1); // Build RMQ inputs aligned to SA: forward starts and RC ends (in T-coords) const uint64_t INF = std::numeric_limits::max()/2ULL; sdsl::int_vector<64> fwd_starts(cst.csa.size(), INF); sdsl::int_vector<64> rc_ends (cst.csa.size(), INF); const size_t T_beg = 0; const size_t T_end = N; // end of original const size_t R_beg = N; // first char of rc const size_t R_end = S.size(); // end of S for (size_t k = 0; k < cst.csa.size(); ++k) { size_t posS = cst.csa[k]; if (posS < T_end) { // suffix starts in T fwd_starts[k] = posS; // 0-based start in T } else if (posS >= R_beg && posS < R_end) { // suffix starts in R size_t jR0 = posS - R_beg; // 0-based start in R size_t endT0 = N - jR0 - 1; // mapped end in T (0-based) rc_ends[k] = endT0; } } sdsl::rmq_succinct_sct<> rmqF(&fwd_starts); sdsl::rmq_succinct_sct<> rmqRcEnd(&rc_ends); // Initialize to the leaf of suffix starting at S position 0 (i.e., T[0]) auto lambda = cst.select_leaf(cst.csa.isa[0] + 1); size_t lambda_node_depth = cst.node_depth(lambda); size_t i = cst.sn(lambda); // suffix start in S, begins at 0 size_t factors = 0; while (i < N) { // only factorize inside T // At factor start i (0-based in T), walk up ancestors and pick best candidate size_t best_len_depth = 0; // best candidate's depth (proxy for length) bool best_is_rc = false; size_t best_fwd_start = 0; // start in T (for FWD) size_t best_rc_end = 0; // end in T (for RC) size_t best_rc_posS = 0; // pos in S where RC candidate suffix starts (for LCP) // Walk from leaf to root via level_anc for (size_t step = 1; step <= lambda_node_depth; ++step) { auto v = cst.bp_support.level_anc(lambda, lambda_node_depth - step); size_t ell = cst.depth(v); if (ell == 0) break; // reached root auto lb = cst.lb(v), rb = cst.rb(v); // Forward candidate (min start in T within v's interval) size_t kF = rmqF(lb, rb); uint64_t jF = fwd_starts[kF]; bool okF = (jF != INF) && (jF + ell - 1 < i); // non-overlap: endF <= i-1 // RC candidate (min END in T within v's interval; monotone with depth) size_t kR = rmqRcEnd(lb, rb); uint64_t endRC = rc_ends[kR]; bool okR = (endRC != INF) && (endRC < i); // endRC <= i-1 if (!okF && !okR) { // deeper nodes can only increase jF and the minimal RC end // -> non-overlap won't become true again for either; stop break; } // Choose the better of the valid candidates at this depth if (okF) { if (ell > best_len_depth || (ell == best_len_depth && !best_is_rc && (jF + ell - 1) < (best_fwd_start + best_len_depth - 1))) { best_len_depth = ell; best_is_rc = false; best_fwd_start = jF; } } if (okR) { size_t posS_R = cst.csa[kR]; // suffix position in S for LCP if (ell > best_len_depth || (ell == best_len_depth && (best_is_rc ? (endRC < best_rc_end) : true))) { best_len_depth = ell; best_is_rc = true; best_rc_end = endRC; best_rc_posS = posS_R; } } } size_t emit_len = 1; uint64_t emit_ref = i; // default for literal if (best_len_depth == 0) { // No previous occurrence (FWD nor RC) — literal of length 1 Factor f{static_cast(i), static_cast(emit_len), static_cast(emit_ref)}; sink(f); ++factors; // Advance lambda = next_leaf(cst, lambda, emit_len); lambda_node_depth = cst.node_depth(lambda); i = cst.sn(lambda); continue; } if (!best_is_rc) { // Finalize FWD with true LCP and non-overlap cap size_t cap = i - best_fwd_start; // i-1 - (best_fwd_start) + 1 size_t L = lcp(cst, i, best_fwd_start); emit_len = std::min(L, cap); emit_ref = static_cast(best_fwd_start); } else { // Finalize RC with true LCP (against suffix in R) size_t L = lcp(cst, i, best_rc_posS); emit_len = L; emit_ref = RC_MASK | static_cast(best_rc_end); // end-anchored + RC flag } // Safety: ensure progress if (emit_len <= 0) { throw std::runtime_error("emit_len must be positive to ensure factorization progress"); } Factor f{static_cast(i), static_cast(emit_len), emit_ref}; sink(f); ++factors; // Advance to next phrase start lambda = next_leaf(cst, lambda, emit_len); lambda_node_depth = cst.node_depth(lambda); i = cst.sn(lambda); } return factors; } // ------------- public wrappers ------------- template size_t factorize_stream(std::string_view text, Sink&& sink) { std::string tmp(text); cst_t cst; construct_im(cst, tmp, 1); return nolzss(cst, std::forward(sink)); } template size_t factorize_file_stream(const std::string& path, Sink&& sink) { // sdsl-lite will automatically add the sentinel when needed cst_t cst; construct(cst, path, 1); return nolzss(cst, std::forward(sink)); } size_t count_factors(std::string_view text) { size_t n = 0; factorize_stream(text, [&](const Factor&){ ++n; }); return n; } size_t count_factors_file(const std::string& path) { size_t n = 0; factorize_file_stream(path, [&](const Factor&){ ++n; }); return n; } std::vector factorize(std::string_view text) { std::vector out; factorize_stream(text, [&](const Factor& f){ out.push_back(f); }); return out; } std::vector factorize_file(const std::string& path, size_t reserve_hint) { std::vector out; if (reserve_hint) out.reserve(reserve_hint); factorize_file_stream(path, [&](const Factor& f){ out.push_back(f); }); return out; } size_t write_factors_binary_file(const std::string& in_path, const std::string& out_path) { // Set up binary output file with buffering std::ofstream os(out_path, std::ios::binary); if (!os) { throw std::runtime_error("Cannot create output file: " + out_path); } std::vector buf(1<<20); // 1 MB buffer for performance os.rdbuf()->pubsetbuf(buf.data(), static_cast(buf.size())); // Collect factors in memory to write header std::vector factors; size_t n = factorize_file_stream(in_path, [&](const Factor& f){ factors.push_back(f); }); // For non-FASTA files, create minimal header with no sequence names or sentinels FactorFileHeader header; header.num_factors = factors.size(); header.num_sequences = 0; // Unknown for raw text files header.num_sentinels = 0; // No sentinels for raw text files header.header_size = sizeof(FactorFileHeader); // Write header os.write(reinterpret_cast(&header), sizeof(header)); // Write factors for (const Factor& f : factors) { os.write(reinterpret_cast(&f), sizeof(Factor)); } return n; } template size_t factorize_stream_dna_w_rc(std::string_view text, Sink&& sink) { std::string tmp(text); return nolzss_dna_w_rc(tmp, std::forward(sink)); } template size_t factorize_file_stream_dna_w_rc(const std::string& path, Sink&& sink) { std::ifstream is(path, std::ios::binary); if (!is) throw std::runtime_error("Cannot open input file: " + path); std::string data((std::istreambuf_iterator(is)), std::istreambuf_iterator()); return nolzss_dna_w_rc(data, std::forward(sink)); } std::vector factorize_dna_w_rc(std::string_view text) { std::vector out; factorize_stream_dna_w_rc(text, [&](const Factor& f){ out.push_back(f); }); return out; } std::vector factorize_file_dna_w_rc(const std::string& path, size_t reserve_hint) { std::vector out; if (reserve_hint) out.reserve(reserve_hint); factorize_file_stream_dna_w_rc(path, [&](const Factor& f){ out.push_back(f); }); return out; } size_t count_factors_dna_w_rc(std::string_view text) { size_t n = 0; factorize_stream_dna_w_rc(text, [&](const Factor&){ ++n; }); return n; } size_t count_factors_file_dna_w_rc(const std::string& path) { size_t n = 0; factorize_file_stream_dna_w_rc(path, [&](const Factor&){ ++n; }); return n; } size_t write_factors_binary_file_dna_w_rc(const std::string& in_path, const std::string& out_path) { // Set up binary output file with buffering std::ofstream os(out_path, std::ios::binary); if (!os) { throw std::runtime_error("Cannot create output file: " + out_path); } std::vector buf(1<<20); // 1 MB buffer for performance os.rdbuf()->pubsetbuf(buf.data(), static_cast(buf.size())); // Collect factors in memory to write header std::vector factors; size_t n = factorize_file_stream_dna_w_rc(in_path, [&](const Factor& f){ factors.push_back(f); }); // For single DNA sequence files, create minimal header FactorFileHeader header; header.num_factors = factors.size(); header.num_sequences = 1; // Single DNA sequence header.num_sentinels = 0; // No sentinels for single sequence header.header_size = sizeof(FactorFileHeader) + 1; // Empty sequence name // Write header os.write(reinterpret_cast(&header), sizeof(header)); // Write empty sequence name (single null terminator) os.write("\0", 1); // Write factors for (const Factor& f : factors) { os.write(reinterpret_cast(&f), sizeof(Factor)); } return n; } template size_t factorize_stream_multiple_dna_w_rc(std::string_view text, Sink&& sink) { std::string tmp(text); return nolzss_multiple_dna_w_rc(tmp, std::forward(sink)); } template size_t factorize_file_stream_multiple_dna_w_rc(const std::string& path, Sink&& sink) { std::ifstream is(path, std::ios::binary); if (!is) throw std::runtime_error("Cannot open input file: " + path); std::string data((std::istreambuf_iterator(is)), std::istreambuf_iterator()); return nolzss_multiple_dna_w_rc(data, std::forward(sink)); } std::vector factorize_multiple_dna_w_rc(std::string_view text) { std::vector out; factorize_stream_multiple_dna_w_rc(text, [&](const Factor& f){ out.push_back(f); }); return out; } std::vector factorize_file_multiple_dna_w_rc(const std::string& path, size_t reserve_hint) { std::vector out; if (reserve_hint) out.reserve(reserve_hint); factorize_file_stream_multiple_dna_w_rc(path, [&](const Factor& f){ out.push_back(f); }); return out; } size_t count_factors_multiple_dna_w_rc(std::string_view text) { size_t n = 0; factorize_stream_multiple_dna_w_rc(text, [&](const Factor&){ ++n; }); return n; } size_t count_factors_file_multiple_dna_w_rc(const std::string& path) { size_t n = 0; factorize_file_stream_multiple_dna_w_rc(path, [&](const Factor&){ ++n; }); return n; } size_t write_factors_binary_file_multiple_dna_w_rc(const std::string& in_path, const std::string& out_path) { // Set up binary output file with buffering std::ofstream os(out_path, std::ios::binary); if (!os) { throw std::runtime_error("Cannot create output file: " + out_path); } std::vector buf(1<<20); // 1 MB buffer for performance os.rdbuf()->pubsetbuf(buf.data(), static_cast(buf.size())); // Collect factors in memory to write header std::vector factors; size_t n = factorize_file_stream_multiple_dna_w_rc(in_path, [&](const Factor& f){ factors.push_back(f); }); // For multiple DNA sequences from text file, we don't know sequence names or sentinels FactorFileHeader header; header.num_factors = factors.size(); header.num_sequences = 0; // Unknown for raw text files with multiple sequences header.num_sentinels = 0; // Cannot identify sentinels without preparation function header.header_size = sizeof(FactorFileHeader); // Write header os.write(reinterpret_cast(&header), sizeof(header)); // Write factors for (const Factor& f : factors) { os.write(reinterpret_cast(&f), sizeof(Factor)); } return n; } } // namespace noLZSS