Program Listing for File factorizer.cpp
↰ Return to documentation for file (src/cpp/factorizer.cpp
)
#include "factorizer.hpp"
#include <sdsl/suffix_trees.hpp>
#include <sdsl/rmq_succinct_sct.hpp>
#include <cassert>
#include <fstream>
#include <iostream>
#include <optional>
#include <limits>
namespace noLZSS {
using cst_t = sdsl::cst_sada<>;
static size_t lcp(cst_t& cst, size_t i, size_t j) {
if (i == j) return cst.csa.size() - cst.csa[i];
auto lca = cst.lca(cst.select_leaf(cst.csa.isa[i]+1), cst.select_leaf(cst.csa.isa[j]+1));
return cst.depth(lca);
}
static cst_t::node_type next_leaf(cst_t& cst, cst_t::node_type lambda, size_t iterations = 1) {
assert(cst.is_leaf(lambda));
auto lambda_rank = cst.lb(lambda);
for (size_t i = 0; i < iterations; i++) lambda_rank = cst.csa.psi[lambda_rank];
return cst.select_leaf(lambda_rank + 1);
}
// ---------- genomic utilities ----------
char complement(char c) {
switch (c) {
case 'A': return 'T';
case 'C': return 'G';
case 'G': return 'C';
case 'T': return 'A';
default:
// Handle invalid input, e.g., throw an exception or return a sentinel value
throw std::invalid_argument("Invalid nucleotide: " + std::string(1, c));
}
}
static std::string revcomp(std::string_view s) {
std::string r; r.resize(s.size());
for (size_t i = 0, n = s.size(); i < n; ++i) r[i] = complement(s[n-1-i]);
return r;
}
PreparedSequenceResult prepare_multiple_dna_sequences_w_rc(const std::vector<std::string>& sequences) {
if (sequences.empty()) {
return {"", 0, {}};
}
// Check if we have too many sequences
if (sequences.size() > 125) {
throw std::invalid_argument("Too many sequences: maximum 125 sequences supported (due to sentinel character limitations)");
}
// Validate sequences contain only valid DNA nucleotides
for (size_t i = 0; i < sequences.size(); ++i) {
for (char c : sequences[i]) {
if (c != 'A' && c != 'C' && c != 'G' && c != 'T' &&
c != 'a' && c != 'c' && c != 'g' && c != 't') {
throw std::runtime_error("Invalid nucleotide '" + std::string(1, c) +
"' found in sequence " + std::to_string(i));
}
}
}
PreparedSequenceResult result;
// Calculate total size for reservation
size_t total_size = 0;
for (const auto& seq : sequences) {
total_size += seq.length() * 2; // Original + reverse complement
}
total_size += sequences.size() * 2; // Add space for sentinels
result.prepared_string.reserve(total_size);
// Generate sentinel characters avoiding 0 and uppercase DNA nucleotides A(65), C(67), G(71), T(84)
auto get_sentinel = [](size_t index) -> char {
char sentinel = 1;
size_t count = 0;
while (true) {
// Check if current sentinel is valid (not 0 and not uppercase DNA nucleotides)
if (sentinel != 0 && sentinel != 'A' && sentinel != 'C' && sentinel != 'G' && sentinel != 'T') {
if (count == index) {
return sentinel; // Found the sentinel for this index
}
count++;
}
sentinel++;
if (sentinel == 0) sentinel = 1; // wrap around, skip 0
}
};
// First, add original sequences with sentinels
for (size_t i = 0; i < sequences.size(); ++i) {
// Convert to uppercase
std::string seq = sequences[i];
for (char& c : seq) {
if (c >= 'a' && c <= 'z') c = c - 'a' + 'A';
}
result.prepared_string += seq;
// Add sentinel and track its position
size_t sentinel_pos = result.prepared_string.length();
char sentinel = get_sentinel(i);
result.prepared_string += sentinel;
result.sentinel_positions.push_back(sentinel_pos);
}
result.original_length = result.prepared_string.length();
// Then, add reverse complements with different sentinels
for (int i = static_cast<int>(sequences.size()) - 1; i >= 0; --i) {
// Convert to uppercase first
std::string seq = sequences[i];
for (char& c : seq) {
if (c >= 'a' && c <= 'z') c = c - 'a' + 'A';
}
// Add reverse complement
std::string rc = revcomp(seq);
result.prepared_string += rc;
// Add sentinel and track its position (offset by sequences.size() to make them unique)
size_t sentinel_pos = result.prepared_string.length();
char sentinel = get_sentinel(sequences.size() + (sequences.size() - 1 - i));
result.prepared_string += sentinel;
result.sentinel_positions.push_back(sentinel_pos);
}
return result;
}
PreparedSequenceResult prepare_multiple_dna_sequences_no_rc(const std::vector<std::string>& sequences) {
if (sequences.empty()) {
return {"", 0, {}};
}
// Check if we have too many sequences
if (sequences.size() > 250) {
throw std::invalid_argument("Too many sequences: maximum 250 sequences supported (due to sentinel character limitations)");
}
// Validate sequences contain only valid DNA nucleotides
for (size_t i = 0; i < sequences.size(); ++i) {
for (char c : sequences[i]) {
if (c != 'A' && c != 'C' && c != 'G' && c != 'T' &&
c != 'a' && c != 'c' && c != 'g' && c != 't') {
throw std::runtime_error("Invalid nucleotide '" + std::string(1, c) +
"' found in sequence " + std::to_string(i));
}
}
}
PreparedSequenceResult result;
// Calculate total size for reservation
size_t total_size = 0;
for (const auto& seq : sequences) {
total_size += seq.length();
}
total_size += sequences.size(); // Add space for sentinels
result.prepared_string.reserve(total_size);
// Generate sentinel characters avoiding 0 and uppercase DNA nucleotides A(65), C(67), G(71), T(84)
auto get_sentinel = [](size_t index) -> char {
char sentinel = 1;
size_t count = 0;
while (true) {
// Check if current sentinel is valid (not 0 and not uppercase DNA nucleotides)
if (sentinel != 0 && sentinel != 'A' && sentinel != 'C' && sentinel != 'G' && sentinel != 'T') {
if (count == index) {
return sentinel; // Found the sentinel for this index
}
count++;
}
sentinel++;
if (sentinel == 0) sentinel = 1; // wrap around, skip 0
}
};
// Add original sequences with sentinels
for (size_t i = 0; i < sequences.size(); ++i) {
// Convert to uppercase
std::string seq = sequences[i];
for (char& c : seq) {
if (c >= 'a' && c <= 'z') c = c - 'a' + 'A';
}
result.prepared_string += seq;
// Add sentinel and track its position (only between sequences, not after the last one)
if (i < sequences.size() - 1) {
size_t sentinel_pos = result.prepared_string.length();
char sentinel = get_sentinel(i);
result.prepared_string += sentinel;
result.sentinel_positions.push_back(sentinel_pos);
}
}
result.original_length = result.prepared_string.length();
return result;
}
// ---------- generic, sink-driven noLZSS ----------
template<class Sink>
static size_t nolzss(cst_t& cst, Sink&& sink) {
sdsl::rmq_succinct_sct<> rmq(&cst.csa);
const size_t str_len = cst.size() - 1; // the length of the string is the size of the CST minus the sentinel
auto lambda = cst.select_leaf(cst.csa.isa[0] + 1);
size_t lambda_node_depth = cst.node_depth(lambda);
size_t lambda_sufnum = 0;
cst_t::node_type v;
size_t v_min_leaf_sufnum = 0;
size_t u_min_leaf_sufnum = 0;
size_t count = 0;
while (lambda_sufnum < str_len) {
// Compute current factor
size_t d = 1;
size_t l = 1;
while (true) {
v = cst.bp_support.level_anc(lambda, lambda_node_depth - d);
v_min_leaf_sufnum = cst.csa[rmq(cst.lb(v), cst.rb(v))];
l = cst.depth(v);
if (v_min_leaf_sufnum + l - 1 < lambda_sufnum) {
u_min_leaf_sufnum = v_min_leaf_sufnum;
++d; continue;
}
auto u = cst.parent(v);
auto u_depth = cst.depth(u);
if (v_min_leaf_sufnum == lambda_sufnum) {
if (u == cst.root()) {
l = 1;
Factor factor{static_cast<uint64_t>(lambda_sufnum), static_cast<uint64_t>(l), static_cast<uint64_t>(lambda_sufnum)};
sink(factor);
break;
}
else {
l = u_depth;
Factor factor{static_cast<uint64_t>(lambda_sufnum), static_cast<uint64_t>(l), static_cast<uint64_t>(u_min_leaf_sufnum)};
sink(factor);
break;
}
}
l = std::min(lcp(cst, lambda_sufnum, v_min_leaf_sufnum),
(lambda_sufnum - v_min_leaf_sufnum));
if (l <= u_depth) {
l = u_depth;
Factor factor{static_cast<uint64_t>(lambda_sufnum), static_cast<uint64_t>(l), static_cast<uint64_t>(u_min_leaf_sufnum)};
sink(factor);
break;
}
else {
Factor factor{static_cast<uint64_t>(lambda_sufnum), static_cast<uint64_t>(l), static_cast<uint64_t>(v_min_leaf_sufnum)};
sink(factor);
break;
}
}
++count;
// Advance to next position
lambda = next_leaf(cst, lambda, l);
lambda_node_depth = cst.node_depth(lambda);
lambda_sufnum = cst.sn(lambda);
}
return count;
}
template<class Sink>
static size_t nolzss_dna_w_rc(const std::string& T, Sink&& sink) {
const size_t n = T.size();
if (n == 0) return 0;
// Use prepare_multiple_dna_sequences_w_rc with a single sequence
PreparedSequenceResult prep_result = prepare_multiple_dna_sequences_w_rc({T});
std::string S = prep_result.prepared_string;
size_t original_length = prep_result.original_length;
// Use the multiple sequence algorithm
return nolzss_multiple_dna_w_rc(S, std::forward<Sink>(sink));
}
template<class Sink>
static size_t nolzss_multiple_dna_w_rc(const std::string& S, Sink&& sink) {
const size_t N = (S.size() / 2) - 1;
if (N == 0) return 0;
// Build CST over S
cst_t cst; construct_im(cst, S, 1);
// Build RMQ inputs aligned to SA: forward starts and RC ends (in T-coords)
const uint64_t INF = std::numeric_limits<uint64_t>::max()/2ULL;
sdsl::int_vector<64> fwd_starts(cst.csa.size(), INF);
sdsl::int_vector<64> rc_ends (cst.csa.size(), INF);
const size_t T_beg = 0;
const size_t T_end = N; // end of original
const size_t R_beg = N; // first char of rc
const size_t R_end = S.size(); // end of S
for (size_t k = 0; k < cst.csa.size(); ++k) {
size_t posS = cst.csa[k];
if (posS < T_end) {
// suffix starts in T
fwd_starts[k] = posS; // 0-based start in T
} else if (posS >= R_beg && posS < R_end) {
// suffix starts in R
size_t jR0 = posS - R_beg; // 0-based start in R
size_t endT0 = N - jR0 - 1; // mapped end in T (0-based)
rc_ends[k] = endT0;
}
}
sdsl::rmq_succinct_sct<> rmqF(&fwd_starts);
sdsl::rmq_succinct_sct<> rmqRcEnd(&rc_ends);
// Initialize to the leaf of suffix starting at S position 0 (i.e., T[0])
auto lambda = cst.select_leaf(cst.csa.isa[0] + 1);
size_t lambda_node_depth = cst.node_depth(lambda);
size_t i = cst.sn(lambda); // suffix start in S, begins at 0
size_t factors = 0;
while (i < N) { // only factorize inside T
// At factor start i (0-based in T), walk up ancestors and pick best candidate
size_t best_len_depth = 0; // best candidate's depth (proxy for length)
bool best_is_rc = false;
size_t best_fwd_start = 0; // start in T (for FWD)
size_t best_rc_end = 0; // end in T (for RC)
size_t best_rc_posS = 0; // pos in S where RC candidate suffix starts (for LCP)
// Walk from leaf to root via level_anc
for (size_t step = 1; step <= lambda_node_depth; ++step) {
auto v = cst.bp_support.level_anc(lambda, lambda_node_depth - step);
size_t ell = cst.depth(v);
if (ell == 0) break; // reached root
auto lb = cst.lb(v), rb = cst.rb(v);
// Forward candidate (min start in T within v's interval)
size_t kF = rmqF(lb, rb);
uint64_t jF = fwd_starts[kF];
bool okF = (jF != INF) && (jF + ell - 1 < i); // non-overlap: endF <= i-1
// RC candidate (min END in T within v's interval; monotone with depth)
size_t kR = rmqRcEnd(lb, rb);
uint64_t endRC = rc_ends[kR];
bool okR = (endRC != INF) && (endRC < i); // endRC <= i-1
if (!okF && !okR) {
// deeper nodes can only increase jF and the minimal RC end
// -> non-overlap won't become true again for either; stop
break;
}
// Choose the better of the valid candidates at this depth
if (okF) {
if (ell > best_len_depth ||
(ell == best_len_depth && !best_is_rc && (jF + ell - 1) < (best_fwd_start + best_len_depth - 1))) {
best_len_depth = ell;
best_is_rc = false;
best_fwd_start = jF;
}
}
if (okR) {
size_t posS_R = cst.csa[kR]; // suffix position in S for LCP
if (ell > best_len_depth ||
(ell == best_len_depth && (best_is_rc ? (endRC < best_rc_end) : true))) {
best_len_depth = ell;
best_is_rc = true;
best_rc_end = endRC;
best_rc_posS = posS_R;
}
}
}
size_t emit_len = 1;
uint64_t emit_ref = i; // default for literal
if (best_len_depth == 0) {
// No previous occurrence (FWD nor RC) — literal of length 1
Factor f{static_cast<uint64_t>(i), static_cast<uint64_t>(emit_len), static_cast<uint64_t>(emit_ref)};
sink(f);
++factors;
// Advance
lambda = next_leaf(cst, lambda, emit_len);
lambda_node_depth = cst.node_depth(lambda);
i = cst.sn(lambda);
continue;
}
if (!best_is_rc) {
// Finalize FWD with true LCP and non-overlap cap
size_t cap = i - best_fwd_start; // i-1 - (best_fwd_start) + 1
size_t L = lcp(cst, i, best_fwd_start);
emit_len = std::min(L, cap);
emit_ref = static_cast<uint64_t>(best_fwd_start);
} else {
// Finalize RC with true LCP (against suffix in R)
size_t L = lcp(cst, i, best_rc_posS);
emit_len = L;
emit_ref = RC_MASK | static_cast<uint64_t>(best_rc_end); // end-anchored + RC flag
}
// Safety: ensure progress
if (emit_len <= 0) {
throw std::runtime_error("emit_len must be positive to ensure factorization progress");
}
Factor f{static_cast<uint64_t>(i), static_cast<uint64_t>(emit_len), emit_ref};
sink(f);
++factors;
// Advance to next phrase start
lambda = next_leaf(cst, lambda, emit_len);
lambda_node_depth = cst.node_depth(lambda);
i = cst.sn(lambda);
}
return factors;
}
// ------------- public wrappers -------------
template<class Sink>
size_t factorize_stream(std::string_view text, Sink&& sink) {
std::string tmp(text);
cst_t cst; construct_im(cst, tmp, 1);
return nolzss(cst, std::forward<Sink>(sink));
}
template<class Sink>
size_t factorize_file_stream(const std::string& path, Sink&& sink) {
// sdsl-lite will automatically add the sentinel when needed
cst_t cst; construct(cst, path, 1);
return nolzss(cst, std::forward<Sink>(sink));
}
size_t count_factors(std::string_view text) {
size_t n = 0;
factorize_stream(text, [&](const Factor&){ ++n; });
return n;
}
size_t count_factors_file(const std::string& path) {
size_t n = 0;
factorize_file_stream(path, [&](const Factor&){ ++n; });
return n;
}
std::vector<Factor> factorize(std::string_view text) {
std::vector<Factor> out;
factorize_stream(text, [&](const Factor& f){ out.push_back(f); });
return out;
}
std::vector<Factor> factorize_file(const std::string& path, size_t reserve_hint) {
std::vector<Factor> out;
if (reserve_hint) out.reserve(reserve_hint);
factorize_file_stream(path, [&](const Factor& f){ out.push_back(f); });
return out;
}
size_t write_factors_binary_file(const std::string& in_path, const std::string& out_path) {
// Set up binary output file with buffering
std::ofstream os(out_path, std::ios::binary);
if (!os) {
throw std::runtime_error("Cannot create output file: " + out_path);
}
std::vector<char> buf(1<<20); // 1 MB buffer for performance
os.rdbuf()->pubsetbuf(buf.data(), static_cast<std::streamsize>(buf.size()));
// Collect factors in memory to write header
std::vector<Factor> factors;
size_t n = factorize_file_stream(in_path, [&](const Factor& f){
factors.push_back(f);
});
// For non-FASTA files, create minimal header with no sequence names or sentinels
FactorFileHeader header;
header.num_factors = factors.size();
header.num_sequences = 0; // Unknown for raw text files
header.num_sentinels = 0; // No sentinels for raw text files
header.header_size = sizeof(FactorFileHeader);
// Write header
os.write(reinterpret_cast<const char*>(&header), sizeof(header));
// Write factors
for (const Factor& f : factors) {
os.write(reinterpret_cast<const char*>(&f), sizeof(Factor));
}
return n;
}
template<class Sink>
size_t factorize_stream_dna_w_rc(std::string_view text, Sink&& sink) {
std::string tmp(text);
return nolzss_dna_w_rc(tmp, std::forward<Sink>(sink));
}
template<class Sink>
size_t factorize_file_stream_dna_w_rc(const std::string& path, Sink&& sink) {
std::ifstream is(path, std::ios::binary);
if (!is) throw std::runtime_error("Cannot open input file: " + path);
std::string data((std::istreambuf_iterator<char>(is)), std::istreambuf_iterator<char>());
return nolzss_dna_w_rc(data, std::forward<Sink>(sink));
}
std::vector<Factor> factorize_dna_w_rc(std::string_view text) {
std::vector<Factor> out;
factorize_stream_dna_w_rc(text, [&](const Factor& f){ out.push_back(f); });
return out;
}
std::vector<Factor> factorize_file_dna_w_rc(const std::string& path, size_t reserve_hint) {
std::vector<Factor> out; if (reserve_hint) out.reserve(reserve_hint);
factorize_file_stream_dna_w_rc(path, [&](const Factor& f){ out.push_back(f); });
return out;
}
size_t count_factors_dna_w_rc(std::string_view text) {
size_t n = 0; factorize_stream_dna_w_rc(text, [&](const Factor&){ ++n; }); return n;
}
size_t count_factors_file_dna_w_rc(const std::string& path) {
size_t n = 0; factorize_file_stream_dna_w_rc(path, [&](const Factor&){ ++n; }); return n;
}
size_t write_factors_binary_file_dna_w_rc(const std::string& in_path, const std::string& out_path) {
// Set up binary output file with buffering
std::ofstream os(out_path, std::ios::binary);
if (!os) {
throw std::runtime_error("Cannot create output file: " + out_path);
}
std::vector<char> buf(1<<20); // 1 MB buffer for performance
os.rdbuf()->pubsetbuf(buf.data(), static_cast<std::streamsize>(buf.size()));
// Collect factors in memory to write header
std::vector<Factor> factors;
size_t n = factorize_file_stream_dna_w_rc(in_path, [&](const Factor& f){
factors.push_back(f);
});
// For single DNA sequence files, create minimal header
FactorFileHeader header;
header.num_factors = factors.size();
header.num_sequences = 1; // Single DNA sequence
header.num_sentinels = 0; // No sentinels for single sequence
header.header_size = sizeof(FactorFileHeader) + 1; // Empty sequence name
// Write header
os.write(reinterpret_cast<const char*>(&header), sizeof(header));
// Write empty sequence name (single null terminator)
os.write("\0", 1);
// Write factors
for (const Factor& f : factors) {
os.write(reinterpret_cast<const char*>(&f), sizeof(Factor));
}
return n;
}
template<class Sink>
size_t factorize_stream_multiple_dna_w_rc(std::string_view text, Sink&& sink) {
std::string tmp(text);
return nolzss_multiple_dna_w_rc(tmp, std::forward<Sink>(sink));
}
template<class Sink>
size_t factorize_file_stream_multiple_dna_w_rc(const std::string& path, Sink&& sink) {
std::ifstream is(path, std::ios::binary);
if (!is) throw std::runtime_error("Cannot open input file: " + path);
std::string data((std::istreambuf_iterator<char>(is)), std::istreambuf_iterator<char>());
return nolzss_multiple_dna_w_rc(data, std::forward<Sink>(sink));
}
std::vector<Factor> factorize_multiple_dna_w_rc(std::string_view text) {
std::vector<Factor> out;
factorize_stream_multiple_dna_w_rc(text, [&](const Factor& f){ out.push_back(f); });
return out;
}
std::vector<Factor> factorize_file_multiple_dna_w_rc(const std::string& path, size_t reserve_hint) {
std::vector<Factor> out; if (reserve_hint) out.reserve(reserve_hint);
factorize_file_stream_multiple_dna_w_rc(path, [&](const Factor& f){ out.push_back(f); });
return out;
}
size_t count_factors_multiple_dna_w_rc(std::string_view text) {
size_t n = 0; factorize_stream_multiple_dna_w_rc(text, [&](const Factor&){ ++n; }); return n;
}
size_t count_factors_file_multiple_dna_w_rc(const std::string& path) {
size_t n = 0; factorize_file_stream_multiple_dna_w_rc(path, [&](const Factor&){ ++n; }); return n;
}
size_t write_factors_binary_file_multiple_dna_w_rc(const std::string& in_path, const std::string& out_path) {
// Set up binary output file with buffering
std::ofstream os(out_path, std::ios::binary);
if (!os) {
throw std::runtime_error("Cannot create output file: " + out_path);
}
std::vector<char> buf(1<<20); // 1 MB buffer for performance
os.rdbuf()->pubsetbuf(buf.data(), static_cast<std::streamsize>(buf.size()));
// Collect factors in memory to write header
std::vector<Factor> factors;
size_t n = factorize_file_stream_multiple_dna_w_rc(in_path, [&](const Factor& f){
factors.push_back(f);
});
// For multiple DNA sequences from text file, we don't know sequence names or sentinels
FactorFileHeader header;
header.num_factors = factors.size();
header.num_sequences = 0; // Unknown for raw text files with multiple sequences
header.num_sentinels = 0; // Cannot identify sentinels without preparation function
header.header_size = sizeof(FactorFileHeader);
// Write header
os.write(reinterpret_cast<const char*>(&header), sizeof(header));
// Write factors
for (const Factor& f : factors) {
os.write(reinterpret_cast<const char*>(&f), sizeof(Factor));
}
return n;
}
} // namespace noLZSS