Program Listing for File factorizer.cpp
↰ Return to documentation for file (src/cpp/factorizer.cpp)
#include "factorizer.hpp"
#include "factorizer_core.hpp" // Template implementations in detail:: namespace
#include "factorizer_helpers.hpp"
#include "parallel_factorizer.hpp"
#include <sdsl/suffix_trees.hpp>
#include <sdsl/rmq_succinct_sct.hpp>
#include <cassert>
#include <fstream>
#include <iostream>
#include <optional>
#include <limits>
namespace noLZSS {
// Helper functions lcp() and next_leaf() are now in factorizer_helpers.hpp
// ---------- genomic utilities ----------
char complement(char c) {
switch (c) {
case 'A': return 'T';
case 'C': return 'G';
case 'G': return 'C';
case 'T': return 'A';
default:
// Handle invalid input, e.g., throw an exception or return a sentinel value
throw std::invalid_argument("Invalid nucleotide: " + std::string(1, c));
}
}
static std::string revcomp(std::string_view s) {
std::string r; r.resize(s.size());
for (size_t i = 0, n = s.size(); i < n; ++i) r[i] = complement(s[n-1-i]);
return r;
}
PreparedSequenceResult prepare_multiple_dna_sequences_w_rc(const std::vector<std::string>& sequences) {
if (sequences.empty()) {
return {"", 0, {}};
}
// Count non-empty sequences and warn about empty ones
size_t non_empty_count = 0;
size_t empty_count = 0;
for (const auto& seq : sequences) {
if (!seq.empty()) {
non_empty_count++;
} else {
empty_count++;
}
}
// Warn if any empty sequences were found
if (empty_count > 0) {
std::cerr << "Warning: Skipping " << empty_count << " empty sequence(s) in prepare_multiple_dna_sequences_w_rc" << std::endl;
}
// Check if all sequences were empty
if (non_empty_count == 0) {
throw std::runtime_error("All sequences are empty - cannot prepare for factorization");
}
// Check if we have too many non-empty sequences
if (non_empty_count > 125) {
throw std::invalid_argument("Too many sequences: maximum 125 sequences supported (due to sentinel character limitations)");
}
// Validate sequences contain only valid DNA nucleotides (skip empty sequences)
for (size_t i = 0; i < sequences.size(); ++i) {
if (sequences[i].empty()) continue; // Skip empty sequences
for (char c : sequences[i]) {
if (c != 'A' && c != 'C' && c != 'G' && c != 'T' &&
c != 'a' && c != 'c' && c != 'g' && c != 't') {
throw std::runtime_error("Invalid nucleotide '" + std::string(1, c) +
"' found in sequence " + std::to_string(i));
}
}
}
PreparedSequenceResult result;
// Calculate total size for reservation (skip empty sequences)
size_t total_size = 0;
for (const auto& seq : sequences) {
if (!seq.empty()) {
total_size += seq.length() * 2; // Original + reverse complement
}
}
total_size += non_empty_count * 2; // Add space for sentinels
result.prepared_string.reserve(total_size);
// Generate sentinel characters avoiding 0 and uppercase DNA nucleotides A(65), C(67), G(71), T(84)
auto get_sentinel = [](size_t index) -> char {
char sentinel = 1;
size_t count = 0;
while (true) {
// Check if current sentinel is valid (not 0 and not uppercase DNA nucleotides)
if (sentinel != 0 && sentinel != 'A' && sentinel != 'C' && sentinel != 'G' && sentinel != 'T') {
if (count == index) {
return sentinel; // Found the sentinel for this index
}
count++;
}
sentinel++;
if (sentinel == 0) sentinel = 1; // wrap around, skip 0
}
};
// First, add original sequences with sentinels (skip empty sequences)
size_t sentinel_idx = 0;
for (size_t i = 0; i < sequences.size(); ++i) {
if (sequences[i].empty()) continue; // Skip empty sequences
// Convert to uppercase
std::string seq = sequences[i];
for (char& c : seq) {
if (c >= 'a' && c <= 'z') c = c - 'a' + 'A';
}
result.prepared_string += seq;
// Add sentinel and track its position
size_t sentinel_pos = result.prepared_string.length();
char sentinel = get_sentinel(sentinel_idx);
result.prepared_string += sentinel;
result.sentinel_positions.push_back(sentinel_pos);
sentinel_idx++;
}
result.original_length = result.prepared_string.length();
// Then, add reverse complements with different sentinels (skip empty sequences)
for (int i = static_cast<int>(sequences.size()) - 1; i >= 0; --i) {
if (sequences[i].empty()) continue; // Skip empty sequences
// Convert to uppercase first
std::string seq = sequences[i];
for (char& c : seq) {
if (c >= 'a' && c <= 'z') c = c - 'a' + 'A';
}
// Add reverse complement
std::string rc = revcomp(seq);
result.prepared_string += rc;
// Add sentinel and track its position
size_t sentinel_pos = result.prepared_string.length();
char sentinel = get_sentinel(sentinel_idx);
result.prepared_string += sentinel;
result.sentinel_positions.push_back(sentinel_pos);
sentinel_idx++;
}
return result;
}
PreparedSequenceResult prepare_multiple_dna_sequences_no_rc(const std::vector<std::string>& sequences) {
if (sequences.empty()) {
return {"", 0, {}};
}
// Count non-empty sequences and warn about empty ones
size_t non_empty_count = 0;
size_t empty_count = 0;
for (const auto& seq : sequences) {
if (!seq.empty()) {
non_empty_count++;
} else {
empty_count++;
}
}
// Warn if any empty sequences were found
if (empty_count > 0) {
std::cerr << "Warning: Skipping " << empty_count << " empty sequence(s) in prepare_multiple_dna_sequences_no_rc" << std::endl;
}
// Check if all sequences were empty
if (non_empty_count == 0) {
throw std::runtime_error("All sequences are empty - cannot prepare for factorization");
}
// Check if we have too many non-empty sequences
if (non_empty_count > 250) {
throw std::invalid_argument("Too many sequences: maximum 250 sequences supported (due to sentinel character limitations)");
}
// Validate sequences contain only valid DNA nucleotides (skip empty sequences)
for (size_t i = 0; i < sequences.size(); ++i) {
if (sequences[i].empty()) continue; // Skip empty sequences
for (char c : sequences[i]) {
if (c != 'A' && c != 'C' && c != 'G' && c != 'T' &&
c != 'a' && c != 'c' && c != 'g' && c != 't') {
throw std::runtime_error("Invalid nucleotide '" + std::string(1, c) +
"' found in sequence " + std::to_string(i));
}
}
}
PreparedSequenceResult result;
// Calculate total size for reservation (skip empty sequences)
size_t total_size = 0;
for (const auto& seq : sequences) {
if (!seq.empty()) {
total_size += seq.length();
}
}
total_size += (non_empty_count > 0 ? non_empty_count - 1 : 0); // Add space for sentinels (between sequences)
result.prepared_string.reserve(total_size);
// Generate sentinel characters avoiding 0 and uppercase DNA nucleotides A(65), C(67), G(71), T(84)
auto get_sentinel = [](size_t index) -> char {
char sentinel = 1;
size_t count = 0;
while (true) {
// Check if current sentinel is valid (not 0 and not uppercase DNA nucleotides)
if (sentinel != 0 && sentinel != 'A' && sentinel != 'C' && sentinel != 'G' && sentinel != 'T') {
if (count == index) {
return sentinel; // Found the sentinel for this index
}
count++;
}
sentinel++;
if (sentinel == 0) sentinel = 1; // wrap around, skip 0
}
};
// Add original sequences with sentinels (skip empty sequences)
size_t sentinel_idx = 0;
size_t processed_count = 0;
for (size_t i = 0; i < sequences.size(); ++i) {
if (sequences[i].empty()) continue; // Skip empty sequences
// Convert to uppercase
std::string seq = sequences[i];
for (char& c : seq) {
if (c >= 'a' && c <= 'z') c = c - 'a' + 'A';
}
result.prepared_string += seq;
// Add sentinel and track its position (only between sequences, not after the last one)
processed_count++;
if (processed_count < non_empty_count) {
size_t sentinel_pos = result.prepared_string.length();
char sentinel = get_sentinel(sentinel_idx);
result.prepared_string += sentinel;
result.sentinel_positions.push_back(sentinel_pos);
sentinel_idx++;
}
}
result.original_length = result.prepared_string.length();
return result;
}
// ---------- Core template algorithms ----------
// Template implementations are in factorizer_core.hpp in the detail:: namespace
// This allows them to be used from any compilation unit without code duplication
// ------------- public wrappers -------------
template<class Sink>
size_t factorize_file_stream(const std::string& path, Sink&& sink, size_t start_pos) {
// sdsl-lite will automatically add the sentinel when needed
cst_t cst; construct(cst, path, 1);
return detail::nolzss(cst, std::forward<Sink>(sink), start_pos);
}
size_t count_factors(std::string_view text, size_t start_pos) {
size_t n = 0;
std::string tmp(text);
cst_t cst; construct_im(cst, tmp, 1);
detail::nolzss(cst, [&](const Factor&){ ++n; }, start_pos);
return n;
}
size_t count_factors_file(const std::string& path, size_t start_pos) {
size_t n = 0;
factorize_file_stream(path, [&](const Factor&){ ++n; }, start_pos);
return n;
}
std::vector<Factor> factorize(std::string_view text, size_t start_pos) {
std::vector<Factor> out;
std::string tmp(text);
cst_t cst; construct_im(cst, tmp, 1);
detail::nolzss(cst, [&](const Factor& f){ out.push_back(f); }, start_pos);
return out;
}
std::vector<Factor> factorize_file(const std::string& path, size_t reserve_hint, size_t start_pos) {
std::vector<Factor> out;
if (reserve_hint) out.reserve(reserve_hint);
factorize_file_stream(path, [&](const Factor& f){ out.push_back(f); }, start_pos);
return out;
}
size_t write_factors_binary_file(const std::string& in_path, const std::string& out_path) {
// Get input file size for total_length
std::ifstream infile(in_path, std::ios::binary | std::ios::ate);
if (!infile) {
throw std::runtime_error("Cannot open input file: " + in_path);
}
uint64_t total_length = infile.tellg();
infile.close();
// Set up binary output file with buffering
std::ofstream os(out_path, std::ios::binary);
if (!os) {
throw std::runtime_error("Cannot create output file: " + out_path);
}
std::vector<char> buf(1<<20); // 1 MB buffer for performance
os.rdbuf()->pubsetbuf(buf.data(), static_cast<std::streamsize>(buf.size()));
// Stream factors directly to file without collecting in memory
size_t n = factorize_file_stream(in_path, [&](const Factor& f){
os.write(reinterpret_cast<const char*>(&f), sizeof(Factor));
});
// For non-FASTA files, create minimal footer with no sequence names or sentinels
FactorFileFooter footer;
footer.num_factors = n;
footer.num_sequences = 0; // Unknown for raw text files
footer.num_sentinels = 0; // No sentinels for raw text files
footer.footer_size = sizeof(FactorFileFooter);
footer.total_length = total_length;
// Write footer at the end
os.write(reinterpret_cast<const char*>(&footer), sizeof(footer));
return n;
}
template<class Sink>
size_t factorize_stream_dna_w_rc(std::string_view text, Sink&& sink) {
std::string tmp(text);
return detail::nolzss_dna_w_rc(tmp, std::forward<Sink>(sink));
}
template<class Sink>
size_t factorize_file_stream_dna_w_rc(const std::string& path, Sink&& sink) {
std::ifstream is(path, std::ios::binary);
if (!is) throw std::runtime_error("Cannot open input file: " + path);
std::string data((std::istreambuf_iterator<char>(is)), std::istreambuf_iterator<char>());
return detail::nolzss_dna_w_rc(data, std::forward<Sink>(sink));
}
std::vector<Factor> factorize_dna_w_rc(std::string_view text) {
std::vector<Factor> out;
factorize_stream_dna_w_rc(text, [&](const Factor& f){ out.push_back(f); });
return out;
}
std::vector<Factor> factorize_file_dna_w_rc(const std::string& path, size_t reserve_hint) {
std::vector<Factor> out; if (reserve_hint) out.reserve(reserve_hint);
factorize_file_stream_dna_w_rc(path, [&](const Factor& f){ out.push_back(f); });
return out;
}
size_t count_factors_dna_w_rc(std::string_view text) {
size_t n = 0; factorize_stream_dna_w_rc(text, [&](const Factor&){ ++n; }); return n;
}
size_t count_factors_file_dna_w_rc(const std::string& path) {
size_t n = 0; factorize_file_stream_dna_w_rc(path, [&](const Factor&){ ++n; }); return n;
}
size_t write_factors_binary_file_dna_w_rc(const std::string& in_path, const std::string& out_path) {
// Get input file size for total_length
std::ifstream infile(in_path, std::ios::binary | std::ios::ate);
if (!infile) {
throw std::runtime_error("Cannot open input file: " + in_path);
}
uint64_t total_length = infile.tellg();
infile.close();
// Set up binary output file with buffering
std::ofstream os(out_path, std::ios::binary);
if (!os) {
throw std::runtime_error("Cannot create output file: " + out_path);
}
std::vector<char> buf(1<<20); // 1 MB buffer for performance
os.rdbuf()->pubsetbuf(buf.data(), static_cast<std::streamsize>(buf.size()));
// Stream factors directly to file without collecting in memory
size_t n = factorize_file_stream_dna_w_rc(in_path, [&](const Factor& f){
os.write(reinterpret_cast<const char*>(&f), sizeof(Factor));
});
// Write empty sequence name (single null terminator)
os.write("\0", 1);
// For single DNA sequence files, create minimal footer
FactorFileFooter footer;
footer.num_factors = n;
footer.num_sequences = 1; // Single DNA sequence
footer.num_sentinels = 0; // No sentinels for single sequence
footer.footer_size = sizeof(FactorFileFooter) + 1; // Empty sequence name
footer.total_length = total_length;
// Write footer at the end
os.write(reinterpret_cast<const char*>(&footer), sizeof(footer));
return n;
}
std::vector<Factor> factorize_multiple_dna_w_rc(std::string_view text, size_t start_pos) {
std::vector<Factor> out;
std::string tmp(text);
detail::nolzss_multiple_dna_w_rc(tmp, [&](const Factor& f){ out.push_back(f); }, start_pos);
return out;
}
std::vector<Factor> factorize_file_multiple_dna_w_rc(const std::string& path, size_t reserve_hint, size_t start_pos) {
std::ifstream file(path, std::ios::binary);
if (!file) {
throw std::runtime_error("Cannot open file: " + path);
}
std::string text((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
std::vector<Factor> out; if (reserve_hint) out.reserve(reserve_hint);
detail::nolzss_multiple_dna_w_rc(text, [&](const Factor& f){ out.push_back(f); }, start_pos);
return out;
}
size_t count_factors_multiple_dna_w_rc(std::string_view text, size_t start_pos) {
size_t n = 0;
std::string tmp(text);
detail::nolzss_multiple_dna_w_rc(tmp, [&](const Factor&){ ++n; }, start_pos);
return n;
}
size_t count_factors_file_multiple_dna_w_rc(const std::string& path, size_t start_pos) {
std::ifstream file(path, std::ios::binary);
if (!file) {
throw std::runtime_error("Cannot open file: " + path);
}
std::string text((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
size_t n = 0;
detail::nolzss_multiple_dna_w_rc(text, [&](const Factor&){ ++n; }, start_pos);
return n;
}
size_t write_factors_binary_file_multiple_dna_w_rc(const std::string& in_path, const std::string& out_path, size_t start_pos) {
// Read input file
std::ifstream infile(in_path, std::ios::binary);
if (!infile) {
throw std::runtime_error("Cannot open input file: " + in_path);
}
std::string text((std::istreambuf_iterator<char>(infile)), std::istreambuf_iterator<char>());
// Set up binary output file with buffering
std::ofstream os(out_path, std::ios::binary);
if (!os) {
throw std::runtime_error("Cannot create output file: " + out_path);
}
std::vector<char> buf(1<<20); // 1 MB buffer for performance
os.rdbuf()->pubsetbuf(buf.data(), static_cast<std::streamsize>(buf.size()));
// Calculate total_length from input text length minus start position
uint64_t total_length = text.length() - start_pos;
// Stream factors directly to file without collecting in memory
size_t n = 0;
detail::nolzss_multiple_dna_w_rc(text, [&](const Factor& f){
os.write(reinterpret_cast<const char*>(&f), sizeof(Factor));
++n;
}, start_pos);
// For multiple DNA sequences from text file, we don't know sequence names or sentinels
FactorFileFooter footer;
footer.num_factors = n;
footer.num_sequences = 0; // Unknown for raw text files with multiple sequences
footer.num_sentinels = 0; // Cannot identify sentinels without preparation function
footer.footer_size = sizeof(FactorFileFooter);
footer.total_length = total_length;
// Write footer at the end
os.write(reinterpret_cast<const char*>(&footer), sizeof(footer));
return n;
}
// Reference sequence factorization functions
std::vector<Factor> factorize_dna_w_reference_seq(const std::string& reference_seq, const std::string& target_seq) {
// Prepare reference and target sequences together
std::vector<std::string> sequences = {reference_seq, target_seq};
PreparedSequenceResult prep_result = prepare_multiple_dna_sequences_w_rc(sequences);
// Calculate the starting position of the target sequence in the prepared string
// The format is: REF[sentinel]TARGET[sentinel]RC(TARGET)[sentinel]RC(REF)[sentinel]
// We want to start factorization from where TARGET begins
size_t target_start_pos = reference_seq.length() + 1; // +1 for the sentinel between ref and target
// Perform factorization starting from target sequence
std::vector<Factor> factors;
detail::nolzss_multiple_dna_w_rc(prep_result.prepared_string, [&](const Factor& f) {
factors.push_back(f);
}, target_start_pos);
return factors;
}
size_t factorize_dna_w_reference_seq_file(const std::string& reference_seq, const std::string& target_seq, const std::string& out_path) {
// Set up binary output file with buffering
std::ofstream os(out_path, std::ios::binary);
if (!os) {
throw std::runtime_error("Cannot create output file: " + out_path);
}
std::vector<char> buf(1<<20); // 1 MB buffer for performance
os.rdbuf()->pubsetbuf(buf.data(), static_cast<std::streamsize>(buf.size()));
// Get factors using the in-memory function
std::vector<Factor> factors = factorize_dna_w_reference_seq(reference_seq, target_seq);
// Calculate total_length from target sequence length (what we're factorizing)
uint64_t total_length = target_seq.length();
// Write factors first
for (const Factor& f : factors) {
os.write(reinterpret_cast<const char*>(&f), sizeof(Factor));
}
// Create footer for reference+target factorization
FactorFileFooter footer;
footer.num_factors = factors.size();
footer.num_sequences = 2; // Reference + target sequences
footer.num_sentinels = 1; // One sentinel between ref and target (in factorized region)
footer.footer_size = sizeof(FactorFileFooter);
footer.total_length = total_length;
// Write footer at the end
os.write(reinterpret_cast<const char*>(&footer), sizeof(footer));
return factors.size();
}
// General reference sequence factorization functions (no reverse complement)
std::vector<Factor> factorize_w_reference(const std::string& reference_seq, const std::string& target_seq) {
// Concatenate reference and target with a sentinel (use ASCII character 1)
std::string combined = reference_seq + '\x01' + target_seq;
// Calculate the starting position of the target sequence in the combined string
size_t target_start_pos = reference_seq.length() + 1; // +1 for the sentinel
// Perform general factorization starting from target sequence position
std::vector<Factor> factors;
cst_t cst; construct_im(cst, combined, 1);
detail::nolzss(cst, [&](const Factor& f) {
factors.push_back(f);
}, target_start_pos);
return factors;
}
size_t factorize_w_reference_file(const std::string& reference_seq, const std::string& target_seq, const std::string& out_path) {
// Set up binary output file with buffering
std::ofstream os(out_path, std::ios::binary);
if (!os) {
throw std::runtime_error("Cannot create output file: " + out_path);
}
std::vector<char> buf(1<<20); // 1 MB buffer for performance
os.rdbuf()->pubsetbuf(buf.data(), static_cast<std::streamsize>(buf.size()));
// Get factors using the in-memory function
std::vector<Factor> factors = factorize_w_reference(reference_seq, target_seq);
// Calculate total_length from target sequence length (what we're factorizing)
uint64_t total_length = target_seq.length();
// Write factors first
for (const Factor& f : factors) {
os.write(reinterpret_cast<const char*>(&f), sizeof(Factor));
}
// Create footer for reference+target factorization
FactorFileFooter footer;
footer.num_factors = factors.size();
footer.num_sequences = 2; // Reference + target sequences
footer.num_sentinels = 1; // One sentinel between ref and target
footer.footer_size = sizeof(FactorFileFooter);
footer.total_length = total_length;
// Write footer at the end
os.write(reinterpret_cast<const char*>(&footer), sizeof(footer));
return factors.size();
}
// Parallel factorization implementations
size_t parallel_factorize_to_file(::std::string_view text, const ::std::string& output_path, size_t num_threads, size_t start_pos) {
ParallelFactorizer factorizer;
return factorizer.parallel_factorize(text, output_path, num_threads, start_pos);
}
size_t parallel_factorize_file_to_file(const ::std::string& input_path, const ::std::string& output_path, size_t num_threads, size_t start_pos) {
ParallelFactorizer factorizer;
return factorizer.parallel_factorize_file(input_path, output_path, num_threads, start_pos);
}
size_t parallel_factorize_dna_w_rc_to_file(::std::string_view text, const ::std::string& output_path, size_t num_threads) {
ParallelFactorizer factorizer;
return factorizer.parallel_factorize_dna_w_rc(text, output_path, num_threads);
}
size_t parallel_factorize_file_dna_w_rc_to_file(const ::std::string& input_path, const ::std::string& output_path, size_t num_threads) {
ParallelFactorizer factorizer;
return factorizer.parallel_factorize_file_dna_w_rc(input_path, output_path, num_threads);
}
} // namespace noLZSS