Program Listing for File factorizer.cpp

Return to documentation for file (src/cpp/factorizer.cpp)

#include "factorizer.hpp"
#include "factorizer_core.hpp"  // Template implementations in detail:: namespace
#include "factorizer_helpers.hpp"
#include "parallel_factorizer.hpp"
#include <sdsl/suffix_trees.hpp>
#include <sdsl/rmq_succinct_sct.hpp>
#include <cassert>
#include <fstream>
#include <iostream>
#include <optional>
#include <limits>

namespace noLZSS {
// Helper functions lcp() and next_leaf() are now in factorizer_helpers.hpp

// ---------- genomic utilities ----------
char complement(char c) {
    switch (c) {
        case 'A': return 'T';
        case 'C': return 'G';
        case 'G': return 'C';
        case 'T': return 'A';
        default:
            // Handle invalid input, e.g., throw an exception or return a sentinel value
            throw std::invalid_argument("Invalid nucleotide: " + std::string(1, c));
    }
}
static std::string revcomp(std::string_view s) {
    std::string r; r.resize(s.size());
    for (size_t i = 0, n = s.size(); i < n; ++i) r[i] = complement(s[n-1-i]);
    return r;
}

PreparedSequenceResult prepare_multiple_dna_sequences_w_rc(const std::vector<std::string>& sequences) {
    if (sequences.empty()) {
        return {"", 0, {}};
    }

    // Count non-empty sequences and warn about empty ones
    size_t non_empty_count = 0;
    size_t empty_count = 0;
    for (const auto& seq : sequences) {
        if (!seq.empty()) {
            non_empty_count++;
        } else {
            empty_count++;
        }
    }

    // Warn if any empty sequences were found
    if (empty_count > 0) {
        std::cerr << "Warning: Skipping " << empty_count << " empty sequence(s) in prepare_multiple_dna_sequences_w_rc" << std::endl;
    }

    // Check if all sequences were empty
    if (non_empty_count == 0) {
        throw std::runtime_error("All sequences are empty - cannot prepare for factorization");
    }

    // Check if we have too many non-empty sequences
    if (non_empty_count > 125) {
        throw std::invalid_argument("Too many sequences: maximum 125 sequences supported (due to sentinel character limitations)");
    }

    // Validate sequences contain only valid DNA nucleotides (skip empty sequences)
    for (size_t i = 0; i < sequences.size(); ++i) {
        if (sequences[i].empty()) continue;  // Skip empty sequences
        for (char c : sequences[i]) {
            if (c != 'A' && c != 'C' && c != 'G' && c != 'T' &&
                c != 'a' && c != 'c' && c != 'g' && c != 't') {
                throw std::runtime_error("Invalid nucleotide '" + std::string(1, c) +
                                       "' found in sequence " + std::to_string(i));
            }
        }
    }

    PreparedSequenceResult result;

    // Calculate total size for reservation (skip empty sequences)
    size_t total_size = 0;
    for (const auto& seq : sequences) {
        if (!seq.empty()) {
            total_size += seq.length() * 2; // Original + reverse complement
        }
    }
    total_size += non_empty_count * 2; // Add space for sentinels
    result.prepared_string.reserve(total_size);

    // Generate sentinel characters avoiding 0 and uppercase DNA nucleotides A(65), C(67), G(71), T(84)
    auto get_sentinel = [](size_t index) -> char {
        char sentinel = 1;
        size_t count = 0;

        while (true) {
            // Check if current sentinel is valid (not 0 and not uppercase DNA nucleotides)
            if (sentinel != 0 && sentinel != 'A' && sentinel != 'C' && sentinel != 'G' && sentinel != 'T') {
                if (count == index) {
                    return sentinel;  // Found the sentinel for this index
                }
                count++;
            }
            sentinel++;
            if (sentinel == 0) sentinel = 1; // wrap around, skip 0
        }
    };

    // First, add original sequences with sentinels (skip empty sequences)
    size_t sentinel_idx = 0;
    for (size_t i = 0; i < sequences.size(); ++i) {
        if (sequences[i].empty()) continue;  // Skip empty sequences

        // Convert to uppercase
        std::string seq = sequences[i];
        for (char& c : seq) {
            if (c >= 'a' && c <= 'z') c = c - 'a' + 'A';
        }
        result.prepared_string += seq;

        // Add sentinel and track its position
        size_t sentinel_pos = result.prepared_string.length();
        char sentinel = get_sentinel(sentinel_idx);
        result.prepared_string += sentinel;
        result.sentinel_positions.push_back(sentinel_pos);
        sentinel_idx++;
    }

    result.original_length = result.prepared_string.length();

    // Then, add reverse complements with different sentinels (skip empty sequences)
    for (int i = static_cast<int>(sequences.size()) - 1; i >= 0; --i) {
        if (sequences[i].empty()) continue;  // Skip empty sequences

        // Convert to uppercase first
        std::string seq = sequences[i];
        for (char& c : seq) {
            if (c >= 'a' && c <= 'z') c = c - 'a' + 'A';
        }

        // Add reverse complement
        std::string rc = revcomp(seq);
        result.prepared_string += rc;

        // Add sentinel and track its position
        size_t sentinel_pos = result.prepared_string.length();
        char sentinel = get_sentinel(sentinel_idx);
        result.prepared_string += sentinel;
        result.sentinel_positions.push_back(sentinel_pos);
        sentinel_idx++;
    }

    return result;
}

PreparedSequenceResult prepare_multiple_dna_sequences_no_rc(const std::vector<std::string>& sequences) {
    if (sequences.empty()) {
        return {"", 0, {}};
    }

    // Count non-empty sequences and warn about empty ones
    size_t non_empty_count = 0;
    size_t empty_count = 0;
    for (const auto& seq : sequences) {
        if (!seq.empty()) {
            non_empty_count++;
        } else {
            empty_count++;
        }
    }

    // Warn if any empty sequences were found
    if (empty_count > 0) {
        std::cerr << "Warning: Skipping " << empty_count << " empty sequence(s) in prepare_multiple_dna_sequences_no_rc" << std::endl;
    }

    // Check if all sequences were empty
    if (non_empty_count == 0) {
        throw std::runtime_error("All sequences are empty - cannot prepare for factorization");
    }

    // Check if we have too many non-empty sequences
    if (non_empty_count > 250) {
        throw std::invalid_argument("Too many sequences: maximum 250 sequences supported (due to sentinel character limitations)");
    }

    // Validate sequences contain only valid DNA nucleotides (skip empty sequences)
    for (size_t i = 0; i < sequences.size(); ++i) {
        if (sequences[i].empty()) continue;  // Skip empty sequences
        for (char c : sequences[i]) {
            if (c != 'A' && c != 'C' && c != 'G' && c != 'T' &&
                c != 'a' && c != 'c' && c != 'g' && c != 't') {
                throw std::runtime_error("Invalid nucleotide '" + std::string(1, c) +
                                       "' found in sequence " + std::to_string(i));
            }
        }
    }

    PreparedSequenceResult result;

    // Calculate total size for reservation (skip empty sequences)
    size_t total_size = 0;
    for (const auto& seq : sequences) {
        if (!seq.empty()) {
            total_size += seq.length();
        }
    }
    total_size += (non_empty_count > 0 ? non_empty_count - 1 : 0); // Add space for sentinels (between sequences)
    result.prepared_string.reserve(total_size);

    // Generate sentinel characters avoiding 0 and uppercase DNA nucleotides A(65), C(67), G(71), T(84)
    auto get_sentinel = [](size_t index) -> char {
        char sentinel = 1;
        size_t count = 0;

        while (true) {
            // Check if current sentinel is valid (not 0 and not uppercase DNA nucleotides)
            if (sentinel != 0 && sentinel != 'A' && sentinel != 'C' && sentinel != 'G' && sentinel != 'T') {
                if (count == index) {
                    return sentinel;  // Found the sentinel for this index
                }
                count++;
            }
            sentinel++;
            if (sentinel == 0) sentinel = 1; // wrap around, skip 0
        }
    };

    // Add original sequences with sentinels (skip empty sequences)
    size_t sentinel_idx = 0;
    size_t processed_count = 0;
    for (size_t i = 0; i < sequences.size(); ++i) {
        if (sequences[i].empty()) continue;  // Skip empty sequences

        // Convert to uppercase
        std::string seq = sequences[i];
        for (char& c : seq) {
            if (c >= 'a' && c <= 'z') c = c - 'a' + 'A';
        }
        result.prepared_string += seq;

        // Add sentinel and track its position (only between sequences, not after the last one)
        processed_count++;
        if (processed_count < non_empty_count) {
            size_t sentinel_pos = result.prepared_string.length();
            char sentinel = get_sentinel(sentinel_idx);
            result.prepared_string += sentinel;
            result.sentinel_positions.push_back(sentinel_pos);
            sentinel_idx++;
        }
    }

    result.original_length = result.prepared_string.length();

    return result;
}

// ---------- Core template algorithms ----------
// Template implementations are in factorizer_core.hpp in the detail:: namespace
// This allows them to be used from any compilation unit without code duplication

// ------------- public wrappers -------------

template<class Sink>
size_t factorize_file_stream(const std::string& path, Sink&& sink, size_t start_pos) {
    // sdsl-lite will automatically add the sentinel when needed
    cst_t cst; construct(cst, path, 1);
    return detail::nolzss(cst, std::forward<Sink>(sink), start_pos);
}

size_t count_factors(std::string_view text, size_t start_pos) {
    size_t n = 0;
    std::string tmp(text);
    cst_t cst; construct_im(cst, tmp, 1);
    detail::nolzss(cst, [&](const Factor&){ ++n; }, start_pos);
    return n;
}

size_t count_factors_file(const std::string& path, size_t start_pos) {
    size_t n = 0;
    factorize_file_stream(path, [&](const Factor&){ ++n; }, start_pos);
    return n;
}

std::vector<Factor> factorize(std::string_view text, size_t start_pos) {
    std::vector<Factor> out;
    std::string tmp(text);
    cst_t cst; construct_im(cst, tmp, 1);
    detail::nolzss(cst, [&](const Factor& f){ out.push_back(f); }, start_pos);
    return out;
}

std::vector<Factor> factorize_file(const std::string& path, size_t reserve_hint, size_t start_pos) {
    std::vector<Factor> out;
    if (reserve_hint) out.reserve(reserve_hint);
    factorize_file_stream(path, [&](const Factor& f){ out.push_back(f); }, start_pos);
    return out;
}

size_t write_factors_binary_file(const std::string& in_path, const std::string& out_path) {
    // Get input file size for total_length
    std::ifstream infile(in_path, std::ios::binary | std::ios::ate);
    if (!infile) {
        throw std::runtime_error("Cannot open input file: " + in_path);
    }
    uint64_t total_length = infile.tellg();
    infile.close();

    // Set up binary output file with buffering
    std::ofstream os(out_path, std::ios::binary);
    if (!os) {
        throw std::runtime_error("Cannot create output file: " + out_path);
    }

    std::vector<char> buf(1<<20); // 1 MB buffer for performance
    os.rdbuf()->pubsetbuf(buf.data(), static_cast<std::streamsize>(buf.size()));

    // Stream factors directly to file without collecting in memory
    size_t n = factorize_file_stream(in_path, [&](const Factor& f){
        os.write(reinterpret_cast<const char*>(&f), sizeof(Factor));
    });

    // For non-FASTA files, create minimal footer with no sequence names or sentinels
    FactorFileFooter footer;
    footer.num_factors = n;
    footer.num_sequences = 0;  // Unknown for raw text files
    footer.num_sentinels = 0;  // No sentinels for raw text files
    footer.footer_size = sizeof(FactorFileFooter);
    footer.total_length = total_length;

    // Write footer at the end
    os.write(reinterpret_cast<const char*>(&footer), sizeof(footer));

    return n;
}

template<class Sink>
size_t factorize_stream_dna_w_rc(std::string_view text, Sink&& sink) {
    std::string tmp(text);
    return detail::nolzss_dna_w_rc(tmp, std::forward<Sink>(sink));
}

template<class Sink>
size_t factorize_file_stream_dna_w_rc(const std::string& path, Sink&& sink) {
    std::ifstream is(path, std::ios::binary);
    if (!is) throw std::runtime_error("Cannot open input file: " + path);
    std::string data((std::istreambuf_iterator<char>(is)), std::istreambuf_iterator<char>());
    return detail::nolzss_dna_w_rc(data, std::forward<Sink>(sink));
}

std::vector<Factor> factorize_dna_w_rc(std::string_view text) {
    std::vector<Factor> out;
    factorize_stream_dna_w_rc(text, [&](const Factor& f){ out.push_back(f); });
    return out;
}

std::vector<Factor> factorize_file_dna_w_rc(const std::string& path, size_t reserve_hint) {
    std::vector<Factor> out; if (reserve_hint) out.reserve(reserve_hint);
    factorize_file_stream_dna_w_rc(path, [&](const Factor& f){ out.push_back(f); });
    return out;
}

size_t count_factors_dna_w_rc(std::string_view text) {
    size_t n = 0; factorize_stream_dna_w_rc(text, [&](const Factor&){ ++n; }); return n;
}

size_t count_factors_file_dna_w_rc(const std::string& path) {
    size_t n = 0; factorize_file_stream_dna_w_rc(path, [&](const Factor&){ ++n; }); return n;
}

size_t write_factors_binary_file_dna_w_rc(const std::string& in_path, const std::string& out_path) {
    // Get input file size for total_length
    std::ifstream infile(in_path, std::ios::binary | std::ios::ate);
    if (!infile) {
        throw std::runtime_error("Cannot open input file: " + in_path);
    }
    uint64_t total_length = infile.tellg();
    infile.close();

    // Set up binary output file with buffering
    std::ofstream os(out_path, std::ios::binary);
    if (!os) {
        throw std::runtime_error("Cannot create output file: " + out_path);
    }

    std::vector<char> buf(1<<20); // 1 MB buffer for performance
    os.rdbuf()->pubsetbuf(buf.data(), static_cast<std::streamsize>(buf.size()));

    // Stream factors directly to file without collecting in memory
    size_t n = factorize_file_stream_dna_w_rc(in_path, [&](const Factor& f){
        os.write(reinterpret_cast<const char*>(&f), sizeof(Factor));
    });

    // Write empty sequence name (single null terminator)
    os.write("\0", 1);

    // For single DNA sequence files, create minimal footer
    FactorFileFooter footer;
    footer.num_factors = n;
    footer.num_sequences = 1;  // Single DNA sequence
    footer.num_sentinels = 0;  // No sentinels for single sequence
    footer.footer_size = sizeof(FactorFileFooter) + 1; // Empty sequence name
    footer.total_length = total_length;

    // Write footer at the end
    os.write(reinterpret_cast<const char*>(&footer), sizeof(footer));

    return n;
}

std::vector<Factor> factorize_multiple_dna_w_rc(std::string_view text, size_t start_pos) {
    std::vector<Factor> out;
    std::string tmp(text);
    detail::nolzss_multiple_dna_w_rc(tmp, [&](const Factor& f){ out.push_back(f); }, start_pos);
    return out;
}

std::vector<Factor> factorize_file_multiple_dna_w_rc(const std::string& path, size_t reserve_hint, size_t start_pos) {
    std::ifstream file(path, std::ios::binary);
    if (!file) {
        throw std::runtime_error("Cannot open file: " + path);
    }
    std::string text((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());

    std::vector<Factor> out; if (reserve_hint) out.reserve(reserve_hint);
    detail::nolzss_multiple_dna_w_rc(text, [&](const Factor& f){ out.push_back(f); }, start_pos);
    return out;
}

size_t count_factors_multiple_dna_w_rc(std::string_view text, size_t start_pos) {
    size_t n = 0;
    std::string tmp(text);
    detail::nolzss_multiple_dna_w_rc(tmp, [&](const Factor&){ ++n; }, start_pos);
    return n;
}

size_t count_factors_file_multiple_dna_w_rc(const std::string& path, size_t start_pos) {
    std::ifstream file(path, std::ios::binary);
    if (!file) {
        throw std::runtime_error("Cannot open file: " + path);
    }
    std::string text((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());

    size_t n = 0;
    detail::nolzss_multiple_dna_w_rc(text, [&](const Factor&){ ++n; }, start_pos);
    return n;
}

size_t write_factors_binary_file_multiple_dna_w_rc(const std::string& in_path, const std::string& out_path, size_t start_pos) {
    // Read input file
    std::ifstream infile(in_path, std::ios::binary);
    if (!infile) {
        throw std::runtime_error("Cannot open input file: " + in_path);
    }
    std::string text((std::istreambuf_iterator<char>(infile)), std::istreambuf_iterator<char>());

    // Set up binary output file with buffering
    std::ofstream os(out_path, std::ios::binary);
    if (!os) {
        throw std::runtime_error("Cannot create output file: " + out_path);
    }

    std::vector<char> buf(1<<20); // 1 MB buffer for performance
    os.rdbuf()->pubsetbuf(buf.data(), static_cast<std::streamsize>(buf.size()));

    // Calculate total_length from input text length minus start position
    uint64_t total_length = text.length() - start_pos;

    // Stream factors directly to file without collecting in memory
    size_t n = 0;
    detail::nolzss_multiple_dna_w_rc(text, [&](const Factor& f){
        os.write(reinterpret_cast<const char*>(&f), sizeof(Factor));
        ++n;
    }, start_pos);

    // For multiple DNA sequences from text file, we don't know sequence names or sentinels
    FactorFileFooter footer;
    footer.num_factors = n;
    footer.num_sequences = 0;  // Unknown for raw text files with multiple sequences
    footer.num_sentinels = 0;  // Cannot identify sentinels without preparation function
    footer.footer_size = sizeof(FactorFileFooter);
    footer.total_length = total_length;

    // Write footer at the end
    os.write(reinterpret_cast<const char*>(&footer), sizeof(footer));

    return n;
}

// Reference sequence factorization functions

std::vector<Factor> factorize_dna_w_reference_seq(const std::string& reference_seq, const std::string& target_seq) {
    // Prepare reference and target sequences together
    std::vector<std::string> sequences = {reference_seq, target_seq};
    PreparedSequenceResult prep_result = prepare_multiple_dna_sequences_w_rc(sequences);

    // Calculate the starting position of the target sequence in the prepared string
    // The format is: REF[sentinel]TARGET[sentinel]RC(TARGET)[sentinel]RC(REF)[sentinel]
    // We want to start factorization from where TARGET begins
    size_t target_start_pos = reference_seq.length() + 1; // +1 for the sentinel between ref and target

    // Perform factorization starting from target sequence
    std::vector<Factor> factors;
    detail::nolzss_multiple_dna_w_rc(prep_result.prepared_string, [&](const Factor& f) {
        factors.push_back(f);
    }, target_start_pos);

    return factors;
}

size_t factorize_dna_w_reference_seq_file(const std::string& reference_seq, const std::string& target_seq, const std::string& out_path) {
    // Set up binary output file with buffering
    std::ofstream os(out_path, std::ios::binary);
    if (!os) {
        throw std::runtime_error("Cannot create output file: " + out_path);
    }

    std::vector<char> buf(1<<20); // 1 MB buffer for performance
    os.rdbuf()->pubsetbuf(buf.data(), static_cast<std::streamsize>(buf.size()));

    // Get factors using the in-memory function
    std::vector<Factor> factors = factorize_dna_w_reference_seq(reference_seq, target_seq);

    // Calculate total_length from target sequence length (what we're factorizing)
    uint64_t total_length = target_seq.length();

    // Write factors first
    for (const Factor& f : factors) {
        os.write(reinterpret_cast<const char*>(&f), sizeof(Factor));
    }

    // Create footer for reference+target factorization
    FactorFileFooter footer;
    footer.num_factors = factors.size();
    footer.num_sequences = 2;  // Reference + target sequences
    footer.num_sentinels = 1;  // One sentinel between ref and target (in factorized region)
    footer.footer_size = sizeof(FactorFileFooter);
    footer.total_length = total_length;

    // Write footer at the end
    os.write(reinterpret_cast<const char*>(&footer), sizeof(footer));

    return factors.size();
}

// General reference sequence factorization functions (no reverse complement)

std::vector<Factor> factorize_w_reference(const std::string& reference_seq, const std::string& target_seq) {
    // Concatenate reference and target with a sentinel (use ASCII character 1)
    std::string combined = reference_seq + '\x01' + target_seq;

    // Calculate the starting position of the target sequence in the combined string
    size_t target_start_pos = reference_seq.length() + 1; // +1 for the sentinel

    // Perform general factorization starting from target sequence position
    std::vector<Factor> factors;
    cst_t cst; construct_im(cst, combined, 1);
    detail::nolzss(cst, [&](const Factor& f) {
        factors.push_back(f);
    }, target_start_pos);

    return factors;
}

size_t factorize_w_reference_file(const std::string& reference_seq, const std::string& target_seq, const std::string& out_path) {
    // Set up binary output file with buffering
    std::ofstream os(out_path, std::ios::binary);
    if (!os) {
        throw std::runtime_error("Cannot create output file: " + out_path);
    }

    std::vector<char> buf(1<<20); // 1 MB buffer for performance
    os.rdbuf()->pubsetbuf(buf.data(), static_cast<std::streamsize>(buf.size()));

    // Get factors using the in-memory function
    std::vector<Factor> factors = factorize_w_reference(reference_seq, target_seq);

    // Calculate total_length from target sequence length (what we're factorizing)
    uint64_t total_length = target_seq.length();

    // Write factors first
    for (const Factor& f : factors) {
        os.write(reinterpret_cast<const char*>(&f), sizeof(Factor));
    }

    // Create footer for reference+target factorization
    FactorFileFooter footer;
    footer.num_factors = factors.size();
    footer.num_sequences = 2;  // Reference + target sequences
    footer.num_sentinels = 1;  // One sentinel between ref and target
    footer.footer_size = sizeof(FactorFileFooter);
    footer.total_length = total_length;

    // Write footer at the end
    os.write(reinterpret_cast<const char*>(&footer), sizeof(footer));

    return factors.size();
}

// Parallel factorization implementations

size_t parallel_factorize_to_file(::std::string_view text, const ::std::string& output_path, size_t num_threads, size_t start_pos) {
    ParallelFactorizer factorizer;
    return factorizer.parallel_factorize(text, output_path, num_threads, start_pos);
}

size_t parallel_factorize_file_to_file(const ::std::string& input_path, const ::std::string& output_path, size_t num_threads, size_t start_pos) {
    ParallelFactorizer factorizer;
    return factorizer.parallel_factorize_file(input_path, output_path, num_threads, start_pos);
}

size_t parallel_factorize_dna_w_rc_to_file(::std::string_view text, const ::std::string& output_path, size_t num_threads) {
    ParallelFactorizer factorizer;
    return factorizer.parallel_factorize_dna_w_rc(text, output_path, num_threads);
}

size_t parallel_factorize_file_dna_w_rc_to_file(const ::std::string& input_path, const ::std::string& output_path, size_t num_threads) {
    ParallelFactorizer factorizer;
    return factorizer.parallel_factorize_file_dna_w_rc(input_path, output_path, num_threads);
}

} // namespace noLZSS