Program Listing for File factorizer.cpp

Return to documentation for file (src/cpp/factorizer.cpp)

#include "factorizer.hpp"
#include <sdsl/suffix_trees.hpp>
#include <sdsl/rmq_succinct_sct.hpp>
#include <cassert>
#include <fstream>
#include <iostream>
#include <optional>
#include <limits>

namespace noLZSS {
using cst_t = sdsl::cst_sada<>;

static size_t lcp(cst_t& cst, size_t i, size_t j) {
    if (i == j) return cst.csa.size() - cst.csa[i];
    auto lca = cst.lca(cst.select_leaf(cst.csa.isa[i]+1), cst.select_leaf(cst.csa.isa[j]+1));
    return cst.depth(lca);
}

static cst_t::node_type next_leaf(cst_t& cst, cst_t::node_type lambda, size_t iterations = 1) {
    assert(cst.is_leaf(lambda));
    auto lambda_rank = cst.lb(lambda);
    for (size_t i = 0; i < iterations; i++) lambda_rank = cst.csa.psi[lambda_rank];
    return cst.select_leaf(lambda_rank + 1);
}

// ---------- genomic utilities ----------
char complement(char c) {
    switch (c) {
        case 'A': return 'T';
        case 'C': return 'G';
        case 'G': return 'C';
        case 'T': return 'A';
        default:
            // Handle invalid input, e.g., throw an exception or return a sentinel value
            throw std::invalid_argument("Invalid nucleotide: " + std::string(1, c));
    }
}
static std::string revcomp(std::string_view s) {
    std::string r; r.resize(s.size());
    for (size_t i = 0, n = s.size(); i < n; ++i) r[i] = complement(s[n-1-i]);
    return r;
}

PreparedSequenceResult prepare_multiple_dna_sequences_w_rc(const std::vector<std::string>& sequences) {
    if (sequences.empty()) {
        return {"", 0, {}};
    }

    // Check if we have too many sequences
    if (sequences.size() > 125) {
        throw std::invalid_argument("Too many sequences: maximum 125 sequences supported (due to sentinel character limitations)");
    }

    // Validate sequences contain only valid DNA nucleotides
    for (size_t i = 0; i < sequences.size(); ++i) {
        for (char c : sequences[i]) {
            if (c != 'A' && c != 'C' && c != 'G' && c != 'T' &&
                c != 'a' && c != 'c' && c != 'g' && c != 't') {
                throw std::runtime_error("Invalid nucleotide '" + std::string(1, c) +
                                       "' found in sequence " + std::to_string(i));
            }
        }
    }

    PreparedSequenceResult result;

    // Calculate total size for reservation
    size_t total_size = 0;
    for (const auto& seq : sequences) {
        total_size += seq.length() * 2; // Original + reverse complement
    }
    total_size += sequences.size() * 2; // Add space for sentinels
    result.prepared_string.reserve(total_size);

    // Generate sentinel characters avoiding 0 and uppercase DNA nucleotides A(65), C(67), G(71), T(84)
    auto get_sentinel = [](size_t index) -> char {
        char sentinel = 1;
        size_t count = 0;

        while (true) {
            // Check if current sentinel is valid (not 0 and not uppercase DNA nucleotides)
            if (sentinel != 0 && sentinel != 'A' && sentinel != 'C' && sentinel != 'G' && sentinel != 'T') {
                if (count == index) {
                    return sentinel;  // Found the sentinel for this index
                }
                count++;
            }
            sentinel++;
            if (sentinel == 0) sentinel = 1; // wrap around, skip 0
        }
    };

    // First, add original sequences with sentinels
    for (size_t i = 0; i < sequences.size(); ++i) {
        // Convert to uppercase
        std::string seq = sequences[i];
        for (char& c : seq) {
            if (c >= 'a' && c <= 'z') c = c - 'a' + 'A';
        }
        result.prepared_string += seq;

        // Add sentinel and track its position
        size_t sentinel_pos = result.prepared_string.length();
        char sentinel = get_sentinel(i);
        result.prepared_string += sentinel;
        result.sentinel_positions.push_back(sentinel_pos);
    }

    result.original_length = result.prepared_string.length();

    // Then, add reverse complements with different sentinels
    for (int i = static_cast<int>(sequences.size()) - 1; i >= 0; --i) {
        // Convert to uppercase first
        std::string seq = sequences[i];
        for (char& c : seq) {
            if (c >= 'a' && c <= 'z') c = c - 'a' + 'A';
        }

        // Add reverse complement
        std::string rc = revcomp(seq);
        result.prepared_string += rc;

        // Add sentinel and track its position (offset by sequences.size() to make them unique)
        size_t sentinel_pos = result.prepared_string.length();
        char sentinel = get_sentinel(sequences.size() + (sequences.size() - 1 - i));
        result.prepared_string += sentinel;
        result.sentinel_positions.push_back(sentinel_pos);
    }

    return result;
}

PreparedSequenceResult prepare_multiple_dna_sequences_no_rc(const std::vector<std::string>& sequences) {
    if (sequences.empty()) {
        return {"", 0, {}};
    }

    // Check if we have too many sequences
    if (sequences.size() > 250) {
        throw std::invalid_argument("Too many sequences: maximum 250 sequences supported (due to sentinel character limitations)");
    }

    // Validate sequences contain only valid DNA nucleotides
    for (size_t i = 0; i < sequences.size(); ++i) {
        for (char c : sequences[i]) {
            if (c != 'A' && c != 'C' && c != 'G' && c != 'T' &&
                c != 'a' && c != 'c' && c != 'g' && c != 't') {
                throw std::runtime_error("Invalid nucleotide '" + std::string(1, c) +
                                       "' found in sequence " + std::to_string(i));
            }
        }
    }

    PreparedSequenceResult result;

    // Calculate total size for reservation
    size_t total_size = 0;
    for (const auto& seq : sequences) {
        total_size += seq.length();
    }
    total_size += sequences.size(); // Add space for sentinels
    result.prepared_string.reserve(total_size);

    // Generate sentinel characters avoiding 0 and uppercase DNA nucleotides A(65), C(67), G(71), T(84)
    auto get_sentinel = [](size_t index) -> char {
        char sentinel = 1;
        size_t count = 0;

        while (true) {
            // Check if current sentinel is valid (not 0 and not uppercase DNA nucleotides)
            if (sentinel != 0 && sentinel != 'A' && sentinel != 'C' && sentinel != 'G' && sentinel != 'T') {
                if (count == index) {
                    return sentinel;  // Found the sentinel for this index
                }
                count++;
            }
            sentinel++;
            if (sentinel == 0) sentinel = 1; // wrap around, skip 0
        }
    };

    // Add original sequences with sentinels
    for (size_t i = 0; i < sequences.size(); ++i) {
        // Convert to uppercase
        std::string seq = sequences[i];
        for (char& c : seq) {
            if (c >= 'a' && c <= 'z') c = c - 'a' + 'A';
        }
        result.prepared_string += seq;

        // Add sentinel and track its position (only between sequences, not after the last one)
        if (i < sequences.size() - 1) {
            size_t sentinel_pos = result.prepared_string.length();
            char sentinel = get_sentinel(i);
            result.prepared_string += sentinel;
            result.sentinel_positions.push_back(sentinel_pos);
        }
    }

    result.original_length = result.prepared_string.length();

    return result;
}


// ---------- generic, sink-driven noLZSS ----------

template<class Sink>
static size_t nolzss(cst_t& cst, Sink&& sink) {
    sdsl::rmq_succinct_sct<> rmq(&cst.csa);
    const size_t str_len = cst.size() - 1; // the length of the string is the size of the CST minus the sentinel

    auto lambda = cst.select_leaf(cst.csa.isa[0] + 1);
    size_t lambda_node_depth = cst.node_depth(lambda);
    size_t lambda_sufnum = 0;

    cst_t::node_type v;
    size_t v_min_leaf_sufnum = 0;
    size_t u_min_leaf_sufnum = 0;

    size_t count = 0;

    while (lambda_sufnum < str_len) {
        // Compute current factor
        size_t d = 1;
        size_t l = 1;
        while (true) {
            v = cst.bp_support.level_anc(lambda, lambda_node_depth - d);
            v_min_leaf_sufnum = cst.csa[rmq(cst.lb(v), cst.rb(v))];
            l = cst.depth(v);

            if (v_min_leaf_sufnum + l - 1 < lambda_sufnum) {
                u_min_leaf_sufnum = v_min_leaf_sufnum;
                ++d; continue;
            }
            auto u = cst.parent(v);
            auto u_depth = cst.depth(u);

            if (v_min_leaf_sufnum == lambda_sufnum) {
                if (u == cst.root()) {
                    l = 1;
                    Factor factor{static_cast<uint64_t>(lambda_sufnum), static_cast<uint64_t>(l), static_cast<uint64_t>(lambda_sufnum)};
                    sink(factor);
                    break;
                }
                else {
                    l = u_depth;
                    Factor factor{static_cast<uint64_t>(lambda_sufnum), static_cast<uint64_t>(l), static_cast<uint64_t>(u_min_leaf_sufnum)};
                    sink(factor);
                    break;
                }
            }
            l = std::min(lcp(cst, lambda_sufnum, v_min_leaf_sufnum),
                         (lambda_sufnum - v_min_leaf_sufnum));
            if (l <= u_depth) {
                l = u_depth;
                Factor factor{static_cast<uint64_t>(lambda_sufnum), static_cast<uint64_t>(l), static_cast<uint64_t>(u_min_leaf_sufnum)};
                sink(factor);
                break;
            }
            else {
                Factor factor{static_cast<uint64_t>(lambda_sufnum), static_cast<uint64_t>(l), static_cast<uint64_t>(v_min_leaf_sufnum)};
                sink(factor);
                break;
            }
        }

        ++count;
        // Advance to next position
        lambda = next_leaf(cst, lambda, l);
        lambda_node_depth = cst.node_depth(lambda);
        lambda_sufnum = cst.sn(lambda);
    }

    return count;
}

template<class Sink>
static size_t nolzss_dna_w_rc(const std::string& T, Sink&& sink) {
    const size_t n = T.size();
    if (n == 0) return 0;

    // Use prepare_multiple_dna_sequences_w_rc with a single sequence
    PreparedSequenceResult prep_result = prepare_multiple_dna_sequences_w_rc({T});
    std::string S = prep_result.prepared_string;
    size_t original_length = prep_result.original_length;

    // Use the multiple sequence algorithm
    return nolzss_multiple_dna_w_rc(S, std::forward<Sink>(sink));
}


template<class Sink>
static size_t nolzss_multiple_dna_w_rc(const std::string& S, Sink&& sink) {
    const size_t N = (S.size() / 2) - 1;
    if (N == 0) return 0;

    // Build CST over S
    cst_t cst; construct_im(cst, S, 1);

    // Build RMQ inputs aligned to SA: forward starts and RC ends (in T-coords)
    const uint64_t INF = std::numeric_limits<uint64_t>::max()/2ULL;
    sdsl::int_vector<64> fwd_starts(cst.csa.size(), INF);
    sdsl::int_vector<64> rc_ends   (cst.csa.size(), INF);

    const size_t T_beg = 0;
    const size_t T_end = N;           // end of original
    const size_t R_beg = N;       // first char of rc
    const size_t R_end = S.size();   // end of S

    for (size_t k = 0; k < cst.csa.size(); ++k) {
        size_t posS = cst.csa[k];
        if (posS < T_end) {
            // suffix starts in T
            fwd_starts[k] = posS;     // 0-based start in T
        } else if (posS >= R_beg && posS < R_end) {
            // suffix starts in R
            size_t jR0   = posS - R_beg;         // 0-based start in R
            size_t endT0 = N - jR0 - 1;          // mapped end in T (0-based)
            rc_ends[k] = endT0;
        }
    }
    sdsl::rmq_succinct_sct<> rmqF(&fwd_starts);
    sdsl::rmq_succinct_sct<> rmqRcEnd(&rc_ends);

    // Initialize to the leaf of suffix starting at S position 0 (i.e., T[0])
    auto lambda = cst.select_leaf(cst.csa.isa[0] + 1);
    size_t lambda_node_depth = cst.node_depth(lambda);
    size_t i = cst.sn(lambda); // suffix start in S, begins at 0

    size_t factors = 0;

    while (i < N) { // only factorize inside T
        // At factor start i (0-based in T), walk up ancestors and pick best candidate
        size_t best_len_depth = 0;   // best candidate's depth (proxy for length)
        bool   best_is_rc      = false;
        size_t best_fwd_start  = 0;  // start in T (for FWD)
        size_t best_rc_end     = 0;  // end in T (for RC)
        size_t best_rc_posS    = 0;  // pos in S where RC candidate suffix starts (for LCP)

        // Walk from leaf to root via level_anc
        for (size_t step = 1; step <= lambda_node_depth; ++step) {
            auto v = cst.bp_support.level_anc(lambda, lambda_node_depth - step);
            size_t ell = cst.depth(v);
            if (ell == 0) break; // reached root

            auto lb = cst.lb(v), rb = cst.rb(v);

            // Forward candidate (min start in T within v's interval)
            size_t kF = rmqF(lb, rb);
            uint64_t jF = fwd_starts[kF];
            bool okF = (jF != INF) && (jF + ell - 1 < i); // non-overlap: endF <= i-1

            // RC candidate (min END in T within v's interval; monotone with depth)
            size_t kR = rmqRcEnd(lb, rb);
            uint64_t endRC = rc_ends[kR];
            bool okR = (endRC != INF) && (endRC < i); // endRC <= i-1

            if (!okF && !okR) {
                // deeper nodes can only increase jF and the minimal RC end
                // -> non-overlap won't become true again for either; stop
                break;
            }

            // Choose the better of the valid candidates at this depth
            if (okF) {
                if (ell > best_len_depth ||
                    (ell == best_len_depth && !best_is_rc && (jF + ell - 1) < (best_fwd_start + best_len_depth - 1))) {
                    best_len_depth = ell;
                    best_is_rc     = false;
                    best_fwd_start = jF;
                }
            }
            if (okR) {
                size_t posS_R = cst.csa[kR]; // suffix position in S for LCP
                if (ell > best_len_depth ||
                    (ell == best_len_depth && (best_is_rc ? (endRC < best_rc_end) : true))) {
                    best_len_depth = ell;
                    best_is_rc     = true;
                    best_rc_end    = endRC;
                    best_rc_posS   = posS_R;
                }
            }
        }

        size_t emit_len = 1;
        uint64_t emit_ref = i; // default for literal
        if (best_len_depth == 0) {
            // No previous occurrence (FWD nor RC) — literal of length 1
            Factor f{static_cast<uint64_t>(i), static_cast<uint64_t>(emit_len), static_cast<uint64_t>(emit_ref)};
            sink(f);
            ++factors;

            // Advance
            lambda = next_leaf(cst, lambda, emit_len);
            lambda_node_depth = cst.node_depth(lambda);
            i = cst.sn(lambda);
            continue;
        }

        if (!best_is_rc) {
            // Finalize FWD with true LCP and non-overlap cap
            size_t cap = i - best_fwd_start; // i-1 - (best_fwd_start) + 1
            size_t L   = lcp(cst, i, best_fwd_start);
            emit_len   = std::min(L, cap);
            emit_ref   = static_cast<uint64_t>(best_fwd_start);
        } else {
            // Finalize RC with true LCP (against suffix in R)
            size_t L   = lcp(cst, i, best_rc_posS);
            emit_len   = L;
            emit_ref   = RC_MASK | static_cast<uint64_t>(best_rc_end); // end-anchored + RC flag
        }

        // Safety: ensure progress
        if (emit_len <= 0) {
            throw std::runtime_error("emit_len must be positive to ensure factorization progress");
        }

        Factor f{static_cast<uint64_t>(i), static_cast<uint64_t>(emit_len), emit_ref};
        sink(f);
        ++factors;

        // Advance to next phrase start
        lambda = next_leaf(cst, lambda, emit_len);
        lambda_node_depth = cst.node_depth(lambda);
        i = cst.sn(lambda);
    }

    return factors;
}

// ------------- public wrappers -------------

template<class Sink>
size_t factorize_stream(std::string_view text, Sink&& sink) {
    std::string tmp(text);
    cst_t cst; construct_im(cst, tmp, 1);
    return nolzss(cst, std::forward<Sink>(sink));
}

template<class Sink>
size_t factorize_file_stream(const std::string& path, Sink&& sink) {
    // sdsl-lite will automatically add the sentinel when needed
    cst_t cst; construct(cst, path, 1);
    return nolzss(cst, std::forward<Sink>(sink));
}

size_t count_factors(std::string_view text) {
    size_t n = 0;
    factorize_stream(text, [&](const Factor&){ ++n; });
    return n;
}

size_t count_factors_file(const std::string& path) {
    size_t n = 0;
    factorize_file_stream(path, [&](const Factor&){ ++n; });
    return n;
}

std::vector<Factor> factorize(std::string_view text) {
    std::vector<Factor> out;
    factorize_stream(text, [&](const Factor& f){ out.push_back(f); });
    return out;
}

std::vector<Factor> factorize_file(const std::string& path, size_t reserve_hint) {
    std::vector<Factor> out;
    if (reserve_hint) out.reserve(reserve_hint);
    factorize_file_stream(path, [&](const Factor& f){ out.push_back(f); });
    return out;
}

size_t write_factors_binary_file(const std::string& in_path, const std::string& out_path) {
    // Set up binary output file with buffering
    std::ofstream os(out_path, std::ios::binary);
    if (!os) {
        throw std::runtime_error("Cannot create output file: " + out_path);
    }

    std::vector<char> buf(1<<20); // 1 MB buffer for performance
    os.rdbuf()->pubsetbuf(buf.data(), static_cast<std::streamsize>(buf.size()));

    // Collect factors in memory to write header
    std::vector<Factor> factors;
    size_t n = factorize_file_stream(in_path, [&](const Factor& f){
        factors.push_back(f);
    });

    // For non-FASTA files, create minimal header with no sequence names or sentinels
    FactorFileHeader header;
    header.num_factors = factors.size();
    header.num_sequences = 0;  // Unknown for raw text files
    header.num_sentinels = 0;  // No sentinels for raw text files
    header.header_size = sizeof(FactorFileHeader);

    // Write header
    os.write(reinterpret_cast<const char*>(&header), sizeof(header));

    // Write factors
    for (const Factor& f : factors) {
        os.write(reinterpret_cast<const char*>(&f), sizeof(Factor));
    }

    return n;
}

template<class Sink>
size_t factorize_stream_dna_w_rc(std::string_view text, Sink&& sink) {
    std::string tmp(text);
    return nolzss_dna_w_rc(tmp, std::forward<Sink>(sink));
}

template<class Sink>
size_t factorize_file_stream_dna_w_rc(const std::string& path, Sink&& sink) {
    std::ifstream is(path, std::ios::binary);
    if (!is) throw std::runtime_error("Cannot open input file: " + path);
    std::string data((std::istreambuf_iterator<char>(is)), std::istreambuf_iterator<char>());
    return nolzss_dna_w_rc(data, std::forward<Sink>(sink));
}

std::vector<Factor> factorize_dna_w_rc(std::string_view text) {
    std::vector<Factor> out;
    factorize_stream_dna_w_rc(text, [&](const Factor& f){ out.push_back(f); });
    return out;
}

std::vector<Factor> factorize_file_dna_w_rc(const std::string& path, size_t reserve_hint) {
    std::vector<Factor> out; if (reserve_hint) out.reserve(reserve_hint);
    factorize_file_stream_dna_w_rc(path, [&](const Factor& f){ out.push_back(f); });
    return out;
}

size_t count_factors_dna_w_rc(std::string_view text) {
    size_t n = 0; factorize_stream_dna_w_rc(text, [&](const Factor&){ ++n; }); return n;
}

size_t count_factors_file_dna_w_rc(const std::string& path) {
    size_t n = 0; factorize_file_stream_dna_w_rc(path, [&](const Factor&){ ++n; }); return n;
}

size_t write_factors_binary_file_dna_w_rc(const std::string& in_path, const std::string& out_path) {
    // Set up binary output file with buffering
    std::ofstream os(out_path, std::ios::binary);
    if (!os) {
        throw std::runtime_error("Cannot create output file: " + out_path);
    }

    std::vector<char> buf(1<<20); // 1 MB buffer for performance
    os.rdbuf()->pubsetbuf(buf.data(), static_cast<std::streamsize>(buf.size()));

    // Collect factors in memory to write header
    std::vector<Factor> factors;
    size_t n = factorize_file_stream_dna_w_rc(in_path, [&](const Factor& f){
        factors.push_back(f);
    });

    // For single DNA sequence files, create minimal header
    FactorFileHeader header;
    header.num_factors = factors.size();
    header.num_sequences = 1;  // Single DNA sequence
    header.num_sentinels = 0;  // No sentinels for single sequence
    header.header_size = sizeof(FactorFileHeader) + 1; // Empty sequence name

    // Write header
    os.write(reinterpret_cast<const char*>(&header), sizeof(header));

    // Write empty sequence name (single null terminator)
    os.write("\0", 1);

    // Write factors
    for (const Factor& f : factors) {
        os.write(reinterpret_cast<const char*>(&f), sizeof(Factor));
    }

    return n;
}

template<class Sink>
size_t factorize_stream_multiple_dna_w_rc(std::string_view text, Sink&& sink) {
    std::string tmp(text);
    return nolzss_multiple_dna_w_rc(tmp, std::forward<Sink>(sink));
}

template<class Sink>
size_t factorize_file_stream_multiple_dna_w_rc(const std::string& path, Sink&& sink) {
    std::ifstream is(path, std::ios::binary);
    if (!is) throw std::runtime_error("Cannot open input file: " + path);
    std::string data((std::istreambuf_iterator<char>(is)), std::istreambuf_iterator<char>());
    return nolzss_multiple_dna_w_rc(data, std::forward<Sink>(sink));
}

std::vector<Factor> factorize_multiple_dna_w_rc(std::string_view text) {
    std::vector<Factor> out;
    factorize_stream_multiple_dna_w_rc(text, [&](const Factor& f){ out.push_back(f); });
    return out;
}

std::vector<Factor> factorize_file_multiple_dna_w_rc(const std::string& path, size_t reserve_hint) {
    std::vector<Factor> out; if (reserve_hint) out.reserve(reserve_hint);
    factorize_file_stream_multiple_dna_w_rc(path, [&](const Factor& f){ out.push_back(f); });
    return out;
}

size_t count_factors_multiple_dna_w_rc(std::string_view text) {
    size_t n = 0; factorize_stream_multiple_dna_w_rc(text, [&](const Factor&){ ++n; }); return n;
}

size_t count_factors_file_multiple_dna_w_rc(const std::string& path) {
    size_t n = 0; factorize_file_stream_multiple_dna_w_rc(path, [&](const Factor&){ ++n; }); return n;
}

size_t write_factors_binary_file_multiple_dna_w_rc(const std::string& in_path, const std::string& out_path) {
    // Set up binary output file with buffering
    std::ofstream os(out_path, std::ios::binary);
    if (!os) {
        throw std::runtime_error("Cannot create output file: " + out_path);
    }

    std::vector<char> buf(1<<20); // 1 MB buffer for performance
    os.rdbuf()->pubsetbuf(buf.data(), static_cast<std::streamsize>(buf.size()));

    // Collect factors in memory to write header
    std::vector<Factor> factors;
    size_t n = factorize_file_stream_multiple_dna_w_rc(in_path, [&](const Factor& f){
        factors.push_back(f);
    });

    // For multiple DNA sequences from text file, we don't know sequence names or sentinels
    FactorFileHeader header;
    header.num_factors = factors.size();
    header.num_sequences = 0;  // Unknown for raw text files with multiple sequences
    header.num_sentinels = 0;  // Cannot identify sentinels without preparation function
    header.header_size = sizeof(FactorFileHeader);

    // Write header
    os.write(reinterpret_cast<const char*>(&header), sizeof(header));

    // Write factors
    for (const Factor& f : factors) {
        os.write(reinterpret_cast<const char*>(&f), sizeof(Factor));
    }

    return n;
}

} // namespace noLZSS