Program Listing for File fasta_processor.cpp

Return to documentation for file (src/cpp/fasta_processor.cpp)

#include "fasta_processor.hpp"
#include "parallel_fasta_processor.hpp"  // For parallel implementations
#include "factorizer.hpp"
#include "factorizer_core.hpp"      // Template implementations in detail:: namespace
#include "factorizer_helpers.hpp"  // For cst_t and lcp
#include <sdsl/rmq_succinct_sct.hpp>
#include <sdsl/construct.hpp>
#include <iostream>
#include <algorithm>
#include <set>
#include <fstream>

namespace noLZSS {

static inline bool is_canonical_dna(char c) {
    return c == 'A' || c == 'C' || c == 'G' || c == 'T' ||
           c == 'a' || c == 'c' || c == 'g' || c == 't';
}

static inline char to_upper_ascii(char c) {
    if (c >= 'a' && c <= 'z') {
        return static_cast<char>(c - 'a' + 'A');
    }
    return c;
}

// Helper function to parse FASTA file into individual sequences and IDs
FastaParseResult parse_fasta_sequences_and_ids(
    const std::string& fasta_path,
    FastaDnaSanitizationMode sanitization_mode
) {
    std::ifstream file(fasta_path);
    if (!file.is_open()) {
        throw std::runtime_error("Cannot open FASTA file: " + fasta_path);
    }

    FastaParseResult result;
    std::string line;
    std::string current_sequence;
    std::string current_id;
    size_t empty_sequence_count = 0;
    size_t ambiguous_removed_count = 0;

    while (std::getline(file, line)) {
        // Remove trailing whitespace
        while (!line.empty() && std::isspace(line.back())) {
            line.pop_back();
        }

        if (line.empty()) {
            continue; // Skip empty lines
        }

        if (line[0] == '>') {
            // Header line - finish previous sequence if exists
            if (!current_id.empty()) {
                if (!current_sequence.empty()) {
                    result.sequences.push_back(current_sequence);
                    result.sequence_ids.push_back(current_id);
                } else {
                    // Warn about empty sequence
                    std::cerr << "Warning: Skipping empty sequence with ID: " << current_id << std::endl;
                    empty_sequence_count++;
                }
                current_sequence.clear();
            }

            // Parse new header to extract ID
            size_t start = 1; // Skip '>'
            while (start < line.size() && std::isspace(line[start])) {
                start++;
            }
            size_t end = start;
            while (end < line.size() && !std::isspace(line[end])) {
                end++;
            }

            if (start < line.size()) {
                current_id = line.substr(start, end - start);
            } else {
                throw std::runtime_error("Empty sequence header in FASTA file");
            }
        } else {
            // Sequence line - append to current sequence
            for (char c : line) {
                if (!std::isspace(c)) {
                    if (is_canonical_dna(c)) {
                        current_sequence += to_upper_ascii(c);
                    } else if (sanitization_mode == FastaDnaSanitizationMode::Strict) {
                        throw std::runtime_error("Invalid nucleotide '" + std::string(1, c) +
                                               "' found in sequence with ID: " + current_id);
                    } else {
                        ambiguous_removed_count++;
                    }
                }
            }
        }
    }

    // Add the last sequence if it exists
    if (!current_id.empty()) {
        if (!current_sequence.empty()) {
            result.sequences.push_back(current_sequence);
            result.sequence_ids.push_back(current_id);
        } else {
            // Warn about empty sequence
            std::cerr << "Warning: Skipping empty sequence with ID: " << current_id << std::endl;
            empty_sequence_count++;
        }
    }

    // Report summary if any empty sequences were skipped
    if (empty_sequence_count > 0) {
        std::cerr << "Warning: Skipped " << empty_sequence_count << " empty sequence(s) in FASTA file" << std::endl;
    }
    if (sanitization_mode == FastaDnaSanitizationMode::RemoveAmbiguous && ambiguous_removed_count > 0) {
        std::cerr << "Warning: Removed " << ambiguous_removed_count
                  << " ambiguous nucleotide(s) from FASTA input" << std::endl;
    }

    file.close();

    if (result.sequences.empty()) {
        throw std::runtime_error("No valid sequences found in FASTA file");
    }

    return result;
}

// Helper function to identify sentinel factors from factorization results
std::vector<uint64_t> identify_sentinel_factors(const std::vector<Factor>& factors,
                                                const std::vector<size_t>& sentinel_positions) {
    std::vector<uint64_t> sentinel_factor_indices;
    size_t sentinel_idx = 0;  // Current index in sentinel_positions

    for (size_t i = 0; i < factors.size(); ++i) {
        const Factor& f = factors[i];

        // Advance sentinel index past positions that occur before the current factor start
        while (sentinel_idx < sentinel_positions.size() &&
               sentinel_positions[sentinel_idx] < f.start) {
            sentinel_idx++;
        }

        // Check if this factor's start position matches current sentinel position
        if (sentinel_idx < sentinel_positions.size() &&
            f.start == sentinel_positions[sentinel_idx]) {

            // Sanity checks for sentinel factors
            if (f.length != 1) {
                throw std::runtime_error("Sentinel factor has unexpected length: " + std::to_string(f.length));
            }
            if (f.ref != f.start) {
                throw std::runtime_error("Sentinel factor reference mismatch: ref=" +
                                       std::to_string(f.ref) + ", pos=" + std::to_string(f.start));
            }
            sentinel_factor_indices.push_back(i);
            sentinel_idx++;  // Move to next sentinel position
        }
    }

    return sentinel_factor_indices;
}



FastaReferenceTargetResult prepare_ref_target_dna_no_rc_from_fasta(
    const std::string& reference_fasta_path,
    const std::string& target_fasta_path,
    FastaDnaSanitizationMode sanitization_mode
) {
    // Process reference FASTA file first
    FastaParseResult ref_parse_result = parse_fasta_sequences_and_ids(reference_fasta_path, sanitization_mode);

    // Calculate target start index BEFORE moving sequences
    size_t target_start_index = 0;
    for (const auto& seq : ref_parse_result.sequences) {
        target_start_index += seq.length() + 1; // +1 for sentinel
    }

    size_t num_ref_sequences = ref_parse_result.sequences.size();

    // Process target FASTA file second
    FastaParseResult target_parse_result = parse_fasta_sequences_and_ids(target_fasta_path, sanitization_mode);

    size_t num_target_sequences = target_parse_result.sequences.size();

    // Reserve and move sequences (avoid copying)
    std::vector<std::string> all_original_sequences;
    all_original_sequences.reserve(num_ref_sequences + num_target_sequences);
    all_original_sequences.insert(all_original_sequences.end(),
                                 std::make_move_iterator(ref_parse_result.sequences.begin()),
                                 std::make_move_iterator(ref_parse_result.sequences.end()));
    all_original_sequences.insert(all_original_sequences.end(),
                                 std::make_move_iterator(target_parse_result.sequences.begin()),
                                 std::make_move_iterator(target_parse_result.sequences.end()));

    // Reserve and move IDs
    std::vector<std::string> all_sequence_ids;
    all_sequence_ids.reserve(num_ref_sequences + num_target_sequences);
    all_sequence_ids.insert(all_sequence_ids.end(),
                           std::make_move_iterator(ref_parse_result.sequence_ids.begin()),
                           std::make_move_iterator(ref_parse_result.sequence_ids.end()));
    all_sequence_ids.insert(all_sequence_ids.end(),
                           std::make_move_iterator(target_parse_result.sequence_ids.begin()),
                           std::make_move_iterator(target_parse_result.sequence_ids.end()));

    if (all_original_sequences.empty()) {
        throw std::runtime_error("No valid sequences found in FASTA files");
    }

    // Use prepare_multiple_dna_sequences_no_rc directly with collected sequences
    return {prepare_multiple_dna_sequences_no_rc(all_original_sequences), all_sequence_ids, num_ref_sequences, num_target_sequences, target_start_index};
}

FastaReferenceTargetResult prepare_ref_target_dna_w_rc_from_fasta(
    const std::string& reference_fasta_path,
    const std::string& target_fasta_path,
    FastaDnaSanitizationMode sanitization_mode
) {
    // Process reference FASTA file first
    FastaParseResult ref_parse_result = parse_fasta_sequences_and_ids(reference_fasta_path, sanitization_mode);

    // Calculate target start index BEFORE moving sequences
    size_t target_start_index = 0;
    for (const auto& seq : ref_parse_result.sequences) {
        target_start_index += seq.length() + 1; // +1 for sentinel
    }

    size_t num_ref_sequences = ref_parse_result.sequences.size();

    // Process target FASTA file second
    FastaParseResult target_parse_result = parse_fasta_sequences_and_ids(target_fasta_path, sanitization_mode);

    size_t num_target_sequences = target_parse_result.sequences.size();

    // Reserve and move sequences (avoid copying)
    std::vector<std::string> all_original_sequences;
    all_original_sequences.reserve(num_ref_sequences + num_target_sequences);
    all_original_sequences.insert(all_original_sequences.end(),
                                 std::make_move_iterator(ref_parse_result.sequences.begin()),
                                 std::make_move_iterator(ref_parse_result.sequences.end()));
    all_original_sequences.insert(all_original_sequences.end(),
                                 std::make_move_iterator(target_parse_result.sequences.begin()),
                                 std::make_move_iterator(target_parse_result.sequences.end()));

    // Reserve and move IDs
    std::vector<std::string> all_sequence_ids;
    all_sequence_ids.reserve(num_ref_sequences + num_target_sequences);
    all_sequence_ids.insert(all_sequence_ids.end(),
                           std::make_move_iterator(ref_parse_result.sequence_ids.begin()),
                           std::make_move_iterator(ref_parse_result.sequence_ids.end()));
    all_sequence_ids.insert(all_sequence_ids.end(),
                           std::make_move_iterator(target_parse_result.sequence_ids.begin()),
                           std::make_move_iterator(target_parse_result.sequence_ids.end()));

    if (all_original_sequences.empty()) {
        throw std::runtime_error("No valid sequences found in FASTA files");
    }

    // Use prepare_multiple_dna_sequences_w_rc directly with collected sequences
    return {prepare_multiple_dna_sequences_w_rc(all_original_sequences), all_sequence_ids, num_ref_sequences, num_target_sequences, target_start_index};
}



FastaFactorizationResult factorize_fasta_multiple_dna_w_rc(
    const std::string& fasta_path,
    FastaDnaSanitizationMode sanitization_mode
) {
    // Parse FASTA file into individual sequences with IDs
    FastaParseResult parse_result = parse_fasta_sequences_and_ids(fasta_path, sanitization_mode);

    // Prepare sequences for factorization (this will validate nucleotides)
    PreparedSequenceResult prep_result = prepare_multiple_dna_sequences_w_rc(parse_result.sequences);

    // Perform factorization
    std::vector<Factor> factors = factorize_multiple_dna_w_rc(prep_result.prepared_string);

    // Identify sentinel factors using helper function
    std::vector<uint64_t> sentinel_factor_indices = identify_sentinel_factors(factors, prep_result.sentinel_positions);

    return {factors, sentinel_factor_indices, parse_result.sequence_ids};
}

FastaFactorizationResult factorize_fasta_multiple_dna_no_rc(
    const std::string& fasta_path,
    FastaDnaSanitizationMode sanitization_mode
) {
    // Parse FASTA file into individual sequences with IDs
    FastaParseResult parse_result = parse_fasta_sequences_and_ids(fasta_path, sanitization_mode);

    // Prepare sequences for factorization (this will validate nucleotides)
    PreparedSequenceResult prep_result = prepare_multiple_dna_sequences_no_rc(parse_result.sequences);

    // Perform factorization using regular factorize function
    std::vector<Factor> factors = factorize(prep_result.prepared_string);

    // Identify sentinel factors using helper function
    std::vector<uint64_t> sentinel_factor_indices = identify_sentinel_factors(factors, prep_result.sentinel_positions);

    return {factors, sentinel_factor_indices, parse_result.sequence_ids};
}

// Note: Template implementations now in factorizer_core.hpp in the detail:: namespace
// No more local duplicates needed!

size_t write_factors_binary_file_fasta_multiple_dna_w_rc(
    const std::string& fasta_path,
    const std::string& out_path,
    FastaDnaSanitizationMode sanitization_mode
) {
    // Delegate to parallel version with 1 thread
    return parallel_write_factors_binary_file_fasta_multiple_dna_w_rc(fasta_path, out_path, 1, sanitization_mode);
}

size_t write_factors_binary_file_fasta_multiple_dna_no_rc(
    const std::string& fasta_path,
    const std::string& out_path,
    FastaDnaSanitizationMode sanitization_mode
) {
    // Delegate to parallel version with 1 thread
    return parallel_write_factors_binary_file_fasta_multiple_dna_no_rc(fasta_path, out_path, 1, sanitization_mode);
}

FastaFactorizationResult factorize_dna_rc_w_ref_fasta_files(
    const std::string& reference_fasta_path,
    const std::string& target_fasta_path,
    FastaDnaSanitizationMode sanitization_mode
) {
    // Process both FASTA files and get prepared sequence with reverse complement
    FastaReferenceTargetResult ref_target_concat_w_rc = prepare_ref_target_dna_w_rc_from_fasta(
        reference_fasta_path, target_fasta_path, sanitization_mode);

    // Perform factorization starting from the target start index
    std::vector<Factor> factors = factorize_multiple_dna_w_rc(ref_target_concat_w_rc.concatinated_sequences.prepared_string,
                                                             ref_target_concat_w_rc.target_start_index);

    // Identify sentinel factors using helper function
    std::vector<uint64_t> sentinel_factor_indices = identify_sentinel_factors(factors,
                                                                             ref_target_concat_w_rc.concatinated_sequences.sentinel_positions);

    return {factors, sentinel_factor_indices, ref_target_concat_w_rc.sequence_ids};
}


size_t write_factors_dna_w_reference_fasta_files_to_binary(
    const std::string& reference_fasta_path,
    const std::string& target_fasta_path,
    const std::string& out_path,
    FastaDnaSanitizationMode sanitization_mode
) {
    // Delegate to parallel version with 1 thread
    return parallel_write_factors_dna_w_reference_fasta_files_to_binary(
        reference_fasta_path, target_fasta_path, out_path, 1, sanitization_mode);
}

FastaPerSequenceFactorizationResult factorize_fasta_dna_w_rc_per_sequence(
    const std::string& fasta_path,
    FastaDnaSanitizationMode sanitization_mode
) {
    // Delegate to parallel version with 1 thread
    // Parse FASTA file into individual sequences with IDs
    FastaParseResult parse_result = parse_fasta_sequences_and_ids(fasta_path, sanitization_mode);

    FastaPerSequenceFactorizationResult result;
    result.sequence_ids = std::move(parse_result.sequence_ids);
    result.per_sequence_factors.reserve(parse_result.sequences.size());

    // Process sequentially (equivalent to parallel with 1 thread, but no threading overhead)
    for (const auto& sequence : parse_result.sequences) {
        std::vector<std::string> single_seq = {sequence};
        PreparedSequenceResult prep_result = prepare_multiple_dna_sequences_w_rc(single_seq);
        std::vector<Factor> factors = factorize_multiple_dna_w_rc(prep_result.prepared_string);
        result.per_sequence_factors.push_back(std::move(factors));
    }

    return result;
}

FastaPerSequenceFactorizationResult factorize_fasta_dna_no_rc_per_sequence(
    const std::string& fasta_path,
    FastaDnaSanitizationMode sanitization_mode
) {
    // Delegate to parallel version with 1 thread
    // Parse FASTA file into individual sequences with IDs
    FastaParseResult parse_result = parse_fasta_sequences_and_ids(fasta_path, sanitization_mode);

    FastaPerSequenceFactorizationResult result;
    result.sequence_ids = std::move(parse_result.sequence_ids);
    result.per_sequence_factors.reserve(parse_result.sequences.size());

    // Process sequentially (equivalent to parallel with 1 thread, but no threading overhead)
    for (const auto& sequence : parse_result.sequences) {
        std::vector<std::string> single_seq = {sequence};
        PreparedSequenceResult prep_result = prepare_multiple_dna_sequences_no_rc(single_seq);
        std::string seq_without_sentinel = prep_result.prepared_string.substr(0, prep_result.prepared_string.length() - 1);
        std::vector<Factor> factors = factorize(seq_without_sentinel);
        result.per_sequence_factors.push_back(std::move(factors));
    }

    return result;
}

size_t write_factors_binary_file_fasta_dna_w_rc_per_sequence(
    const std::string& fasta_path,
    const std::string& out_dir,
    FastaDnaSanitizationMode sanitization_mode
) {
    // Delegate to parallel version with 1 thread
    return parallel_write_factors_binary_file_fasta_dna_w_rc_per_sequence(fasta_path, out_dir, 1, sanitization_mode);
}

size_t write_factors_binary_file_fasta_dna_no_rc_per_sequence(
    const std::string& fasta_path,
    const std::string& out_dir,
    FastaDnaSanitizationMode sanitization_mode
) {
    // Delegate to parallel version with 1 thread
    return parallel_write_factors_binary_file_fasta_dna_no_rc_per_sequence(fasta_path, out_dir, 1, sanitization_mode);
}

FastaPerSequenceCountResult count_factors_fasta_dna_w_rc_per_sequence(
    const std::string& fasta_path,
    FastaDnaSanitizationMode sanitization_mode
) {
    // Parse FASTA file into individual sequences
    FastaParseResult parse_result = parse_fasta_sequences_and_ids(fasta_path, sanitization_mode);

    FastaPerSequenceCountResult result;
    result.sequence_ids = parse_result.sequence_ids;
    result.factor_counts.reserve(parse_result.sequences.size());

    // Count factors for each sequence independently
    for (const auto& sequence : parse_result.sequences) {
        // Prepare single DNA sequence with reverse complement
        std::vector<std::string> single_seq = {sequence};
        PreparedSequenceResult prep_result = prepare_multiple_dna_sequences_w_rc(single_seq);

        // Count factors using count_factors_multiple_dna_w_rc
        size_t count = count_factors_multiple_dna_w_rc(prep_result.prepared_string);
        result.factor_counts.push_back(count);
        result.total_factors += count;
    }

    return result;
}

FastaPerSequenceCountResult count_factors_fasta_dna_no_rc_per_sequence(
    const std::string& fasta_path,
    FastaDnaSanitizationMode sanitization_mode
) {
    // Parse FASTA file into individual sequences
    FastaParseResult parse_result = parse_fasta_sequences_and_ids(fasta_path, sanitization_mode);

    FastaPerSequenceCountResult result;
    result.sequence_ids = parse_result.sequence_ids;
    result.factor_counts.reserve(parse_result.sequences.size());

    // Count factors for each sequence independently
    for (const auto& sequence : parse_result.sequences) {
        // Prepare single DNA sequence (no reverse complement)
        std::vector<std::string> single_seq = {sequence};
        PreparedSequenceResult prep_result = prepare_multiple_dna_sequences_no_rc(single_seq);

        // Remove the sentinel at the end
        std::string seq_without_sentinel = prep_result.prepared_string.substr(0, prep_result.prepared_string.length() - 1);

        // Count factors
        size_t count = count_factors(seq_without_sentinel);
        result.factor_counts.push_back(count);
        result.total_factors += count;
    }

    return result;
}

} // namespace noLZSS