Program Listing for File fasta_processor.cpp

Return to documentation for file (src/cpp/fasta_processor.cpp)

#include "fasta_processor.hpp"
#include "factorizer.hpp"
#include <iostream>
#include <algorithm>
#include <set>

namespace noLZSS {

// Helper function to parse FASTA file into individual sequences and IDs
static FastaParseResult parse_fasta_sequences_and_ids(const std::string& fasta_path) {
    std::ifstream file(fasta_path);
    if (!file.is_open()) {
        throw std::runtime_error("Cannot open FASTA file: " + fasta_path);
    }

    FastaParseResult result;
    std::string line;
    std::string current_sequence;
    std::string current_id;

    while (std::getline(file, line)) {
        // Remove trailing whitespace
        while (!line.empty() && std::isspace(line.back())) {
            line.pop_back();
        }

        if (line.empty()) {
            continue; // Skip empty lines
        }

        if (line[0] == '>') {
            // Header line - finish previous sequence if exists
            if (!current_sequence.empty()) {
                result.sequences.push_back(current_sequence);
                result.sequence_ids.push_back(current_id);
                current_sequence.clear();
            }

            // Parse new header to extract ID
            size_t start = 1; // Skip '>'
            while (start < line.size() && std::isspace(line[start])) {
                start++;
            }
            size_t end = start;
            while (end < line.size() && !std::isspace(line[end])) {
                end++;
            }

            if (start < line.size()) {
                current_id = line.substr(start, end - start);
            } else {
                throw std::runtime_error("Empty sequence header in FASTA file");
            }
        } else {
            // Sequence line - append to current sequence
            for (char c : line) {
                if (!std::isspace(c)) {
                    current_sequence += c;
                }
            }
        }
    }

    // Add the last sequence if it exists
    if (!current_sequence.empty()) {
        result.sequences.push_back(current_sequence);
        result.sequence_ids.push_back(current_id);
    }

    file.close();

    if (result.sequences.empty()) {
        throw std::runtime_error("No valid sequences found in FASTA file");
    }

    return result;
}

// Legacy helper function to maintain compatibility with existing code
static std::vector<std::string> parse_fasta_sequences(const std::string& fasta_path) {
    FastaParseResult result = parse_fasta_sequences_and_ids(fasta_path);
    return result.sequences;
}

// Helper function to identify sentinel factors from factorization results
static std::vector<uint64_t> identify_sentinel_factors(const std::vector<Factor>& factors,
                                                      const std::vector<size_t>& sentinel_positions) {
    std::vector<uint64_t> sentinel_factor_indices;
    size_t sentinel_idx = 0;  // Current index in sentinel_positions

    for (size_t i = 0; i < factors.size(); ++i) {
        const Factor& f = factors[i];

        // Check if this factor's start position matches current sentinel position
        if (sentinel_idx < sentinel_positions.size() &&
            f.start == sentinel_positions[sentinel_idx]) {

            // Sanity checks for sentinel factors
            if (f.length != 1) {
                throw std::runtime_error("Sentinel factor has unexpected length: " + std::to_string(f.length));
            }
            if (f.ref != f.start) {
                throw std::runtime_error("Sentinel factor reference mismatch: ref=" +
                                       std::to_string(f.ref) + ", pos=" + std::to_string(f.start));
            }
            sentinel_factor_indices.push_back(i);
            sentinel_idx++;  // Move to next sentinel position
        }
    }

    return sentinel_factor_indices;
}

FastaProcessResult process_nucleotide_fasta(const std::string& fasta_path) {
    std::ifstream file(fasta_path);
    if (!file.is_open()) {
        throw std::runtime_error("Cannot open FASTA file: " + fasta_path);
    }

    FastaProcessResult result;
    result.num_sequences = 0;  // Initialize to 0 to avoid garbage values
    result.sequence.reserve(1024 * 1024); // Start with 1MB reservation

    std::string line;
    std::string current_id;
    size_t current_seq_length = 0;
    size_t current_seq_start = 0;
    bool in_sequence = false;

    // Generate sentinel characters (1-251, avoiding 0, A=65, C=67, G=71, T=84)
    auto get_sentinel = [](size_t seq_index) -> char {
        if (seq_index >= 251) {
            throw std::runtime_error("Too many sequences in FASTA file (max 251)");
        }
        char sentinel = static_cast<char>(seq_index + 1);
        // Skip nucleotide characters
        while (sentinel == 65 || sentinel == 67 || sentinel == 71 || sentinel == 84) { // A, C, G, T
            sentinel++;
            if (sentinel > 251) {
                throw std::runtime_error("Sentinel generation failed - too many sequences");
            }
        }
        return sentinel;
    };

    while (std::getline(file, line)) {
        // Remove trailing whitespace
        while (!line.empty() && std::isspace(line.back())) {
            line.pop_back();
        }

        if (line.empty()) {
            continue; // Skip empty lines
        }

        if (line[0] == '>') {
            // Header line - finish previous sequence if exists
            if (in_sequence && !current_id.empty()) {
                if (current_seq_length > 0) {
                    result.sequence_ids.push_back(current_id);
                    result.sequence_lengths.push_back(current_seq_length);
                    result.sequence_positions.push_back(current_seq_start);
                    result.num_sequences++;

                    // Add sentinel after sequence (except for last sequence)
                    char sentinel = get_sentinel(result.num_sequences - 1);
                    result.sequence.push_back(sentinel);
                }
            }

            // Parse new header
            size_t start = 1; // Skip '>'
            while (start < line.size() && std::isspace(line[start])) {
                start++;
            }
            size_t end = start;
            while (end < line.size() && !std::isspace(line[end])) {
                end++;
            }

            if (start < line.size()) {
                current_id = line.substr(start, end - start);
                current_seq_length = 0;
                current_seq_start = result.sequence.size();
                in_sequence = true;
            } else {
                throw std::runtime_error("Empty sequence header in FASTA file");
            }
        } else if (in_sequence) {
            // Sequence line - process each character
            for (char c : line) {
                if (std::isspace(c)) {
                    continue; // Skip whitespace
                }

                char upper_c = static_cast<char>(std::toupper(static_cast<unsigned char>(c)));
                if (upper_c == 'A' || upper_c == 'C' || upper_c == 'G' || upper_c == 'T') {
                    result.sequence.push_back(upper_c);
                    current_seq_length++;
                } else {
                    throw std::runtime_error("Invalid nucleotide character '" + std::string(1, c) +
                                           "' in sequence " + current_id);
                }
            }
        }
    }

    // Finish last sequence
    if (in_sequence && !current_id.empty() && current_seq_length > 0) {
        result.sequence_ids.push_back(current_id);
        result.sequence_lengths.push_back(current_seq_length);
        result.sequence_positions.push_back(current_seq_start);
        result.num_sequences++;
    }

    if (result.num_sequences == 0) {
        throw std::runtime_error("No valid sequences found in FASTA file");
    }

    file.close();
    return result;
}

FastaProcessResult process_amino_acid_fasta(const std::string& fasta_path) {
    std::ifstream file(fasta_path);
    if (!file.is_open()) {
        throw std::runtime_error("Cannot open FASTA file: " + fasta_path);
    }

    FastaProcessResult result;
    result.num_sequences = 0;  // Initialize to 0 to avoid garbage values
    result.sequence.reserve(1024 * 1024); // Start with 1MB reservation

    std::string line;
    std::string current_id;
    size_t current_seq_length = 0;
    size_t current_seq_start = 0;
    bool in_sequence = false;

    // Generate sentinel characters (1-251, avoiding 0 and amino acids)
    auto get_sentinel = [](size_t seq_index) -> char {
        if (seq_index >= 235) {  // 256 - 20 amino acids - 1 null = 235 available sentinels
            throw std::runtime_error("Too many sequences in FASTA file (max 235)");
        }

        // Amino acid ASCII values to avoid
        const std::string amino_acids = "ACDEFGHIKLMNPQRSTVWY";
        char sentinel = static_cast<char>(seq_index + 1);

        // Find next available character that's not an amino acid
        while (amino_acids.find(sentinel) != std::string::npos || sentinel == 0) {
            sentinel++;
            if (sentinel > 235) {
                throw std::runtime_error("Sentinel generation failed - no available characters");
            }
        }

        return sentinel;
    };

    // Canonical amino acids (20 standard)
    const std::string valid_aa = "ACDEFGHIKLMNPQRSTVWY";

    while (std::getline(file, line)) {
        // Remove trailing whitespace
        while (!line.empty() && std::isspace(line.back())) {
            line.pop_back();
        }

        if (line.empty()) {
            continue; // Skip empty lines
        }

        if (line[0] == '>') {
            // Header line - finish previous sequence if exists
            if (in_sequence && !current_id.empty()) {
                if (current_seq_length > 0) {
                    result.sequence_ids.push_back(current_id);
                    result.sequence_lengths.push_back(current_seq_length);
                    result.sequence_positions.push_back(current_seq_start);
                    result.num_sequences++;

                    // Add sentinel after sequence (except for last sequence)
                    char sentinel = get_sentinel(result.num_sequences - 1);
                    result.sequence.push_back(sentinel);
                }
            }

            // Parse new header
            size_t start = 1; // Skip '>'
            while (start < line.size() && std::isspace(line[start])) {
                start++;
            }
            size_t end = start;
            while (end < line.size() && !std::isspace(line[end])) {
                end++;
            }

            if (start < line.size()) {
                current_id = line.substr(start, end - start);
                current_seq_length = 0;
                current_seq_start = result.sequence.size();
                in_sequence = true;
            } else {
                throw std::runtime_error("Empty sequence header in FASTA file");
            }
        } else if (in_sequence) {
            // Sequence line - process each character
            for (char c : line) {
                if (std::isspace(c)) {
                    continue; // Skip whitespace
                }

                char upper_c = static_cast<char>(std::toupper(static_cast<unsigned char>(c)));
                if (valid_aa.find(upper_c) != std::string::npos) {
                    result.sequence.push_back(upper_c);
                    current_seq_length++;
                } else {
                    throw std::runtime_error("Invalid amino acid character '" + std::string(1, c) +
                                           "' in sequence " + current_id +
                                           ". Only canonical amino acids (ACDEFGHIKLMNPQRSTVWY) are allowed.");
                }
            }
        }
    }

    // Finish last sequence
    if (in_sequence && !current_id.empty() && current_seq_length > 0) {
        result.sequence_ids.push_back(current_id);
        result.sequence_lengths.push_back(current_seq_length);
        result.sequence_positions.push_back(current_seq_start);
        result.num_sequences++;
    }

    if (result.num_sequences == 0) {
        throw std::runtime_error("No valid sequences found in FASTA file");
    }

    file.close();
    return result;
}

FastaFactorizationResult factorize_fasta_multiple_dna_w_rc(const std::string& fasta_path) {
    // Parse FASTA file into individual sequences
    std::vector<std::string> sequences = parse_fasta_sequences(fasta_path);

    // Prepare sequences for factorization (this will validate nucleotides)
    PreparedSequenceResult prep_result = prepare_multiple_dna_sequences_w_rc(sequences);

    // Perform factorization
    std::vector<Factor> factors = factorize_multiple_dna_w_rc(prep_result.prepared_string);

    // Identify sentinel factors using helper function
    std::vector<uint64_t> sentinel_factor_indices = identify_sentinel_factors(factors, prep_result.sentinel_positions);

    return {factors, sentinel_factor_indices};
}

FastaFactorizationResult factorize_fasta_multiple_dna_no_rc(const std::string& fasta_path) {
    // Parse FASTA file into individual sequences
    std::vector<std::string> sequences = parse_fasta_sequences(fasta_path);

    // Prepare sequences for factorization (this will validate nucleotides)
    PreparedSequenceResult prep_result = prepare_multiple_dna_sequences_no_rc(sequences);

    // Perform factorization using regular factorize function
    std::vector<Factor> factors = factorize(prep_result.prepared_string);

    // Identify sentinel factors using helper function
    std::vector<uint64_t> sentinel_factor_indices = identify_sentinel_factors(factors, prep_result.sentinel_positions);

    return {factors, sentinel_factor_indices};
}

size_t write_factors_binary_file_fasta_multiple_dna_w_rc(const std::string& fasta_path, const std::string& out_path) {
    // Parse FASTA file into sequences and IDs in one pass
    FastaParseResult parse_result = parse_fasta_sequences_and_ids(fasta_path);

    // Get factorization result with sentinel information
    FastaFactorizationResult factorization_result = factorize_fasta_multiple_dna_w_rc(fasta_path);

    // Calculate header size
    size_t names_size = 0;
    for (const auto& name : parse_result.sequence_ids) {
        names_size += name.length() + 1;  // +1 for null terminator
    }

    size_t header_size = sizeof(FactorFileHeader) + names_size +
                        factorization_result.sentinel_factor_indices.size() * sizeof(uint64_t);

    // Write to file
    std::ofstream os(out_path, std::ios::binary);
    if (!os) {
        throw std::runtime_error("Cannot create output file: " + out_path);
    }

    std::vector<char> buf(1<<20); // 1 MB buffer for performance
    os.rdbuf()->pubsetbuf(buf.data(), static_cast<std::streamsize>(buf.size()));

    // Write header
    FactorFileHeader header;
    header.num_factors = factorization_result.factors.size();
    header.num_sequences = parse_result.sequence_ids.size();
    header.num_sentinels = factorization_result.sentinel_factor_indices.size();
    header.header_size = header_size;

    os.write(reinterpret_cast<const char*>(&header), sizeof(header));

    // Write sequence names
    for (const auto& name : parse_result.sequence_ids) {
        os.write(name.c_str(), name.length() + 1);  // Include null terminator
    }

    // Write sentinel factor indices
    for (uint64_t idx : factorization_result.sentinel_factor_indices) {
        os.write(reinterpret_cast<const char*>(&idx), sizeof(idx));
    }

    // Write factors (existing format)
    for (const Factor& f : factorization_result.factors) {
        os.write(reinterpret_cast<const char*>(&f), sizeof(Factor));
    }

    return factorization_result.factors.size();
}

size_t write_factors_binary_file_fasta_multiple_dna_no_rc(const std::string& fasta_path, const std::string& out_path) {
    // Parse FASTA file into sequences and IDs in one pass
    FastaParseResult parse_result = parse_fasta_sequences_and_ids(fasta_path);

    // Get factorization result with sentinel information
    FastaFactorizationResult factorization_result = factorize_fasta_multiple_dna_no_rc(fasta_path);

    // Calculate header size
    size_t names_size = 0;
    for (const auto& name : parse_result.sequence_ids) {
        names_size += name.length() + 1;  // +1 for null terminator
    }

    size_t header_size = sizeof(FactorFileHeader) + names_size +
                        factorization_result.sentinel_factor_indices.size() * sizeof(uint64_t);

    // Write to file
    std::ofstream os(out_path, std::ios::binary);
    if (!os) {
        throw std::runtime_error("Cannot create output file: " + out_path);
    }

    std::vector<char> buf(1<<20); // 1 MB buffer for performance
    os.rdbuf()->pubsetbuf(buf.data(), static_cast<std::streamsize>(buf.size()));

    // Write header
    FactorFileHeader header;
    header.num_factors = factorization_result.factors.size();
    header.num_sequences = parse_result.sequence_ids.size();
    header.num_sentinels = factorization_result.sentinel_factor_indices.size();
    header.header_size = header_size;

    os.write(reinterpret_cast<const char*>(&header), sizeof(header));

    // Write sequence names
    for (const auto& name : parse_result.sequence_ids) {
        os.write(name.c_str(), name.length() + 1);  // Include null terminator
    }

    // Write sentinel factor indices
    for (uint64_t idx : factorization_result.sentinel_factor_indices) {
        os.write(reinterpret_cast<const char*>(&idx), sizeof(idx));
    }

    // Write factors (existing format)
    for (const Factor& f : factorization_result.factors) {
        os.write(reinterpret_cast<const char*>(&f), sizeof(Factor));
    }

    return factorization_result.factors.size();
}

} // namespace noLZSS