Program Listing for File fasta_processor.cpp
↰ Return to documentation for file (src/cpp/fasta_processor.cpp)
#include "fasta_processor.hpp"
#include "parallel_fasta_processor.hpp" // For parallel implementations
#include "factorizer.hpp"
#include "factorizer_core.hpp" // Template implementations in detail:: namespace
#include "factorizer_helpers.hpp" // For cst_t and lcp
#include <sdsl/rmq_succinct_sct.hpp>
#include <sdsl/construct.hpp>
#include <iostream>
#include <algorithm>
#include <set>
#include <fstream>
namespace noLZSS {
static inline bool is_canonical_dna(char c) {
return c == 'A' || c == 'C' || c == 'G' || c == 'T' ||
c == 'a' || c == 'c' || c == 'g' || c == 't';
}
static inline char to_upper_ascii(char c) {
if (c >= 'a' && c <= 'z') {
return static_cast<char>(c - 'a' + 'A');
}
return c;
}
// Helper function to parse FASTA file into individual sequences and IDs
FastaParseResult parse_fasta_sequences_and_ids(
const std::string& fasta_path,
FastaDnaSanitizationMode sanitization_mode
) {
std::ifstream file(fasta_path);
if (!file.is_open()) {
throw std::runtime_error("Cannot open FASTA file: " + fasta_path);
}
FastaParseResult result;
std::string line;
std::string current_sequence;
std::string current_id;
size_t empty_sequence_count = 0;
size_t ambiguous_removed_count = 0;
while (std::getline(file, line)) {
// Remove trailing whitespace
while (!line.empty() && std::isspace(line.back())) {
line.pop_back();
}
if (line.empty()) {
continue; // Skip empty lines
}
if (line[0] == '>') {
// Header line - finish previous sequence if exists
if (!current_id.empty()) {
if (!current_sequence.empty()) {
result.sequences.push_back(current_sequence);
result.sequence_ids.push_back(current_id);
} else {
// Warn about empty sequence
std::cerr << "Warning: Skipping empty sequence with ID: " << current_id << std::endl;
empty_sequence_count++;
}
current_sequence.clear();
}
// Parse new header to extract ID
size_t start = 1; // Skip '>'
while (start < line.size() && std::isspace(line[start])) {
start++;
}
size_t end = start;
while (end < line.size() && !std::isspace(line[end])) {
end++;
}
if (start < line.size()) {
current_id = line.substr(start, end - start);
} else {
throw std::runtime_error("Empty sequence header in FASTA file");
}
} else {
// Sequence line - append to current sequence
for (char c : line) {
if (!std::isspace(c)) {
if (is_canonical_dna(c)) {
current_sequence += to_upper_ascii(c);
} else if (sanitization_mode == FastaDnaSanitizationMode::Strict) {
throw std::runtime_error("Invalid nucleotide '" + std::string(1, c) +
"' found in sequence with ID: " + current_id);
} else {
ambiguous_removed_count++;
}
}
}
}
}
// Add the last sequence if it exists
if (!current_id.empty()) {
if (!current_sequence.empty()) {
result.sequences.push_back(current_sequence);
result.sequence_ids.push_back(current_id);
} else {
// Warn about empty sequence
std::cerr << "Warning: Skipping empty sequence with ID: " << current_id << std::endl;
empty_sequence_count++;
}
}
// Report summary if any empty sequences were skipped
if (empty_sequence_count > 0) {
std::cerr << "Warning: Skipped " << empty_sequence_count << " empty sequence(s) in FASTA file" << std::endl;
}
if (sanitization_mode == FastaDnaSanitizationMode::RemoveAmbiguous && ambiguous_removed_count > 0) {
std::cerr << "Warning: Removed " << ambiguous_removed_count
<< " ambiguous nucleotide(s) from FASTA input" << std::endl;
}
file.close();
if (result.sequences.empty()) {
throw std::runtime_error("No valid sequences found in FASTA file");
}
return result;
}
// Helper function to identify sentinel factors from factorization results
std::vector<uint64_t> identify_sentinel_factors(const std::vector<Factor>& factors,
const std::vector<size_t>& sentinel_positions) {
std::vector<uint64_t> sentinel_factor_indices;
size_t sentinel_idx = 0; // Current index in sentinel_positions
for (size_t i = 0; i < factors.size(); ++i) {
const Factor& f = factors[i];
// Advance sentinel index past positions that occur before the current factor start
while (sentinel_idx < sentinel_positions.size() &&
sentinel_positions[sentinel_idx] < f.start) {
sentinel_idx++;
}
// Check if this factor's start position matches current sentinel position
if (sentinel_idx < sentinel_positions.size() &&
f.start == sentinel_positions[sentinel_idx]) {
// Sanity checks for sentinel factors
if (f.length != 1) {
throw std::runtime_error("Sentinel factor has unexpected length: " + std::to_string(f.length));
}
if (f.ref != f.start) {
throw std::runtime_error("Sentinel factor reference mismatch: ref=" +
std::to_string(f.ref) + ", pos=" + std::to_string(f.start));
}
sentinel_factor_indices.push_back(i);
sentinel_idx++; // Move to next sentinel position
}
}
return sentinel_factor_indices;
}
FastaReferenceTargetResult prepare_ref_target_dna_no_rc_from_fasta(
const std::string& reference_fasta_path,
const std::string& target_fasta_path,
FastaDnaSanitizationMode sanitization_mode
) {
// Process reference FASTA file first
FastaParseResult ref_parse_result = parse_fasta_sequences_and_ids(reference_fasta_path, sanitization_mode);
// Calculate target start index BEFORE moving sequences
size_t target_start_index = 0;
for (const auto& seq : ref_parse_result.sequences) {
target_start_index += seq.length() + 1; // +1 for sentinel
}
size_t num_ref_sequences = ref_parse_result.sequences.size();
// Process target FASTA file second
FastaParseResult target_parse_result = parse_fasta_sequences_and_ids(target_fasta_path, sanitization_mode);
size_t num_target_sequences = target_parse_result.sequences.size();
// Reserve and move sequences (avoid copying)
std::vector<std::string> all_original_sequences;
all_original_sequences.reserve(num_ref_sequences + num_target_sequences);
all_original_sequences.insert(all_original_sequences.end(),
std::make_move_iterator(ref_parse_result.sequences.begin()),
std::make_move_iterator(ref_parse_result.sequences.end()));
all_original_sequences.insert(all_original_sequences.end(),
std::make_move_iterator(target_parse_result.sequences.begin()),
std::make_move_iterator(target_parse_result.sequences.end()));
// Reserve and move IDs
std::vector<std::string> all_sequence_ids;
all_sequence_ids.reserve(num_ref_sequences + num_target_sequences);
all_sequence_ids.insert(all_sequence_ids.end(),
std::make_move_iterator(ref_parse_result.sequence_ids.begin()),
std::make_move_iterator(ref_parse_result.sequence_ids.end()));
all_sequence_ids.insert(all_sequence_ids.end(),
std::make_move_iterator(target_parse_result.sequence_ids.begin()),
std::make_move_iterator(target_parse_result.sequence_ids.end()));
if (all_original_sequences.empty()) {
throw std::runtime_error("No valid sequences found in FASTA files");
}
// Use prepare_multiple_dna_sequences_no_rc directly with collected sequences
return {prepare_multiple_dna_sequences_no_rc(all_original_sequences), all_sequence_ids, num_ref_sequences, num_target_sequences, target_start_index};
}
FastaReferenceTargetResult prepare_ref_target_dna_w_rc_from_fasta(
const std::string& reference_fasta_path,
const std::string& target_fasta_path,
FastaDnaSanitizationMode sanitization_mode
) {
// Process reference FASTA file first
FastaParseResult ref_parse_result = parse_fasta_sequences_and_ids(reference_fasta_path, sanitization_mode);
// Calculate target start index BEFORE moving sequences
size_t target_start_index = 0;
for (const auto& seq : ref_parse_result.sequences) {
target_start_index += seq.length() + 1; // +1 for sentinel
}
size_t num_ref_sequences = ref_parse_result.sequences.size();
// Process target FASTA file second
FastaParseResult target_parse_result = parse_fasta_sequences_and_ids(target_fasta_path, sanitization_mode);
size_t num_target_sequences = target_parse_result.sequences.size();
// Reserve and move sequences (avoid copying)
std::vector<std::string> all_original_sequences;
all_original_sequences.reserve(num_ref_sequences + num_target_sequences);
all_original_sequences.insert(all_original_sequences.end(),
std::make_move_iterator(ref_parse_result.sequences.begin()),
std::make_move_iterator(ref_parse_result.sequences.end()));
all_original_sequences.insert(all_original_sequences.end(),
std::make_move_iterator(target_parse_result.sequences.begin()),
std::make_move_iterator(target_parse_result.sequences.end()));
// Reserve and move IDs
std::vector<std::string> all_sequence_ids;
all_sequence_ids.reserve(num_ref_sequences + num_target_sequences);
all_sequence_ids.insert(all_sequence_ids.end(),
std::make_move_iterator(ref_parse_result.sequence_ids.begin()),
std::make_move_iterator(ref_parse_result.sequence_ids.end()));
all_sequence_ids.insert(all_sequence_ids.end(),
std::make_move_iterator(target_parse_result.sequence_ids.begin()),
std::make_move_iterator(target_parse_result.sequence_ids.end()));
if (all_original_sequences.empty()) {
throw std::runtime_error("No valid sequences found in FASTA files");
}
// Use prepare_multiple_dna_sequences_w_rc directly with collected sequences
return {prepare_multiple_dna_sequences_w_rc(all_original_sequences), all_sequence_ids, num_ref_sequences, num_target_sequences, target_start_index};
}
FastaFactorizationResult factorize_fasta_multiple_dna_w_rc(
const std::string& fasta_path,
FastaDnaSanitizationMode sanitization_mode
) {
// Parse FASTA file into individual sequences with IDs
FastaParseResult parse_result = parse_fasta_sequences_and_ids(fasta_path, sanitization_mode);
// Prepare sequences for factorization (this will validate nucleotides)
PreparedSequenceResult prep_result = prepare_multiple_dna_sequences_w_rc(parse_result.sequences);
// Perform factorization
std::vector<Factor> factors = factorize_multiple_dna_w_rc(prep_result.prepared_string);
// Identify sentinel factors using helper function
std::vector<uint64_t> sentinel_factor_indices = identify_sentinel_factors(factors, prep_result.sentinel_positions);
return {factors, sentinel_factor_indices, parse_result.sequence_ids};
}
FastaFactorizationResult factorize_fasta_multiple_dna_no_rc(
const std::string& fasta_path,
FastaDnaSanitizationMode sanitization_mode
) {
// Parse FASTA file into individual sequences with IDs
FastaParseResult parse_result = parse_fasta_sequences_and_ids(fasta_path, sanitization_mode);
// Prepare sequences for factorization (this will validate nucleotides)
PreparedSequenceResult prep_result = prepare_multiple_dna_sequences_no_rc(parse_result.sequences);
// Perform factorization using regular factorize function
std::vector<Factor> factors = factorize(prep_result.prepared_string);
// Identify sentinel factors using helper function
std::vector<uint64_t> sentinel_factor_indices = identify_sentinel_factors(factors, prep_result.sentinel_positions);
return {factors, sentinel_factor_indices, parse_result.sequence_ids};
}
// Note: Template implementations now in factorizer_core.hpp in the detail:: namespace
// No more local duplicates needed!
size_t write_factors_binary_file_fasta_multiple_dna_w_rc(
const std::string& fasta_path,
const std::string& out_path,
FastaDnaSanitizationMode sanitization_mode
) {
// Delegate to parallel version with 1 thread
return parallel_write_factors_binary_file_fasta_multiple_dna_w_rc(fasta_path, out_path, 1, sanitization_mode);
}
size_t write_factors_binary_file_fasta_multiple_dna_no_rc(
const std::string& fasta_path,
const std::string& out_path,
FastaDnaSanitizationMode sanitization_mode
) {
// Delegate to parallel version with 1 thread
return parallel_write_factors_binary_file_fasta_multiple_dna_no_rc(fasta_path, out_path, 1, sanitization_mode);
}
FastaFactorizationResult factorize_dna_rc_w_ref_fasta_files(
const std::string& reference_fasta_path,
const std::string& target_fasta_path,
FastaDnaSanitizationMode sanitization_mode
) {
// Process both FASTA files and get prepared sequence with reverse complement
FastaReferenceTargetResult ref_target_concat_w_rc = prepare_ref_target_dna_w_rc_from_fasta(
reference_fasta_path, target_fasta_path, sanitization_mode);
// Perform factorization starting from the target start index
std::vector<Factor> factors = factorize_multiple_dna_w_rc(ref_target_concat_w_rc.concatinated_sequences.prepared_string,
ref_target_concat_w_rc.target_start_index);
// Identify sentinel factors using helper function
std::vector<uint64_t> sentinel_factor_indices = identify_sentinel_factors(factors,
ref_target_concat_w_rc.concatinated_sequences.sentinel_positions);
return {factors, sentinel_factor_indices, ref_target_concat_w_rc.sequence_ids};
}
size_t write_factors_dna_w_reference_fasta_files_to_binary(
const std::string& reference_fasta_path,
const std::string& target_fasta_path,
const std::string& out_path,
FastaDnaSanitizationMode sanitization_mode
) {
// Delegate to parallel version with 1 thread
return parallel_write_factors_dna_w_reference_fasta_files_to_binary(
reference_fasta_path, target_fasta_path, out_path, 1, sanitization_mode);
}
FastaPerSequenceFactorizationResult factorize_fasta_dna_w_rc_per_sequence(
const std::string& fasta_path,
FastaDnaSanitizationMode sanitization_mode
) {
// Delegate to parallel version with 1 thread
// Parse FASTA file into individual sequences with IDs
FastaParseResult parse_result = parse_fasta_sequences_and_ids(fasta_path, sanitization_mode);
FastaPerSequenceFactorizationResult result;
result.sequence_ids = std::move(parse_result.sequence_ids);
result.per_sequence_factors.reserve(parse_result.sequences.size());
// Process sequentially (equivalent to parallel with 1 thread, but no threading overhead)
for (const auto& sequence : parse_result.sequences) {
std::vector<std::string> single_seq = {sequence};
PreparedSequenceResult prep_result = prepare_multiple_dna_sequences_w_rc(single_seq);
std::vector<Factor> factors = factorize_multiple_dna_w_rc(prep_result.prepared_string);
result.per_sequence_factors.push_back(std::move(factors));
}
return result;
}
FastaPerSequenceFactorizationResult factorize_fasta_dna_no_rc_per_sequence(
const std::string& fasta_path,
FastaDnaSanitizationMode sanitization_mode
) {
// Delegate to parallel version with 1 thread
// Parse FASTA file into individual sequences with IDs
FastaParseResult parse_result = parse_fasta_sequences_and_ids(fasta_path, sanitization_mode);
FastaPerSequenceFactorizationResult result;
result.sequence_ids = std::move(parse_result.sequence_ids);
result.per_sequence_factors.reserve(parse_result.sequences.size());
// Process sequentially (equivalent to parallel with 1 thread, but no threading overhead)
for (const auto& sequence : parse_result.sequences) {
std::vector<std::string> single_seq = {sequence};
PreparedSequenceResult prep_result = prepare_multiple_dna_sequences_no_rc(single_seq);
std::string seq_without_sentinel = prep_result.prepared_string.substr(0, prep_result.prepared_string.length() - 1);
std::vector<Factor> factors = factorize(seq_without_sentinel);
result.per_sequence_factors.push_back(std::move(factors));
}
return result;
}
size_t write_factors_binary_file_fasta_dna_w_rc_per_sequence(
const std::string& fasta_path,
const std::string& out_dir,
FastaDnaSanitizationMode sanitization_mode
) {
// Delegate to parallel version with 1 thread
return parallel_write_factors_binary_file_fasta_dna_w_rc_per_sequence(fasta_path, out_dir, 1, sanitization_mode);
}
size_t write_factors_binary_file_fasta_dna_no_rc_per_sequence(
const std::string& fasta_path,
const std::string& out_dir,
FastaDnaSanitizationMode sanitization_mode
) {
// Delegate to parallel version with 1 thread
return parallel_write_factors_binary_file_fasta_dna_no_rc_per_sequence(fasta_path, out_dir, 1, sanitization_mode);
}
FastaPerSequenceCountResult count_factors_fasta_dna_w_rc_per_sequence(
const std::string& fasta_path,
FastaDnaSanitizationMode sanitization_mode
) {
// Parse FASTA file into individual sequences
FastaParseResult parse_result = parse_fasta_sequences_and_ids(fasta_path, sanitization_mode);
FastaPerSequenceCountResult result;
result.sequence_ids = parse_result.sequence_ids;
result.factor_counts.reserve(parse_result.sequences.size());
// Count factors for each sequence independently
for (const auto& sequence : parse_result.sequences) {
// Prepare single DNA sequence with reverse complement
std::vector<std::string> single_seq = {sequence};
PreparedSequenceResult prep_result = prepare_multiple_dna_sequences_w_rc(single_seq);
// Count factors using count_factors_multiple_dna_w_rc
size_t count = count_factors_multiple_dna_w_rc(prep_result.prepared_string);
result.factor_counts.push_back(count);
result.total_factors += count;
}
return result;
}
FastaPerSequenceCountResult count_factors_fasta_dna_no_rc_per_sequence(
const std::string& fasta_path,
FastaDnaSanitizationMode sanitization_mode
) {
// Parse FASTA file into individual sequences
FastaParseResult parse_result = parse_fasta_sequences_and_ids(fasta_path, sanitization_mode);
FastaPerSequenceCountResult result;
result.sequence_ids = parse_result.sequence_ids;
result.factor_counts.reserve(parse_result.sequences.size());
// Count factors for each sequence independently
for (const auto& sequence : parse_result.sequences) {
// Prepare single DNA sequence (no reverse complement)
std::vector<std::string> single_seq = {sequence};
PreparedSequenceResult prep_result = prepare_multiple_dna_sequences_no_rc(single_seq);
// Remove the sentinel at the end
std::string seq_without_sentinel = prep_result.prepared_string.substr(0, prep_result.prepared_string.length() - 1);
// Count factors
size_t count = count_factors(seq_without_sentinel);
result.factor_counts.push_back(count);
result.total_factors += count;
}
return result;
}
} // namespace noLZSS