Program Listing for File fasta_processor.hpp
↰ Return to documentation for file (src/cpp/fasta_processor.hpp)
#pragma once
#include <string>
#include <vector>
#include <fstream>
#include <cctype>
#include "factorizer.hpp"
namespace noLZSS {
enum class FastaDnaSanitizationMode {
RemoveAmbiguous,
Strict
};
struct FastaParseResult {
std::vector<std::string> sequences;
std::vector<std::string> sequence_ids;
};
FastaParseResult parse_fasta_sequences_and_ids(
const std::string& fasta_path,
FastaDnaSanitizationMode sanitization_mode = FastaDnaSanitizationMode::RemoveAmbiguous
);
std::vector<uint64_t> identify_sentinel_factors(const std::vector<Factor>& factors,
const std::vector<size_t>& sentinel_positions);
struct FastaFactorizationResult {
std::vector<Factor> factors;
std::vector<uint64_t> sentinel_factor_indices;
std::vector<std::string> sequence_ids;
};
struct FastaPerSequenceFactorizationResult {
std::vector<std::vector<Factor>> per_sequence_factors;
std::vector<std::string> sequence_ids;
};
struct FastaReferenceTargetResult {
PreparedSequenceResult concatinated_sequences;
std::vector<std::string> sequence_ids;
size_t num_ref_sequences;
size_t num_target_sequences;
size_t target_start_index;
};
FastaReferenceTargetResult prepare_ref_target_dna_no_rc_from_fasta(
const std::string& reference_fasta_path,
const std::string& target_fasta_path,
FastaDnaSanitizationMode sanitization_mode = FastaDnaSanitizationMode::RemoveAmbiguous
);
FastaReferenceTargetResult prepare_ref_target_dna_w_rc_from_fasta(
const std::string& reference_fasta_path,
const std::string& target_fasta_path,
FastaDnaSanitizationMode sanitization_mode = FastaDnaSanitizationMode::RemoveAmbiguous
);
FastaFactorizationResult factorize_fasta_multiple_dna_w_rc(
const std::string& fasta_path,
FastaDnaSanitizationMode sanitization_mode = FastaDnaSanitizationMode::RemoveAmbiguous
);
FastaFactorizationResult factorize_fasta_multiple_dna_no_rc(
const std::string& fasta_path,
FastaDnaSanitizationMode sanitization_mode = FastaDnaSanitizationMode::RemoveAmbiguous
);
FastaFactorizationResult factorize_dna_rc_w_ref_fasta_files(
const std::string& reference_fasta_path,
const std::string& target_fasta_path,
FastaDnaSanitizationMode sanitization_mode = FastaDnaSanitizationMode::RemoveAmbiguous
);
size_t write_factors_binary_file_fasta_multiple_dna_w_rc(
const std::string& fasta_path,
const std::string& out_path,
FastaDnaSanitizationMode sanitization_mode = FastaDnaSanitizationMode::RemoveAmbiguous
);
size_t write_factors_binary_file_fasta_multiple_dna_no_rc(
const std::string& fasta_path,
const std::string& out_path,
FastaDnaSanitizationMode sanitization_mode = FastaDnaSanitizationMode::RemoveAmbiguous
);
size_t write_factors_dna_w_reference_fasta_files_to_binary(const std::string& reference_fasta_path,
const std::string& target_fasta_path,
const std::string& out_path,
FastaDnaSanitizationMode sanitization_mode = FastaDnaSanitizationMode::RemoveAmbiguous);
FastaPerSequenceFactorizationResult factorize_fasta_dna_w_rc_per_sequence(
const std::string& fasta_path,
FastaDnaSanitizationMode sanitization_mode = FastaDnaSanitizationMode::RemoveAmbiguous
);
FastaPerSequenceFactorizationResult factorize_fasta_dna_no_rc_per_sequence(
const std::string& fasta_path,
FastaDnaSanitizationMode sanitization_mode = FastaDnaSanitizationMode::RemoveAmbiguous
);
size_t write_factors_binary_file_fasta_dna_w_rc_per_sequence(
const std::string& fasta_path,
const std::string& out_dir,
FastaDnaSanitizationMode sanitization_mode = FastaDnaSanitizationMode::RemoveAmbiguous
);
size_t write_factors_binary_file_fasta_dna_no_rc_per_sequence(
const std::string& fasta_path,
const std::string& out_dir,
FastaDnaSanitizationMode sanitization_mode = FastaDnaSanitizationMode::RemoveAmbiguous
);
struct FastaPerSequenceCountResult {
std::vector<std::string> sequence_ids;
std::vector<size_t> factor_counts;
size_t total_factors = 0;
};
FastaPerSequenceCountResult count_factors_fasta_dna_w_rc_per_sequence(
const std::string& fasta_path,
FastaDnaSanitizationMode sanitization_mode = FastaDnaSanitizationMode::RemoveAmbiguous
);
FastaPerSequenceCountResult count_factors_fasta_dna_no_rc_per_sequence(
const std::string& fasta_path,
FastaDnaSanitizationMode sanitization_mode = FastaDnaSanitizationMode::RemoveAmbiguous
);
} // namespace noLZSS