Program Listing for File fasta_processor.hpp

Return to documentation for file (src/cpp/fasta_processor.hpp)

#pragma once
#include <string>
#include <vector>
#include <fstream>
#include <cctype>
#include "factorizer.hpp"

namespace noLZSS {

enum class FastaDnaSanitizationMode {
    RemoveAmbiguous,
    Strict
};

struct FastaParseResult {
    std::vector<std::string> sequences;
    std::vector<std::string> sequence_ids;
};

FastaParseResult parse_fasta_sequences_and_ids(
    const std::string& fasta_path,
    FastaDnaSanitizationMode sanitization_mode = FastaDnaSanitizationMode::RemoveAmbiguous
);

std::vector<uint64_t> identify_sentinel_factors(const std::vector<Factor>& factors,
                                                const std::vector<size_t>& sentinel_positions);

struct FastaFactorizationResult {
    std::vector<Factor> factors;
    std::vector<uint64_t> sentinel_factor_indices;
    std::vector<std::string> sequence_ids;
};

struct FastaPerSequenceFactorizationResult {
    std::vector<std::vector<Factor>> per_sequence_factors;
    std::vector<std::string> sequence_ids;
};

struct FastaReferenceTargetResult {
    PreparedSequenceResult concatinated_sequences;
    std::vector<std::string> sequence_ids;
    size_t num_ref_sequences;
    size_t num_target_sequences;
    size_t target_start_index;
};

FastaReferenceTargetResult prepare_ref_target_dna_no_rc_from_fasta(
    const std::string& reference_fasta_path,
    const std::string& target_fasta_path,
    FastaDnaSanitizationMode sanitization_mode = FastaDnaSanitizationMode::RemoveAmbiguous
);

FastaReferenceTargetResult prepare_ref_target_dna_w_rc_from_fasta(
    const std::string& reference_fasta_path,
    const std::string& target_fasta_path,
    FastaDnaSanitizationMode sanitization_mode = FastaDnaSanitizationMode::RemoveAmbiguous
);

FastaFactorizationResult factorize_fasta_multiple_dna_w_rc(
    const std::string& fasta_path,
    FastaDnaSanitizationMode sanitization_mode = FastaDnaSanitizationMode::RemoveAmbiguous
);

FastaFactorizationResult factorize_fasta_multiple_dna_no_rc(
    const std::string& fasta_path,
    FastaDnaSanitizationMode sanitization_mode = FastaDnaSanitizationMode::RemoveAmbiguous
);

FastaFactorizationResult factorize_dna_rc_w_ref_fasta_files(
    const std::string& reference_fasta_path,
    const std::string& target_fasta_path,
    FastaDnaSanitizationMode sanitization_mode = FastaDnaSanitizationMode::RemoveAmbiguous
);

size_t write_factors_binary_file_fasta_multiple_dna_w_rc(
    const std::string& fasta_path,
    const std::string& out_path,
    FastaDnaSanitizationMode sanitization_mode = FastaDnaSanitizationMode::RemoveAmbiguous
);

size_t write_factors_binary_file_fasta_multiple_dna_no_rc(
    const std::string& fasta_path,
    const std::string& out_path,
    FastaDnaSanitizationMode sanitization_mode = FastaDnaSanitizationMode::RemoveAmbiguous
);

size_t write_factors_dna_w_reference_fasta_files_to_binary(const std::string& reference_fasta_path,
                                                          const std::string& target_fasta_path,
                                                          const std::string& out_path,
                                                          FastaDnaSanitizationMode sanitization_mode = FastaDnaSanitizationMode::RemoveAmbiguous);

FastaPerSequenceFactorizationResult factorize_fasta_dna_w_rc_per_sequence(
    const std::string& fasta_path,
    FastaDnaSanitizationMode sanitization_mode = FastaDnaSanitizationMode::RemoveAmbiguous
);

FastaPerSequenceFactorizationResult factorize_fasta_dna_no_rc_per_sequence(
    const std::string& fasta_path,
    FastaDnaSanitizationMode sanitization_mode = FastaDnaSanitizationMode::RemoveAmbiguous
);

size_t write_factors_binary_file_fasta_dna_w_rc_per_sequence(
    const std::string& fasta_path,
    const std::string& out_dir,
    FastaDnaSanitizationMode sanitization_mode = FastaDnaSanitizationMode::RemoveAmbiguous
);

size_t write_factors_binary_file_fasta_dna_no_rc_per_sequence(
    const std::string& fasta_path,
    const std::string& out_dir,
    FastaDnaSanitizationMode sanitization_mode = FastaDnaSanitizationMode::RemoveAmbiguous
);

struct FastaPerSequenceCountResult {
    std::vector<std::string> sequence_ids;
    std::vector<size_t> factor_counts;
    size_t total_factors = 0;
};

FastaPerSequenceCountResult count_factors_fasta_dna_w_rc_per_sequence(
    const std::string& fasta_path,
    FastaDnaSanitizationMode sanitization_mode = FastaDnaSanitizationMode::RemoveAmbiguous
);

FastaPerSequenceCountResult count_factors_fasta_dna_no_rc_per_sequence(
    const std::string& fasta_path,
    FastaDnaSanitizationMode sanitization_mode = FastaDnaSanitizationMode::RemoveAmbiguous
);

} // namespace noLZSS