Program Listing for File factorizer.hpp

Return to documentation for file (src/cpp/factorizer.hpp)

#pragma once
#include <vector>
#include <utility>
#include <string_view>
#include <string>
#include <cstdint>

namespace noLZSS {

// Constants and utility functions for reverse complement handling

template<class Sink>
size_t factorize_file_stream(const std::string& path, Sink&& sink, size_t start_pos = 0);

template<class Sink>
size_t factorize_stream(std::string_view text, Sink&& sink, size_t start_pos = 0);
constexpr uint64_t RC_MASK = (1ULL << 63);

struct PreparedSequenceResult {
    std::string prepared_string;
    size_t original_length;
    std::vector<size_t> sentinel_positions;
};

struct FactorFileFooter {
    char magic[8];
    uint64_t num_factors;
    uint64_t num_sequences;
    uint64_t num_sentinels;
    uint64_t footer_size;
    uint64_t total_length;
    // Constructor to ensure magic is properly initialized
    FactorFileFooter() : num_factors(0), num_sequences(0), num_sentinels(0), footer_size(0), total_length(0) {
        magic[0] = 'n'; magic[1] = 'o'; magic[2] = 'L'; magic[3] = 'Z';
        magic[4] = 'S'; magic[5] = 'S'; magic[6] = 'v'; magic[7] = '2';
    }
};

inline bool is_rc(uint64_t ref) { return (ref & RC_MASK) != 0; }

inline uint64_t rc_end(uint64_t ref) { return (ref & ~RC_MASK); }

// Utility functions for DNA sequence preparation

PreparedSequenceResult prepare_multiple_dna_sequences_w_rc(const std::vector<std::string>& sequences);

PreparedSequenceResult prepare_multiple_dna_sequences_no_rc(const std::vector<std::string>& sequences);

struct Factor {
    uint64_t start;
    uint64_t length;
    uint64_t ref;
};

// Core factorization functions

std::vector<Factor> factorize(std::string_view text, size_t start_pos = 0);

std::vector<Factor> factorize_file(const std::string& path, size_t reserve_hint = 0, size_t start_pos = 0);

// Counting functions

size_t count_factors(std::string_view text, size_t start_pos = 0);

size_t count_factors_file(const std::string& path, size_t start_pos = 0);

// Binary output

size_t write_factors_binary_file(const std::string& in_path, const std::string& out_path);

// DNA-aware factorization functions with reverse complement support

std::vector<Factor> factorize_dna_w_rc(std::string_view text);

std::vector<Factor> factorize_file_dna_w_rc(const std::string& path, size_t reserve_hint = 0);

size_t count_factors_dna_w_rc(std::string_view text);

size_t count_factors_file_dna_w_rc(const std::string& path);

size_t write_factors_binary_file_dna_w_rc(const std::string& in_path, const std::string& out_path);

// Template functions for advanced usage

template<class Sink>
size_t factorize_stream_dna_w_rc(std::string_view text, Sink&& sink);

template<class Sink>
size_t factorize_file_stream_dna_w_rc(const std::string& path, Sink&& sink);

// Multiple DNA sequences factorization functions with reverse complement support

std::vector<Factor> factorize_multiple_dna_w_rc(std::string_view text, size_t start_pos = 0);

std::vector<Factor> factorize_file_multiple_dna_w_rc(const std::string& path, size_t reserve_hint = 0, size_t start_pos = 0);

size_t count_factors_multiple_dna_w_rc(std::string_view text, size_t start_pos = 0);

size_t count_factors_file_multiple_dna_w_rc(const std::string& path, size_t start_pos = 0);

size_t write_factors_binary_file_multiple_dna_w_rc(const std::string& in_path, const std::string& out_path, size_t start_pos = 0);

// Template functions for advanced usage with multiple sequences

template<class Sink>
size_t factorize_stream_multiple_dna_w_rc(std::string_view text, Sink&& sink, size_t start_pos = 0);

template<class Sink>
size_t factorize_file_stream_multiple_dna_w_rc(const std::string& path, Sink&& sink, size_t start_pos = 0);

// Reference sequence factorization functions

std::vector<Factor> factorize_dna_w_reference_seq(const std::string& reference_seq, const std::string& target_seq);

size_t factorize_dna_w_reference_seq_file(const std::string& reference_seq, const std::string& target_seq, const std::string& out_path);

// General reference sequence factorization functions (no reverse complement)

std::vector<Factor> factorize_w_reference(const std::string& reference_seq, const std::string& target_seq);

size_t factorize_w_reference_file(const std::string& reference_seq, const std::string& target_seq, const std::string& out_path);

// Parallel factorization functions

size_t parallel_factorize_to_file(std::string_view text, const std::string& output_path, size_t num_threads = 0, size_t start_pos = 0);

size_t parallel_factorize_file_to_file(const std::string& input_path, const std::string& output_path, size_t num_threads = 0, size_t start_pos = 0);

size_t parallel_factorize_dna_w_rc_to_file(std::string_view text, const std::string& output_path, size_t num_threads = 0);

size_t parallel_factorize_file_dna_w_rc_to_file(const std::string& input_path, const std::string& output_path, size_t num_threads = 0);

} // namespace noLZSS