Program Listing for File factorizer.hpp
↰ Return to documentation for file (src/cpp/factorizer.hpp)
#pragma once
#include <vector>
#include <utility>
#include <string_view>
#include <string>
#include <cstdint>
namespace noLZSS {
// Constants and utility functions for reverse complement handling
template<class Sink>
size_t factorize_file_stream(const std::string& path, Sink&& sink, size_t start_pos = 0);
template<class Sink>
size_t factorize_stream(std::string_view text, Sink&& sink, size_t start_pos = 0);
constexpr uint64_t RC_MASK = (1ULL << 63);
struct PreparedSequenceResult {
std::string prepared_string;
size_t original_length;
std::vector<size_t> sentinel_positions;
};
struct FactorFileFooter {
char magic[8];
uint64_t num_factors;
uint64_t num_sequences;
uint64_t num_sentinels;
uint64_t footer_size;
uint64_t total_length;
// Constructor to ensure magic is properly initialized
FactorFileFooter() : num_factors(0), num_sequences(0), num_sentinels(0), footer_size(0), total_length(0) {
magic[0] = 'n'; magic[1] = 'o'; magic[2] = 'L'; magic[3] = 'Z';
magic[4] = 'S'; magic[5] = 'S'; magic[6] = 'v'; magic[7] = '2';
}
};
inline bool is_rc(uint64_t ref) { return (ref & RC_MASK) != 0; }
inline uint64_t rc_end(uint64_t ref) { return (ref & ~RC_MASK); }
// Utility functions for DNA sequence preparation
PreparedSequenceResult prepare_multiple_dna_sequences_w_rc(const std::vector<std::string>& sequences);
PreparedSequenceResult prepare_multiple_dna_sequences_no_rc(const std::vector<std::string>& sequences);
struct Factor {
uint64_t start;
uint64_t length;
uint64_t ref;
};
// Core factorization functions
std::vector<Factor> factorize(std::string_view text, size_t start_pos = 0);
std::vector<Factor> factorize_file(const std::string& path, size_t reserve_hint = 0, size_t start_pos = 0);
// Counting functions
size_t count_factors(std::string_view text, size_t start_pos = 0);
size_t count_factors_file(const std::string& path, size_t start_pos = 0);
// Binary output
size_t write_factors_binary_file(const std::string& in_path, const std::string& out_path);
// DNA-aware factorization functions with reverse complement support
std::vector<Factor> factorize_dna_w_rc(std::string_view text);
std::vector<Factor> factorize_file_dna_w_rc(const std::string& path, size_t reserve_hint = 0);
size_t count_factors_dna_w_rc(std::string_view text);
size_t count_factors_file_dna_w_rc(const std::string& path);
size_t write_factors_binary_file_dna_w_rc(const std::string& in_path, const std::string& out_path);
// Template functions for advanced usage
template<class Sink>
size_t factorize_stream_dna_w_rc(std::string_view text, Sink&& sink);
template<class Sink>
size_t factorize_file_stream_dna_w_rc(const std::string& path, Sink&& sink);
// Multiple DNA sequences factorization functions with reverse complement support
std::vector<Factor> factorize_multiple_dna_w_rc(std::string_view text, size_t start_pos = 0);
std::vector<Factor> factorize_file_multiple_dna_w_rc(const std::string& path, size_t reserve_hint = 0, size_t start_pos = 0);
size_t count_factors_multiple_dna_w_rc(std::string_view text, size_t start_pos = 0);
size_t count_factors_file_multiple_dna_w_rc(const std::string& path, size_t start_pos = 0);
size_t write_factors_binary_file_multiple_dna_w_rc(const std::string& in_path, const std::string& out_path, size_t start_pos = 0);
// Template functions for advanced usage with multiple sequences
template<class Sink>
size_t factorize_stream_multiple_dna_w_rc(std::string_view text, Sink&& sink, size_t start_pos = 0);
template<class Sink>
size_t factorize_file_stream_multiple_dna_w_rc(const std::string& path, Sink&& sink, size_t start_pos = 0);
// Reference sequence factorization functions
std::vector<Factor> factorize_dna_w_reference_seq(const std::string& reference_seq, const std::string& target_seq);
size_t factorize_dna_w_reference_seq_file(const std::string& reference_seq, const std::string& target_seq, const std::string& out_path);
// General reference sequence factorization functions (no reverse complement)
std::vector<Factor> factorize_w_reference(const std::string& reference_seq, const std::string& target_seq);
size_t factorize_w_reference_file(const std::string& reference_seq, const std::string& target_seq, const std::string& out_path);
// Parallel factorization functions
size_t parallel_factorize_to_file(std::string_view text, const std::string& output_path, size_t num_threads = 0, size_t start_pos = 0);
size_t parallel_factorize_file_to_file(const std::string& input_path, const std::string& output_path, size_t num_threads = 0, size_t start_pos = 0);
size_t parallel_factorize_dna_w_rc_to_file(std::string_view text, const std::string& output_path, size_t num_threads = 0);
size_t parallel_factorize_file_dna_w_rc_to_file(const std::string& input_path, const std::string& output_path, size_t num_threads = 0);
} // namespace noLZSS