Program Listing for File factorizer.hpp
↰ Return to documentation for file (src/cpp/factorizer.hpp
)
#pragma once
#include <vector>
#include <utility>
#include <string_view>
#include <string>
#include <cstdint>
namespace noLZSS {
// Constants and utility functions for reverse complement handling
constexpr uint64_t RC_MASK = (1ULL << 63);
struct PreparedSequenceResult {
std::string prepared_string;
size_t original_length;
std::vector<size_t> sentinel_positions;
};
struct FactorFileHeader {
char magic[8] = {'n', 'o', 'L', 'Z', 'S', 'S', 'v', '1'};
uint64_t num_factors;
uint64_t num_sequences;
uint64_t num_sentinels;
uint64_t header_size;
};
inline bool is_rc(uint64_t ref) { return (ref & RC_MASK) != 0; }
inline uint64_t rc_end(uint64_t ref) { return (ref & ~RC_MASK); }
// Utility functions for DNA sequence preparation
PreparedSequenceResult prepare_multiple_dna_sequences_w_rc(const std::vector<std::string>& sequences);
PreparedSequenceResult prepare_multiple_dna_sequences_no_rc(const std::vector<std::string>& sequences);
struct Factor {
uint64_t start;
uint64_t length;
uint64_t ref;
};
// Core factorization functions
std::vector<Factor> factorize(std::string_view text);
std::vector<Factor> factorize_file(const std::string& path, size_t reserve_hint = 0);
// Counting functions
size_t count_factors(std::string_view text);
size_t count_factors_file(const std::string& path);
// Binary output
size_t write_factors_binary_file(const std::string& in_path, const std::string& out_path);
// DNA-aware factorization functions with reverse complement support
std::vector<Factor> factorize_dna_w_rc(std::string_view text);
std::vector<Factor> factorize_file_dna_w_rc(const std::string& path, size_t reserve_hint = 0);
size_t count_factors_dna_w_rc(std::string_view text);
size_t count_factors_file_dna_w_rc(const std::string& path);
size_t write_factors_binary_file_dna_w_rc(const std::string& in_path, const std::string& out_path);
// Template functions for advanced usage
template<class Sink>
size_t factorize_stream_dna_w_rc(std::string_view text, Sink&& sink);
template<class Sink>
size_t factorize_file_stream_dna_w_rc(const std::string& path, Sink&& sink);
// Multiple DNA sequences factorization functions with reverse complement support
std::vector<Factor> factorize_multiple_dna_w_rc(std::string_view text);
std::vector<Factor> factorize_file_multiple_dna_w_rc(const std::string& path, size_t reserve_hint = 0);
size_t count_factors_multiple_dna_w_rc(std::string_view text);
size_t count_factors_file_multiple_dna_w_rc(const std::string& path);
size_t write_factors_binary_file_multiple_dna_w_rc(const std::string& in_path, const std::string& out_path);
// Template functions for advanced usage with multiple sequences
template<class Sink>
size_t factorize_stream_multiple_dna_w_rc(std::string_view text, Sink&& sink);
template<class Sink>
size_t factorize_file_stream_multiple_dna_w_rc(const std::string& path, Sink&& sink);
} // namespace noLZSS