Program Listing for File factorizer.hpp

Return to documentation for file (src/cpp/factorizer.hpp)

#pragma once
#include <vector>
#include <utility>
#include <string_view>
#include <string>
#include <cstdint>

namespace noLZSS {

// Constants and utility functions for reverse complement handling
constexpr uint64_t RC_MASK = (1ULL << 63);

struct PreparedSequenceResult {
    std::string prepared_string;
    size_t original_length;
    std::vector<size_t> sentinel_positions;
};

struct FactorFileHeader {
    char magic[8] = {'n', 'o', 'L', 'Z', 'S', 'S', 'v', '1'};
    uint64_t num_factors;
    uint64_t num_sequences;
    uint64_t num_sentinels;
    uint64_t header_size;
};

inline bool is_rc(uint64_t ref) { return (ref & RC_MASK) != 0; }

inline uint64_t rc_end(uint64_t ref) { return (ref & ~RC_MASK); }

// Utility functions for DNA sequence preparation

PreparedSequenceResult prepare_multiple_dna_sequences_w_rc(const std::vector<std::string>& sequences);

PreparedSequenceResult prepare_multiple_dna_sequences_no_rc(const std::vector<std::string>& sequences);

struct Factor {
    uint64_t start;
    uint64_t length;
    uint64_t ref;
};

// Core factorization functions

std::vector<Factor> factorize(std::string_view text);

std::vector<Factor> factorize_file(const std::string& path, size_t reserve_hint = 0);

// Counting functions

size_t count_factors(std::string_view text);

size_t count_factors_file(const std::string& path);

// Binary output

size_t write_factors_binary_file(const std::string& in_path, const std::string& out_path);

// DNA-aware factorization functions with reverse complement support

std::vector<Factor> factorize_dna_w_rc(std::string_view text);

std::vector<Factor> factorize_file_dna_w_rc(const std::string& path, size_t reserve_hint = 0);

size_t count_factors_dna_w_rc(std::string_view text);

size_t count_factors_file_dna_w_rc(const std::string& path);

size_t write_factors_binary_file_dna_w_rc(const std::string& in_path, const std::string& out_path);

// Template functions for advanced usage

template<class Sink>
size_t factorize_stream_dna_w_rc(std::string_view text, Sink&& sink);

template<class Sink>
size_t factorize_file_stream_dna_w_rc(const std::string& path, Sink&& sink);

// Multiple DNA sequences factorization functions with reverse complement support

std::vector<Factor> factorize_multiple_dna_w_rc(std::string_view text);

std::vector<Factor> factorize_file_multiple_dna_w_rc(const std::string& path, size_t reserve_hint = 0);

size_t count_factors_multiple_dna_w_rc(std::string_view text);

size_t count_factors_file_multiple_dna_w_rc(const std::string& path);

size_t write_factors_binary_file_multiple_dna_w_rc(const std::string& in_path, const std::string& out_path);

// Template functions for advanced usage with multiple sequences

template<class Sink>
size_t factorize_stream_multiple_dna_w_rc(std::string_view text, Sink&& sink);

template<class Sink>
size_t factorize_file_stream_multiple_dna_w_rc(const std::string& path, Sink&& sink);


} // namespace noLZSS