Source code for noLZSS.core

"""
Core Python wrappers for noLZSS C++ functionality.

This module provides enhanced Python wrappers around the C++ factorization functions,
adding input validation, error handling, and convenience features.
"""

from typing import List, Tuple, Union, Optional
import os
from pathlib import Path

from ._noLZSS import (
    factorize as _factorize,
    factorize_file as _factorize_file,
    count_factors as _count_factors,
    count_factors_file as _count_factors_file,
    write_factors_binary_file as _write_factors_binary_file,
)

from .utils import validate_input, analyze_alphabet



[docs]
def factorize(data: Union[str, bytes], validate: bool = True) -> List[Tuple[int, int, int]]:
    """
    Factorize a string or bytes object into LZ factors.
    
    Args:
        data: Input string or bytes to factorize
        validate: Whether to perform input validation (default: True)
    
    Returns:
        List of (position, length, ref) tuples representing the factorization
        
    Raises:
        ValueError: If input is invalid (empty, etc.)
        TypeError: If input type is not supported
    """
    if validate:
        data = validate_input(data)
    
    return _factorize(data)




[docs]
def factorize_file(filepath: Union[str, Path], reserve_hint: int = 0) -> List[Tuple[int, int, int]]:
    """
    Factorize the contents of a file into LZ factors.
    
    Args:
        filepath: Path to the input file
        reserve_hint: Optional hint for reserving space in output vector (0 = no hint)
        
    Returns:
        List of (position, length, ref) tuples representing the factorization
        
    Raises:
        FileNotFoundError: If the file doesn't exist
    """
    filepath = Path(filepath)
    if not filepath.exists():
        raise FileNotFoundError(f"File not found: {filepath}")
    
    return _factorize_file(str(filepath), reserve_hint)




[docs]
def count_factors(data: Union[str, bytes], validate: bool = True) -> int:
    """
    Count the number of factors in a string without computing the full factorization.
    
    Args:
        data: Input string or bytes to analyze
        validate: Whether to perform input validation (default: True)
        
    Returns:
        Number of factors in the factorization
        
    Raises:
        ValueError: If input is invalid
        TypeError: If input type is not supported
    """
    if validate:
        data = validate_input(data)
    
    return _count_factors(data)




[docs]
def count_factors_file(filepath: Union[str, Path], validate: bool = True) -> int:
    """
    Count the number of factors in a file without computing the full factorization.
    
    Args:
        filepath: Path to the input file
        validate: Whether to perform input validation (default: True)
        
    Returns:
        Number of factors in the factorization
        
    Raises:
        FileNotFoundError: If the file doesn't exist
        ValueError: If file contents are invalid
    """
    filepath = Path(filepath)
    if not filepath.exists():
        raise FileNotFoundError(f"File not found: {filepath}")
    
    return _count_factors_file(str(filepath))




[docs]
def write_factors_binary_file(
    data: Union[str, bytes], 
    output_filepath: Union[str, Path]
) -> None:
    """
    Factorize input and write the factors to a binary file.
    
    Args:
        data: Input string or bytes to factorize
        output_filepath: Path where to write the binary factors
        
    Raises:
        ValueError: If input is invalid
        TypeError: If input type is not supported
        OSError: If unable to write to output file
    """
    data = validate_input(data)
    
    output_filepath = Path(output_filepath)
    # Ensure output directory exists
    output_filepath.parent.mkdir(parents=True, exist_ok=True)
    
    _write_factors_binary_file(data, str(output_filepath))




[docs]
def factorize_with_info(data: Union[str, bytes], validate: bool = True) -> dict:
    """
    Factorize input and return both factors and additional information.
    
    Args:
        data: Input string or bytes to factorize
        validate: Whether to perform input validation (default: True)
        
    Returns:
        Dictionary containing:
        - 'factors': List of (position, length, ref) tuples
        - 'alphabet_info': Alphabet analysis results
        - 'input_size': Size of input data
        - 'num_factors': Number of factors
    """
    if validate:
        data = validate_input(data)
    
    factors = _factorize(data)
    alphabet_info = analyze_alphabet(data)
    
    return {
        'factors': factors,
        'alphabet_info': alphabet_info,
        'input_size': len(data),
        'num_factors': len(factors)
    }