Examples and Usage

This page provides comprehensive examples demonstrating the noLZSS library’s capabilities, from basic string factorization to advanced genomics applications.

Basic Usage

String Factorization

import noLZSS

# Simple string factorization
text = "abcabcabc"
factors = noLZSS.factorize(text)
print(factors)
# Output: [(0, 1, 0), (1, 1, 1), (2, 1, 2), (3, 3, 0), (6, 3, 0)]

# Factorize bytes (recommended for non-ASCII text)
data = b"hello world hello"
factors = noLZSS.factorize(data)
print(f"Found {len(factors)} factors")

# Understanding factor format: (position, length, reference)
text = "abracadabra"
factors = noLZSS.factorize(text)
for i, (pos, length, ref) in enumerate(factors):
    if ref == 0:
        print(f"Factor {i}: New character '{text[pos]}' at position {pos}")
    else:
        substring = text[pos:pos+length]
        ref_substring = text[ref:ref+length]
        print(f"Factor {i}: '{substring}' at pos {pos}, references pos {ref} ('{ref_substring}')")

Enhanced Factorization with Metadata

# Get detailed analysis with factorization
result = noLZSS.factorize_with_info("the quick brown fox jumps over the lazy dog")
factors = result['factors']

print(f"Input text: '{result['input_text']}'")
print(f"Number of factors: {result['num_factors']}")
print(f"Input size: {result['input_size']} characters")
print(f"Compression ratio: {result['num_factors'] / result['input_size']:.3f}")

# Alphabet analysis
alphabet_info = result['alphabet_info']
print(f"\nAlphabet analysis:")
print(f"  Size: {alphabet_info['size']} unique characters")
print(f"  Characters: {alphabet_info['characters']}")
print(f"  Entropy: {alphabet_info['entropy']:.3f} bits")
print(f"  Most common: {alphabet_info['most_common'][:3]}")  # Top 3

File Processing

# Create a sample file for demonstration
sample_text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit. " * 100
with open("sample.txt", "w") as f:
    f.write(sample_text)

# Process files efficiently
factors = noLZSS.factorize_file("sample.txt")
print(f"File factorization: {len(factors)} factors")

# Count factors without storing them (memory efficient for large files)
count = noLZSS.count_factors_file("sample.txt")
print(f"Factor count: {count}")

# Performance optimization with reserve hint
# If you know approximately how many factors to expect:
factors = noLZSS.factorize_file("sample.txt", reserve_hint=1000)
print(f"Optimized factorization: {len(factors)} factors")

# Clean up
import os
os.remove("sample.txt")

Input Validation and Error Handling

# Input validation examples
try:
    # Empty input
    factors = noLZSS.factorize("")
except ValueError as e:
    print(f"Empty input error: {e}")

try:
    # Invalid file path
    factors = noLZSS.factorize_file("nonexistent.txt")
except FileNotFoundError as e:
    print(f"File not found: {e}")

# Disable validation for performance (use with caution)
text = "valid input"
factors = noLZSS.factorize(text, validate=False)
print(f"Fast factorization: {len(factors)} factors")

Genomics Applications

DNA Sequence Analysis

import noLZSS.genomics
import os
from pathlib import Path

# Create sample DNA FASTA file
fasta_content = """>sequence1
ATCGATCGATCGATCG
>sequence2  
GCTAGCTAGCTAGCTA
>sequence3
AAATTTCCCGGG
"""

with open("sample_dna.fasta", "w") as f:
    f.write(fasta_content)

# Read and factorize nucleotide sequences
try:
    results = noLZSS.genomics.read_nucleotide_fasta("sample_dna.fasta")
    
    for seq_id, factors in results:
        print(f"\nSequence: {seq_id}")
        print(f"  Factors: {len(factors)}")
        print(f"  First few factors: {factors[:3]}")
        
except Exception as e:
    print(f"Error processing FASTA: {e}")

# Automatic sequence type detection
results = noLZSS.genomics.read_fasta_auto("sample_dna.fasta")
print(f"Auto-detected {len(results)} DNA sequences")

# Clean up
os.remove("sample_dna.fasta")

Protein Sequence Analysis

# Create sample protein FASTA file
protein_fasta = """>protein1
MKTVRQERLKSIVRILERSKEPVSGAQLAEELSVSRQVIVQDIAYLRSLGYNIVATPRGYVLAGG
>protein2
ARNDCQEGHILKMFPSTWYV
"""

with open("sample_proteins.fasta", "w") as f:
    f.write(protein_fasta)

# Read protein sequences (returns sequences, not factors)
try:
    results = noLZSS.genomics.read_protein_fasta("sample_proteins.fasta")
    
    for seq_id, sequence in results:
        print(f"Protein: {seq_id}")
        print(f"  Length: {len(sequence)} amino acids")
        print(f"  First 20 AA: {sequence[:20]}")
        
        # Factorize the protein sequence
        factors = noLZSS.factorize(sequence)
        print(f"  Factors: {len(factors)}")
        
except Exception as e:
    print(f"Error processing protein FASTA: {e}")

# Clean up
os.remove("sample_proteins.fasta")

Performance Optimization

Advanced Features

Binary Factor Storage

# Create test data
test_text = "the quick brown fox jumps over the lazy dog" * 50
with open("input.txt", "w") as f:
    f.write(test_text)

# Write factors directly to binary file (memory efficient)
num_factors = noLZSS.write_factors_binary_file("input.txt", "factors.bin")
print(f"Wrote {num_factors} factors to binary file")

# Read factors back from binary file
factors = noLZSS.read_factors_binary_file("factors.bin")
print(f"Read {len(factors)} factors from binary file")

# Verify integrity
factors_direct = noLZSS.factorize_file("input.txt")
assert factors == factors_direct, "Binary storage integrity check failed"
print("Binary storage integrity verified!")

# Check file sizes
import os
text_size = os.path.getsize("input.txt")
binary_size = os.path.getsize("factors.bin")
print(f"Original text: {text_size} bytes")
print(f"Binary factors: {binary_size} bytes")
print(f"Storage ratio: {binary_size / text_size:.3f}")

# Clean up
os.remove("input.txt")
os.remove("factors.bin")

Benchmarking and Analysis

Plotting and Visualization

Performance Comparison

Advanced Genomics Example

This examples documentation provides comprehensive, working code samples that demonstrate all major features of the noLZSS library, from basic usage to advanced genomics applications and performance optimization techniques.