Suffix Tree

A suffix tree is a compressed trie containing all the suffixes of a given text. It provides efficient solutions for many string problems including pattern matching, longest common substring detection, and repeated substring analysis. This data structure is fundamental in bioinformatics, text processing, and string algorithms.

Features

Compressed trie structure - space-efficient representation of all suffixes
Linear time pattern matching - find patterns in O(m) time where m is pattern length
Multiple occurrence detection - find all positions where a pattern occurs
Longest common substring - efficiently find shared substrings between texts
Repeated substring analysis - identify all repeated patterns in text
Suffix enumeration - access all suffixes of the original text
Space efficient - linear space complexity O(n) where n is text length
Versatile queries - supports various string analysis operations

Time Complexities

Operation	Time Complexity	Description
Construction	O(n²) naive, O(n) Ukkonen's	Build tree for text of length n
Pattern Search	O(m)	Search for pattern of length m
Find Occurrences	O(m + k)	m = pattern length, k = number of occurrences
Longest Common Substring	O(n + m)	Between texts of length n and m
Get Suffixes	O(n²)	List all suffixes (total length)
Repeated Substrings	O(n)	Find all repeated patterns
Text Length	O(1)	Get length of original text
Contains Check	O(m)	Check if pattern exists

Space Complexity: O(n) where n is the length of the input text

Basic Usage

Creating and Building

from algo.data_structs.strings.suffix_tree import SuffixTree, SuffixTreeNode

# Create empty suffix tree
tree = SuffixTree()
print(tree.is_empty())         # True
print(tree.size())             # 0
print(len(tree))               # 0

# Build tree with text
tree.build("banana")
print(tree.is_empty())         # False
print(tree.size())             # 10 (number of nodes)
print(len(tree))               # 6 (length of original text)
print(tree.get_text())         # "banana"

# String representations
print(str(tree))               # SuffixTree(text='banana', nodes=10)
print(repr(tree))              # SuffixTree(text_length=6, nodes=10)

Creating Nodes

# Create suffix tree nodes
root = SuffixTreeNode()
print(root.start)              # -1 (root has no substring)
print(root.end)                # None
print(root.is_leaf())          # True (no children initially)
print(root.has_children())     # False

# Internal node (substring from index 2 to 4)
internal = SuffixTreeNode(2, 4)
print(internal.start)          # 2
print(internal.end)            # 4
print(str(internal))           # SuffixTreeNode(start=2, end=4, leaf=True)

# Leaf node (extends to end of text)
leaf = SuffixTreeNode(3, None)
print(leaf.start)              # 3
print(leaf.end)                # None (extends to global end)

# Node with edge substring
text = "banana"
edge_text = internal.get_edge_substring(text + "$")
edge_length = internal.edge_length(text + "$")
print(f"Edge substring: '{edge_text}'")  # 'nan'
print(f"Edge length: {edge_length}")     # 3

Basic Text Operations

# Build tree and check basic properties
tree = SuffixTree()
tree.build("abcabc")

print(f"Text: '{tree.get_text()}'")      # 'abcabc'
print(f"Empty: {tree.is_empty()}")       # False
print(f"Size: {tree.size()}")            # Number of nodes
print(f"Length: {len(tree)}")            # 6

# Check if patterns exist
print(f"Contains 'abc': {'abc' in tree}")     # True
print(f"Contains 'cab': {'cab' in tree}")     # False
print(f"Contains 'bc': {'bc' in tree}")       # True

Text Construction

Single Text Building

tree = SuffixTree()

# Build with simple text
tree.build("hello")
print(f"Text: '{tree.get_text()}'")      # 'hello'
print(f"Nodes: {tree.size()}")           # Number of internal nodes + leaves

# Build with repeated patterns
tree.build("abcabc")
print(f"Text: '{tree.get_text()}'")      # 'abcabc'
print(f"Repeated patterns exist")        # Tree will be more compact due to sharing

# Build with single character
tree.build("a")
print(f"Text: '{tree.get_text()}'")      # 'a'
print(f"Nodes: {tree.size()}")           # Minimal tree

Handling Different Text Types

# Empty text
empty_tree = SuffixTree()
empty_tree.build("")
print(f"Empty: {empty_tree.is_empty()}")       # True
print(f"Text: '{empty_tree.get_text()}'")      # ''

# Single repeated character
repeat_tree = SuffixTree()
repeat_tree.build("aaaa")
print(f"Text: '{repeat_tree.get_text()}'")     # 'aaaa'
print(f"Compacted structure due to repetition")

# Complex text with various patterns
complex_tree = SuffixTree()
complex_tree.build("abracadabra")
print(f"Text: '{complex_tree.get_text()}'")    # 'abracadabra'
print(f"Nodes: {complex_tree.size()}")         # More complex structure

Rebuilding Trees

tree = SuffixTree()

# Build with first text
tree.build("abc")
print(f"First text: '{tree.get_text()}'")     # 'abc'

# Rebuild with different text (replaces previous)
tree.build("xyz")
print(f"New text: '{tree.get_text()}'")       # 'xyz'
print(f"Previous tree structure is replaced")

# Build with longer text
tree.build("abcdefghijk")
print(f"Longer text: '{tree.get_text()}'")    # 'abcdefghijk'
print(f"Tree size scales with text complexity")

Pattern Matching

Basic Pattern Search

tree = SuffixTree()
tree.build("banana")

# Search for various patterns
print(f"Search 'an': {tree.search('an')}")      # True
print(f"Search 'ana': {tree.search('ana')}")    # True
print(f"Search 'ban': {tree.search('ban')}")    # True
print(f"Search 'xyz': {tree.search('xyz')}")    # False

# Using 'in' operator
print(f"'ana' in tree: {'ana' in tree}")        # True
print(f"'cab' in tree: {'cab' in tree}")        # False

# Edge cases
print(f"Search '': {tree.search('')}")          # False
print(f"Search 'banana': {tree.search('banana')}")  # True (full text)
print(f"Search 'bananas': {tree.search('bananas')}")  # False (longer than text)

Finding All Occurrences

tree = SuffixTree()
tree.build("abracadabra")

# Find all occurrences of patterns
a_positions = tree.find_occurrences("a")
print(f"'a' occurs at positions: {a_positions}")
# [0, 3, 5, 7, 10] (all positions where 'a' appears)

abr_positions = tree.find_occurrences("abr")
print(f"'abr' occurs at positions: {abr_positions}")
# [0, 7] ('abr' appears at start and in 'cadabr')

br_positions = tree.find_occurrences("br")
print(f"'br' occurs at positions: {br_positions}")
# [1, 8] ('br' in 'abr' patterns)

# Non-existent pattern
xyz_positions = tree.find_occurrences("xyz")
print(f"'xyz' occurs at positions: {xyz_positions}")
# [] (empty list)

# Single character patterns
b_positions = tree.find_occurrences("b")
print(f"'b' occurs at positions: {b_positions}")
# [1, 8] (positions of 'b')

Pattern Matching Edge Cases

tree = SuffixTree()
tree.build("aaaaaa")

# Overlapping patterns
aa_positions = tree.find_occurrences("aa")
print(f"'aa' in 'aaaaaa' at: {aa_positions}")
# [0, 1, 2, 3, 4] (overlapping occurrences)

aaa_positions = tree.find_occurrences("aaa")
print(f"'aaa' in 'aaaaaa' at: {aaa_positions}")
# [0, 1, 2, 3] (overlapping triple-a's)

# Full text pattern
full_positions = tree.find_occurrences("aaaaaa")
print(f"Full text occurs at: {full_positions}")
# [0] (only at the beginning)

# Pattern longer than text
long_positions = tree.find_occurrences("aaaaaaa")
print(f"Longer pattern occurs at: {long_positions}")
# [] (impossible)

Suffix Operations

Getting All Suffixes

tree = SuffixTree()
tree.build("abc")

# Get all suffixes
suffixes = tree.get_suffixes()
print(f"Suffixes of 'abc': {suffixes}")
# ['abc', 'bc', 'c']

# More complex example
tree.build("banana")
suffixes = tree.get_suffixes()
print(f"Suffixes of 'banana': {suffixes}")
# ['banana', 'anana', 'nana', 'ana', 'na', 'a']

print(f"Number of suffixes: {len(suffixes)}")  # 6 (same as text length)

# Verify each suffix
for i, suffix in enumerate(suffixes):
    print(f"Suffix {i}: '{suffix}'")

Suffix Properties

tree = SuffixTree()
tree.build("abcdef")

suffixes = tree.get_suffixes()

# Properties of suffixes
print(f"Longest suffix: '{max(suffixes, key=len)}'")    # 'abcdef'
print(f"Shortest suffix: '{min(suffixes, key=len)}'")   # 'f'

# Verify suffix property: each suffix is a tail of the original text
original_text = tree.get_text()
for suffix in suffixes:
    assert original_text.endswith(suffix), f"'{suffix}' is not a suffix of '{original_text}'"

print("All suffix properties verified!")

# Suffix relationships
for i, suffix in enumerate(suffixes):
    # Each suffix (except the last) contains the next suffix
    if i < len(suffixes) - 1:
        next_suffix = suffixes[i + 1]
        assert suffix.endswith(next_suffix), f"Suffix relationship broken"

print("Suffix containment relationships verified!")

Advanced String Analysis

Longest Common Substring

tree = SuffixTree()

# Build tree with first text
tree.build("abcdef")

# Find longest common substring with various texts
lcs1 = tree.longest_common_substring("defghi")
print(f"LCS of 'abcdef' and 'defghi': '{lcs1}'")  # 'def'

lcs2 = tree.longest_common_substring("xyzabc")
print(f"LCS of 'abcdef' and 'xyzabc': '{lcs2}'")  # 'abc'

lcs3 = tree.longest_common_substring("123456")
print(f"LCS of 'abcdef' and '123456': '{lcs3}'")  # '' (no common substring)

# More complex example
tree.build("banana")
lcs4 = tree.longest_common_substring("ananas")
print(f"LCS of 'banana' and 'ananas': '{lcs4}'")  # 'anan' or similar

# Identical texts
lcs5 = tree.longest_common_substring("banana")
print(f"LCS of 'banana' and 'banana': '{lcs5}'")  # 'banana'

Finding Repeated Substrings

tree = SuffixTree()
tree.build("abracadabra")

# Find repeated substrings of different minimum lengths
repeated_2 = tree.get_repeated_substrings(min_length=2)
print(f"Repeated substrings (≥2 chars): {sorted(repeated_2)}")
# ['ab', 'br', 'ra', 'abr', 'bra'] and others

repeated_3 = tree.get_repeated_substrings(min_length=3)
print(f"Repeated substrings (≥3 chars): {sorted(repeated_3)}")
# ['abr', 'bra'] (longer repeated patterns)

repeated_4 = tree.get_repeated_substrings(min_length=4)
print(f"Repeated substrings (≥4 chars): {sorted(repeated_4)}")
# Possibly fewer or none depending on the text

# Example with highly repetitive text
tree.build("ababab")
repeated_ab = tree.get_repeated_substrings(min_length=2)
print(f"Repeated in 'ababab': {sorted(repeated_ab)}")
# ['ab', 'ba', 'abab', 'baba'] and others

Pattern Analysis

tree = SuffixTree()
tree.build("mississippi")

# Analyze pattern frequencies
patterns = ["i", "s", "is", "si", "iss", "ssi"]
for pattern in patterns:
    occurrences = tree.find_occurrences(pattern)
    print(f"'{pattern}' appears {len(occurrences)} times at positions {occurrences}")

# Find all unique substrings of a certain length
def get_substrings_of_length(tree, length):
    """Get all unique substrings of specified length."""
    text = tree.get_text()
    substrings = set()
    for i in range(len(text) - length + 1):
        substring = text[i:i + length]
        if tree.search(substring):  # Verify it exists (should always be true)
            substrings.add(substring)
    return sorted(substrings)

# Get all unique 3-character substrings
substrings_3 = get_substrings_of_length(tree, 3)
print(f"All 3-char substrings: {substrings_3}")

# Verify using suffix tree
for substring in substrings_3:
    positions = tree.find_occurrences(substring)
    print(f"  '{substring}': {len(positions)} occurrences")

Integration with Other Data Structures

Converting to Trie

tree = SuffixTree()
tree.build("abc")

# Convert suffix tree to trie structure
suffix_trie = tree.to_trie_structure()
print(f"Suffix trie size: {suffix_trie.size()}")    # 3 (number of suffixes)

# The trie contains all suffixes as words
print(f"Trie words: {suffix_trie.get_all_words()}")  # ['a', 'abc', 'bc']

# Verify each suffix is in the trie
suffixes = tree.get_suffixes()
for suffix in suffixes:
    assert suffix_trie.search(suffix), f"'{suffix}' not found in trie"

print("All suffixes successfully converted to trie!")

# Use trie operations on suffixes
print(f"Suffixes starting with 'a': {suffix_trie.get_words_with_prefix('a')}")
print(f"Suffixes starting with 'b': {suffix_trie.get_words_with_prefix('b')}")

Suffix-based Autocomplete

tree = SuffixTree()
tree.build("programming")

# Use suffix tree for autocomplete (finds suffixes that start with prefix)
suggestions_p = tree.autocomplete("p", max_suggestions=5)
print(f"Autocomplete for 'p': {suggestions_p}")
# Suggestions based on suffixes starting with 'p'

suggestions_pr = tree.autocomplete("pr", max_suggestions=3)
print(f"Autocomplete for 'pr': {suggestions_pr}")
# More specific suggestions

suggestions_xyz = tree.autocomplete("xyz", max_suggestions=5)
print(f"Autocomplete for 'xyz': {suggestions_xyz}")
# [] (no suffixes start with 'xyz')

# This is different from traditional autocomplete as it works with suffixes
# rather than a dictionary of complete words

Working with Different Text Types

DNA Sequence Analysis

def analyze_dna_sequence():
    """Analyze DNA sequence using suffix tree."""
    tree = SuffixTree()

    # Build tree with DNA sequence
    dna_sequence = "ATCGATCGATCG"
    tree.build(dna_sequence)

    print(f"DNA Sequence: {dna_sequence}")
    print(f"Sequence length: {len(tree)}")

    # Find repeated sequences (important in genomics)
    repeated = tree.get_repeated_substrings(min_length=3)
    print(f"Repeated sequences (≥3 bp): {sorted(repeated)}")

    # Look for specific patterns
    patterns = ["ATG", "TAA", "TGA", "TAG"]  # Start and stop codons
    for pattern in patterns:
        occurrences = tree.find_occurrences(pattern)
        if occurrences:
            print(f"Found {pattern} at positions: {occurrences}")

    # Find palindromic sequences by checking reverse complements
    def reverse_complement(seq):
        complement = {'A': 'T', 'T': 'A', 'G': 'C', 'C': 'G'}
        return ''.join(complement.get(base, base) for base in reversed(seq))

    # Check for palindromic sequences
    for length in range(4, 8):  # Check 4-7 bp palindromes
        for i in range(len(dna_sequence) - length + 1):
            subseq = dna_sequence[i:i + length]
            rev_comp = reverse_complement(subseq)
            if tree.search(rev_comp) and subseq != rev_comp:
                print(f"Found palindromic sequence: {subseq} (reverse complement: {rev_comp})")

# Run DNA analysis
analyze_dna_sequence()

Text Processing Applications

def text_analysis_toolkit():
    """Demonstrate text analysis using suffix trees."""

    # Analyze a paragraph
    text = "the quick brown fox jumps over the lazy dog the fox is quick"

    # Remove spaces and convert to lowercase for analysis
    clean_text = text.replace(" ", "").lower()

    tree = SuffixTree()
    tree.build(clean_text)

    print(f"Analyzing text: '{text}'")
    print(f"Clean text: '{clean_text}'")
    print(f"Text length: {len(tree)} characters")

    # Find most common letters
    letter_counts = {}
    for letter in "abcdefghijklmnopqrstuvwxyz":
        occurrences = tree.find_occurrences(letter)
        if occurrences:
            letter_counts[letter] = len(occurrences)

    print("\nLetter frequencies:")
    for letter, count in sorted(letter_counts.items(), key=lambda x: x[1], reverse=True)[:5]:
        print(f"  '{letter}': {count} times")

    # Find repeated patterns
    repeated = tree.get_repeated_substrings(min_length=3)
    print(f"\nRepeated patterns (≥3 chars): {sorted(repeated)[:10]}")

    # Analyze specific patterns
    common_patterns = ["the", "qu", "ck", "ox", "og"]
    print(f"\nPattern analysis:")
    for pattern in common_patterns:
        # Remove spaces from pattern for comparison
        clean_pattern = pattern.replace(" ", "")
        if clean_pattern:
            occurrences = tree.find_occurrences(clean_pattern)
            print(f"  '{pattern}': {len(occurrences)} occurrences")

# Run text analysis
text_analysis_toolkit()

Bioinformatics Applications

def protein_sequence_analysis():
    """Analyze protein sequences using suffix trees."""

    # Example protein sequence (amino acid single-letter codes)
    protein = "MKTVRQERLKSIVRILERSKEPVSGAQLAEELSVSRQVIVQDIAYLRSLGYNIVATPRGYVLAGG"

    tree = SuffixTree()
    tree.build(protein)

    print(f"Protein sequence: {protein}")
    print(f"Length: {len(tree)} amino acids")

    # Find repeated domains (important for protein function)
    repeated_domains = tree.get_repeated_substrings(min_length=4)
    print(f"Repeated domains (≥4 AA): {sorted(repeated_domains)}")

    # Look for common motifs
    motifs = ["RGD", "PEST", "KDEL"]  # Common protein motifs
    for motif in motifs:
        occurrences = tree.find_occurrences(motif)
        if occurrences:
            print(f"Found motif {motif} at positions: {occurrences}")

    # Analyze amino acid composition
    amino_acids = "ACDEFGHIKLMNPQRSTVWY"
    composition = {}
    for aa in amino_acids:
        occurrences = tree.find_occurrences(aa)
        if occurrences:
            composition[aa] = len(occurrences)

    print(f"\nAmino acid composition (top 5):")
    for aa, count in sorted(composition.items(), key=lambda x: x[1], reverse=True)[:5]:
        percentage = (count / len(protein)) * 100
        print(f"  {aa}: {count} ({percentage:.1f}%)")

# Run protein analysis
protein_sequence_analysis()

Error Handling and Edge Cases

Empty and Invalid Inputs

# Test with empty inputs
empty_tree = SuffixTree()

# Operations on empty tree
print(f"Empty tree search: {empty_tree.search('any')}")           # False
print(f"Empty tree occurrences: {empty_tree.find_occurrences('a')}")  # []
print(f"Empty tree suffixes: {empty_tree.get_suffixes()}")        # []
print(f"Empty tree size: {empty_tree.size()}")                    # 0
print(f"Empty tree is empty: {empty_tree.is_empty()}")            # True

# Build with empty string
empty_tree.build("")
print(f"After empty build - is empty: {empty_tree.is_empty()}")   # True
print(f"After empty build - text: '{empty_tree.get_text()}'")     # ''

# Test edge cases
edge_tree = SuffixTree()
edge_tree.build("a")

print(f"Single char - search 'a': {edge_tree.search('a')}")       # True
print(f"Single char - search 'b': {edge_tree.search('b')}")       # False
print(f"Single char - suffixes: {edge_tree.get_suffixes()}")      # ['a']

Large Text Handling

def test_large_text_performance():
    """Test suffix tree with larger texts."""
    import time

    # Generate test text with patterns
    base_pattern = "abcde"
    large_text = base_pattern * 100  # 500 characters

    print(f"Testing with text of length: {len(large_text)}")

    # Build tree
    start_time = time.time()
    tree = SuffixTree()
    tree.build(large_text)
    build_time = time.time() - start_time

    print(f"Tree built in {build_time:.4f} seconds")
    print(f"Tree has {tree.size()} nodes")

    # Test pattern search
    start_time = time.time()
    result = tree.search("abcde")
    search_time = time.time() - start_time

    print(f"Pattern search took {search_time:.6f} seconds")
    print(f"Pattern found: {result}")

    # Test finding occurrences
    start_time = time.time()
    occurrences = tree.find_occurrences("abc")
    occurrence_time = time.time() - start_time

    print(f"Finding {len(occurrences)} occurrences took {occurrence_time:.6f} seconds")

    # Test repeated substring detection
    start_time = time.time()
    repeated = tree.get_repeated_substrings(min_length=3)
    repeated_time = time.time() - start_time

    print(f"Finding repeated substrings took {repeated_time:.6f} seconds")
    print(f"Found {len(repeated)} repeated patterns")

# Uncomment to run performance test
# test_large_text_performance()

Special Characters and Unicode

# Test with special characters
special_tree = SuffixTree()
special_text = "hello, world! 123 @#$%"
special_tree.build(special_text)

print(f"Special text: '{special_tree.get_text()}'")
print(f"Contains 'hello': {special_tree.search('hello')}")        # True
print(f"Contains '!': {special_tree.search('!')}")               # True
print(f"Contains '123': {special_tree.search('123')}")           # True
print(f"Contains '@#$': {special_tree.search('@#$')}")           # True

# Test with Unicode characters
unicode_tree = SuffixTree()
unicode_text = "café naïve résumé"
unicode_tree.build(unicode_text)

print(f"Unicode text: '{unicode_tree.get_text()}'")
print(f"Contains 'café': {unicode_tree.search('café')}")         # True
print(f"Contains 'ñ': {unicode_tree.search('ñ')}")              # False
print(f"Contains 'é': {unicode_tree.search('é')}")              # True

# Get Unicode suffixes
unicode_suffixes = unicode_tree.get_suffixes()
print(f"Unicode suffixes: {unicode_suffixes}")

Performance Considerations and Best Practices

Memory Usage Optimization

def analyze_memory_usage():
    """Analyze memory usage patterns."""

    # Compare trees with different characteristics

    # Tree with no repetition
    unique_tree = SuffixTree()
    unique_tree.build("abcdefghijk")

    # Tree with high repetition
    repeat_tree = SuffixTree()
    repeat_tree.build("ababababab")

    print("Memory usage comparison:")
    print(f"Unique text tree: {unique_tree.size()} nodes for {len(unique_tree)} chars")
    print(f"Repetitive text tree: {repeat_tree.size()} nodes for {len(repeat_tree)} chars")

    # Show compression benefits
    ratio_unique = unique_tree.size() / len(unique_tree)
    ratio_repeat = repeat_tree.size() / len(repeat_tree)

    print(f"Node-to-character ratio:")
    print(f"  Unique text: {ratio_unique:.2f}")
    print(f"  Repetitive text: {ratio_repeat:.2f}")
    print(f"  Compression benefit: {ratio_unique / ratio_repeat:.2f}x")

# analyze_memory_usage()

Usage Guidelines

def suffix_tree_best_practices():
    """Demonstrate best practices for suffix tree usage."""

    print("Suffix Tree Best Practices:")
    print("="*50)

    # 1. Use for appropriate problems
    print("1. Use suffix trees when you need:")
    print("   - Fast pattern matching in fixed text")
    print("   - Multiple pattern searches in same text")
    print("   - Longest common substring problems")
    print("   - Repeated substring analysis")

    # 2. Consider text characteristics
    print("\n2. Text characteristics matter:")
    tree_short = SuffixTree()
    tree_short.build("abc")

    tree_repetitive = SuffixTree()
    tree_repetitive.build("abcabcabc")

    print(f"   Short text: {tree_short.size()} nodes for '{tree_short.get_text()}'")
    print(f"   Repetitive text: {tree_repetitive.size()} nodes for '{tree_repetitive.get_text()}'")

    # 3. Preprocessing vs query trade-offs
    print("\n3. Preprocessing vs Query trade-offs:")
    print("   - Build once, query many times: Excellent")
    print("   - Single query on text: Consider simpler algorithms")
    print("   - Frequently changing text: Not ideal")

    # 4. Memory considerations
    print("\n4. Memory considerations:")
    print("   - Linear space O(n) but with significant constants")
    print("   - Best for texts with repetitive patterns")
    print("   - Consider alternatives for very large texts without patterns")

    return {
        'short_tree_efficiency': tree_short.size() / len(tree_short),
        'repetitive_tree_efficiency': tree_repetitive.size() / len(tree_repetitive)
    }

# Example usage
practices_results = suffix_tree_best_practices()
print(f"\nEfficiency ratios: {practices_results}")

Practical Applications

Document Similarity Detection

class DocumentSimilarity:
    """Compare document similarity using suffix trees."""

    def __init__(self):
        self.documents = {}
        self.trees = {}

    def add_document(self, doc_id: str, text: str):
        """Add a document for comparison."""
        # Clean text (remove spaces, lowercase)
        clean_text = ''.join(text.lower().split())
        self.documents[doc_id] = clean_text

        # Build suffix tree
        tree = SuffixTree()
        tree.build(clean_text)
        self.trees[doc_id] = tree

    def find_similarity(self, doc1_id: str, doc2_id: str) -> dict:
        """Find similarity between two documents."""
        if doc1_id not in self.trees or doc2_id not in self.trees:
            return {"error": "Document not found"}

        tree1 = self.trees[doc1_id]
        text2 = self.documents[doc2_id]

        # Find longest common substring
        lcs = tree1.longest_common_substring(text2)

        # Calculate similarity metrics
        len1 = len(self.documents[doc1_id])
        len2 = len(text2)
        lcs_len = len(lcs)

        similarity_ratio = (2 * lcs_len) / (len1 + len2) if (len1 + len2) > 0 else 0

        return {
            "longest_common_substring": lcs,
            "lcs_length": lcs_len,
            "similarity_ratio": similarity_ratio,
            "doc1_length": len1,
            "doc2_length": len2
        }

    def find_common_patterns(self, doc1_id: str, doc2_id: str, min_length: int = 3) -> list:
        """Find all common patterns between documents."""
        if doc1_id not in self.trees or doc2_id not in self.trees:
            return []

        tree1 = self.trees[doc1_id]
        text2 = self.documents[doc2_id]

        common_patterns = []

        # Check all substrings of doc2 in doc1's suffix tree
        for i in range(len(text2)):
            for j in range(i + min_length, len(text2) + 1):
                pattern = text2[i:j]
                if tree1.search(pattern):
                    common_patterns.append(pattern)

        # Remove duplicates and sort by length
        unique_patterns = list(set(common_patterns))
        return sorted(unique_patterns, key=len, reverse=True)

# Example usage
similarity_detector = DocumentSimilarity()

# Add documents
doc1 = "The quick brown fox jumps over the lazy dog"
doc2 = "A quick brown fox leaps over a sleeping dog"
doc3 = "The cat sat on the mat"

similarity_detector.add_document("doc1", doc1)
similarity_detector.add_document("doc2", doc2)
similarity_detector.add_document("doc3", doc3)

# Compare documents
sim_1_2 = similarity_detector.find_similarity("doc1", "doc2")
print(f"Similarity between doc1 and doc2:")
print(f"  LCS: '{sim_1_2['longest_common_substring']}'")
print(f"  Similarity ratio: {sim_1_2['similarity_ratio']:.3f}")

sim_1_3 = similarity_detector.find_similarity("doc1", "doc3")
print(f"\nSimilarity between doc1 and doc3:")
print(f"  LCS: '{sim_1_3['longest_common_substring']}'")
print(f"  Similarity ratio: {sim_1_3['similarity_ratio']:.3f}")

# Find common patterns
patterns_1_2 = similarity_detector.find_common_patterns("doc1", "doc2", min_length=4)
print(f"\nCommon patterns (doc1, doc2): {patterns_1_2[:5]}")  # Top 5

Plagiarism Detection System

class PlagiarismDetector:
    """Simple plagiarism detection using suffix trees."""

    def __init__(self, min_match_length: int = 5):
        self.min_match_length = min_match_length
        self.reference_tree = None
        self.reference_text = ""

    def set_reference_document(self, text: str):
        """Set the reference document to check against."""
        # Clean and prepare text
        self.reference_text = ''.join(text.lower().split())

        # Build suffix tree
        self.reference_tree = SuffixTree()
        self.reference_tree.build(self.reference_text)

    def check_document(self, suspect_text: str) -> dict:
        """Check a document for potential plagiarism."""
        if not self.reference_tree:
            return {"error": "No reference document set"}

        # Clean suspect text
        clean_suspect = ''.join(suspect_text.lower().split())

        matches = []
        total_matched_chars = 0

        # Look for matching substrings
        i = 0
        while i < len(clean_suspect):
            # Try to find longest match starting at position i
            longest_match = ""
            for j in range(i + self.min_match_length, len(clean_suspect) + 1):
                substring = clean_suspect[i:j]
                if self.reference_tree.search(substring):
                    longest_match = substring
                else:
                    break

            if longest_match:
                # Find where this match occurs in reference
                positions = self.reference_tree.find_occurrences(longest_match)
                matches.append({
                    "text": longest_match,
                    "suspect_start": i,
                    "suspect_end": i + len(longest_match),
                    "reference_positions": positions,
                    "length": len(longest_match)
                })
                total_matched_chars += len(longest_match)
                i += len(longest_match)  # Skip past this match
            else:
                i += 1

        # Calculate plagiarism metrics
        plagiarism_ratio = total_matched_chars / len(clean_suspect) if clean_suspect else 0

        return {
            "matches": matches,
            "total_matched_chars": total_matched_chars,
            "suspect_length": len(clean_suspect),
            "plagiarism_ratio": plagiarism_ratio,
            "is_suspicious": plagiarism_ratio > 0.3  # 30% threshold
        }

    def generate_report(self, result: dict) -> str:
        """Generate a human-readable plagiarism report."""
        if "error" in result:
            return f"Error: {result['error']}"

        report = []
        report.append("PLAGIARISM DETECTION REPORT")
        report.append("=" * 30)
        report.append(f"Document length: {result['suspect_length']} characters")
        report.append(f"Matched characters: {result['total_matched_chars']}")
        report.append(f"Plagiarism ratio: {result['plagiarism_ratio']:.1%}")
        report.append(f"Status: {'SUSPICIOUS' if result['is_suspicious'] else 'CLEAR'}")

        if result['matches']:
            report.append(f"\nFound {len(result['matches'])} potential matches:")
            for i, match in enumerate(result['matches'][:5]):  # Show top 5
                report.append(f"{i+1}. '{match['text']}' (length: {match['length']})")
                report.append(f"   At positions: {match['reference_positions']}")

        return "\n".join(report)

# Example usage
detector = PlagiarismDetector(min_match_length=4)

# Set reference document
reference = """
Artificial intelligence is transforming the world in unprecedented ways.
Machine learning algorithms are becoming more sophisticated each day.
Deep learning neural networks can now process complex patterns in data.
"""

detector.set_reference_document(reference)

# Check suspect documents
suspect1 = """
AI is changing our world significantly.
Machine learning algorithms are becoming more sophisticated each day.
This represents a major technological advancement.
"""

suspect2 = """
The weather today is quite nice.
I enjoy reading books in the afternoon.
Technology continues to evolve rapidly.
"""

# Run plagiarism detection
result1 = detector.check_document(suspect1)
result2 = detector.check_document(suspect2)

print("SUSPECT DOCUMENT 1:")
print(detector.generate_report(result1))

print("\n" + "="*50 + "\n")

print("SUSPECT DOCUMENT 2:")
print(detector.generate_report(result2))

Bioinformatics Sequence Alignment

class SequenceAligner:
    """Sequence alignment using suffix trees for bioinformatics."""

    def __init__(self):
        self.sequences = {}
        self.trees = {}

    def add_sequence(self, seq_id: str, sequence: str):
        """Add a biological sequence."""
        # Convert to uppercase for consistency
        clean_seq = sequence.upper().strip()
        self.sequences[seq_id] = clean_seq

        # Build suffix tree
        tree = SuffixTree()
        tree.build(clean_seq)
        self.trees[seq_id] = tree

    def find_common_motifs(self, seq1_id: str, seq2_id: str, min_length: int = 3) -> list:
        """Find common motifs between two sequences."""
        if seq1_id not in self.trees or seq2_id not in self.trees:
            return []

        tree1 = self.trees[seq1_id]
        seq2 = self.sequences[seq2_id]

        common_motifs = []

        # Find all common substrings
        for i in range(len(seq2)):
            for j in range(i + min_length, len(seq2) + 1):
                motif = seq2[i:j]
                if tree1.search(motif):
                    positions1 = tree1.find_occurrences(motif)
                    positions2 = [i]  # Position in seq2

                    common_motifs.append({
                        "motif": motif,
                        "length": len(motif),
                        "positions_seq1": positions1,
                        "positions_seq2": positions2
                    })

        # Remove duplicates and sort by length
        unique_motifs = {}
        for motif_info in common_motifs:
            motif = motif_info["motif"]
            if motif not in unique_motifs or len(motif_info["positions_seq1"]) > len(unique_motifs[motif]["positions_seq1"]):
                unique_motifs[motif] = motif_info

        return sorted(unique_motifs.values(), key=lambda x: x["length"], reverse=True)

    def analyze_repeats(self, seq_id: str, min_length: int = 3) -> dict:
        """Analyze repeated elements in a sequence."""
        if seq_id not in self.trees:
            return {"error": "Sequence not found"}

        tree = self.trees[seq_id]
        sequence = self.sequences[seq_id]

        # Find repeated substrings
        repeated = tree.get_repeated_substrings(min_length)

        # Analyze each repeat
        repeat_analysis = []
        for repeat in repeated:
            positions = tree.find_occurrences(repeat)
            repeat_analysis.append({
                "sequence": repeat,
                "length": len(repeat),
                "occurrences": len(positions),
                "positions": positions,
                "total_coverage": len(repeat) * len(positions)
            })

        # Sort by total coverage (length × occurrences)
        repeat_analysis.sort(key=lambda x: x["total_coverage"], reverse=True)

        # Calculate statistics
        total_sequence_length = len(sequence)
        total_repeat_coverage = sum(r["total_coverage"] for r in repeat_analysis)
        repeat_density = total_repeat_coverage / total_sequence_length if total_sequence_length > 0 else 0

        return {
            "sequence_id": seq_id,
            "sequence_length": total_sequence_length,
            "total_repeats": len(repeat_analysis),
            "repeat_density": repeat_density,
            "repeats": repeat_analysis
        }

# Example usage with DNA sequences
aligner = SequenceAligner()

# Add DNA sequences
seq1 = "ATCGATCGATCGAAATTTCGATCGATCG"
seq2 = "GCGATCGATCGCCGATTTAATCGATCGC"
seq3 = "ATCGATCGATCGATCGATCGATCG"  # Highly repetitive

aligner.add_sequence("seq1", seq1)
aligner.add_sequence("seq2", seq2)
aligner.add_sequence("seq3", seq3)

# Find common motifs
print("Common motifs between seq1 and seq2:")
motifs = aligner.find_common_motifs("seq1", "seq2", min_length=4)
for motif in motifs[:5]:  # Show top 5
    print(f"  {motif['motif']} (length: {motif['length']})")
    print(f"    Seq1 positions: {motif['positions_seq1']}")
    print(f"    Seq2 positions: {motif['positions_seq2']}")

# Analyze repeats
print(f"\nRepeat analysis for seq3:")
repeat_analysis = aligner.analyze_repeats("seq3", min_length=3)
print(f"Sequence length: {repeat_analysis['sequence_length']}")
print(f"Repeat density: {repeat_analysis['repeat_density']:.2%}")
print(f"Top repeated elements:")
for repeat in repeat_analysis['repeats'][:5]:
    print(f"  '{repeat['sequence']}': {repeat['occurrences']} times, coverage: {repeat['total_coverage']}")

API Reference

Suffix Tree Implementation

A suffix tree is a compressed trie containing all the suffixes of a given text. It provides efficient solutions for many string problems including pattern matching, longest common substring, and repeated substring detection.

This implementation uses Ukkonen's algorithm for linear time construction.

Key Features:

Pattern searching in O(m) time where m is pattern length
Finding all occurrences of a pattern
Longest common substring detection
Space-efficient edge compression
Suffix links for efficient construction

Time Complexities:

Construction: O(n) where n is text length
Pattern search: O(m) where m is pattern length
Find occurrences: O(m + k) where k is number of occurrences
Longest common substring: O(n + m) where n, m are text lengths

Space Complexity: O(n) where n is text length

`SuffixTree()`

Bases: SuffixTreeInterface

A suffix tree implementation using Ukkonen's algorithm.

This implementation provides efficient string matching and analysis operations by building a compressed trie of all suffixes of the input text.

Initialize an empty suffix tree.

`contains(pattern)`

Check if a pattern is in the text using 'in' operator.

`len()`

Return the length of the text.

`repr()`

Return detailed representation for debugging.

`str()`

Return string representation of the suffix tree.

`autocomplete(prefix, max_suggestions=10)`

Get autocomplete suggestions using suffix tree converted to trie approach.

This shows how suffix tree data can be used for trie-like operations.

Parameters:

Name	Type	Description	Default
`prefix`	`str`	Prefix to get suggestions for	required
`max_suggestions`	`int`	Maximum number of suggestions	`10`

Returns:

Type	Description
`List[str]`	List of suffix-based suggestions

`build(text)`

Build the suffix tree for the given text using Ukkonen's algorithm.

Parameters:

Name	Type	Description	Default
`text`	`str`	Text to build suffix tree for	required

`find_occurrences(pattern)`

Find all occurrences of a pattern in the text.

Parameters:

Name	Type	Description	Default
`pattern`	`str`	Pattern to search for	required

Returns:

Type	Description
`List[int]`	List of starting positions where pattern occurs

`get_repeated_substrings(min_length=2)`

Find all repeated substrings of at least the given length.

Parameters:

Name	Type	Description	Default
`min_length`	`int`	Minimum length of substrings to find	`2`

Returns:

Type	Description
`List[str]`	List of repeated substrings

`get_suffixes()`

Get all suffixes of the text.

Returns:

Type	Description
`List[str]`	List of all suffixes

`get_text()`

Get the original text.

Returns:

Type	Description
`str`	Original text used to build the tree

`is_empty()`

Check if the suffix tree is empty.

Returns:

Type	Description
`bool`	True if tree is empty, False otherwise

`longest_common_substring(other_text)`

Find the longest common substring between the tree's text and another text.

This is a simplified implementation that builds a generalized suffix tree concept.

Parameters:

Name	Type	Description	Default
`other_text`	`str`	Text to compare with	required

Returns:

Type	Description
`str`	Longest common substring

`search(pattern)`

Search for a pattern in the text.

Parameters:

Name	Type	Description	Default
`pattern`	`str`	Pattern to search for	required

Returns:

Type	Description
`bool`	True if pattern exists, False otherwise

`size()`

Get the number of nodes in the suffix tree.

Returns:

Type	Description
`int`	Number of nodes in the tree

`to_trie_structure()`

Convert the suffix tree's suffixes into a trie structure.

This demonstrates the relationship between suffix trees and tries, where a suffix tree's suffixes can be viewed as words in a trie.

Returns:

Type	Description
`Trie`	Trie containing all suffixes as words

`SuffixTreeNode(start=-1, end=None)`

Bases: SuffixTreeNodeInterface

A node in the suffix tree structure.

Each node represents a substring (edge) and maintains references to child nodes and suffix links for efficient traversal.

Initialize a suffix tree node.

Parameters:

Name	Type	Description	Default
`start`	`int`	Starting index of the substring in the original text	`-1`
`end`	`Optional[int]`	Ending index of the substring (None for leaf nodes with global end)	`None`

`repr()`

Return detailed string representation for debugging.

`str()`

Return string representation of the node.

`add_child(char, child)`

Add a child node for the given character.

Parameters:

Name	Type	Description	Default
`char`	`str`	First character of the edge leading to the child	required
`child`	`SuffixTreeNode`	The child node to add	required

`edge_length(text)`

Get the length of the edge represented by this node.

Parameters:

Name	Type	Description	Default
`text`	`str`	Original text to calculate length from	required

Returns:

Type	Description
`int`	Length of the substring this node represents

`get_child(char)`

Get the child node for the given character.

Parameters:

Name	Type	Description	Default
`char`	`str`	Character to look for	required

Returns:

Type	Description
`Optional[SuffixTreeNode]`	Child node if exists, None otherwise

`get_children_chars()`

Get list of characters for all child edges.

Returns:

Type	Description
`List[str]`	List of first characters of edges to child nodes

`get_edge_substring(text)`

Get the substring represented by this node's edge.

Parameters:

Name	Type	Description	Default
`text`	`str`	Original text	required

Returns:

Type	Description
`str`	Substring represented by this edge

`has_child(char)`

Check if a child exists for the given character.

Parameters:

Name	Type	Description	Default
`char`	`str`	Character to check	required

Returns:

Type	Description
`bool`	True if child exists, False otherwise

`has_children()`

Check if this node has any children.

Returns:

Type	Description
`bool`	True if node has children, False otherwise

`is_leaf()`

Check if this is a leaf node.

Returns:

Type	Description
`bool`	True if leaf node, False otherwise

`remove_child(char)`

Remove the child node for the given character.

Parameters:

Name	Type	Description	Default
`char`	`str`	Character of child to remove	required

Returns:

Type	Description
`bool`	True if child was removed, False if not found

Suffix Tree

Features

Time Complexities

Basic Usage

Creating and Building

Creating Nodes

Basic Text Operations

Text Construction

Single Text Building

Handling Different Text Types

Rebuilding Trees

Pattern Matching

Basic Pattern Search

Finding All Occurrences

Pattern Matching Edge Cases

Suffix Operations

Getting All Suffixes

Suffix Properties

Advanced String Analysis

Longest Common Substring

Finding Repeated Substrings

Pattern Analysis

Integration with Other Data Structures

Converting to Trie

Suffix-based Autocomplete

Working with Different Text Types

DNA Sequence Analysis

Text Processing Applications

Bioinformatics Applications

Error Handling and Edge Cases

Empty and Invalid Inputs

Large Text Handling

Special Characters and Unicode

Performance Considerations and Best Practices

Memory Usage Optimization

Usage Guidelines

Practical Applications

Document Similarity Detection

Plagiarism Detection System

Bioinformatics Sequence Alignment

API Reference

SuffixTree()

__contains__(pattern)

__len__()

__repr__()

__str__()

autocomplete(prefix, max_suggestions=10)

build(text)

find_occurrences(pattern)

get_repeated_substrings(min_length=2)

get_suffixes()

get_text()

is_empty()

longest_common_substring(other_text)

search(pattern)

size()

to_trie_structure()

SuffixTreeNode(start=-1, end=None)

__repr__()

__str__()

add_child(char, child)

edge_length(text)

get_child(char)

get_children_chars()

get_edge_substring(text)

has_child(char)

has_children()

is_leaf()

remove_child(char)

`SuffixTree()`

`contains(pattern)`

`len()`

`repr()`

`str()`

`autocomplete(prefix, max_suggestions=10)`

`build(text)`

`find_occurrences(pattern)`

`get_repeated_substrings(min_length=2)`

`get_suffixes()`

`get_text()`

`is_empty()`

`longest_common_substring(other_text)`

`search(pattern)`

`size()`

`to_trie_structure()`

`SuffixTreeNode(start=-1, end=None)`

`repr()`

`str()`

`add_child(char, child)`

`edge_length(text)`

`get_child(char)`

`get_children_chars()`

`get_edge_substring(text)`

`has_child(char)`

`has_children()`

`is_leaf()`

`remove_child(char)`