Tokenization -- Converting Text to Numbers for Neural Networks

Introduction: Why Tokenization Matters

Rick Hightower

Originally published on Medium.

Introduction: Why Tokenization Matters

Tokenization -- Converting Text to Numbers for Neural Networks

  • Understand how tokenization converts text into numerical representations
  • Compare three major tokenization algorithms: BPE, WordPiece, and Unigram
  • Implement tokenization using Hugging Face’s transformers library
  • Handle common edge cases in production systems
  • Debug tokenization issues effectively
  • Build custom tokenizers for specialized domains
  • This is very hands-on article series, so be sure to clone the github repo, run the examples, and load the notebooks.
  1. Customer Support: A chatbot needs to distinguish between “can’t login” and “cannot log in”
  2. Financial Analysis: A system must recognize “Q4 2023” as one unit, not three
  3. Medical Records: “Myocardial infarction” must stay together to preserve meaning
  • Misunderstood user intent
  • Incorrect data extraction
  • Higher computational costs
  • Reduced model accuracy

Tokenization -- Converting Text to Numbers for Neural Networks

  • Whole words: “cat” → [“cat”]
  • Subwords: “unhappy” → [“un”, “happy”]
  • Characters: “hi” → [“h”, “i”]

Tokenization -- Converting Text to Numbers for Neural Networks

from
 transformers 
import
 AutoTokenizer
import
 logging
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def
 
demonstrate_basic_tokenization
():
    
"""
    Shows how tokenization converts text to numbers.
    This example uses BERT's tokenizer to process a simple sentence.
    """
    
# Load BERT tokenizer
    tokenizer = AutoTokenizer.from_pretrained(
'bert-base-uncased'
)
    
# Sample text
    text = 
"Tokenization converts text to numbers."
    
# Tokenize
    tokens = tokenizer.tokenize(text)
    token_ids = tokenizer.encode(text)
    
# Display results
    logger.info(
f"Original text: 
{text}
"
)
    logger.info(
f"Tokens: 
{tokens}
"
)
    logger.info(
f"Token IDs: 
{token_ids}
"
)
    
# Show token-to-ID mapping
    
for
 token, token_id 
in
 
zip
(tokens, token_ids[
1
:-
1
]):  
# Skip special tokens
        logger.info(
f"  '
{token}
' → 
{token_id}
"
)
    
return
 tokens, token_ids
# Run the demonstration
tokens, ids = demonstrate_basic_tokenization()
  • Used by: GPT, RoBERTa

  • Approach: Frequency-based merging

  • Vocabulary Size: 30k-50k

  • Best For: General text processing

  • Used by: BERT

  • Approach: Likelihood maximization

  • Vocabulary Size: 30k

  • Best For: Multilingual applications

  • Used by: T5, mBART

  • Approach: Probabilistic model

  • Vocabulary Size: 32k-250k

  • Best For: Flexibility in token selection

Tokenization -- Converting Text to Numbers for Neural Networks

def
 
demonstrate_bpe_tokenization
():
    
"""
    Demonstrates BPE tokenization using RoBERTa.
    BPE handles unknown words by breaking them into known subwords.
    """
    tokenizer = AutoTokenizer.from_pretrained(
'roberta-base'
)
    
# Test words showing BPE behavior
    test_words = [
        
"tokenization"
,      
# Common word
        
"pretokenization"
,   
# Compound word
        
"cryptocurrency"
,    
# Technical term
        
"antidisestablish"
   
# Rare word
    ]
    logger.info(
"=== BPE Tokenization (RoBERTa) ==="
)
    
for
 word 
in
 test_words:
        tokens = tokenizer.tokenize(word)
        ids = tokenizer.encode(word, add_special_tokens=
False
)
        logger.info(
f"\\n'
{word}
':"
)
        logger.info(
f"  Tokens: 
{tokens}
"
)
        logger.info(
f"  Count: 
{
len
(tokens)}
"
)
        
# Show how BPE splits the word
        
if
 
len
(tokens) > 
1
:
            logger.info(
f"  Split pattern: 
{
' + '
.join(tokens)}
"
)
    
return
 tokenizer
def
 
demonstrate_wordpiece_tokenization
():
    
"""
    Shows WordPiece tokenization used by BERT.
    Note the ## prefix marking word continuations.
    """
    tokenizer = AutoTokenizer.from_pretrained(
'bert-base-uncased'
)
    
# Same test words for comparison
    test_words = [
        
"tokenization"
,
        
"pretokenization"
,
        
"cryptocurrency"
,
        
"antidisestablish"
    ]
    logger.info(
"\\n=== WordPiece Tokenization (BERT) ==="
)
    
for
 word 
in
 test_words:
        tokens = tokenizer.tokenize(word)
        logger.info(
f"\\n'
{word}
':"
)
        logger.info(
f"  Tokens: 
{tokens}
"
)
        
# Explain ## notation
        
if
 
any
(t.startswith(
'##'
) 
for
 t 
in
 tokens):
            logger.info(
"  Note: ## indicates continuation of previous token"
)
            
# Reconstruct word from pieces
            reconstructed = tokens[
0
]
            
for
 token 
in
 tokens[
1
:]:
                reconstructed += token.replace(
'##'
, 
''
)
            logger.info(
f"  Reconstructed: 
{reconstructed}
"
)
    
return
 tokenizer

Tokenization -- Converting Text to Numbers for Neural Networks

# requirements.txt
transformers
==
4.36
.
0
torch
==
2.1
.
0
tokenizers
==
0.15
.
0
datasets
==
2.16
.
0
class
 
TokenizationPipeline
:
    
"""
    Production-ready tokenization pipeline with error handling.
    Supports batch processing and various output formats.
    """
    
def
 
__init__
(
self, model_name=
'bert-base-uncased'
, max_length=
512
):
        
"""
        Initialize tokenizer with specified model.
        Parameters:
        -----------
        model_name : str
            Hugging Face model identifier
        max_length : int
            Maximum sequence length
        """
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.max_length = max_length
        logger.info(
f"Initialized tokenizer: 
{model_name}
"
)
    
def
 
tokenize_single
(
self, text, return_offsets=
False
):
        
"""
        Tokenize a single text string.
        Parameters:
        -----------
        text : str
            Input text to tokenize
        return_offsets : bool
            Whether to return character offset mappings
        Returns:
        --------
        dict : Tokenization results including input_ids, attention_mask
        """
        
if
 
not
 text:
            logger.warning(
"Empty text provided"
)
            text = 
""
        
try
:
            encoding = self.tokenizer(
                text,
                truncation=
True
,
                max_length=self.max_length,
                padding=
'max_length'
,
                return_offsets_mapping=return_offsets,
                return_tensors=
'pt'
            )
            logger.info(
f"Tokenized 
{
len
(text)}
 chars into 
{encoding[
'input_ids'
].shape[
1
]}
 tokens"
)
            
return
 encoding
        
except
 Exception 
as
 e:
            logger.error(
f"Tokenization failed: 
{
str
(e)}
"
)
            
raise
def
 
tokenize_batch
(
self, texts, show_progress=
True
):
    
"""
    Efficiently tokenize multiple texts.
    Parameters:
    -----------
    texts : list of str
        Input texts to process
    show_progress : bool
        Display progress information
    Returns:
    --------
    dict : Batched tokenization results
    """
    
if
 
not
 texts:
        logger.warning(
"Empty text list provided"
)
        
return
 
None
    
# Clean texts
    texts = [text 
if
 text 
else
 
""
 
for
 text 
in
 texts]
    
# Process in batches for memory efficiency
    batch_size = 
32
    all_encodings = []
    
for
 i 
in
 
range
(
0
, 
len
(texts), batch_size):
        batch = texts[i:i + batch_size]
        
if
 show_progress:
            logger.info(
f"Processing batch 
{i//batch_size + 
1
}
/
{(
len
(texts)-
1
)//batch_size + 
1
}
"
)
        encoding = self.tokenizer(
            batch,
            truncation=
True
,
            max_length=self.max_length,
            padding=
True
,
            return_tensors=
'pt'
        )
        all_encodings.append(encoding)
    
# Combine batches
    combined = {
        key: torch.cat([e[key] 
for
 e 
in
 all_encodings], dim=
0
)
        
for
 key 
in
 all_encodings[
0
].keys()
    }
    logger.info(
f"Tokenized 
{
len
(texts)}
 texts"
)
    
return
 combined
def
 
demonstrate_special_tokens
():
    
"""
    Shows how special tokens frame and separate sequences.
    Essential for tasks like question-answering and classification.
    """
    tokenizer = AutoTokenizer.from_pretrained(
'bert-base-uncased'
)
    
# Single sequence
    text1 = 
"What is tokenization?"
    encoding1 = tokenizer(text1)
    tokens1 = tokenizer.convert_ids_to_tokens(encoding1[
'input_ids'
])
    logger.info(
"=== Special Tokens in Single Sequence ==="
)
    logger.info(
f"Text: 
{text1}
"
)
    logger.info(
f"Tokens: 
{tokens1}
"
)
    logger.info(
f"[CLS] at position 0: Marks sequence start"
)
    logger.info(
f"[SEP] at position 
{
len
(tokens1)-
1
}
: Marks sequence end"
)
    
# Sequence pair (for QA tasks)
    question = 
"What is tokenization?"
    context = 
"Tokenization converts text into tokens."
    encoding2 = tokenizer(question, context)
    tokens2 = tokenizer.convert_ids_to_tokens(encoding2[
'input_ids'
])
    type_ids = encoding2[
'token_type_ids'
]
    logger.info(
"\\n=== Special Tokens in Sequence Pair ==="
)
    logger.info(
f"Question: 
{question}
"
)
    logger.info(
f"Context: 
{context}
"
)
    
# Find separator positions
    sep_positions = [i 
for
 i, token 
in
 
enumerate
(tokens2) 
if
 token == 
'[SEP]'
]
    logger.info(
f"[SEP] positions: 
{sep_positions}
"
)
    logger.info(
f"Question tokens: positions 1 to 
{sep_positions[
0
]-
1
}
"
)
    logger.info(
f"Context tokens: positions 
{sep_positions[
0
]+
1
}
 to 
{sep_positions[
1
]-
1
}
"
)
    
return
 tokens1, tokens2
  • [CLS]: Classification token - Aggregates sequence meaning
  • [SEP]: Separator token - Marks boundaries between sequences
  • [PAD]: Padding token - Fills shorter sequences to match batch length
  • [UNK]: Unknown token - Replaces out-of-vocabulary words
  • [MASK]: Masking token - Used in masked language modeling
def
 
demonstrate_offset_mapping
():
    
"""
    Shows how offset mapping links tokens back to source text.
    Critical for named entity recognition and text highlighting.
    """
    tokenizer = AutoTokenizer.from_pretrained(
'bert-base-uncased'
)
    text = 
"Apple Inc. was founded by Steve Jobs in Cupertino."
    encoding = tokenizer(
        text,
        return_offsets_mapping=
True
,
        add_special_tokens=
True
    )
    tokens = tokenizer.convert_ids_to_tokens(encoding[
'input_ids'
])
    offsets = encoding[
'offset_mapping'
]
    logger.info(
"=== Token to Character Mapping ==="
)
    logger.info(
f"Original: 
{text}
\\n"
)
    
# Create visual alignment
    logger.info(
"Token → Original Text [Start:End]"
)
    logger.info(
"-"
 * 
40
)
    
for
 token, (start, end) 
in
 
zip
(tokens, offsets):
        
if
 start == end:  
# Special token
            logger.info(
f"
{token:
12
}
 → [SPECIAL]"
)
        
else
:
            original = text[start:end]
            logger.info(
f"
{token:
12
}
 → '
{original}
' [
{start}
:
{end}
]"
)
    
# Demonstrate entity extraction
    entity_tokens = [
2
, 
3
]  
# "apple inc"
    logger.info(
f"\\nExtracting entity from tokens 
{entity_tokens}
:"
)
    start_char = offsets[entity_tokens[
0
]][
0
]
    end_char = offsets[entity_tokens[-
1
]][
1
]
    entity = text[start_char:end_char]
    logger.info(
f"Extracted: '
{entity}
'"
)
    
return
 encoding
  1. Preserves exact character positions
  2. Enables highlighting in source text
  3. Supports entity extraction
  4. Maintains alignment through tokenization
def
 
benchmark_tokenization_methods
():
    
"""
    Compares performance of different tokenization approaches.
    Shows impact of batching and fast tokenizers.
    """
    
import
 time
    
# Create test corpus
    texts = [
"This is a sample sentence for benchmarking."
] * 
1000
    
# Method 1: Individual tokenization
    tokenizer_slow = AutoTokenizer.from_pretrained(
'bert-base-uncased'
, use_fast=
False
)
    start = time.time()
    
for
 text 
in
 texts:
        _ = tokenizer_slow(text)
    individual_time = time.time() - start
    
# Method 2: Batch tokenization
    start = time.time()
    _ = tokenizer_slow(texts, padding=
True
, truncation=
True
)
    batch_time = time.time() - start
    
# Method 3: Fast tokenizer
    tokenizer_fast = AutoTokenizer.from_pretrained(
'bert-base-uncased'
, use_fast=
True
)
    start = time.time()
    _ = tokenizer_fast(texts, padding=
True
, truncation=
True
)
    fast_time = time.time() - start
    logger.info(
"=== Performance Comparison ==="
)
    logger.info(
f"Individual processing: 
{individual_time:
.2
f}
s"
)
    logger.info(
f"Batch processing: 
{batch_time:
.2
f}
s (
{individual_time/batch_time:
.1
f}
x faster)"
)
    logger.info(
f"Fast tokenizer: 
{fast_time:
.2
f}
s (
{batch_time/fast_time:
.1
f}
x faster than batch)"
)
    
return
 {
        
'individual'
: individual_time,
        
'batch'
: batch_time,
        
'fast'
: fast_time
    }
  1. Use Fast Tokenizers: Rust-based implementation offers 5–10x speedup.
  2. Batch Processing: Reduces overhead significantly.
  3. Precompute When Possible: Cache tokenized results.
  4. Optimize Padding: Use dynamic padding to reduce wasted computation.
def
 
detect_tokenizer_mismatch
():
    
"""
    Demonstrates problems from using wrong tokenizer with model.
    Shows how to verify compatibility.
    """
    
from
 transformers 
import
 AutoModel
    
# Intentional mismatch
    tokenizer = AutoTokenizer.from_pretrained(
'bert-base-uncased'
)
    model = AutoModel.from_pretrained(
'roberta-base'
)
    text = 
"This demonstrates tokenizer mismatch."
    
try
:
        inputs = tokenizer(text, return_tensors=
'pt'
)
        outputs = model(**inputs)
        logger.warning(
"Model processed mismatched inputs - results unreliable!"
)
    
except
 Exception 
as
 e:
        logger.error(
f"Mismatch error: 
{e}
"
)
    
# Correct approach
    logger.info(
"\\n=== Correct Matching ==="
)
    model_name = 
'roberta-base'
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    inputs = tokenizer(text, return_tensors=
'pt'
)
    outputs = model(**inputs)
    logger.info(
f"Success! Output shape: 
{outputs.last_hidden_state.shape}
"
)
def
 
handle_long_documents
():
    
"""
    Strategies for documents exceeding token limits.
    Shows truncation and sliding window approaches.
    """
    tokenizer = AutoTokenizer.from_pretrained(
'bert-base-uncased'
)
    max_length = 
512
    
# Create long document
    long_doc = 
" "
.join([
"This is a sentence."
] * 
200
)
    
# Strategy 1: Simple truncation
    truncated = tokenizer(
        long_doc,
        max_length=max_length,
        truncation=
True
,
        return_tensors=
'pt'
    )
    logger.info(
f"Document length: 
{
len
(long_doc)}
 chars"
)
    logger.info(
f"Truncated to: 
{truncated[
'input_ids'
].shape[
1
]}
 tokens"
)
    
# Strategy 2: Sliding window
    stride = 
256
    chunks = []
    tokens = tokenizer.tokenize(long_doc)
    
for
 i 
in
 
range
(
0
, 
len
(tokens), stride):
        chunk = tokens[i:i + max_length - 
2
]  
# Reserve space for special tokens
        chunk_ids = tokenizer.convert_tokens_to_ids(chunk)
        chunk_ids = [tokenizer.cls_token_id] + chunk_ids + [tokenizer.sep_token_id]
        chunks.append(chunk_ids)
    logger.info(
f"\\nSliding window created 
{
len
(chunks)}
 chunks"
)
    logger.info(
f"Overlap: 
{max_length - stride}
 tokens between chunks"
)
    
return
 chunks
  1. Truncation: Fast but loses information
  2. Sliding Window: Preserves all content with overlap
  3. Hierarchical: Process sections separately then combine
  4. Summarization: Reduce content before tokenization
class
 
TokenizationDebugger
:
    
"""
    Comprehensive debugging tools for tokenization issues.
    """
    
def
 
__init__
(
self, tokenizer
):
        self.tokenizer = tokenizer
    
def
 
analyze_text
(
self, text
):
        
"""
        Detailed analysis of tokenization results.
        Parameters:
        -----------
        text : str
            Text to analyze
        """
        logger.info(
f"\\n=== Analyzing: '
{text}
' ==="
)
        
# Basic tokenization
        tokens = self.tokenizer.tokenize(text)
        token_ids = self.tokenizer.encode(text)
        
# Get special token info
        special_tokens = {
            
'PAD'
: self.tokenizer.pad_token_id,
            
'UNK'
: self.tokenizer.unk_token_id,
            
'CLS'
: self.tokenizer.cls_token_id,
            
'SEP'
: self.tokenizer.sep_token_id
        }
        
# Analysis results
        logger.info(
f"Character count: 
{
len
(text)}
"
)
        logger.info(
f"Token count: 
{
len
(tokens)}
"
)
        logger.info(
f"Compression ratio: 
{
len
(text)/
len
(tokens):
.2
f}
 chars/token"
)
        
# Check for unknown tokens
        unk_count = tokens.count(self.tokenizer.unk_token)
        
if
 unk_count > 
0
:
            logger.warning(
f"Found 
{unk_count}
 unknown tokens!"
)
            unk_positions = [i 
for
 i, t 
in
 
enumerate
(tokens) 
if
 t == self.tokenizer.unk_token]
            logger.warning(
f"Unknown token positions: 
{unk_positions}
"
)
        
# Display token breakdown
        logger.info(
"\\nToken Breakdown:"
)
        
for
 i, (token, token_id) 
in
 
enumerate
(
zip
(tokens, token_ids[
1
:-
1
])):
            special = 
""
            
for
 name, special_id 
in
 special_tokens.items():
                
if
 token_id == special_id:
                    special = 
f" [
{name}
]"
            logger.info(
f"  
{i}
: '
{token}
' → 
{token_id}
{special}
"
)
        
return
 {
            
'tokens'
: tokens,
            
'token_ids'
: token_ids,
            
'char_count'
: 
len
(text),
            
'token_count'
: 
len
(tokens),
            
'unk_count'
: unk_count
        }
    
def
 
compare_tokenizers
(
self, text, tokenizer_names
):
        
"""
        Compare how different tokenizers handle the same text.
        """
        results = {}
        logger.info(
f"\\n=== Comparing Tokenizers on: '
{text}
' ==="
)
        
for
 name 
in
 tokenizer_names:
            tokenizer = AutoTokenizer.from_pretrained(name)
            tokens = tokenizer.tokenize(text)
            results[name] = {
                
'tokens'
: tokens,
                
'count'
: 
len
(tokens)
            }
            logger.info(
f"\\n
{name}
:"
)
            logger.info(
f"  Tokens: 
{tokens}
"
)
            logger.info(
f"  Count: 
{
len
(tokens)}
"
)
        
return
 results
  • [ ] Verify tokenizer matches model
  • [ ] Check for excessive unknown tokens
  • [ ] Monitor sequence lengths
  • [ ] Validate special token handling
  • [ ] Test edge cases (empty strings, special characters)
  • [ ] Compare against expected output
def
 
train_custom_medical_tokenizer
():
    
"""
    Trains a tokenizer optimized for medical text.
    Reduces fragmentation of medical terms.
    """
    
from
 tokenizers 
import
 Tokenizer, models, trainers, pre_tokenizers
    
# Medical corpus (in practice, use larger dataset)
    medical_texts = [
        
"Patient presents with acute myocardial infarction."
,
        
"Diagnosis: Type 2 diabetes mellitus with neuropathy."
,
        
"Prescribed metformin 500mg twice daily."
,
        
"MRI shows L4-L5 disc herniation with radiculopathy."
,
        
"Post-operative recovery following cholecystectomy."
,
        
"Chronic obstructive pulmonary disease exacerbation."
,
        
"Administered epinephrine for anaphylactic reaction."
,
        
"ECG reveals atrial fibrillation with rapid ventricular response."
    ]
    
# Initialize BPE tokenizer
    tokenizer = Tokenizer(models.BPE())
    tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
    
# Configure trainer
    trainer = trainers.BpeTrainer(
        vocab_size=
10000
,
        special_tokens=[
"[PAD]"
, 
"[UNK]"
, 
"[CLS]"
, 
"[SEP]"
, 
"[MASK]"
],
        min_frequency=
2
    )
    
# Train on medical corpus
    tokenizer.train_from_iterator(medical_texts, trainer=trainer)
    
# Test on medical terms
    test_terms = [
        
"myocardial infarction"
,
        
"cholecystectomy"
,
        
"pneumonia"
,
        
"diabetes mellitus"
    ]
    logger.info(
"=== Custom Medical Tokenizer Results ==="
)
    
for
 term 
in
 test_terms:
        encoding = tokenizer.encode(term)
        logger.info(
f"\\n'
{term}
':"
)
        logger.info(
f"  Tokens: 
{encoding.tokens}
"
)
        logger.info(
f"  IDs: 
{encoding.ids}
"
)
    
return
 tokenizer
  1. Better Coverage: Keeps domain terms intact
  2. Smaller Vocabulary: Focused on relevant terms
  3. Improved Accuracy: Better representation of domain language
  4. Reduced Tokens: More efficient processing
def
 
compare_medical_tokenization
():
    
"""
    Shows advantage of domain-specific tokenization.
    """
    
# Generic tokenizer
    generic = AutoTokenizer.from_pretrained(
'bert-base-uncased'
)
    
# Medical terms that generic tokenizers fragment
    medical_terms = [
        
"pneumonoultramicroscopicsilicovolcanoconiosis"
,
        
"electroencephalography"
,
        
"thrombocytopenia"
,
        
"gastroesophageal"
    ]
    logger.info(
"=== Generic vs Domain Tokenization ==="
)
    
for
 term 
in
 medical_terms:
        generic_tokens = generic.tokenize(term)
        logger.info(
f"\\n'
{term}
':"
)
        logger.info(
f"  Generic: 
{generic_tokens}
 (
{
len
(generic_tokens)}
 tokens)"
)
        
# Custom tokenizer would show fewer tokens
        
# Calculate efficiency loss
        
if
 
len
(generic_tokens) > 
3
:
            logger.warning(
f"  ⚠️ Excessive fragmentation: 
{
len
(generic_tokens)}
 pieces"
)
def
 
handle_edge_cases
():
    
"""
    Demonstrates handling of problematic text inputs.
    """
    tokenizer = AutoTokenizer.from_pretrained(
'bert-base-uncased'
)
    edge_cases = {
        
"Empty string"
: 
""
,
        
"Only spaces"
: 
"     "
,
        
"Mixed languages"
: 
"Hello 世界 Bonjour"
,
        
"Emojis"
: 
"Great job! 👍🎉"
,
        
"Code"
: 
"def func(x): return x**2"
,
        
"URLs"
: 
"Visit <https://example.com/page>"
,
        
"Special chars"
: 
"Price: $99.99 (↑15%)"
,
        
"Long word"
: 
"a"
 * 
100
    }
    logger.info(
"=== Edge Case Handling ==="
)
    
for
 case_name, text 
in
 edge_cases.items():
        logger.info(
f"\\n
{case_name}
: '
{text[:
50
]}
{
'...'
 
if
 
len
(text) > 
50
 
else
 
''
}
'"
)
        
try
:
            tokens = tokenizer.tokenize(text)
            encoding = tokenizer(text, add_special_tokens=
True
)
            logger.info(
f"  Success: 
{
len
(tokens)}
 tokens"
)
            
# Check for issues
            
if
 
not
 tokens 
and
 text:
                logger.warning(
"  ⚠️ No tokens produced from non-empty text"
)
            
if
 tokenizer.unk_token 
in
 tokens:
                unk_count = tokens.count(tokenizer.unk_token)
                logger.warning(
f"  ⚠️ Contains 
{unk_count}
 unknown tokens"
)
        
except
 Exception 
as
 e:
            logger.error(
f"  ❌ Error: 
{
str
(e)}
"
)
  1. Empty/Whitespace: Return empty token list or pad token

  2. Mixed Scripts: May produce unknown tokens

  3. Emojis: Handled differently by each tokenizer

  4. URLs/Emails: Often split incorrectly

  5. Very Long Words: May exceed token limits

  6. Tokenization bridges text and neural networks — It’s the critical first step that determines model performance

  7. Algorithm choice matters — BPE, WordPiece, and Unigram each have strengths for different applications

  8. Always match tokenizer and model — Mismatches cause silent failures and poor results

  9. Special tokens provide structure — [CLS], [SEP], and others help models understand sequences

  10. Production requires optimization — Use fast tokenizers and batch processing for efficiency

  • [ ] Use the same tokenizer for training and inference
  • [ ] Handle edge cases gracefully (empty strings, special characters)
  • [ ] Implement proper error handling and logging
  • [ ] Optimize for your production constraints (speed vs accuracy)
  • [ ] Test with real-world data, including edge cases
  • [ ] Monitor tokenization metrics (unknown token rate, sequence lengths)
  • [ ] Consider domain-specific tokenizers for specialized applications
# Standard setup
from
 transformers 
import
 AutoTokenizer
# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(
'bert-base-uncased'
)
# Basic usage
tokens = tokenizer.tokenize(
"Hello world"
)
encoding = tokenizer(
"Hello world"
, return_tensors=
'pt'
)
# Production usage
encoding = tokenizer(
    texts,                    
# List of strings
    padding=
True
,            
# Pad to same length
    truncation=
True
,         
# Truncate to max_length
    max_length=
512
,         
# Maximum sequence length
    return_tensors=
'pt'
,    
# Return PyTorch tensors
    return_attention_mask=
True
,  
# Return attention masks
    return_offsets_mapping=
True
  
# For NER tasks
)
# Access results
input_ids = encoding[
'input_ids'
]
attention_mask = encoding[
'attention_mask'
]
  1. Experiment with different tokenizers on your data

  2. Measure tokenization metrics for your use case

  3. Build custom tokenizers if needed

  4. Integrate with your model pipeline

  5. Monitor production performance

  6. Hugging Faces Transformers and the AI Revolution (Article 1)

  7. Hugging Faces: Why Language is Hard for AI? How Transformers Changed that (Article 2)

  8. Hands-On with Hugging Face: Building Your AI Workspace (Article 3)

  9. Inside the Transformer: Architecture and Attention Demystified (Article 4)

  • Python 3.12 (managed via pyenv).
  • Poetry for dependency management.
  • Go Task for build automation.
  • API keys for any required services (see .env.example).
  1. Clone this repository:
git 
clone
 [email protected]:RichardHightower/art_hug_05.git 
cd
 art_hug_05
task setup
.
├── src/
│   ├── __init__.py
│   ├── config.py              
# Configuration and utilities
│   ├── main.py                
# Entry point with all examples
│   ├── tokenization_examples.py       
# Basic tokenization examples
│   ├── tokenization_algorithms.py     
# BPE, WordPiece, and Unigram comparison
│   ├── custom_tokenization.py         
# Training custom tokenizers
│   ├── tokenization_debugging.py      
# Debugging and visualization tools
│   ├── multimodal_tokenization.py     
# Image and CLIP tokenization
│   ├── advanced_tokenization.py       
# Advanced tokenization techniques
│   ├── model_loading.py               
# Model loading examples
│   └── utils.py               
# Utility functions
├── tests/
│   └── test_examples.py       
# Unit tests
├── .env.example               
# Environment template
├── Taskfile.yml               
# Task automation
└── pyproject.toml             
# Poetry configuration
task run
task run-tokenization          
# Run basic tokenization examples
task run-algorithms            
# Run tokenization algorithms comparison
task run-custom                
# Run custom tokenizer training
task run-debugging             
# Run tokenization debugging tools
task run-multimodal            
# Run multimodal tokenization
task run-advanced              
# Run advanced tokenization techniques
task run-model-loading         
# Run model loading examples
task notebook
  • Create interactive notebooks for experimentation

  • Run code cells step by step

  • Visualize tokenization results

  • Test different tokenizers interactively

  • task setup - Set up Python environment and install dependencies

  • task run - Run all examples

  • task run-tokenization - Run basic tokenization examples

  • task run-algorithms - Run algorithm comparison examples

  • task run-custom - Run custom tokenizer training

  • task run-debugging - Run debugging and visualization tools

  • task run-multimodal - Run multimodal tokenization examples

  • task run-advanced - Run advanced tokenization techniques

  • task run-model-loading - Run model loading examples

  • task notebook - Launch Jupyter notebook server

  • task test - Run unit tests

  • task format - Format code with Black and Ruff

  • task lint - Run linting checks (Black, Ruff, mypy)

  • task clean - Clean up generated files

  1. Using Homebrew (Recommended):
brew install pyenv
pyenv install 
3.12
.0
 pyenv 
global
 
3.12
.0
python 
--version
  1. Download the installer from Python.org
  2. Run the installer and ensure you check “Add Python to PATH”
  3. Open Command Prompt and verify installation:
python 
--version
pip install pyenv-win
  1. Install using the official installer:
curl -sSL https://install.python-poetry.org | python3 -
echo
 
'export PATH="$HOME/.poetry/bin:$PATH"'
 >> ~/.zshrc 
source
 ~/.zshrc
  1. Install using PowerShell:
(Invoke-WebRequest -
Uri
 https:
//install.python-poetry.org -UseBasicParsing).Content | python -
poetry 
--version
  1. Using Homebrew:
brew install 
go
-task/tap/
go
-task
task 
--version
  1. Using Scoop:
scoop install 
go
-task
choco install 
go
-task
task 
--version
  • Python not found: Ensure Python is correctly added to your PATH variable
  • Poetry commands not working: Restart your terminal or add the Poetry bin directory to your PATH
  • Task not found: Verify Task installation and PATH settings
  • Dependency errors: Run poetry update to resolve dependency conflicts
% task run-medical
task:
 [run-medical] poetry run python src/medical_tokenization_demo.py
INFO
:__main__
:
🏥 Medical Tokenization Examples
INFO
:__main__
:==================================================
INFO
:__main__
:
=== Generic vs Domain Tokenization ===
INFO
:__main__
:
'pneumonoultramicroscopicsilicovolcanoconiosis'
:
INFO
:__main__
:  
Generic:
 [
'p'
, 
'##ne'
, 
'##um'
, 
'##ono'
, 
'##ult'
, 
'##ram'
, 
'##ic'
, 
'##ros'
, 
'##copic'
, 
'##sil'
, 
'##ico'
, 
'##vo'
, 
'##lc'
, 
'##ano'
, 
'##con'
, 
'##ios'
, 
'##is'
] (
17
 tokens)
WARNING
:__main__
:  ⚠️ Excessive 
fragmentation:
 
17
 pieces
INFO
:__main__
:
'electroencephalography'
:
INFO
:__main__
:  
Generic:
 [
'electro'
, 
'##ence'
, 
'##pha'
, 
'##log'
, 
'##raphy'
] (
5
 tokens)
WARNING
:__main__
:  ⚠️ Excessive 
fragmentation:
 
5
 pieces
INFO
:__main__
:
'thrombocytopenia'
:
INFO
:__main__
:  
Generic:
 [
'th'
, 
'##rom'
, 
'##bo'
, 
'##cy'
, 
'##top'
, 
'##enia'
] (
6
 tokens)
WARNING
:__main__
:  ⚠️ Excessive 
fragmentation:
 
6
 pieces
INFO
:__main__
:
'gastroesophageal'
:
INFO
:__main__
:  
Generic:
 [
'gas'
, 
'##tro'
, 
'##es'
, 
'##op'
, 
'##ha'
, 
'##ge'
, 
'##al'
] (
7
 tokens)
WARNING
:__main__
:  ⚠️ Excessive 
fragmentation:
 
7
 pieces
INFO
:__main__
:
=== MedCPT Biomedical Text Encoder Example ===
INFO
:__main__
:Loading
 MedCPT Article Encoder...
INFO
:__main__
:
Embedding 
shape:
 torch.Size([
3
, 
768
])
INFO
:__main__
:Embedding
 
dimension:
 
768
INFO
:__main__
:
=== MedCPT Tokenization of Medical Terms ===
INFO
:__main__
:
'diabetes insipidus'
:
INFO
:__main__
:  
Tokens:
 [
'diabetes'
, 
'ins'
, 
'##ip'
, 
'##idus'
] (
4
 tokens)
INFO
:__main__
:
'vasopressinergic neurons'
:
INFO
:__main__
:  
Tokens:
 [
'vasopressin'
, 
'##ergic'
, 
'neurons'
] (
3
 tokens)
INFO
:__main__
:
'hypothalamic destruction'
:
INFO
:__main__
:  
Tokens:
 [
'hypothalamic'
, 
'destruction'
] (
2
 tokens)
INFO
:__main__
:
'polyuria and polydipsia'
:
INFO
:__main__
:  
Tokens:
 [
'poly'
, 
'##uria'
, 
'and'
, 
'polyd'
, 
'##ips'
, 
'##ia'
] (
6
 tokens)
INFO
:__main__
:
=== Comparison with Generic 
BERT
 ===
INFO
:__main__
:
'diabetes insipidus'
:
INFO
:__main__
:  
MedCPT:
 
4
 tokens
INFO
:__main__
:  Generic 
BERT
: 
5
 tokens
INFO
:__main__
:  ✅ MedCPT is 
1
 tokens more efficient
INFO
:__main__
:
'vasopressinergic neurons'
:
INFO
:__main__
:  
MedCPT:
 
3
 tokens
INFO
:__main__
:  Generic 
BERT
: 
6
 tokens
INFO
:__main__
:  ✅ MedCPT is 
3
 tokens more efficient
INFO
:__main__
:
'hypothalamic destruction'
:
INFO
:__main__
:  
MedCPT:
 
2
 tokens
INFO
:__main__
:  Generic 
BERT
: 
6
 tokens
INFO
:__main__
:  ✅ MedCPT is 
4
 tokens more efficient
INFO
:__main__
:
'polyuria and polydipsia'
:
INFO
:__main__
:  
MedCPT:
 
6
 tokens
INFO
:__main__
:  Generic 
BERT
: 
7
 tokens
INFO
:__main__
:  ✅ MedCPT is 
1
 tokens more efficient
INFO
:__main__
:
✅ Medical tokenization examples completed!
"""
Medical Tokenization Demo
Standalone script to run medical tokenization examples
"""
from
 transformers 
import
 AutoTokenizer, AutoModel
import
 torch
import
 logging
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def
 
compare_medical_tokenization
():
    
"""Shows advantage of domain-specific tokenization."""
    
# Generic tokenizer
    generic = AutoTokenizer.from_pretrained(
'bert-base-uncased'
)
    
# Medical terms that generic tokenizers fragment
    medical_terms = [
        
"pneumonoultramicroscopicsilicovolcanoconiosis"
,
        
"electroencephalography"
,
        
"thrombocytopenia"
,
        
"gastroesophageal"
    ]
    logger.info(
"\n=== Generic vs Domain Tokenization ==="
)
    
for
 term 
in
 medical_terms:
        generic_tokens = generic.tokenize(term)
        logger.info(
f"\n'
{term}
':"
)
        logger.info(
f"  Generic: 
{generic_tokens}
 (
{
len
(generic_tokens)}
 tokens)"
)
        
# Custom tokenizer would show fewer tokens
        
# Calculate efficiency loss
        
if
 
len
(generic_tokens) > 
3
:
            logger.warning(
f"  ⚠️ Excessive fragmentation: 
{
len
(generic_tokens)}
 pieces"
)
def
 
medcpt_encoder_example
():
    
"""Demonstrates MedCPT encoder for biomedical text embeddings."""
    logger.info(
"\n=== MedCPT Biomedical Text Encoder Example ==="
)
    
    
try
:
        
# Load MedCPT Article Encoder
        logger.info(
"Loading MedCPT Article Encoder..."
)
        model = AutoModel.from_pretrained(
"ncbi/MedCPT-Article-Encoder"
)
        tokenizer = AutoTokenizer.from_pretrained(
"ncbi/MedCPT-Article-Encoder"
)
        
        
# Example medical articles
        articles = [
            [
                
"Diagnosis and Management of Central Diabetes Insipidus in Adults"
,
                
"Central diabetes insipidus (CDI) is a clinical syndrome which results from loss or impaired function of vasopressinergic neurons in the hypothalamus/posterior pituitary, resulting in impaired synthesis and/or secretion of arginine vasopressin (AVP)."
,
            ],
            [
                
"Adipsic diabetes insipidus"
,
                
"Adipsic diabetes insipidus (ADI) is a rare but devastating disorder of water balance with significant associated morbidity and mortality. Most patients develop the disease as a result of hypothalamic destruction from a variety of underlying etiologies."
,
            ],
            [
                
"Nephrogenic diabetes insipidus: a comprehensive overview"
,
                
"Nephrogenic diabetes insipidus (NDI) is characterized by the inability to concentrate urine that results in polyuria and polydipsia, despite having normal or elevated plasma concentrations of arginine vasopressin (AVP)."
,
            ],
        ]
        
        
# Format articles for the model
        formatted_articles = [
f"
{title}
. 
{abstract}
"
 
for
 title, abstract 
in
 articles]
        
        
with
 torch.no_grad():
            
# Tokenize the articles
            encoded = tokenizer(
                formatted_articles, 
                truncation=
True
, 
                padding=
True
, 
                return_tensors=
'pt'
, 
                max_length=
512
,
            )
            
            
# Encode the articles
            embeds = model(**encoded).last_hidden_state[:, 
0
, :]
            
            logger.info(
f"\nEmbedding shape: 
{embeds.shape}
"
)
            logger.info(
f"Embedding dimension: 
{embeds.shape[
1
]}
"
)
            
            
# Show tokenization comparison for medical terms
            logger.info(
"\n=== MedCPT Tokenization of Medical Terms ==="
)
            
            medical_terms = [
                
"diabetes insipidus"
,
                
"vasopressinergic neurons"
,
                
"hypothalamic destruction"
,
                
"polyuria and polydipsia"
            ]
            
            
for
 term 
in
 medical_terms:
                tokens = tokenizer.tokenize(term)
                logger.info(
f"\n'
{term}
':"
)
                logger.info(
f"  Tokens: 
{tokens}
 (
{
len
(tokens)}
 tokens)"
)
            
            
# Compare with generic BERT tokenizer
            generic = AutoTokenizer.from_pretrained(
'bert-base-uncased'
)
            logger.info(
"\n=== Comparison with Generic BERT ==="
)
            
            
for
 term 
in
 medical_terms:
                medcpt_tokens = tokenizer.tokenize(term)
                generic_tokens = generic.tokenize(term)
                
                logger.info(
f"\n'
{term}
':"
)
                logger.info(
f"  MedCPT: 
{
len
(medcpt_tokens)}
 tokens"
)
                logger.info(
f"  Generic BERT: 
{
len
(generic_tokens)}
 tokens"
)
                
                
if
 
len
(generic_tokens) > 
len
(medcpt_tokens):
                    logger.info(
f"  ✅ MedCPT is 
{
len
(generic_tokens) - 
len
(medcpt_tokens)}
 tokens more efficient"
)
                    
    
except
 Exception 
as
 e:
        logger.error(
f"Error loading MedCPT model: 
{e}
"
)
        logger.info(
"Install with: pip install transformers torch"
)
        logger.info(
"Note: MedCPT model requires downloading ~440MB"
)
def
 
main
():
    
"""Run medical tokenization examples."""
    logger.info(
"🏥 Medical Tokenization Examples"
)
    logger.info(
"="
 * 
50
)
    
    
# Run generic vs domain comparison
    compare_medical_tokenization()
    
    
# Run MedCPT encoder example
    medcpt_encoder_example()
    
    logger.info(
"\n✅ Medical tokenization examples completed!"
)
if
 __name__ == 
"__main__"
:
    main()
  • Generic vs. Domain Tokenization Comparison: Shows how a standard tokenizer breaks down complex medical terms into many small pieces (tokens)

  • MedCPT Encoder Example: Demonstrates a specialized medical text encoder model that better understands medical terminology

  • Comparison Between Tokenizers: Directly compares how many tokens are needed for the same medical phrases using both tokenizers

  • Fewer tokens means more efficient processing (saves time and computing resources).

  • Better tokenization leads to better understanding of the text’s meaning.

  • Specialized models can handle longer medical texts within token limits.

  • A general-purpose one called “bert-base-uncased” that works for everyday language.

  • A specialized medical one called “MedCPT-Article-Encoder” trained specifically on medical texts.

Tokenization -- Converting Text to Numbers for Neural Networks

Tokenization -- Converting Text to Numbers for Neural Networks

#Tokenization #Converting #Text #Numbers #Neural #Networks