Move 6 guide pages from Guides/ to ToolsnToys/ root; fix back-links. Add edu-toys.html (museum-style iframe exhibit for 4 legacy edu toy pages). Add 4 edu toy artifacts, dendritic curio, docker-cheatsheet-enhanced. Wire foss-tools, guides, edu-toys, and dendritic hrefs in toolsntoys.html. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
261 lines
8.8 KiB
HTML
Executable File
261 lines
8.8 KiB
HTML
Executable File
<!DOCTYPE html>
|
|
<html lang="en">
|
|
<head>
|
|
<meta charset="UTF-8">
|
|
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
<title>Tokenization Demo</title>
|
|
<style>
|
|
body {
|
|
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif;
|
|
max-width: 900px;
|
|
margin: 40px auto;
|
|
padding: 20px;
|
|
background: #f5f5f5;
|
|
}
|
|
.container {
|
|
background: white;
|
|
padding: 30px;
|
|
border-radius: 12px;
|
|
box-shadow: 0 2px 10px rgba(0,0,0,0.1);
|
|
}
|
|
h1 {
|
|
color: #333;
|
|
margin-bottom: 10px;
|
|
}
|
|
.subtitle {
|
|
color: #666;
|
|
margin-bottom: 30px;
|
|
font-style: italic;
|
|
}
|
|
textarea {
|
|
width: 100%;
|
|
min-height: 120px;
|
|
padding: 15px;
|
|
font-size: 16px;
|
|
border: 2px solid #ddd;
|
|
border-radius: 8px;
|
|
font-family: inherit;
|
|
resize: vertical;
|
|
box-sizing: border-box;
|
|
}
|
|
textarea:focus {
|
|
outline: none;
|
|
border-color: #4CAF50;
|
|
}
|
|
.output {
|
|
margin-top: 30px;
|
|
padding: 20px;
|
|
background: #f9f9f9;
|
|
border-radius: 8px;
|
|
min-height: 100px;
|
|
}
|
|
.token {
|
|
display: inline-block;
|
|
padding: 6px 12px;
|
|
margin: 4px;
|
|
background: #4CAF50;
|
|
color: white;
|
|
border-radius: 6px;
|
|
font-family: 'Courier New', monospace;
|
|
font-size: 14px;
|
|
}
|
|
.token.special {
|
|
background: #FF9800;
|
|
}
|
|
.stats {
|
|
margin-top: 20px;
|
|
padding: 15px;
|
|
background: #e3f2fd;
|
|
border-radius: 8px;
|
|
display: grid;
|
|
grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
|
|
gap: 15px;
|
|
}
|
|
.stat {
|
|
text-align: center;
|
|
}
|
|
.stat-number {
|
|
font-size: 32px;
|
|
font-weight: bold;
|
|
color: #1976D2;
|
|
}
|
|
.stat-label {
|
|
color: #666;
|
|
font-size: 14px;
|
|
}
|
|
.explanation {
|
|
margin-top: 20px;
|
|
padding: 15px;
|
|
background: #fff3cd;
|
|
border-left: 4px solid #ffc107;
|
|
border-radius: 4px;
|
|
}
|
|
.examples {
|
|
margin-top: 20px;
|
|
}
|
|
.example-btn {
|
|
padding: 10px 20px;
|
|
margin: 5px;
|
|
background: #2196F3;
|
|
color: white;
|
|
border: none;
|
|
border-radius: 6px;
|
|
cursor: pointer;
|
|
font-size: 14px;
|
|
}
|
|
.example-btn:hover {
|
|
background: #1976D2;
|
|
}
|
|
</style>
|
|
</head>
|
|
<body>
|
|
<div class="container">
|
|
<h1>🔤 How AI "Reads" Text: Tokenization</h1>
|
|
<p class="subtitle">Watch how AI breaks your words into tokens—the building blocks it actually processes</p>
|
|
|
|
<textarea id="input" placeholder="Type or paste text here to see how AI tokenizes it...">The quick brown fox jumps over the lazy dog.</textarea>
|
|
|
|
<div class="examples">
|
|
<strong>Try these examples:</strong><br>
|
|
<button class="example-btn" onclick="setExample('Unbelievable!')">Unbelievable!</button>
|
|
<button class="example-btn" onclick="setExample('ChatGPT is amazing')">ChatGPT is amazing</button>
|
|
<button class="example-btn" onclick="setExample('I can\'t believe it\'s 2025!')">Contractions & numbers</button>
|
|
<button class="example-btn" onclick="setExample('🎉 Emoji test! 🚀')">Emoji test</button>
|
|
<button class="example-btn" onclick="setExample('supercalifragilisticexpialidocious')">Long word</button>
|
|
</div>
|
|
|
|
<div class="output" id="output">
|
|
<em>Tokens will appear here...</em>
|
|
</div>
|
|
|
|
<div class="stats">
|
|
<div class="stat">
|
|
<div class="stat-number" id="charCount">0</div>
|
|
<div class="stat-label">Characters</div>
|
|
</div>
|
|
<div class="stat">
|
|
<div class="stat-number" id="tokenCount">0</div>
|
|
<div class="stat-label">Tokens</div>
|
|
</div>
|
|
<div class="stat">
|
|
<div class="stat-number" id="ratio">0</div>
|
|
<div class="stat-label">Chars per Token</div>
|
|
</div>
|
|
</div>
|
|
|
|
<div class="explanation">
|
|
<strong>💡 What's happening here?</strong><br>
|
|
AI doesn't read "words" like we do. It breaks text into <strong>tokens</strong>—chunks that might be whole words, parts of words, or even single characters. This is why AI sometimes:
|
|
<ul>
|
|
<li>Cuts long words in weird places</li>
|
|
<li>Handles common words easily but struggles with rare ones</li>
|
|
<li>Has limits like "8K tokens" (not words!)</li>
|
|
<li>Treats "can't" differently than "cannot"</li>
|
|
</ul>
|
|
<strong>Orange tokens</strong> = special characters (spaces, punctuation, emoji)
|
|
</div>
|
|
</div>
|
|
|
|
<script>
|
|
const input = document.getElementById('input');
|
|
const output = document.getElementById('output');
|
|
const charCount = document.getElementById('charCount');
|
|
const tokenCount = document.getElementById('tokenCount');
|
|
const ratio = document.getElementById('ratio');
|
|
|
|
// Simple tokenization approximation (simulates BPE-style tokenization)
|
|
function tokenize(text) {
|
|
if (!text.trim()) return [];
|
|
|
|
const tokens = [];
|
|
let current = '';
|
|
|
|
for (let i = 0; i < text.length; i++) {
|
|
const char = text[i];
|
|
|
|
// Special characters get their own tokens
|
|
if (char.match(/[\s\n\r\t,;:.!?'"()\[\]{}]/)) {
|
|
if (current) {
|
|
// Break word into subword tokens (simulate BPE)
|
|
tokens.push(...breakWord(current));
|
|
current = '';
|
|
}
|
|
if (char.trim()) { // Only add non-whitespace special chars
|
|
tokens.push({ text: char, type: 'special' });
|
|
} else {
|
|
tokens.push({ text: '·', type: 'special' }); // Visualize spaces
|
|
}
|
|
} else {
|
|
current += char;
|
|
}
|
|
}
|
|
|
|
if (current) {
|
|
tokens.push(...breakWord(current));
|
|
}
|
|
|
|
return tokens;
|
|
}
|
|
|
|
function breakWord(word) {
|
|
// Simulate subword tokenization
|
|
const tokens = [];
|
|
|
|
// Very common words stay whole
|
|
const commonWords = ['the', 'is', 'at', 'it', 'in', 'on', 'and', 'or', 'to', 'a', 'an', 'of', 'for', 'with'];
|
|
if (commonWords.includes(word.toLowerCase())) {
|
|
return [{ text: word, type: 'word' }];
|
|
}
|
|
|
|
// Short words (<=4 chars) usually stay whole
|
|
if (word.length <= 4) {
|
|
return [{ text: word, type: 'word' }];
|
|
}
|
|
|
|
// Medium words (5-8 chars) might split once
|
|
if (word.length <= 8) {
|
|
const mid = Math.floor(word.length / 2);
|
|
return [
|
|
{ text: word.slice(0, mid), type: 'word' },
|
|
{ text: word.slice(mid), type: 'word' }
|
|
];
|
|
}
|
|
|
|
// Long words split into ~4 char chunks
|
|
let pos = 0;
|
|
while (pos < word.length) {
|
|
const chunkSize = Math.min(4, word.length - pos);
|
|
tokens.push({ text: word.slice(pos, pos + chunkSize), type: 'word' });
|
|
pos += chunkSize;
|
|
}
|
|
|
|
return tokens;
|
|
}
|
|
|
|
function updateVisualization() {
|
|
const text = input.value;
|
|
const tokens = tokenize(text);
|
|
|
|
// Update display
|
|
output.innerHTML = tokens.map(token =>
|
|
`<span class="token ${token.type}">${token.text}</span>`
|
|
).join('');
|
|
|
|
// Update stats
|
|
charCount.textContent = text.length;
|
|
tokenCount.textContent = tokens.length;
|
|
ratio.textContent = tokens.length > 0 ? (text.length / tokens.length).toFixed(1) : '0';
|
|
}
|
|
|
|
function setExample(text) {
|
|
input.value = text;
|
|
updateVisualization();
|
|
}
|
|
|
|
input.addEventListener('input', updateVisualization);
|
|
|
|
// Initial visualization
|
|
updateVisualization();
|
|
</script>
|
|
</body>
|
|
</html> |