Flatten ToolsnToys structure; add edu toys, dendritic, legacy artifacts
Move 6 guide pages from Guides/ to ToolsnToys/ root; fix back-links. Add edu-toys.html (museum-style iframe exhibit for 4 legacy edu toy pages). Add 4 edu toy artifacts, dendritic curio, docker-cheatsheet-enhanced. Wire foss-tools, guides, edu-toys, and dendritic hrefs in toolsntoys.html. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
261
ToolsnToys/tokenization_demo.html
Executable file
261
ToolsnToys/tokenization_demo.html
Executable file
@@ -0,0 +1,261 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Tokenization Demo</title>
|
||||
<style>
|
||||
body {
|
||||
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif;
|
||||
max-width: 900px;
|
||||
margin: 40px auto;
|
||||
padding: 20px;
|
||||
background: #f5f5f5;
|
||||
}
|
||||
.container {
|
||||
background: white;
|
||||
padding: 30px;
|
||||
border-radius: 12px;
|
||||
box-shadow: 0 2px 10px rgba(0,0,0,0.1);
|
||||
}
|
||||
h1 {
|
||||
color: #333;
|
||||
margin-bottom: 10px;
|
||||
}
|
||||
.subtitle {
|
||||
color: #666;
|
||||
margin-bottom: 30px;
|
||||
font-style: italic;
|
||||
}
|
||||
textarea {
|
||||
width: 100%;
|
||||
min-height: 120px;
|
||||
padding: 15px;
|
||||
font-size: 16px;
|
||||
border: 2px solid #ddd;
|
||||
border-radius: 8px;
|
||||
font-family: inherit;
|
||||
resize: vertical;
|
||||
box-sizing: border-box;
|
||||
}
|
||||
textarea:focus {
|
||||
outline: none;
|
||||
border-color: #4CAF50;
|
||||
}
|
||||
.output {
|
||||
margin-top: 30px;
|
||||
padding: 20px;
|
||||
background: #f9f9f9;
|
||||
border-radius: 8px;
|
||||
min-height: 100px;
|
||||
}
|
||||
.token {
|
||||
display: inline-block;
|
||||
padding: 6px 12px;
|
||||
margin: 4px;
|
||||
background: #4CAF50;
|
||||
color: white;
|
||||
border-radius: 6px;
|
||||
font-family: 'Courier New', monospace;
|
||||
font-size: 14px;
|
||||
}
|
||||
.token.special {
|
||||
background: #FF9800;
|
||||
}
|
||||
.stats {
|
||||
margin-top: 20px;
|
||||
padding: 15px;
|
||||
background: #e3f2fd;
|
||||
border-radius: 8px;
|
||||
display: grid;
|
||||
grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
|
||||
gap: 15px;
|
||||
}
|
||||
.stat {
|
||||
text-align: center;
|
||||
}
|
||||
.stat-number {
|
||||
font-size: 32px;
|
||||
font-weight: bold;
|
||||
color: #1976D2;
|
||||
}
|
||||
.stat-label {
|
||||
color: #666;
|
||||
font-size: 14px;
|
||||
}
|
||||
.explanation {
|
||||
margin-top: 20px;
|
||||
padding: 15px;
|
||||
background: #fff3cd;
|
||||
border-left: 4px solid #ffc107;
|
||||
border-radius: 4px;
|
||||
}
|
||||
.examples {
|
||||
margin-top: 20px;
|
||||
}
|
||||
.example-btn {
|
||||
padding: 10px 20px;
|
||||
margin: 5px;
|
||||
background: #2196F3;
|
||||
color: white;
|
||||
border: none;
|
||||
border-radius: 6px;
|
||||
cursor: pointer;
|
||||
font-size: 14px;
|
||||
}
|
||||
.example-btn:hover {
|
||||
background: #1976D2;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div class="container">
|
||||
<h1>🔤 How AI "Reads" Text: Tokenization</h1>
|
||||
<p class="subtitle">Watch how AI breaks your words into tokens—the building blocks it actually processes</p>
|
||||
|
||||
<textarea id="input" placeholder="Type or paste text here to see how AI tokenizes it...">The quick brown fox jumps over the lazy dog.</textarea>
|
||||
|
||||
<div class="examples">
|
||||
<strong>Try these examples:</strong><br>
|
||||
<button class="example-btn" onclick="setExample('Unbelievable!')">Unbelievable!</button>
|
||||
<button class="example-btn" onclick="setExample('ChatGPT is amazing')">ChatGPT is amazing</button>
|
||||
<button class="example-btn" onclick="setExample('I can\'t believe it\'s 2025!')">Contractions & numbers</button>
|
||||
<button class="example-btn" onclick="setExample('🎉 Emoji test! 🚀')">Emoji test</button>
|
||||
<button class="example-btn" onclick="setExample('supercalifragilisticexpialidocious')">Long word</button>
|
||||
</div>
|
||||
|
||||
<div class="output" id="output">
|
||||
<em>Tokens will appear here...</em>
|
||||
</div>
|
||||
|
||||
<div class="stats">
|
||||
<div class="stat">
|
||||
<div class="stat-number" id="charCount">0</div>
|
||||
<div class="stat-label">Characters</div>
|
||||
</div>
|
||||
<div class="stat">
|
||||
<div class="stat-number" id="tokenCount">0</div>
|
||||
<div class="stat-label">Tokens</div>
|
||||
</div>
|
||||
<div class="stat">
|
||||
<div class="stat-number" id="ratio">0</div>
|
||||
<div class="stat-label">Chars per Token</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="explanation">
|
||||
<strong>💡 What's happening here?</strong><br>
|
||||
AI doesn't read "words" like we do. It breaks text into <strong>tokens</strong>—chunks that might be whole words, parts of words, or even single characters. This is why AI sometimes:
|
||||
<ul>
|
||||
<li>Cuts long words in weird places</li>
|
||||
<li>Handles common words easily but struggles with rare ones</li>
|
||||
<li>Has limits like "8K tokens" (not words!)</li>
|
||||
<li>Treats "can't" differently than "cannot"</li>
|
||||
</ul>
|
||||
<strong>Orange tokens</strong> = special characters (spaces, punctuation, emoji)
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<script>
|
||||
const input = document.getElementById('input');
|
||||
const output = document.getElementById('output');
|
||||
const charCount = document.getElementById('charCount');
|
||||
const tokenCount = document.getElementById('tokenCount');
|
||||
const ratio = document.getElementById('ratio');
|
||||
|
||||
// Simple tokenization approximation (simulates BPE-style tokenization)
|
||||
function tokenize(text) {
|
||||
if (!text.trim()) return [];
|
||||
|
||||
const tokens = [];
|
||||
let current = '';
|
||||
|
||||
for (let i = 0; i < text.length; i++) {
|
||||
const char = text[i];
|
||||
|
||||
// Special characters get their own tokens
|
||||
if (char.match(/[\s\n\r\t,;:.!?'"()\[\]{}]/)) {
|
||||
if (current) {
|
||||
// Break word into subword tokens (simulate BPE)
|
||||
tokens.push(...breakWord(current));
|
||||
current = '';
|
||||
}
|
||||
if (char.trim()) { // Only add non-whitespace special chars
|
||||
tokens.push({ text: char, type: 'special' });
|
||||
} else {
|
||||
tokens.push({ text: '·', type: 'special' }); // Visualize spaces
|
||||
}
|
||||
} else {
|
||||
current += char;
|
||||
}
|
||||
}
|
||||
|
||||
if (current) {
|
||||
tokens.push(...breakWord(current));
|
||||
}
|
||||
|
||||
return tokens;
|
||||
}
|
||||
|
||||
function breakWord(word) {
|
||||
// Simulate subword tokenization
|
||||
const tokens = [];
|
||||
|
||||
// Very common words stay whole
|
||||
const commonWords = ['the', 'is', 'at', 'it', 'in', 'on', 'and', 'or', 'to', 'a', 'an', 'of', 'for', 'with'];
|
||||
if (commonWords.includes(word.toLowerCase())) {
|
||||
return [{ text: word, type: 'word' }];
|
||||
}
|
||||
|
||||
// Short words (<=4 chars) usually stay whole
|
||||
if (word.length <= 4) {
|
||||
return [{ text: word, type: 'word' }];
|
||||
}
|
||||
|
||||
// Medium words (5-8 chars) might split once
|
||||
if (word.length <= 8) {
|
||||
const mid = Math.floor(word.length / 2);
|
||||
return [
|
||||
{ text: word.slice(0, mid), type: 'word' },
|
||||
{ text: word.slice(mid), type: 'word' }
|
||||
];
|
||||
}
|
||||
|
||||
// Long words split into ~4 char chunks
|
||||
let pos = 0;
|
||||
while (pos < word.length) {
|
||||
const chunkSize = Math.min(4, word.length - pos);
|
||||
tokens.push({ text: word.slice(pos, pos + chunkSize), type: 'word' });
|
||||
pos += chunkSize;
|
||||
}
|
||||
|
||||
return tokens;
|
||||
}
|
||||
|
||||
function updateVisualization() {
|
||||
const text = input.value;
|
||||
const tokens = tokenize(text);
|
||||
|
||||
// Update display
|
||||
output.innerHTML = tokens.map(token =>
|
||||
`<span class="token ${token.type}">${token.text}</span>`
|
||||
).join('');
|
||||
|
||||
// Update stats
|
||||
charCount.textContent = text.length;
|
||||
tokenCount.textContent = tokens.length;
|
||||
ratio.textContent = tokens.length > 0 ? (text.length / tokens.length).toFixed(1) : '0';
|
||||
}
|
||||
|
||||
function setExample(text) {
|
||||
input.value = text;
|
||||
updateVisualization();
|
||||
}
|
||||
|
||||
input.addEventListener('input', updateVisualization);
|
||||
|
||||
// Initial visualization
|
||||
updateVisualization();
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
Reference in New Issue
Block a user