singular-particular-space/ToolsnToys/tokenization_demo.html

<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Tokenization Demo</title>
    <style>
        body {
            font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif;
            max-width: 900px;
            margin: 40px auto;
            padding: 20px;
            background: #f5f5f5;
        }
        .container {
            background: white;
            padding: 30px;
            border-radius: 12px;
            box-shadow: 0 2px 10px rgba(0,0,0,0.1);
        }
        h1 {
            color: #333;
            margin-bottom: 10px;
        }
        .subtitle {
            color: #666;
            margin-bottom: 30px;
            font-style: italic;
        }
        textarea {
            width: 100%;
            min-height: 120px;
            padding: 15px;
            font-size: 16px;
            border: 2px solid #ddd;
            border-radius: 8px;
            font-family: inherit;
            resize: vertical;
            box-sizing: border-box;
        }
        textarea:focus {
            outline: none;
            border-color: #4CAF50;
        }
        .output {
            margin-top: 30px;
            padding: 20px;
            background: #f9f9f9;
            border-radius: 8px;
            min-height: 100px;
        }
        .token {
            display: inline-block;
            padding: 6px 12px;
            margin: 4px;
            background: #4CAF50;
            color: white;
            border-radius: 6px;
            font-family: 'Courier New', monospace;
            font-size: 14px;
        }
        .token.special {
            background: #FF9800;
        }
        .stats {
            margin-top: 20px;
            padding: 15px;
            background: #e3f2fd;
            border-radius: 8px;
            display: grid;
            grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
            gap: 15px;
        }
        .stat {
            text-align: center;
        }
        .stat-number {
            font-size: 32px;
            font-weight: bold;
            color: #1976D2;
        }
        .stat-label {
            color: #666;
            font-size: 14px;
        }
        .explanation {
            margin-top: 20px;
            padding: 15px;
            background: #fff3cd;
            border-left: 4px solid #ffc107;
            border-radius: 4px;
        }
        .examples {
            margin-top: 20px;
        }
        .example-btn {
            padding: 10px 20px;
            margin: 5px;
            background: #2196F3;
            color: white;
            border: none;
            border-radius: 6px;
            cursor: pointer;
            font-size: 14px;
        }
        .example-btn:hover {
            background: #1976D2;
        }
    </style>
</head>
<body>
    <div class="container">
        <h1>🔤 How AI "Reads" Text: Tokenization</h1>
        <p class="subtitle">Watch how AI breaks your words into tokens—the building blocks it actually processes</p>

        <textarea id="input" placeholder="Type or paste text here to see how AI tokenizes it...">The quick brown fox jumps over the lazy dog.</textarea>

        <div class="examples">
            <strong>Try these examples:</strong><br>
            <button class="example-btn" onclick="setExample('Unbelievable!')">Unbelievable!</button>
            <button class="example-btn" onclick="setExample('ChatGPT is amazing')">ChatGPT is amazing</button>
            <button class="example-btn" onclick="setExample('I can\'t believe it\'s 2025!')">Contractions & numbers</button>
            <button class="example-btn" onclick="setExample('🎉 Emoji test! 🚀')">Emoji test</button>
            <button class="example-btn" onclick="setExample('supercalifragilisticexpialidocious')">Long word</button>
        </div>

        <div class="output" id="output">
            <em>Tokens will appear here...</em>
        </div>

        <div class="stats">
            <div class="stat">
                <div class="stat-number" id="charCount">0</div>
                <div class="stat-label">Characters</div>
            </div>
            <div class="stat">
                <div class="stat-number" id="tokenCount">0</div>
                <div class="stat-label">Tokens</div>
            </div>
            <div class="stat">
                <div class="stat-number" id="ratio">0</div>
                <div class="stat-label">Chars per Token</div>
            </div>
        </div>

        <div class="explanation">
            <strong>💡 What's happening here?</strong><br>
            AI doesn't read "words" like we do. It breaks text into <strong>tokens</strong>—chunks that might be whole words, parts of words, or even single characters. This is why AI sometimes:
            <ul>
                <li>Cuts long words in weird places</li>
                <li>Handles common words easily but struggles with rare ones</li>
                <li>Has limits like "8K tokens" (not words!)</li>
                <li>Treats "can't" differently than "cannot"</li>
            </ul>
            <strong>Orange tokens</strong> = special characters (spaces, punctuation, emoji)
        </div>
    </div>

    <script>
        const input = document.getElementById('input');
        const output = document.getElementById('output');
        const charCount = document.getElementById('charCount');
        const tokenCount = document.getElementById('tokenCount');
        const ratio = document.getElementById('ratio');

        // Simple tokenization approximation (simulates BPE-style tokenization)
        function tokenize(text) {
            if (!text.trim()) return [];

            const tokens = [];
            let current = '';

            for (let i = 0; i < text.length; i++) {
                const char = text[i];

                // Special characters get their own tokens
                if (char.match(/[\s\n\r\t,;:.!?'"()\[\]{}]/)) {
                    if (current) {
                        // Break word into subword tokens (simulate BPE)
                        tokens.push(...breakWord(current));
                        current = '';
                    }
                    if (char.trim()) { // Only add non-whitespace special chars
                        tokens.push({ text: char, type: 'special' });
                    } else {
                        tokens.push({ text: '·', type: 'special' }); // Visualize spaces
                    }
                } else {
                    current += char;
                }
            }

            if (current) {
                tokens.push(...breakWord(current));
            }

            return tokens;
        }

        function breakWord(word) {
            // Simulate subword tokenization
            const tokens = [];

            // Very common words stay whole
            const commonWords = ['the', 'is', 'at', 'it', 'in', 'on', 'and', 'or', 'to', 'a', 'an', 'of', 'for', 'with'];
            if (commonWords.includes(word.toLowerCase())) {
                return [{ text: word, type: 'word' }];
            }

            // Short words (<=4 chars) usually stay whole
            if (word.length <= 4) {
                return [{ text: word, type: 'word' }];
            }

            // Medium words (5-8 chars) might split once
            if (word.length <= 8) {
                const mid = Math.floor(word.length / 2);
                return [
                    { text: word.slice(0, mid), type: 'word' },
                    { text: word.slice(mid), type: 'word' }
                ];
            }

            // Long words split into ~4 char chunks
            let pos = 0;
            while (pos < word.length) {
                const chunkSize = Math.min(4, word.length - pos);
                tokens.push({ text: word.slice(pos, pos + chunkSize), type: 'word' });
                pos += chunkSize;
            }

            return tokens;
        }

        function updateVisualization() {
            const text = input.value;
            const tokens = tokenize(text);

            // Update display
            output.innerHTML = tokens.map(token =>
                `<span class="token ${token.type}">${token.text}</span>`
            ).join('');

            // Update stats
            charCount.textContent = text.length;
            tokenCount.textContent = tokens.length;
            ratio.textContent = tokens.length > 0 ? (text.length / tokens.length).toFixed(1) : '0';
        }

        function setExample(text) {
            input.value = text;
            updateVisualization();
        }

        input.addEventListener('input', updateVisualization);

        // Initial visualization
        updateVisualization();
    </script>
</body>
</html>