Initial commit — Singular Particular Space v1

Homepage (site/index.html): integration-v14 promoted, Writings section integrated with 33 pieces clustered by type (stories/essays/miscellany), Writings welcome lightbox, content frame at 98% opacity. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-27 12:09:22 +02:00
commit 5422131782
359 changed files with 117437 additions and 0 deletions
--- a/DumperCan/tokenization_demo.html
+++ b/DumperCan/tokenization_demo.html
@@ -0,0 +1,261 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Tokenization Demo</title>
+    <style>
+        body {
+            font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif;
+            max-width: 900px;
+            margin: 40px auto;
+            padding: 20px;
+            background: #f5f5f5;
+        }
+        .container {
+            background: white;
+            padding: 30px;
+            border-radius: 12px;
+            box-shadow: 0 2px 10px rgba(0,0,0,0.1);
+        }
+        h1 {
+            color: #333;
+            margin-bottom: 10px;
+        }
+        .subtitle {
+            color: #666;
+            margin-bottom: 30px;
+            font-style: italic;
+        }
+        textarea {
+            width: 100%;
+            min-height: 120px;
+            padding: 15px;
+            font-size: 16px;
+            border: 2px solid #ddd;
+            border-radius: 8px;
+            font-family: inherit;
+            resize: vertical;
+            box-sizing: border-box;
+        }
+        textarea:focus {
+            outline: none;
+            border-color: #4CAF50;
+        }
+        .output {
+            margin-top: 30px;
+            padding: 20px;
+            background: #f9f9f9;
+            border-radius: 8px;
+            min-height: 100px;
+        }
+        .token {
+            display: inline-block;
+            padding: 6px 12px;
+            margin: 4px;
+            background: #4CAF50;
+            color: white;
+            border-radius: 6px;
+            font-family: 'Courier New', monospace;
+            font-size: 14px;
+        }
+        .token.special {
+            background: #FF9800;
+        }
+        .stats {
+            margin-top: 20px;
+            padding: 15px;
+            background: #e3f2fd;
+            border-radius: 8px;
+            display: grid;
+            grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
+            gap: 15px;
+        }
+        .stat {
+            text-align: center;
+        }
+        .stat-number {
+            font-size: 32px;
+            font-weight: bold;
+            color: #1976D2;
+        }
+        .stat-label {
+            color: #666;
+            font-size: 14px;
+        }
+        .explanation {
+            margin-top: 20px;
+            padding: 15px;
+            background: #fff3cd;
+            border-left: 4px solid #ffc107;
+            border-radius: 4px;
+        }
+        .examples {
+            margin-top: 20px;
+        }
+        .example-btn {
+            padding: 10px 20px;
+            margin: 5px;
+            background: #2196F3;
+            color: white;
+            border: none;
+            border-radius: 6px;
+            cursor: pointer;
+            font-size: 14px;
+        }
+        .example-btn:hover {
+            background: #1976D2;
+        }
+    </style>
+</head>
+<body>
+    <div class="container">
+        <h1>🔤 How AI "Reads" Text: Tokenization</h1>
+        <p class="subtitle">Watch how AI breaks your words into tokens—the building blocks it actually processes</p>
+        
+        <textarea id="input" placeholder="Type or paste text here to see how AI tokenizes it...">The quick brown fox jumps over the lazy dog.</textarea>
+        
+        <div class="examples">
+            <strong>Try these examples:</strong><br>
+            <button class="example-btn" onclick="setExample('Unbelievable!')">Unbelievable!</button>
+            <button class="example-btn" onclick="setExample('ChatGPT is amazing')">ChatGPT is amazing</button>
+            <button class="example-btn" onclick="setExample('I can\'t believe it\'s 2025!')">Contractions & numbers</button>
+            <button class="example-btn" onclick="setExample('🎉 Emoji test! 🚀')">Emoji test</button>
+            <button class="example-btn" onclick="setExample('supercalifragilisticexpialidocious')">Long word</button>
+        </div>
+        
+        <div class="output" id="output">
+            <em>Tokens will appear here...</em>
+        </div>
+        
+        <div class="stats">
+            <div class="stat">
+                <div class="stat-number" id="charCount">0</div>
+                <div class="stat-label">Characters</div>
+            </div>
+            <div class="stat">
+                <div class="stat-number" id="tokenCount">0</div>
+                <div class="stat-label">Tokens</div>
+            </div>
+            <div class="stat">
+                <div class="stat-number" id="ratio">0</div>
+                <div class="stat-label">Chars per Token</div>
+            </div>
+        </div>
+        
+        <div class="explanation">
+            <strong>💡 What's happening here?</strong><br>
+            AI doesn't read "words" like we do. It breaks text into <strong>tokens</strong>—chunks that might be whole words, parts of words, or even single characters. This is why AI sometimes:
+            <ul>
+                <li>Cuts long words in weird places</li>
+                <li>Handles common words easily but struggles with rare ones</li>
+                <li>Has limits like "8K tokens" (not words!)</li>
+                <li>Treats "can't" differently than "cannot"</li>
+            </ul>
+            <strong>Orange tokens</strong> = special characters (spaces, punctuation, emoji)
+        </div>
+    </div>
+
+    <script>
+        const input = document.getElementById('input');
+        const output = document.getElementById('output');
+        const charCount = document.getElementById('charCount');
+        const tokenCount = document.getElementById('tokenCount');
+        const ratio = document.getElementById('ratio');
+
+        // Simple tokenization approximation (simulates BPE-style tokenization)
+        function tokenize(text) {
+            if (!text.trim()) return [];
+            
+            const tokens = [];
+            let current = '';
+            
+            for (let i = 0; i < text.length; i++) {
+                const char = text[i];
+                
+                // Special characters get their own tokens
+                if (char.match(/[\s\n\r\t,;:.!?'"()\[\]{}]/)) {
+                    if (current) {
+                        // Break word into subword tokens (simulate BPE)
+                        tokens.push(...breakWord(current));
+                        current = '';
+                    }
+                    if (char.trim()) { // Only add non-whitespace special chars
+                        tokens.push({ text: char, type: 'special' });
+                    } else {
+                        tokens.push({ text: '·', type: 'special' }); // Visualize spaces
+                    }
+                } else {
+                    current += char;
+                }
+            }
+            
+            if (current) {
+                tokens.push(...breakWord(current));
+            }
+            
+            return tokens;
+        }
+
+        function breakWord(word) {
+            // Simulate subword tokenization
+            const tokens = [];
+            
+            // Very common words stay whole
+            const commonWords = ['the', 'is', 'at', 'it', 'in', 'on', 'and', 'or', 'to', 'a', 'an', 'of', 'for', 'with'];
+            if (commonWords.includes(word.toLowerCase())) {
+                return [{ text: word, type: 'word' }];
+            }
+            
+            // Short words (<=4 chars) usually stay whole
+            if (word.length <= 4) {
+                return [{ text: word, type: 'word' }];
+            }
+            
+            // Medium words (5-8 chars) might split once
+            if (word.length <= 8) {
+                const mid = Math.floor(word.length / 2);
+                return [
+                    { text: word.slice(0, mid), type: 'word' },
+                    { text: word.slice(mid), type: 'word' }
+                ];
+            }
+            
+            // Long words split into ~4 char chunks
+            let pos = 0;
+            while (pos < word.length) {
+                const chunkSize = Math.min(4, word.length - pos);
+                tokens.push({ text: word.slice(pos, pos + chunkSize), type: 'word' });
+                pos += chunkSize;
+            }
+            
+            return tokens;
+        }
+
+        function updateVisualization() {
+            const text = input.value;
+            const tokens = tokenize(text);
+            
+            // Update display
+            output.innerHTML = tokens.map(token => 
+                `<span class="token ${token.type}">${token.text}</span>`
+            ).join('');
+            
+            // Update stats
+            charCount.textContent = text.length;
+            tokenCount.textContent = tokens.length;
+            ratio.textContent = tokens.length > 0 ? (text.length / tokens.length).toFixed(1) : '0';
+        }
+
+        function setExample(text) {
+            input.value = text;
+            updateVisualization();
+        }
+
+        input.addEventListener('input', updateVisualization);
+        
+        // Initial visualization
+        updateVisualization();
+    </script>
+</body>
+</html>