Initial commit — Singular Particular Space v1

Homepage (site/index.html): integration-v14 promoted, Writings section
integrated with 33 pieces clustered by type (stories/essays/miscellany),
Writings welcome lightbox, content frame at 98% opacity.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-27 12:09:22 +02:00
commit 5422131782
359 changed files with 117437 additions and 0 deletions

261
DumperCan/tokenization_demo.html Executable file
View File

@@ -0,0 +1,261 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Tokenization Demo</title>
<style>
body {
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif;
max-width: 900px;
margin: 40px auto;
padding: 20px;
background: #f5f5f5;
}
.container {
background: white;
padding: 30px;
border-radius: 12px;
box-shadow: 0 2px 10px rgba(0,0,0,0.1);
}
h1 {
color: #333;
margin-bottom: 10px;
}
.subtitle {
color: #666;
margin-bottom: 30px;
font-style: italic;
}
textarea {
width: 100%;
min-height: 120px;
padding: 15px;
font-size: 16px;
border: 2px solid #ddd;
border-radius: 8px;
font-family: inherit;
resize: vertical;
box-sizing: border-box;
}
textarea:focus {
outline: none;
border-color: #4CAF50;
}
.output {
margin-top: 30px;
padding: 20px;
background: #f9f9f9;
border-radius: 8px;
min-height: 100px;
}
.token {
display: inline-block;
padding: 6px 12px;
margin: 4px;
background: #4CAF50;
color: white;
border-radius: 6px;
font-family: 'Courier New', monospace;
font-size: 14px;
}
.token.special {
background: #FF9800;
}
.stats {
margin-top: 20px;
padding: 15px;
background: #e3f2fd;
border-radius: 8px;
display: grid;
grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
gap: 15px;
}
.stat {
text-align: center;
}
.stat-number {
font-size: 32px;
font-weight: bold;
color: #1976D2;
}
.stat-label {
color: #666;
font-size: 14px;
}
.explanation {
margin-top: 20px;
padding: 15px;
background: #fff3cd;
border-left: 4px solid #ffc107;
border-radius: 4px;
}
.examples {
margin-top: 20px;
}
.example-btn {
padding: 10px 20px;
margin: 5px;
background: #2196F3;
color: white;
border: none;
border-radius: 6px;
cursor: pointer;
font-size: 14px;
}
.example-btn:hover {
background: #1976D2;
}
</style>
</head>
<body>
<div class="container">
<h1>🔤 How AI "Reads" Text: Tokenization</h1>
<p class="subtitle">Watch how AI breaks your words into tokens—the building blocks it actually processes</p>
<textarea id="input" placeholder="Type or paste text here to see how AI tokenizes it...">The quick brown fox jumps over the lazy dog.</textarea>
<div class="examples">
<strong>Try these examples:</strong><br>
<button class="example-btn" onclick="setExample('Unbelievable!')">Unbelievable!</button>
<button class="example-btn" onclick="setExample('ChatGPT is amazing')">ChatGPT is amazing</button>
<button class="example-btn" onclick="setExample('I can\'t believe it\'s 2025!')">Contractions & numbers</button>
<button class="example-btn" onclick="setExample('🎉 Emoji test! 🚀')">Emoji test</button>
<button class="example-btn" onclick="setExample('supercalifragilisticexpialidocious')">Long word</button>
</div>
<div class="output" id="output">
<em>Tokens will appear here...</em>
</div>
<div class="stats">
<div class="stat">
<div class="stat-number" id="charCount">0</div>
<div class="stat-label">Characters</div>
</div>
<div class="stat">
<div class="stat-number" id="tokenCount">0</div>
<div class="stat-label">Tokens</div>
</div>
<div class="stat">
<div class="stat-number" id="ratio">0</div>
<div class="stat-label">Chars per Token</div>
</div>
</div>
<div class="explanation">
<strong>💡 What's happening here?</strong><br>
AI doesn't read "words" like we do. It breaks text into <strong>tokens</strong>—chunks that might be whole words, parts of words, or even single characters. This is why AI sometimes:
<ul>
<li>Cuts long words in weird places</li>
<li>Handles common words easily but struggles with rare ones</li>
<li>Has limits like "8K tokens" (not words!)</li>
<li>Treats "can't" differently than "cannot"</li>
</ul>
<strong>Orange tokens</strong> = special characters (spaces, punctuation, emoji)
</div>
</div>
<script>
const input = document.getElementById('input');
const output = document.getElementById('output');
const charCount = document.getElementById('charCount');
const tokenCount = document.getElementById('tokenCount');
const ratio = document.getElementById('ratio');
// Simple tokenization approximation (simulates BPE-style tokenization)
function tokenize(text) {
if (!text.trim()) return [];
const tokens = [];
let current = '';
for (let i = 0; i < text.length; i++) {
const char = text[i];
// Special characters get their own tokens
if (char.match(/[\s\n\r\t,;:.!?'"()\[\]{}]/)) {
if (current) {
// Break word into subword tokens (simulate BPE)
tokens.push(...breakWord(current));
current = '';
}
if (char.trim()) { // Only add non-whitespace special chars
tokens.push({ text: char, type: 'special' });
} else {
tokens.push({ text: '·', type: 'special' }); // Visualize spaces
}
} else {
current += char;
}
}
if (current) {
tokens.push(...breakWord(current));
}
return tokens;
}
function breakWord(word) {
// Simulate subword tokenization
const tokens = [];
// Very common words stay whole
const commonWords = ['the', 'is', 'at', 'it', 'in', 'on', 'and', 'or', 'to', 'a', 'an', 'of', 'for', 'with'];
if (commonWords.includes(word.toLowerCase())) {
return [{ text: word, type: 'word' }];
}
// Short words (<=4 chars) usually stay whole
if (word.length <= 4) {
return [{ text: word, type: 'word' }];
}
// Medium words (5-8 chars) might split once
if (word.length <= 8) {
const mid = Math.floor(word.length / 2);
return [
{ text: word.slice(0, mid), type: 'word' },
{ text: word.slice(mid), type: 'word' }
];
}
// Long words split into ~4 char chunks
let pos = 0;
while (pos < word.length) {
const chunkSize = Math.min(4, word.length - pos);
tokens.push({ text: word.slice(pos, pos + chunkSize), type: 'word' });
pos += chunkSize;
}
return tokens;
}
function updateVisualization() {
const text = input.value;
const tokens = tokenize(text);
// Update display
output.innerHTML = tokens.map(token =>
`<span class="token ${token.type}">${token.text}</span>`
).join('');
// Update stats
charCount.textContent = text.length;
tokenCount.textContent = tokens.length;
ratio.textContent = tokens.length > 0 ? (text.length / tokens.length).toFixed(1) : '0';
}
function setExample(text) {
input.value = text;
updateVisualization();
}
input.addEventListener('input', updateVisualization);
// Initial visualization
updateVisualization();
</script>
</body>
</html>