From 6c3abab0322ed1cc4c1fbe4ecb6aee6737a2ea31 Mon Sep 17 00:00:00 2001 From: kennethnym Date: Sun, 22 Mar 2026 21:16:16 +0000 Subject: [PATCH] Interactive explainer and manim visualization of Percepta's 'Can LLMs Be Computers?' - Interactive web app (interactive/) explaining how transformer weights execute deterministic WASM programs: softmax sharpening, 2D parabola trick for exact memory lookup, stack machine step-through, and full execution trace visualization - Manim animation script (manim_project/scene.py) with 9 scenes covering the article's key concepts Co-authored-by: Ona --- .gitignore | 17 + interactive/app.js | 818 ++++++++++++++++++++++++++ interactive/index.html | 515 +++++++++++++++++ interactive/style.css | 1237 ++++++++++++++++++++++++++++++++++++++++ manim_project/scene.py | 523 +++++++++++++++++ 5 files changed, 3110 insertions(+) create mode 100644 .gitignore create mode 100644 interactive/app.js create mode 100644 interactive/index.html create mode 100644 interactive/style.css create mode 100644 manim_project/scene.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..857b2ee --- /dev/null +++ b/.gitignore @@ -0,0 +1,17 @@ +# Manim build artifacts +manim_project/media/ +manim_project/output.mp4 +manim_project/__pycache__/ + +# Python +__pycache__/ +*.pyc + +# Ona +.ona/ + +# Devcontainer +.devcontainer/ + +# OS +.DS_Store diff --git a/interactive/app.js b/interactive/app.js new file mode 100644 index 0000000..fbe6196 --- /dev/null +++ b/interactive/app.js @@ -0,0 +1,818 @@ +// ── Tab Navigation ── +document.querySelectorAll('.tab').forEach(btn => { + btn.addEventListener('click', () => { + document.querySelectorAll('.tab').forEach(b => b.classList.remove('active')); + document.querySelectorAll('.panel').forEach(p => p.classList.remove('active')); + btn.classList.add('active'); + document.getElementById(btn.dataset.tab).classList.add('active'); + }); +}); +document.querySelectorAll('.next-btn').forEach(btn => { + btn.addEventListener('click', () => { + document.querySelector(`.tab[data-tab="${btn.dataset.next}"]`).click(); + window.scrollTo({ top: 0, behavior: 'smooth' }); + }); +}); + +// ── Helpers ── +function softmax(scores) { + const max = Math.max(...scores); + const exps = scores.map(s => Math.exp(s - max)); + const sum = exps.reduce((a, b) => a + b, 0); + return exps.map(e => e / sum); +} + +// ══════════════════════════════════════════ +// TAB 1: Softmax Temperature Demo +// ══════════════════════════════════════════ +const rawScores = [1.2, 0.5, 3.8, 0.9, 1.5]; // index 2 is the "target" +const scoreLabels = ['slot 0', 'slot 1', 'slot 2', 'slot 3', 'slot 4']; + +function renderSoftmaxBars(temp) { + const scaled = rawScores.map(s => s * temp); + const weights = softmax(scaled); + const maxW = Math.max(...weights); + const container = document.getElementById('softmaxBars'); + container.innerHTML = ''; + weights.forEach((w, i) => { + const col = document.createElement('div'); + col.className = 'bar-col'; + const wrapper = document.createElement('div'); + wrapper.className = 'bar-wrapper'; + const bar = document.createElement('div'); + bar.className = 'bar' + (w === maxW ? ' winner' : ''); + bar.style.height = (w * 130) + 'px'; + wrapper.appendChild(bar); + const val = document.createElement('div'); + val.className = 'bar-value'; + val.textContent = (w * 100).toFixed(1) + '%'; + const lbl = document.createElement('div'); + lbl.className = 'bar-label'; + lbl.textContent = scoreLabels[i] + (i === 2 ? ' ★' : ''); + col.append(val, wrapper, lbl); + container.appendChild(col); + }); + const insight = document.getElementById('tempInsight'); + if (temp < 5) insight.textContent = 'At low temperature, attention is spread out — fuzzy, not useful for exact computation.'; + else if (temp < 20) insight.textContent = 'Getting sharper! The target slot is winning, but there\'s still leakage to other slots.'; + else insight.textContent = 'Nearly 100% on the target. The softmax now acts like an exact array read — this is how weights produce deterministic lookups.'; +} + +document.getElementById('tempSlider').addEventListener('input', e => { + const v = +e.target.value; + document.getElementById('tempVal').textContent = v; + renderSoftmaxBars(v); +}); +renderSoftmaxBars(1); + +// ══════════════════════════════════════════ +// TAB 2: Memory Lookup via Attention +// ══════════════════════════════════════════ +const memValues = [42, 17, 99, 8, 55, 73]; +let queryTarget = 2; + +function makeColVec(values, cls, label) { + // Returns a DOM element showing a column vector with bracket notation + const wrap = document.createElement('div'); + wrap.className = 'col-vec ' + cls; + wrap.innerHTML = '
'; + values.forEach(v => { + const cell = document.createElement('div'); + cell.className = 'cell'; + cell.textContent = v; + wrap.appendChild(cell); + }); + if (label) { + const lbl = document.createElement('div'); + lbl.className = 'vec-label'; + lbl.innerHTML = label; + wrap.appendChild(lbl); + } + return wrap; +} + +function renderMemory() { + // Memory slots + const container = document.getElementById('memorySlots'); + container.innerHTML = ''; + memValues.forEach((v, i) => { + const slot = document.createElement('div'); + slot.className = 'mem-slot' + (i === queryTarget ? ' active' : ''); + slot.innerHTML = `
addr ${i}
${v}
`; + slot.addEventListener('click', () => { queryTarget = i; renderMemory(); }); + container.appendChild(slot); + }); + + // Query vector + const qEl = document.getElementById('queryVec'); + qEl.innerHTML = ''; + const qVec = makeColVec([queryTarget, 1], 'query', `q`); + const qNote = document.createElement('span'); + qNote.style.cssText = 'font-size:0.82rem;color:var(--dim);margin-left:12px'; + qNote.innerHTML = `= (i, 1) where i = ${queryTarget}  ← "I want to read address ${queryTarget}"`; + qEl.appendChild(qVec); + qEl.appendChild(qNote); + + // Key vectors + dot products + const vecEl = document.getElementById('vecColumns'); + vecEl.innerHTML = ''; + + const scores = memValues.map((_, j) => 2 * queryTarget * j - j * j); + const maxScore = Math.max(...scores); + const minScore = Math.min(...scores); + const scoreRange = maxScore - minScore || 1; + const weights = softmax(scores.map(s => s * 10)); + + memValues.forEach((val, j) => { + const isWin = j === queryTarget; + const k = [2 * j, -(j * j)]; + + const group = document.createElement('div'); + group.className = 'vec-group' + (isWin ? ' winner' : ''); + + // Column vector + const vec = makeColVec(k, isWin ? 'winner' : '', `k${j}`); + group.appendChild(vec); + + // Dot product computation + const comp = document.createElement('div'); + comp.className = 'dot-computation'; + comp.innerHTML = `${queryTarget}×${k[0]} + 1×${k[1] >= 0 ? k[1] : '(' + k[1] + ')'}`; + group.appendChild(comp); + + // Score + weight + const dpLine = document.createElement('div'); + dpLine.className = 'dot-product-line'; + dpLine.innerHTML = `${scores[j]}${(weights[j] * 100).toFixed(1)}%`; + group.appendChild(dpLine); + + // Mini bar + const bar = document.createElement('div'); + bar.className = 'dp-bar-mini'; + bar.style.width = Math.max(2, ((scores[j] - minScore) / scoreRange) * 60) + 'px'; + group.appendChild(bar); + + // Value stored + const valLabel = document.createElement('div'); + valLabel.style.cssText = `font-size:0.7rem;margin-top:4px;color:${isWin ? 'var(--gold)' : 'var(--dim)'};font-family:monospace`; + valLabel.textContent = `val=${val}`; + group.appendChild(valLabel); + + vecEl.appendChild(group); + + // Add "·" or "=" separator between groups (except last) + if (j < memValues.length - 1) { + const sep = document.createElement('div'); + sep.style.cssText = 'align-self:center;color:var(--border);font-size:1.2rem;padding:0 2px'; + sep.textContent = ''; + vecEl.appendChild(sep); + } + }); + + // Read result + document.getElementById('readResult').innerHTML = + `Read result: mem[${queryTarget}] = ${memValues[queryTarget]}  — key k${queryTarget} gets score ${maxScore} (softmax weight ${(weights[queryTarget] * 100).toFixed(2)}%), all others are penalized by −(i−j)²`; +} + +renderMemory(); + +// ══════════════════════════════════════════ +// TAB 2b: Side-by-Side Comparison +// ══════════════════════════════════════════ +const sbsWords = ['The', 'cake', 'delicious', 'was', 'very']; +// Simulated high-dim embeddings (4D slice for display) — designed to show semantic similarity spread +const sbsTradKeys = [ + [0.2, -0.1, 0.8, 0.3], // The + [0.9, 0.7, 0.1, -0.2], // cake + [0.8, 0.9, -0.1, 0.3], // delicious + [0.1, -0.3, 0.7, 0.5], // was + [0.3, 0.1, 0.2, 0.9], // very +]; +// Queries are similar to the target but with overlap to neighbors (semantic similarity) +const sbsTradQueries = [ + [0.3, -0.2, 0.9, 0.2], // attending to "The" + [0.8, 0.6, 0.2, -0.1], // attending to "cake" + [0.7, 0.8, 0.0, 0.4], // attending to "delicious" + [0.2, -0.2, 0.8, 0.4], // attending to "was" + [0.4, 0.2, 0.1, 0.8], // attending to "very" +]; + +let sbsTarget = 2; + +function makeSbsColVec(values, cls, label) { + const wrap = document.createElement('div'); + wrap.className = 'col-vec sbs-vec ' + cls; + wrap.innerHTML = '
'; + values.forEach(v => { + const cell = document.createElement('div'); + cell.className = 'cell'; + cell.textContent = typeof v === 'number' ? (Number.isInteger(v) ? v : v.toFixed(1)) : v; + wrap.appendChild(cell); + }); + if (label) { + const lbl = document.createElement('div'); + lbl.className = 'vec-label'; + lbl.innerHTML = label; + wrap.appendChild(lbl); + } + return wrap; +} + +function renderSBS() { + const target = sbsTarget; + document.getElementById('sbsTargetLabel').textContent = sbsWords[target]; + + // ── Traditional side ── + const tradQEl = document.getElementById('sbsTradQ'); + tradQEl.innerHTML = ''; + const tradQLabel = document.createElement('span'); + tradQLabel.style.cssText = 'font-size:0.75rem;color:var(--dim);margin-right:4px'; + tradQLabel.textContent = 'query:'; + tradQEl.appendChild(tradQLabel); + tradQEl.appendChild(makeSbsColVec(sbsTradQueries[target], 'query', 'q')); + + const tradKeysEl = document.getElementById('sbsTradKeys'); + tradKeysEl.innerHTML = ''; + + const tradScores = sbsTradKeys.map(k => + k.reduce((sum, ki, d) => sum + ki * sbsTradQueries[target][d], 0) + ); + const tradWeights = softmax(tradScores.map(s => s * 4)); // moderate temperature + const tradMaxW = Math.max(...tradWeights); + + sbsWords.forEach((word, j) => { + const grp = document.createElement('div'); + const isTop = tradWeights[j] === tradMaxW; + grp.className = 'sbs-key-group' + (isTop ? ' trad-winner' : ''); + + const wordEl = document.createElement('div'); + wordEl.className = 'sbs-word'; + wordEl.textContent = word; + grp.appendChild(wordEl); + + grp.appendChild(makeSbsColVec(sbsTradKeys[j], '', `k${j}`)); + + const score = document.createElement('div'); + score.className = 'sbs-score'; + score.textContent = tradScores[j].toFixed(2); + grp.appendChild(score); + + const weight = document.createElement('div'); + weight.className = 'sbs-weight'; + weight.textContent = (tradWeights[j] * 100).toFixed(1) + '%'; + grp.appendChild(weight); + + const bar = document.createElement('div'); + bar.className = 'sbs-weight-bar'; + bar.style.width = (tradWeights[j] / tradMaxW * 50) + 'px'; + grp.appendChild(bar); + + tradKeysEl.appendChild(grp); + }); + + const tradResult = document.getElementById('sbsTradResult'); + const topTrad = tradWeights.map((w, i) => ({w, i})).sort((a, b) => b.w - a.w); + tradResult.innerHTML = `Output: blend of "${sbsWords[topTrad[0].i]}" (${(topTrad[0].w*100).toFixed(0)}%) + "${sbsWords[topTrad[1].i]}" (${(topTrad[1].w*100).toFixed(0)}%) + others
→ A fuzzy mix of semantically related tokens`; + + // ── Lookup side ── + const lookupQEl = document.getElementById('sbsLookupQ'); + lookupQEl.innerHTML = ''; + const lookupQLabel = document.createElement('span'); + lookupQLabel.style.cssText = 'font-size:0.75rem;color:var(--dim);margin-right:4px'; + lookupQLabel.textContent = 'query:'; + lookupQEl.appendChild(lookupQLabel); + lookupQEl.appendChild(makeSbsColVec([target, 1], 'query', 'q')); + + const lookupKeysEl = document.getElementById('sbsLookupKeys'); + lookupKeysEl.innerHTML = ''; + + const lookupScores = sbsWords.map((_, j) => 2 * target * j - j * j); + const lookupWeights = softmax(lookupScores.map(s => s * 10)); + const lookupMaxW = Math.max(...lookupWeights); + + sbsWords.forEach((word, j) => { + const grp = document.createElement('div'); + const isWin = lookupWeights[j] === lookupMaxW; + grp.className = 'sbs-key-group' + (isWin ? ' winner' : ''); + + const wordEl = document.createElement('div'); + wordEl.className = 'sbs-word'; + wordEl.textContent = `addr ${j}`; + grp.appendChild(wordEl); + + grp.appendChild(makeSbsColVec([2 * j, -(j * j)], isWin ? 'winner' : '', `k${j}`)); + + const score = document.createElement('div'); + score.className = 'sbs-score'; + score.textContent = lookupScores[j]; + grp.appendChild(score); + + const weight = document.createElement('div'); + weight.className = 'sbs-weight'; + weight.textContent = (lookupWeights[j] * 100).toFixed(1) + '%'; + grp.appendChild(weight); + + const bar = document.createElement('div'); + bar.className = 'sbs-weight-bar'; + bar.style.width = (lookupWeights[j] / (lookupMaxW || 1) * 50) + 'px'; + grp.appendChild(bar); + + lookupKeysEl.appendChild(grp); + }); + + const lookupResult = document.getElementById('sbsLookupResult'); + lookupResult.innerHTML = `Output: value at addr ${target} with ${(lookupWeights[target] * 100).toFixed(2)}% weight
→ An exact read of one specific address`; +} + +document.getElementById('sbsTargetSlider').addEventListener('input', e => { + sbsTarget = +e.target.value; + renderSBS(); +}); +renderSBS(); + +// ══════════════════════════════════════════ +// TAB 3: Parabola Visualization +// ══════════════════════════════════════════ +function drawParabola(queryIdx) { + const canvas = document.getElementById('parabolaCanvas'); + const ctx = canvas.getContext('2d'); + const W = canvas.width, H = canvas.height; + ctx.clearRect(0, 0, W, H); + + const n = 8; + const pad = 40; + const xScale = (W - 2 * pad) / (2 * (n - 1)); + const maxJ2 = (n - 1) * (n - 1); + const yScale = (H - 2 * pad) / maxJ2; + + // Axes + ctx.strokeStyle = '#2a2a44'; + ctx.lineWidth = 1; + ctx.beginPath(); + ctx.moveTo(pad, H - pad); + ctx.lineTo(W - pad, H - pad); + ctx.moveTo(pad, H - pad); + ctx.lineTo(pad, pad); + ctx.stroke(); + + ctx.fillStyle = '#666680'; + ctx.font = '10px monospace'; + ctx.fillText('2j →', W - pad - 20, H - pad + 15); + ctx.fillText('−j²', pad - 5, pad - 5); + + // Parabola curve + ctx.strokeStyle = '#333355'; + ctx.lineWidth = 1.5; + ctx.beginPath(); + for (let j = 0; j < n; j++) { + const x = pad + (2 * j) * xScale; + const y = H - pad - (j * j) * yScale; + j === 0 ? ctx.moveTo(x, y) : ctx.lineTo(x, y); + } + ctx.stroke(); + + // Points + for (let j = 0; j < n; j++) { + const x = pad + (2 * j) * xScale; + const y = H - pad - (j * j) * yScale; + ctx.beginPath(); + ctx.arc(x, y, j === queryIdx ? 8 : 5, 0, Math.PI * 2); + ctx.fillStyle = j === queryIdx ? '#ffd54f' : '#4fc3f7'; + ctx.fill(); + ctx.fillStyle = '#999'; + ctx.font = '10px monospace'; + ctx.fillText(`j=${j}`, x - 8, y + 18); + } + + // Query direction arrow + const qx = pad + (2 * queryIdx) * xScale; + const qy = H - pad - (queryIdx * queryIdx) * yScale; + ctx.strokeStyle = '#ff7043'; + ctx.lineWidth = 2; + ctx.setLineDash([4, 3]); + ctx.beginPath(); + ctx.moveTo(pad + W / 4, H - pad); + ctx.lineTo(qx, qy); + ctx.stroke(); + ctx.setLineDash([]); + + ctx.fillStyle = '#ff7043'; + ctx.font = 'bold 11px sans-serif'; + ctx.fillText(`q=(${queryIdx},1)`, pad + W / 4 - 15, H - pad + 15); +} + +function drawScores(queryIdx) { + const canvas = document.getElementById('scoresCanvas'); + const ctx = canvas.getContext('2d'); + const W = canvas.width, H = canvas.height; + ctx.clearRect(0, 0, W, H); + + const n = 8; + const pad = 40; + const barW = (W - 2 * pad) / n - 4; + + const scores = []; + for (let j = 0; j < n; j++) { + scores.push(2 * queryIdx * j - j * j); + } + const maxS = Math.max(...scores); + const minS = Math.min(...scores); + const range = maxS - minS || 1; + + // Axes + ctx.strokeStyle = '#2a2a44'; + ctx.lineWidth = 1; + ctx.beginPath(); + ctx.moveTo(pad, H - pad); + ctx.lineTo(W - pad, H - pad); + ctx.stroke(); + + ctx.fillStyle = '#666680'; + ctx.font = '10px monospace'; + ctx.fillText('j →', W - pad - 15, H - pad + 15); + ctx.fillText('score', pad - 5, pad + 10); + + // Bars + for (let j = 0; j < n; j++) { + const x = pad + j * ((W - 2 * pad) / n) + 2; + const h = ((scores[j] - minS) / range) * (H - 2 * pad - 20); + const y = H - pad - h; + ctx.fillStyle = j === queryIdx ? '#ffd54f' : '#4fc3f7'; + ctx.fillRect(x, y, barW, h); + ctx.fillStyle = '#999'; + ctx.font = '9px monospace'; + ctx.fillText(j.toString(), x + barW / 2 - 3, H - pad + 12); + ctx.fillStyle = j === queryIdx ? '#ffd54f' : '#aaa'; + ctx.font = '9px monospace'; + ctx.fillText(scores[j].toString(), x + barW / 2 - 6, y - 4); + } + + document.getElementById('parabolaInsight').textContent = + `Index ${queryIdx} gets the highest score (${maxS}). The penalty −(i−j)² ensures only the exact match wins.`; +} + +document.getElementById('queryIdxSlider').addEventListener('input', e => { + const v = +e.target.value; + document.getElementById('queryIdxVal').textContent = v; + drawParabola(v); + drawScores(v); +}); + +// Defer canvas drawing until tab is visible +const tab3Observer = new MutationObserver(() => { + if (document.getElementById('tab3').classList.contains('active')) { + drawParabola(3); + drawScores(3); + tab3Observer.disconnect(); + } +}); +tab3Observer.observe(document.getElementById('tab3'), { attributes: true, attributeFilter: ['class'] }); + +// ══════════════════════════════════════════ +// TAB 4: Write-Read Trace Demo +// ══════════════════════════════════════════ +const wrSteps = [ + { + type: 'write', + instr: 'i32.const 3', + desc: 'Push 3 onto stack', + token: { label: 'token 0', instr: 'const 3', k: 'k=[2, −1]', v: 'v=3', addr: 1, val: 3 }, + action: '
WRITE: i32.const 3
The model emits a new trace token. WK maps it to key [2, −1] (stack depth 1 on the parabola). WV extracts value 3. The token now sits in the sequence — that\'s the write. No memory chip needed.', + }, + { + type: 'write', + instr: 'i32.const 5', + desc: 'Push 5 onto stack', + token: { label: 'token 1', instr: 'const 5', k: 'k=[4, −4]', v: 'v=5', addr: 2, val: 5 }, + action: '
WRITE: i32.const 5
Another trace token emitted. Key [4, −4] (stack depth 2). Value 5. Now two tokens sit in the sequence = two stack entries.', + }, + { + type: 'read', + instr: 'i32.add (read operands)', + desc: 'Read top two stack values', + readTargets: [1, 0], // indices into tokens array + action: '
READ: i32.add needs operands
The add instruction needs the top two stack values. The stack head produces query q=[2, 1] → attention scans all past tokens\' keys → finds token 1 (score = 2×2×2 − 4 = 4, highest) → retrieves value 5. Then query q=[1, 1] → finds token 0 → retrieves 3. No memory was accessed — just attention over past tokens.', + }, + { + type: 'write', + instr: 'i32.add (result)', + desc: 'Push result 8', + token: { label: 'token 2', instr: 'add → 8', k: 'k=[2, −1]', v: 'v=8', addr: 1, val: 8 }, + shadowIdx: 0, // token 0 gets overshadowed (same addr) + action: '
WRITE: push result 8
The FFN computed 3 + 5 = 8. A new token is emitted with key [2, −1] (stack depth 1 — the stack shrank by 1). Value 8.

Notice: token 0 also had key [2, −1] (depth 1, value 3). But token 2 is later in the sequence, so the parabola trick gives it a higher score. The old value 3 is overshadowed, not erased.', + }, + { + type: 'read', + instr: 'output', + desc: 'Read top of stack', + readTargets: [2], + action: '
READ: output top of stack
Query q=[1, 1] for stack depth 1. Both token 0 and token 2 have key [2, −1], but token 2 is later → higher score → attention returns 8 (not the old 3). Output: 8. ✓', + }, +]; + +let wrStep = 0; +let wrTokens = []; +let wrShadowed = new Set(); + +function wrReset() { + wrStep = 0; + wrTokens = []; + wrShadowed = new Set(); + renderWR(); +} + +function wrStepExec() { + if (wrStep >= wrSteps.length) return; + const step = wrSteps[wrStep]; + if (step.type === 'write') { + wrTokens.push({ ...step.token, isNew: true }); + if (step.shadowIdx !== undefined) wrShadowed.add(step.shadowIdx); + } + wrStep++; + renderWR(); + // Clear "new" flag after animation + setTimeout(() => { + wrTokens.forEach(t => t.isNew = false); + // Don't re-render, just let CSS animation finish + }, 700); +} + +function renderWR() { + const traceEl = document.getElementById('wrTrace'); + traceEl.innerHTML = ''; + + const step = wrStep > 0 ? wrSteps[wrStep - 1] : null; + const isRead = step && step.type === 'read'; + const readSet = isRead ? new Set(step.readTargets) : new Set(); + + if (wrTokens.length === 0) { + traceEl.innerHTML = '
No tokens yet. The sequence is empty — no memory exists.
Click Step to emit the first trace token.
'; + } + + wrTokens.forEach((tok, i) => { + const div = document.createElement('div'); + let cls = 'wr-token'; + if (tok.isNew) cls += ' new-token'; + if (isRead && readSet.has(i)) cls += ' found'; + if (wrShadowed.has(i)) cls += ' shadowed'; + div.className = cls; + + let inner = `
${tok.label}
`; + inner += `
${tok.instr}
`; + inner += `
${tok.k}
${tok.v}
`; + if (isRead && readSet.has(i)) { + inner += `
↑ READ
`; + } + if (wrShadowed.has(i) && !readSet.has(i)) { + inner += `
overshadowed
`; + } + div.innerHTML = inner; + traceEl.appendChild(div); + }); + + const actionEl = document.getElementById('wrAction'); + if (step) { + actionEl.innerHTML = step.action; + } else { + actionEl.innerHTML = 'Click Step to begin execution. Watch how each token becomes a memory cell.'; + } +} + +document.getElementById('wrStep').addEventListener('click', wrStepExec); +document.getElementById('wrReset').addEventListener('click', wrReset); +renderWR(); + +// ══════════════════════════════════════════ +// TAB 4: Stack Machine Step-Through +// ══════════════════════════════════════════ +const smProgram = [ + { op: 'i32.const', arg: 3, desc: 'Push 3' }, + { op: 'i32.const', arg: 5, desc: 'Push 5' }, + { op: 'i32.add', arg: null, desc: 'Pop two, push sum' }, + { op: 'i32.const', arg: 10, desc: 'Push 10' }, + { op: 'i32.sub', arg: null, desc: 'Pop two, subtract' }, + { op: 'output', arg: null, desc: 'Output top of stack' }, + { op: 'halt', arg: null, desc: 'Stop' }, +]; + +let smState = { ip: 0, stack: [], output: null, done: false, headLog: [] }; + +function smReset() { + smState = { ip: 0, stack: [], output: null, done: false, headLog: [] }; + renderSM(); +} + +function smStepExec() { + if (smState.done) return; + const instr = smProgram[smState.ip]; + const heads = []; + + // IP head: cumulative sum + heads.push({ name: 'IP Head', action: `sum of deltas → IP = ${smState.ip}`, detail: `query: uniform avg × t = ${smState.ip}` }); + + if (instr.op === 'i32.const') { + smState.stack.push(instr.arg); + heads.push({ name: 'Stack Head', action: `WRITE ${instr.arg} at depth ${smState.stack.length}`, detail: `key=(${2 * smState.stack.length}, -${smState.stack.length ** 2}) val=${instr.arg}` }); + heads.push({ name: 'FFN (ALU)', action: 'passthrough (no arithmetic)', detail: 'gate=1, val=input' }); + } else if (instr.op === 'i32.add') { + const b = smState.stack.pop(), a = smState.stack.pop(); + const r = a + b; + heads.push({ name: 'Stack Head ×2', action: `READ depth ${smState.stack.length + 2} → ${b}, depth ${smState.stack.length + 1} → ${a}`, detail: `q=(${smState.stack.length + 2},1) → ${b}; q=(${smState.stack.length + 1},1) → ${a}` }); + heads.push({ name: 'FFN (ALU)', action: `${a} + ${b} = ${r}`, detail: `ReLU gate selects ADD path` }); + smState.stack.push(r); + } else if (instr.op === 'i32.sub') { + const b = smState.stack.pop(), a = smState.stack.pop(); + const r = a - b; + heads.push({ name: 'Stack Head ×2', action: `READ depth ${smState.stack.length + 2} → ${b}, depth ${smState.stack.length + 1} → ${a}`, detail: `q=(${smState.stack.length + 2},1) → ${b}; q=(${smState.stack.length + 1},1) → ${a}` }); + heads.push({ name: 'FFN (ALU)', action: `${a} - ${b} = ${r}`, detail: `ReLU gate selects SUB path` }); + smState.stack.push(r); + } else if (instr.op === 'output') { + const top = smState.stack[smState.stack.length - 1]; + smState.output = top; + heads.push({ name: 'Stack Head', action: `READ top (depth ${smState.stack.length}) → ${top}`, detail: `q=(${smState.stack.length},1) → ${top}` }); + } else if (instr.op === 'halt') { + smState.done = true; + heads.push({ name: 'Control Head', action: 'HALT detected', detail: 'opcode matches halt pattern' }); + } + + smState.headLog = heads; + smState.ip++; + renderSM(); +} + +function renderSM() { + // Program listing + const progEl = document.getElementById('smProgram'); + progEl.innerHTML = '

Program

'; + smProgram.forEach((instr, i) => { + const line = document.createElement('div'); + line.className = 'instr-line' + (i === smState.ip - 1 && !smState.done ? ' current' : '') + (i < smState.ip - 1 ? ' done' : '') + (i === smState.ip - 1 && smState.done ? ' current' : ''); + line.textContent = `${i}: ${instr.op}${instr.arg !== null ? ' ' + instr.arg : ''}`; + progEl.appendChild(line); + }); + + // State + const stateEl = document.getElementById('smState'); + stateEl.innerHTML = '

VM State

'; + const rows = [ + ['IP', smState.ip >= smProgram.length ? 'HALT' : smState.ip], + ['Stack depth', smState.stack.length], + ['Output', smState.output !== null ? smState.output : '—'], + ]; + rows.forEach(([l, v]) => { + const row = document.createElement('div'); + row.className = 'state-row'; + row.innerHTML = `${l}${v}`; + stateEl.appendChild(row); + }); + + const stackLabel = document.createElement('div'); + stackLabel.style.cssText = 'font-size:0.75rem;color:var(--dim);margin-top:8px;margin-bottom:4px'; + stackLabel.textContent = 'Stack (top ↑):'; + stateEl.appendChild(stackLabel); + + [...smState.stack].reverse().forEach((v, i) => { + const item = document.createElement('div'); + item.className = 'stack-item' + (i === 0 ? ' top' : ''); + item.textContent = v; + stateEl.appendChild(item); + }); + + // Heads + const headsEl = document.getElementById('smHeads'); + headsEl.innerHTML = '

Attention Heads Active

'; + if (smState.headLog.length === 0) { + headsEl.innerHTML += '
Click Step to begin
'; + } + smState.headLog.forEach(h => { + const div = document.createElement('div'); + div.className = 'head-info'; + div.innerHTML = `
${h.name}
${h.action}
${h.detail}
`; + headsEl.appendChild(div); + }); +} + +document.getElementById('smStep').addEventListener('click', smStepExec); +document.getElementById('smReset').addEventListener('click', smReset); +renderSM(); + +// IP demo (mini) +(function () { + const el = document.querySelector('#ipDemo .ip-trace'); + const deltas = [1, 1, 1, 1, -2, 1, 1]; + let sum = 0; + deltas.forEach(d => { + sum += d; + const cell = document.createElement('div'); + cell.className = 'ip-cell'; + cell.innerHTML = `
${d > 0 ? '+' : ''}${d}
IP=${sum}
`; + el.appendChild(cell); + }); +})(); + +// Stack demo (mini) +(function () { + const el = document.querySelector('#stackDemo .stack-vis'); + [3, 5, 8].forEach(v => { + const item = document.createElement('div'); + item.className = 'sv-item'; + item.textContent = v; + el.appendChild(item); + }); +})(); + +// ══════════════════════════════════════════ +// TAB 5: Full Execution Trace +// ══════════════════════════════════════════ +const feProgram = [ + { op: 'i32.const', bytes: '03 00 00 00', desc: 'Push 3 onto stack' }, + { op: 'i32.const', bytes: '05 00 00 00', desc: 'Push 5 onto stack' }, + { op: 'i32.add', bytes: '00 00 00 00', desc: 'Pop 3 and 5, push 8' }, + { op: 'output', bytes: '00 00 00 00', desc: 'Output top of stack' }, +]; + +const feTraceSteps = [ + { tok: '03 00 00 00', meta: 'commit(+1,sts=1,bt=0)', detail: 'IP Head reads instruction 0 → i32.const. Stack Head writes 3 at depth 1. Stack: [3]' }, + { tok: '05 00 00 00', meta: 'commit(+1,sts=2,bt=0)', detail: 'IP Head reads instruction 1 → i32.const. Stack Head writes 5 at depth 2. Stack: [3, 5]' }, + { tok: '08 00 00 00', meta: 'commit(-1,sts=1,bt=0)', detail: 'IP Head reads instruction 2 → i32.add. Stack Head reads depth 2 → 5, depth 1 → 3. FFN computes 3+5=8. Writes 8 at depth 1. Stack: [8]' }, + { tok: 'out(08)', meta: '', detail: 'IP Head reads instruction 3 → output. Stack Head reads top → 8. Output token emitted.' }, + { tok: 'halt', meta: '', detail: 'Program complete. All computation happened inside the transformer\'s forward pass.' }, +]; + +let feStep = 0; + +function feReset() { + feStep = 0; + renderFE(); +} + +function feStepExec() { + if (feStep < feTraceSteps.length) feStep++; + renderFE(); +} + +function feRunAll() { + feStep = feTraceSteps.length; + renderFE(); +} + +function renderFE() { + // Program + const progEl = document.getElementById('feProgram'); + progEl.innerHTML = '

WASM Program

'; + feProgram.forEach((instr, i) => { + const line = document.createElement('div'); + line.className = 'instr-line' + (i < feStep ? ' done' : '') + (i === feStep - 1 ? ' current' : ''); + line.textContent = `${instr.op} ${instr.bytes}`; + progEl.appendChild(line); + }); + + // Trace + const traceEl = document.getElementById('feTrace'); + traceEl.innerHTML = '

Execution Trace (tokens)

'; + for (let i = 0; i < feStep; i++) { + const t = feTraceSteps[i]; + const div = document.createElement('div'); + div.className = 'trace-token' + (i === feStep - 1 ? ' new' : ''); + div.innerHTML = `${t.tok}${t.meta}`; + traceEl.appendChild(div); + } + if (feStep === 0) { + traceEl.innerHTML += '
Click Step to generate trace tokens...
'; + } + + // Detail + const detailEl = document.getElementById('feDetail'); + detailEl.innerHTML = '

What Happened (weight level)

'; + if (feStep > 0) { + const d = feTraceSteps[feStep - 1]; + const div = document.createElement('div'); + div.style.cssText = 'font-size:0.82rem;line-height:1.6;color:var(--text)'; + div.textContent = d.detail; + detailEl.appendChild(div); + + // Visual: which heads fired + const heads = []; + if (feStep <= 3) heads.push({ name: 'IP Head', color: 'var(--accent)' }); + if (feStep <= 4) heads.push({ name: 'Stack Head', color: 'var(--green)' }); + if (feStep === 3) heads.push({ name: 'FFN (ALU)', color: 'var(--gold)' }); + if (feStep === 5) heads.push({ name: 'Control', color: 'var(--warn)' }); + + const hDiv = document.createElement('div'); + hDiv.style.cssText = 'margin-top:12px;display:flex;gap:6px;flex-wrap:wrap'; + heads.forEach(h => { + const chip = document.createElement('span'); + chip.style.cssText = `font-size:0.75rem;padding:3px 10px;border-radius:12px;border:1px solid ${h.color};color:${h.color}`; + chip.textContent = h.name; + hDiv.appendChild(chip); + }); + detailEl.appendChild(hDiv); + } else { + detailEl.innerHTML += '
Each step shows which attention heads fire and what they compute.
'; + } +} + +document.getElementById('feStep').addEventListener('click', feStepExec); +document.getElementById('feReset').addEventListener('click', feReset); +document.getElementById('feRunAll').addEventListener('click', feRunAll); +renderFE(); diff --git a/interactive/index.html b/interactive/index.html new file mode 100644 index 0000000..64f46a6 --- /dev/null +++ b/interactive/index.html @@ -0,0 +1,515 @@ + + + + + +How Transformer Weights Become a Computer + + + +
+
+

How Transformer Weights Become a Computer

+

An interactive exploration of how matrix multiplications can execute deterministic programs

+
+ + + + +
+

The Puzzle: How Can Weights Be Deterministic?

+
+

A transformer is just matrix multiplications and softmax. How can that execute a program exactly?

+

The secret: attention is a lookup table. If you set up the keys and queries just right, the softmax becomes so peaked that it acts like an exact array[index] read.

+
+
+

Try it: Softmax Temperature

+

Below are 5 scores. One is the "correct" lookup target. Watch what happens as you increase the temperature multiplier — the softmax sharpens until it's essentially a hard lookup.

+
+ + +
+
+

At low temperature, attention is spread out — fuzzy, not useful for exact computation.

+
+
+

Key insight: When the score gap is large enough, softmax gives >99.99% weight to one entry. The "weighted average" becomes an exact read. This is how weights produce deterministic behavior — not by magic, but by engineering the scores to be extremely peaked.

+
+ +
+ + +
+

Attention as a Lookup Table

+ +
+

Start here: What problem are we solving?

+

A computer needs to read from memory. You say "give me the value at address 3" and you get back whatever is stored there. Simple.

+

But a transformer has no memory. It only has a growing list of past tokens and one operation: attention — which computes a weighted average over all past tokens. How do you turn a weighted average into an exact memory read?

+
+ +
+

First: How tokens become vectors

+

Before attention can happen, every token goes through the same pipeline. Here it is, step by step:

+ +
+
+
Plain text
+
"cake"
+
+

tokenizer
+
+
Token ID
+
4821
+
+

embedding table
+
+
Token embedding (x)
+
[0.3, 0.8, −0.1, 0.5, ...]
+
one vector per token, d dimensions
+
+
+ +

Then this embedding gets multiplied by three separate weight matrices to produce Q, K, and V:

+ +
+
+
x
+
×
+
WQ
+
=
+
q
+
← "what address do I need?"
+
+
+
x
+
×
+
WK
+
=
+
k
+
← "what address am I at?"
+
+
+
x
+
×
+
WV
+
=
+
v
+
← "what data do I hold?"
+
+
+ +
+

How Q, K, V work together (this is the key!)

+

A common misconception: the value v is not computed from q · k. The dot product q · k only produces a score — a single number. Here's the actual flow:

+ +
+
+
Each past token (independently)
+
+
kj = xj × WK
+
its address label
+
+
+
vj = xj × WV
+
its stored data
+
+
+ +
+
Current token
+
+
q = xnow × WQ
+
the address I want
+
+
+ +
+
Step 1: Score every past token
+
+
scorej = q · kj
+
"how well does this key match my query?"
→ just a number, not a vector
+
+
+ +
+
Step 2: Pick the winner
+
+
weightj = softmax(scores)
+
highest score → ~100% weight
all others → ~0%
+
+
+ +
+
Step 3: Retrieve the value
+
+
output = Σ weightj × vj
+
≈ just vwinner (because its weight is ~100%)
+
+
+
+ +

Think of it like a library: k is the label on the spine of each book, q is the title you're searching for, and v is the content inside the book. You use q and k to find the right book, then you read v from it. The content was always there — the lookup just selects which book to open.

+
+ +
+ So where is the "memory" physically?
+ There's no separate memory chip. The past tokens in the sequence are the memory. Each past token carries a key (its address) and a value (its data), both computed from its residual stream via WK and WV. When the model "writes 42 to address 3," it emits a trace token whose key encodes address 3 and whose value encodes 42. When a later step "reads address 3," attention finds that token and retrieves its value. +
+ +
+ Important nuance about x: The input to each layer isn't the raw token embedding — it's the residual stream: the original embedding plus all outputs from previous layers added on top. So by the time a token reaches layer 3, its x already contains results computed by layers 1 and 2 (like the current instruction pointer, or a value just loaded from the stack).

+ This means WV doesn't just extract the original token — it extracts whatever information previous layers have written into the residual stream. For example, if layer 2 computed "the value at stack position 3 is 42", then layer 3's WV can extract that 42 and make it available as this token's value. +
+ +
+ The mechanism is identical in both traditional LLMs and this paper. The only difference is what the weight matrices contain:
+ ● Traditional: WQ, WK, WV are learned from data via gradient descent → produce fuzzy semantic vectors
+ ● This paper: WQ, WK, WV are set by construction (compiled) → produce exact address vectors like [i, 1] and [2j, −j²] +
+
+ +
+

The three players: Key, Query, Value

+

Now that you know how they're produced, here's what each one means:

+ + + + + + + + + + + + + +
Key (k)The token's address label. It answers: "Where am I?"
+ Think of it like the number on a mailbox. Token at memory address 3 gets a key that encodes "I'm address 3."
Query (q)The current token's request. It answers: "What address do I need?"
+ If the current instruction needs to read address 3, the query encodes "I want address 3."
Value (v)The token's stored data. It answers: "What's in me?"
+ This is the actual content — the number 42, or a stack entry, or whatever was written there.
+
+ +
+

How the lookup works

+

Attention computes the dot product between the query and every key. The dot product is just: multiply corresponding entries and add up. A high dot product means "these two vectors point in a similar direction" — i.e. this key matches what the query is asking for.

+

Then softmax turns those scores into weights that sum to 1. If one score is much higher than the rest, that key gets nearly 100% of the weight — and the output is just that key's value. That's an exact read.

+
+ 1. Compute scores:  score_j = q · k_j  ← dot product of query with each key
+ 2. Normalize:  weight_j = softmax(scores)  ← highest score gets ~100%
+ 3. Read:  output = Σ weight_j × value_j  ← weighted average ≈ exact value +
+
+ +
+

But what are the key vectors, concretely?

+

In this paper, each key is a 2-number column vector. The weight matrix W_K is engineered so that a token at address j gets mapped to:

+
+ kj = [2j, −j²] +
+

Why these specific numbers? Because they sit on a parabola, and that shape has a magical property: when you dot-product a query q = [i, 1] with every key, the math simplifies to −(i − j)² + i². That's an upside-down parabola centered at j = i — meaning the key at the exact target address always wins. (Tab 3 visualizes this in detail.)

+

The key vector isn't something the model "learns" in the usual sense — the weight matrix W_K is set by construction to produce these parabola coordinates. It's more like compiling an address decoder into the weights.

+
+ +
+

Interactive: See It In Action

+

Below is a simulated memory with 6 slots. Click a slot to change which address you're reading. Watch how the query vector changes, and how the dot products with each key vector determine which value gets read.

+
+

Query Vector

+
+

Key Vectors & Dot Products

+
+
+
+
+

Side by Side: Traditional LLM Attention vs Memory Lookup

+

Both use the same mechanism (dot product → softmax → weighted average). The difference is what the keys and queries represent and how sharp the result is.

+
+
+ + +
+
+
+ +
+
Traditional LLM Attention
+
Keys = learned semantic embeddings (high-dimensional, shown as 4D slice)
+
+
+
+
+ Weights are spread across multiple tokens — attention is a soft blend. Good for language understanding ("what words are related?") but useless for exact computation. +
+
+ +
+
Memory Lookup (this paper)
+
Keys = engineered 2D addresses on a parabola
+
+
+
+
+ Weight is 100% on one token — attention is a hard lookup. This is what you need to read stack[3] or mem[addr] exactly. +
+
+
+

Same math, different weights. Traditional LLMs learn W_K and W_Q from data → fuzzy semantic similarity. This paper engineers W_K and W_Q by construction → exact address lookup. The architecture is identical.

+
+ +
+

But where does the query vector come from?

+

In both cases, the query is produced the same way: multiply the current token's embedding by the WQ weight matrix. The difference is what WQ contains.

+ +
+ +
+
Traditional LLM
+
+
+
Current token embedding
+
x = [0.3, 0.8, −0.1, ...]
+
~4096 dimensions
+
+
× WQ
+
+
WQ weight matrix
+
Learned from data
+
Encodes "what's semantically relevant to me?"
+
+
=
+
+
Query vector
+
q = [0.7, 0.8, 0.0, ...]
+
Points toward semantically similar keys
+
+
+

WQ is trained via gradient descent on billions of tokens. It learns to produce queries that find contextually relevant tokens — "delicious" attends to "cake" because they co-occur. The query is a fuzzy semantic direction.

+
+ + +
+
Memory Lookup (this paper)
+
+
+
Current state (from prior layers)
+
x includes the target address i
+
Prior attention heads computed which address to read
+
+
× WQ
+
+
WQ weight matrix
+
Engineered by construction
+
Extracts address i and formats it as (i, 1)
+
+
=
+
+
Query vector
+
q = [i, 1]
+
Points exactly at key ki on the parabola
+
+
+

WQ is not learned — it's set so that it extracts the target address from the current token's state and formats it as [i, 1]. The address i itself comes from earlier layers: e.g. the instruction pointer head computes the current IP, and the stack head uses the current stack depth. Each head's WQ is wired to extract the specific piece of state it needs.

+
+
+ +
+ The chain of computation across layers:
+ Layer 1: A head computes the instruction pointer (cumulative sum of deltas) → "we're at instruction 5"
+ Layer 2: A head uses the IP to look up instruction 5 from the program → "it's i32.add"
+ Layer 3: A head uses the stack depth to read the top two stack values → operands
+ Layer 4: The feed-forward network does the arithmetic → result
+ Layer 5: A head writes the result back to the stack at the new depth
+
+ Each layer's WQ is wired to extract exactly the right piece of state from the previous layer's output. The query vector is never a vague "what's relevant?" — it's always a precise "give me the value at this specific address." +
+
+ +
+

Each attention head in the transformer does exactly this — but the "indices" and "values" are determined by the weight matrices W_Q, W_K, W_V. The weights are set so that:

+
    +
  • W_K maps each token to a 2D "address" on a parabola
  • +
  • W_Q maps the current step to a "query direction" — extracting the target address from the current state
  • +
  • W_V extracts the stored value
  • +
+

Multiple heads = multiple independent arrays. That's enough to build registers, a stack, and memory.

+
+ +
+ + +
+

The 2D Parabola Trick: Exact Index Lookup

+
+

Here's the mathematical trick that makes exact lookup work with just 2D keys.

+

Store index j as the 2D key: kj = (2j, −j²)

+

To look up index i, query with direction: q = (i, 1)

+

The dot product becomes: q · kj = 2ij − j² = −(i − j)² + i²

+

Since is constant for a given query, the argmax is at j = i — exact match!

+
+
+

Interactive: See the Parabola

+

The keys sit on a parabola. Drag the query slider to change which index you're looking up. The dot product scores form an inverted parabola peaked at the target.

+
+ + +
+
+
+

Keys on Parabola (2D space)

+ +
+
+

Dot Product Scores (q · kj)

+ +
+
+

Index 3 gets the highest score. The penalty −(i−j)² ensures only the exact match wins.

+
+
+

Why this matters: The weight matrix W_K is set so it maps token positions to points on this parabola. W_Q produces the query direction. The result: attention performs an exact array read — no scanning, no approximation. And because the keys are 2D, you can use a convex hull to find the max in O(log n) time instead of checking every key.

+
+ +
+ + +
+

Building a Stack Machine from Attention Heads

+
+

A WebAssembly VM needs: an instruction pointer, an operand stack, and memory. Here's how attention heads provide each:

+
+
+
+

📍 Instruction Pointer

+

Mechanism: Cumulative sum via attention

+

Each trace token emits a delta (+1 for next instruction, or a jump offset). One head averages all deltas uniformly, then multiplies by the token count to recover the running sum = current IP.

+
+
+
+
+
+

📚 Operand Stack

+

Mechanism: Index lookup via 2D keys

+

Stack depth is tracked as a cumulative sum of push/pop deltas. To read the top of stack, the head queries for the current depth using the parabola trick — exact lookup of the most recent value at that depth.

+
+
+
+
+
+

💾 Memory

+

Mechanism: Index lookup via 2D keys

+

Memory addresses work the same way as stack indices. To read mem[addr], the head queries with direction (addr, 1). The most recent write to that address has the highest score because it has the largest position component.

+
+
+
+

But how does "writing" work? There's no memory!

+

This is the crucial insight: writing = emitting a new token. There is no separate memory. The growing sequence of trace tokens is the memory.

+ +
+

Click "Step" to watch the token sequence grow. Each token carries a key (address) and value (data). Reading is just attention looking back at past tokens.

+
+
+
+ + +
+
+ +
+ Summary:
+ Write = emit a token. Its WK gives it an address, its WV gives it data. It just sits there in the sequence.
+ Read = a later token's WQ produces a query. Attention scans all past tokens' keys, finds the match, returns that token's value.
+ Overwrite = emit another token with the same address. Because it's later in the sequence, the parabola trick gives it a higher score than the old one (the position component breaks ties).

+ There is no "write back" step. The sequence only grows. Old values are never erased — they're just overshadowed by newer tokens at the same address. +
+
+ +
+

Interactive: Step Through a Stack Machine

+

Click "Step" to execute WASM instructions. Watch how each head reads/writes.

+
+
+
+
+
+
+ + +
+
+ +
+ + +
+

Putting It All Together: 3 + 5 Inside a Transformer

+
+

Now let's see the complete picture. The transformer receives a WASM program as tokens. Then it generates an execution trace — each token is produced by the attention heads doing lookups into the growing trace.

+
+
+

Full Execution Trace: 3 + 5

+
+
+
+
+
+
+ + + +
+
+
+

Why This Is Different From Tool Use

+
+
+

🔧 Tool Use

+

LLM writes print(3+5)

+

Pauses, sends to external Python

+

Gets back "8" — a black box

+

Execution happened outside

+
+
+

⚡ In-Model Execution

+

LLM emits WASM tokens

+

Each next token = one VM step

+

Attention heads ARE the CPU

+

Every step visible in the trace

+
+
+
+
+

The Weight-Level Summary

+
+
+ WK maps positions → parabola points
+ Encodes "what address am I?" +
+
+ WQ maps current state → query direction
+ Encodes "what address do I need?" +
+
+ WV extracts stored values
+ Encodes "what data is here?" +
+
+ Wff (feed-forward) does ALU ops
+ Addition, comparison, branching logic +
+
+

The weights aren't learned from data in the usual sense — they're engineered (or compiled) so that the transformer's forward pass literally executes a WASM interpreter. The architecture is a standard PyTorch transformer. Only the weight values are special.

+
+
+
+ + + + diff --git a/interactive/style.css b/interactive/style.css new file mode 100644 index 0000000..41d3d3f --- /dev/null +++ b/interactive/style.css @@ -0,0 +1,1237 @@ +* { margin: 0; padding: 0; box-sizing: border-box; } + +:root { + --bg: #0a0a14; + --surface: #12121f; + --surface2: #1a1a2e; + --border: #2a2a44; + --text: #d0d0e0; + --dim: #666680; + --accent: #4fc3f7; + --accent2: #ab47bc; + --green: #66bb6a; + --gold: #ffd54f; + --warn: #ff7043; + --radius: 10px; +} + +body { + background: var(--bg); + color: var(--text); + font-family: 'Inter', -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif; + line-height: 1.6; + min-height: 100vh; +} + +#app { + max-width: 900px; + margin: 0 auto; + padding: 2rem 1.5rem 4rem; +} + +header { + text-align: center; + margin-bottom: 2rem; + padding-bottom: 1.5rem; + border-bottom: 1px solid var(--border); +} + +header h1 { + font-size: 1.8rem; + color: var(--accent); + margin-bottom: 0.4rem; + letter-spacing: -0.02em; +} + +.subtitle { + color: var(--dim); + font-size: 0.95rem; +} + +/* Tabs */ +#tabs { + display: flex; + gap: 4px; + margin-bottom: 2rem; + overflow-x: auto; + padding-bottom: 4px; +} + +.tab { + background: var(--surface); + border: 1px solid var(--border); + color: var(--dim); + padding: 0.5rem 1rem; + border-radius: var(--radius); + cursor: pointer; + font-size: 0.82rem; + white-space: nowrap; + transition: all 0.2s; +} + +.tab:hover { color: var(--text); border-color: var(--accent); } +.tab.active { + background: var(--accent); + color: var(--bg); + border-color: var(--accent); + font-weight: 600; +} + +/* Panels */ +.panel { display: none; } +.panel.active { display: block; } + +.panel h2 { + font-size: 1.35rem; + color: var(--accent); + margin-bottom: 1rem; +} + +/* Explainer blocks */ +.explainer { + background: var(--surface); + border: 1px solid var(--border); + border-radius: var(--radius); + padding: 1.2rem 1.4rem; + margin-bottom: 1.2rem; + font-size: 0.92rem; +} + +.explainer p { margin-bottom: 0.6rem; } +.explainer p:last-child { margin-bottom: 0; } +.explainer ul { margin: 0.5rem 0 0.5rem 1.5rem; } +.explainer li { margin-bottom: 0.3rem; } +.explainer code { + background: var(--surface2); + padding: 0.15em 0.4em; + border-radius: 4px; + font-size: 0.88em; + color: var(--gold); +} +.explainer strong { color: var(--accent); } +.explainer em { color: var(--gold); font-style: normal; } + +/* Demo boxes */ +.demo-box { + background: var(--surface2); + border: 1px solid var(--border); + border-radius: var(--radius); + padding: 1.4rem; + margin-bottom: 1.2rem; +} + +.demo-box h3 { + font-size: 1rem; + color: var(--gold); + margin-bottom: 0.8rem; +} + +.demo-box h4 { + font-size: 0.85rem; + color: var(--dim); + margin-bottom: 0.5rem; + text-align: center; +} + +.demo-box p { font-size: 0.88rem; margin-bottom: 0.6rem; } + +.slider-row { + display: flex; + align-items: center; + gap: 1rem; + margin-bottom: 1rem; +} + +.slider-row label { + font-size: 0.85rem; + color: var(--dim); + white-space: nowrap; +} + +.slider-row label strong { color: var(--accent); } + +input[type="range"] { + flex: 1; + accent-color: var(--accent); + height: 6px; +} + +.insight { + font-size: 0.82rem !important; + color: var(--green) !important; + font-style: italic; + margin-top: 0.8rem !important; + padding: 0.6rem 0.8rem; + background: rgba(102, 187, 106, 0.08); + border-radius: 6px; + border-left: 3px solid var(--green); +} + +/* Softmax bars */ +#softmaxBars { + display: flex; + align-items: flex-end; + gap: 12px; + height: 180px; + padding: 0 1rem; +} + +.bar-col { + flex: 1; + display: flex; + flex-direction: column; + align-items: center; + gap: 4px; +} + +.bar-wrapper { + width: 100%; + height: 140px; + display: flex; + align-items: flex-end; + justify-content: center; +} + +.bar { + width: 80%; + background: var(--accent); + border-radius: 4px 4px 0 0; + transition: height 0.3s ease, background 0.3s ease; + min-height: 2px; +} + +.bar.winner { background: var(--gold); } + +.bar-label { + font-size: 0.7rem; + color: var(--dim); +} + +.bar-value { + font-size: 0.72rem; + color: var(--text); + font-family: monospace; +} + +/* Memory slots (tab 2) */ +#memorySlots { + display: flex; + gap: 8px; + margin-bottom: 1rem; + flex-wrap: wrap; +} + +.mem-slot { + background: var(--surface); + border: 2px solid var(--border); + border-radius: 8px; + padding: 0.6rem 0.8rem; + cursor: pointer; + text-align: center; + min-width: 80px; + transition: all 0.2s; +} + +.mem-slot:hover { border-color: var(--accent); } +.mem-slot.active { border-color: var(--gold); background: rgba(255, 213, 79, 0.1); } + +.mem-slot .idx { font-size: 0.7rem; color: var(--dim); } +.mem-slot .val { font-size: 1.1rem; color: var(--text); font-family: monospace; font-weight: 600; } + +#dotProducts { + margin: 1rem 0; + font-family: monospace; + font-size: 0.82rem; + line-height: 1.8; +} + +.dp-row { + display: flex; + align-items: center; + gap: 8px; + padding: 2px 0; +} + +.dp-row .dp-bar { + height: 14px; + background: var(--accent); + border-radius: 3px; + transition: width 0.3s, background 0.3s; +} + +.dp-row .dp-bar.winner { background: var(--gold); } +.dp-row .dp-label { width: 50px; color: var(--dim); } +.dp-row .dp-score { width: 60px; text-align: right; } +.dp-row .dp-weight { width: 70px; text-align: right; color: var(--green); } + +#readResult { + margin-top: 1rem; + padding: 0.8rem 1rem; + background: rgba(255, 213, 79, 0.1); + border-radius: 8px; + border-left: 3px solid var(--gold); + font-family: monospace; + font-size: 0.9rem; +} + +/* Side-by-side comparison */ +.sbs-container { + display: grid; + grid-template-columns: 1fr 1fr; + gap: 12px; + margin: 1rem 0; +} + +.sbs-panel { + background: var(--surface); + border: 1px solid var(--border); + border-radius: 10px; + padding: 1rem; + overflow-x: auto; +} + +.sbs-header { + font-size: 0.9rem; + font-weight: 700; + margin-bottom: 4px; + padding-bottom: 6px; + border-bottom: 2px solid var(--border); +} + +.sbs-header.trad { color: var(--accent2); border-color: var(--accent2); } +.sbs-header.lookup { color: var(--gold); border-color: var(--gold); } + +.sbs-subtitle { + font-size: 0.72rem; + color: var(--dim); + margin-bottom: 0.8rem; +} + +.sbs-vec-row { + display: flex; + align-items: center; + gap: 8px; + margin-bottom: 0.8rem; +} + +.sbs-keys { + display: flex; + flex-wrap: wrap; + gap: 4px; + margin-bottom: 0.8rem; +} + +.sbs-key-group { + display: flex; + flex-direction: column; + align-items: center; + gap: 2px; + padding: 6px 4px; + border-radius: 6px; + min-width: 58px; + transition: background 0.2s; +} + +.sbs-key-group.winner { background: rgba(255, 213, 79, 0.1); } +.sbs-key-group.trad-winner { background: rgba(171, 71, 188, 0.1); } + +.sbs-key-group .sbs-word { + font-size: 0.7rem; + color: var(--dim); + font-style: italic; +} + +.sbs-key-group .sbs-score { + font-family: monospace; + font-size: 0.75rem; + color: var(--text); +} + +.sbs-key-group .sbs-weight { + font-family: monospace; + font-size: 0.78rem; + font-weight: 600; +} + +.sbs-key-group.winner .sbs-weight { color: var(--gold); } +.sbs-key-group.trad-winner .sbs-weight { color: var(--accent2); } + +.sbs-weight-bar { + height: 4px; + border-radius: 2px; + transition: width 0.3s; + margin-top: 1px; +} + +.sbs-key-group .sbs-weight-bar { background: var(--accent); } +.sbs-key-group.winner .sbs-weight-bar { background: var(--gold); } +.sbs-key-group.trad-winner .sbs-weight-bar { background: var(--accent2); } + +.sbs-result { + font-size: 0.82rem; + padding: 0.5rem 0.7rem; + background: var(--surface2); + border-radius: 6px; + margin-bottom: 0.6rem; + font-family: monospace; + line-height: 1.6; +} + +.sbs-note { + font-size: 0.78rem; + color: var(--dim); + line-height: 1.5; +} + +.sbs-note strong { color: var(--text); } + +.sbs-controls { + margin-bottom: 0.5rem; +} + +/* Small column vec variant for SBS */ +.col-vec.sbs-vec .cell { + font-size: 0.78rem; + padding: 2px 8px; + min-width: 36px; +} + +.col-vec.sbs-vec .vec-label { + font-size: 0.65rem; +} + +@media (max-width: 768px) { + .sbs-container { grid-template-columns: 1fr; } +} + +/* Write-read trace demo */ +.write-read-demo { + margin-top: 0.6rem; +} + +.wr-trace { + display: flex; + gap: 6px; + flex-wrap: wrap; + margin-bottom: 0.8rem; + min-height: 90px; + align-items: flex-start; +} + +.wr-token { + display: flex; + flex-direction: column; + align-items: center; + gap: 2px; + padding: 6px 8px; + border-radius: 8px; + border: 1px solid var(--border); + background: var(--surface); + min-width: 70px; + transition: all 0.3s; + position: relative; +} + +.wr-token.new-token { + animation: wrFlash 0.6s ease; +} + +.wr-token.reading { + border-color: var(--warn); + box-shadow: 0 0 8px rgba(255, 112, 67, 0.3); +} + +.wr-token.found { + border-color: var(--gold); + background: rgba(255, 213, 79, 0.1); + box-shadow: 0 0 8px rgba(255, 213, 79, 0.3); +} + +.wr-token.shadowed { + opacity: 0.35; +} + +@keyframes wrFlash { + 0% { background: rgba(102, 187, 106, 0.25); } + 100% { background: var(--surface); } +} + +.wr-token .wr-tok-label { + font-size: 0.65rem; + color: var(--dim); + text-transform: uppercase; + letter-spacing: 0.03em; +} + +.wr-token .wr-tok-instr { + font-family: monospace; + font-size: 0.75rem; + color: var(--text); + font-weight: 600; +} + +.wr-token .wr-tok-kv { + font-family: monospace; + font-size: 0.65rem; + line-height: 1.4; +} + +.wr-tok-kv .wr-k { color: var(--accent); } +.wr-tok-kv .wr-v { color: var(--green); } + +.wr-read-arrow { + position: absolute; + top: -18px; + left: 50%; + transform: translateX(-50%); + font-size: 0.65rem; + color: var(--warn); + font-weight: 600; + white-space: nowrap; +} + +.wr-action { + padding: 0.6rem 0.8rem; + background: var(--surface2); + border-radius: 8px; + font-size: 0.82rem; + line-height: 1.6; + min-height: 50px; + color: var(--text); +} + +.wr-action .wr-step-title { + font-weight: 600; + color: var(--gold); + margin-bottom: 2px; +} + +/* Attention flow diagram */ +.attn-flow-diagram { + display: flex; + flex-direction: column; + gap: 8px; + margin: 0.8rem 0; +} + +.attn-flow-section { + display: flex; + flex-direction: column; + gap: 4px; +} + +.attn-flow-label { + font-size: 0.7rem; + color: var(--dim); + text-transform: uppercase; + letter-spacing: 0.04em; + font-weight: 600; + margin-top: 4px; +} + +.attn-flow-row { + display: flex; + align-items: center; + gap: 10px; + padding-left: 12px; +} + +.attn-flow-box { + font-family: monospace; + font-size: 0.85rem; + padding: 0.3rem 0.7rem; + border: 1px solid var(--border); + border-radius: 6px; + background: var(--surface); + color: var(--text); + white-space: nowrap; +} + +.attn-flow-box.computed { + border-color: var(--dim); + border-style: dashed; + color: var(--dim); +} + +.attn-flow-desc { + font-size: 0.75rem; + color: var(--dim); + line-height: 1.4; +} + +.attn-flow-desc strong { color: var(--gold); } + +/* Token pipeline diagram */ +.pipeline-diagram { + display: flex; + align-items: center; + gap: 0; + margin: 0.8rem 0; + flex-wrap: wrap; + justify-content: center; +} + +.pipe-step { + display: flex; + flex-direction: column; + align-items: center; + gap: 3px; +} + +.pipe-label { + font-size: 0.7rem; + color: var(--dim); + text-transform: uppercase; + letter-spacing: 0.03em; +} + +.pipe-box { + background: var(--surface2); + border: 1px solid var(--border); + border-radius: 6px; + padding: 0.4rem 0.8rem; + font-family: monospace; + font-size: 0.9rem; + color: var(--text); +} + +.pipe-box.pipe-text { color: var(--gold); font-style: italic; } +.pipe-box.pipe-id { color: var(--accent); } +.pipe-box.pipe-emb { font-size: 0.8rem; color: var(--green); } + +.pipe-dim { + font-size: 0.65rem; + color: var(--dim); + font-style: italic; +} + +.pipe-arrow { + font-size: 1.1rem; + color: var(--dim); + padding: 0 0.6rem; + text-align: center; + line-height: 1.2; +} + +.pipe-arrow-label { + font-size: 0.65rem; + color: var(--dim); +} + +/* QKV matrix multiply diagram */ +.qkv-matmul-diagram { + display: flex; + flex-direction: column; + gap: 6px; + margin: 0.6rem 0; +} + +.qkv-row { + display: flex; + align-items: center; + gap: 8px; + padding: 4px 0; +} + +.qkv-input { + font-family: monospace; + font-size: 0.95rem; + font-weight: 600; + color: var(--text); + min-width: 20px; + text-align: center; +} + +.qkv-times { + font-size: 0.9rem; + color: var(--dim); +} + +.qkv-matrix { + font-family: monospace; + font-size: 0.9rem; + font-weight: 600; + padding: 0.25rem 0.6rem; + border: 2px solid; + border-radius: 6px; + background: var(--surface2); + min-width: 40px; + text-align: center; +} + +.qkv-eq { + font-size: 0.9rem; + color: var(--dim); +} + +.qkv-output { + font-family: monospace; + font-size: 1rem; + font-weight: 700; + min-width: 20px; + text-align: center; +} + +.qkv-meaning { + font-size: 0.78rem; + color: var(--dim); + margin-left: 4px; +} + +/* Query origin visualization */ +.query-origin-grid { + display: grid; + grid-template-columns: 1fr 1fr; + gap: 12px; + margin-top: 0.8rem; +} + +.qo-panel { + background: var(--surface2); + border: 1px solid var(--border); + border-radius: 10px; + padding: 1rem; +} + +.qo-header { + font-weight: 700; + font-size: 0.9rem; + margin-bottom: 0.7rem; + padding-bottom: 5px; + border-bottom: 1px solid var(--border); +} + +.qo-pipeline { + display: flex; + flex-direction: column; + gap: 2px; + align-items: center; + margin-bottom: 0.7rem; +} + +.qo-step { + width: 100%; + text-align: center; +} + +.qo-label { + font-size: 0.7rem; + color: var(--dim); + margin-bottom: 2px; + text-transform: uppercase; + letter-spacing: 0.03em; +} + +.qo-box { + background: var(--surface); + border: 1px solid var(--border); + border-radius: 6px; + padding: 0.4rem 0.6rem; + font-size: 0.82rem; + color: var(--text); +} + +.qo-box code { + color: var(--gold); + font-size: 0.8rem; +} + +.qo-box.learned { + border-color: var(--accent2); + color: var(--accent2); + font-weight: 600; + font-size: 0.85rem; +} + +.qo-box.engineered { + border-color: var(--gold); + color: var(--gold); + font-weight: 600; + font-size: 0.85rem; +} + +.qo-dim { + font-size: 0.68rem; + color: var(--dim); + margin-top: 2px; + font-style: italic; +} + +.qo-arrow { + font-size: 0.85rem; + color: var(--dim); + padding: 2px 0; + font-family: monospace; +} + +.qo-note { + font-size: 0.78rem; + color: var(--dim); + line-height: 1.5; + margin-top: 0.4rem; +} + +.qo-note strong { color: var(--text); } +.qo-note em { color: var(--gold); font-style: normal; } + +@media (max-width: 768px) { + .query-origin-grid { grid-template-columns: 1fr; } +} + +/* KQV explanation table */ +.kqv-table { + width: 100%; + border-collapse: separate; + border-spacing: 0 6px; + margin-top: 0.5rem; +} + +.kqv-table td { + padding: 0.6rem 0.8rem; + background: var(--surface2); + font-size: 0.85rem; + vertical-align: top; +} + +.kqv-table tr td:first-child { + border-radius: 6px 0 0 6px; +} + +.kqv-table tr td:last-child { + border-radius: 0 6px 6px 0; +} + +.kqv-name { + font-weight: 700; + font-size: 0.9rem; + white-space: nowrap; + width: 70px; +} + +/* Column vector displays (tab 2) */ +#queryVec { + display: inline-flex; + align-items: center; + gap: 8px; + margin-bottom: 0.5rem; +} + +.col-vec { + display: inline-flex; + flex-direction: column; + align-items: center; + position: relative; +} + +.col-vec .bracket { + position: absolute; + top: 0; bottom: 0; + width: 6px; + border: 2px solid var(--accent); + border-right: none; + border-radius: 4px 0 0 4px; + left: -10px; +} + +.col-vec .bracket-r { + position: absolute; + top: 0; bottom: 0; + width: 6px; + border: 2px solid var(--accent); + border-left: none; + border-radius: 0 4px 4px 0; + right: -10px; +} + +.col-vec.query .bracket, +.col-vec.query .bracket-r { border-color: var(--warn); } + +.col-vec.winner .bracket, +.col-vec.winner .bracket-r { border-color: var(--gold); } + +.col-vec .cell { + font-family: monospace; + font-size: 0.95rem; + padding: 4px 14px; + text-align: right; + min-width: 48px; + color: var(--text); +} + +.col-vec.query .cell { color: var(--warn); font-weight: 600; } +.col-vec.winner .cell { color: var(--gold); font-weight: 600; } + +.vec-label { + font-size: 0.72rem; + color: var(--dim); + text-align: center; + margin-top: 4px; +} + +.col-vec.winner .vec-label { color: var(--gold); } + +#vecColumns { + display: flex; + flex-wrap: wrap; + gap: 6px; + align-items: flex-start; + margin-bottom: 1rem; +} + +.vec-group { + display: flex; + flex-direction: column; + align-items: center; + gap: 2px; + padding: 8px 6px; + border-radius: 8px; + transition: background 0.2s; + min-width: 72px; +} + +.vec-group.winner { background: rgba(255, 213, 79, 0.08); } + +.dot-product-line { + display: flex; + align-items: center; + gap: 6px; + font-family: monospace; + font-size: 0.78rem; + margin-top: 4px; + color: var(--dim); +} + +.dot-product-line .dp-val { + font-weight: 600; + color: var(--accent); +} + +.vec-group.winner .dot-product-line .dp-val { color: var(--gold); } + +.dot-product-line .dp-weight { + font-size: 0.72rem; + color: var(--green); +} + +.dp-bar-mini { + height: 4px; + border-radius: 2px; + background: var(--accent); + transition: width 0.3s; + margin-top: 2px; +} + +.vec-group.winner .dp-bar-mini { background: var(--gold); } + +.dot-computation { + font-family: monospace; + font-size: 0.75rem; + color: var(--dim); + margin-top: 2px; + text-align: center; + line-height: 1.5; +} + +/* Two column layout */ +.two-col { + display: grid; + grid-template-columns: 1fr 1fr; + gap: 1rem; +} + +canvas { + background: var(--surface); + border-radius: 8px; + border: 1px solid var(--border); + width: 100%; + height: auto; +} + +/* Mechanism grid (tab 4) */ +.mechanism-grid { + display: grid; + grid-template-columns: 1fr 1fr 1fr; + gap: 1rem; + margin-bottom: 1.5rem; +} + +.mechanism { + background: var(--surface); + border: 1px solid var(--border); + border-radius: var(--radius); + padding: 1rem; +} + +.mechanism h3 { + font-size: 0.95rem; + color: var(--gold); + margin-bottom: 0.5rem; +} + +.mechanism p { + font-size: 0.8rem; + margin-bottom: 0.4rem; +} + +.mechanism strong { color: var(--accent); font-size: 0.78rem; } + +/* Stack machine vis */ +#stackMachineVis { + display: grid; + grid-template-columns: 1fr 1fr 1fr; + gap: 1rem; + margin-bottom: 1rem; +} + +#smProgram, #smState, #smHeads { + background: var(--surface); + border: 1px solid var(--border); + border-radius: 8px; + padding: 0.8rem; + min-height: 200px; +} + +#smProgram h4, #smState h4, #smHeads h4 { + font-size: 0.78rem; + color: var(--dim); + text-transform: uppercase; + letter-spacing: 0.05em; + margin-bottom: 0.6rem; +} + +.instr-line { + font-family: monospace; + font-size: 0.8rem; + padding: 2px 6px; + border-radius: 4px; + margin-bottom: 2px; +} + +.instr-line.current { + background: rgba(79, 195, 247, 0.15); + color: var(--accent); + font-weight: 600; +} + +.instr-line.done { color: var(--dim); } + +.state-row { + display: flex; + justify-content: space-between; + font-size: 0.82rem; + padding: 3px 0; + border-bottom: 1px solid var(--border); +} + +.state-row .label { color: var(--dim); } +.state-row .value { font-family: monospace; color: var(--gold); font-weight: 600; } + +.stack-item { + font-family: monospace; + font-size: 0.85rem; + padding: 3px 8px; + background: rgba(79, 195, 247, 0.1); + border-radius: 4px; + margin-bottom: 3px; + text-align: center; +} + +.stack-item.top { background: rgba(255, 213, 79, 0.15); color: var(--gold); } + +.head-info { + font-size: 0.75rem; + margin-bottom: 0.5rem; + padding: 0.4rem 0.5rem; + background: var(--surface2); + border-radius: 6px; +} + +.head-info .head-name { color: var(--accent2); font-weight: 600; } +.head-info .head-action { color: var(--text); } +.head-info .head-detail { color: var(--dim); font-family: monospace; font-size: 0.72rem; } + +/* Buttons */ +.btn-row { + display: flex; + gap: 8px; + margin-top: 0.5rem; +} + +.action-btn { + background: var(--accent); + color: var(--bg); + border: none; + padding: 0.5rem 1.2rem; + border-radius: 6px; + font-weight: 600; + cursor: pointer; + font-size: 0.85rem; + transition: all 0.2s; +} + +.action-btn:hover { filter: brightness(1.15); } +.action-btn.secondary { + background: var(--surface); + color: var(--text); + border: 1px solid var(--border); +} + +.next-btn { + display: block; + margin: 1.5rem auto 0; + background: none; + border: 1px solid var(--accent); + color: var(--accent); + padding: 0.6rem 1.5rem; + border-radius: 8px; + cursor: pointer; + font-size: 0.9rem; + transition: all 0.2s; +} + +.next-btn:hover { background: var(--accent); color: var(--bg); } + +/* Full execution (tab 5) */ +#fullExec { + display: grid; + grid-template-columns: 200px 1fr 1fr; + gap: 1rem; + margin-bottom: 1rem; +} + +#feProgram, #feTrace, #feDetail { + background: var(--surface); + border: 1px solid var(--border); + border-radius: 8px; + padding: 0.8rem; + min-height: 260px; +} + +#feProgram h4, #feTrace h4, #feDetail h4 { + font-size: 0.78rem; + color: var(--dim); + text-transform: uppercase; + letter-spacing: 0.05em; + margin-bottom: 0.6rem; +} + +.trace-token { + font-family: monospace; + font-size: 0.8rem; + padding: 3px 8px; + margin-bottom: 2px; + border-radius: 4px; + display: flex; + justify-content: space-between; +} + +.trace-token.new { + background: rgba(255, 213, 79, 0.12); + animation: flashIn 0.5s ease; +} + +@keyframes flashIn { + 0% { background: rgba(255, 213, 79, 0.4); } + 100% { background: rgba(255, 213, 79, 0.12); } +} + +.trace-token .tok { color: var(--text); } +.trace-token .meta { color: var(--dim); font-size: 0.72rem; } + +/* Comparison */ +.comparison { + display: grid; + grid-template-columns: 1fr 1fr; + gap: 1rem; + margin-top: 0.8rem; +} + +.comp-col { + background: var(--surface); + border: 1px solid var(--border); + border-radius: 8px; + padding: 1rem; +} + +.comp-col.highlight { + border-color: var(--accent); + background: rgba(79, 195, 247, 0.05); +} + +.comp-col h4 { + font-size: 0.95rem; + margin-bottom: 0.6rem; +} + +.comp-col p { + font-size: 0.82rem; + margin-bottom: 0.3rem; +} + +/* Summary grid */ +.summary-grid { + display: grid; + grid-template-columns: 1fr 1fr; + gap: 0.8rem; + margin: 1rem 0; +} + +.summary-item { + background: var(--surface); + border: 1px solid var(--border); + border-radius: 8px; + padding: 0.8rem 1rem; + font-size: 0.85rem; +} + +.summary-item strong { color: var(--gold); } +.summary-item .dim { color: var(--dim); font-size: 0.78rem; } + +/* IP demo */ +.ip-trace { + display: flex; + gap: 4px; + flex-wrap: wrap; +} + +.ip-cell { + background: var(--surface2); + border-radius: 4px; + padding: 2px 6px; + font-family: monospace; + font-size: 0.7rem; + text-align: center; + min-width: 36px; +} + +.ip-cell .delta { color: var(--green); } +.ip-cell .sum { color: var(--gold); font-weight: 600; } + +/* Stack vis */ +.stack-vis { + display: flex; + flex-direction: column-reverse; + gap: 3px; + max-height: 100px; +} + +.sv-item { + background: rgba(79, 195, 247, 0.12); + border-radius: 4px; + padding: 2px 8px; + font-family: monospace; + font-size: 0.75rem; + text-align: center; +} + +/* Responsive */ +@media (max-width: 768px) { + .mechanism-grid { grid-template-columns: 1fr; } + #stackMachineVis { grid-template-columns: 1fr; } + #fullExec { grid-template-columns: 1fr; } + .two-col { grid-template-columns: 1fr; } + .comparison { grid-template-columns: 1fr; } + .summary-grid { grid-template-columns: 1fr; } + .vec-vis-container { grid-template-columns: 1fr; } + #tabs { flex-wrap: wrap; } +} diff --git a/manim_project/scene.py b/manim_project/scene.py new file mode 100644 index 0000000..0eea670 --- /dev/null +++ b/manim_project/scene.py @@ -0,0 +1,523 @@ +from manim import * +import numpy as np + +# Color palette +BG = "#0f0f1a" +ACCENT = "#4fc3f7" +ACCENT2 = "#ab47bc" +ACCENT3 = "#66bb6a" +WARN = "#ff7043" +TEXT_COL = "#e0e0e0" +DIM = "#666677" +GOLD = "#ffd54f" + + +class Scene1_Intro(Scene): + """The big question: Can LLMs be computers?""" + + def construct(self): + self.camera.background_color = BG + + title = Text("Can LLMs Be Computers?", font_size=56, color=ACCENT, weight=BOLD) + sub = Text( + "Executing programs inside transformers", + font_size=28, color=DIM, + ).next_to(title, DOWN, buff=0.4) + self.play(Write(title, run_time=1.5)) + self.play(FadeIn(sub, shift=UP * 0.2)) + self.wait(1.5) + + # Fade out title + self.play(FadeOut(title), FadeOut(sub)) + + # --- What LLMs are good at vs bad at --- + good_title = Text("LLMs are great at…", font_size=32, color=ACCENT3) + good_title.to_edge(UP, buff=0.6) + good_items = VGroup( + Text("✓ Solving hard math (IMO gold-medal level)", font_size=24, color=TEXT_COL), + Text("✓ Writing code & reasoning about algorithms", font_size=24, color=TEXT_COL), + Text("✓ Understanding natural language", font_size=24, color=TEXT_COL), + ).arrange(DOWN, aligned_edge=LEFT, buff=0.25).next_to(good_title, DOWN, buff=0.4) + + self.play(Write(good_title)) + for item in good_items: + self.play(FadeIn(item, shift=RIGHT * 0.3), run_time=0.5) + self.wait(1) + + bad_title = Text("…but terrible at simple computation", font_size=32, color=WARN) + bad_title.next_to(good_items, DOWN, buff=0.6) + bad_items = VGroup( + Text("✗ Multiplying two numbers reliably", font_size=24, color=WARN), + Text("✗ Solving even easy Sudoku puzzles", font_size=24, color=WARN), + Text("✗ Any task needing many exact steps", font_size=24, color=WARN), + ).arrange(DOWN, aligned_edge=LEFT, buff=0.25).next_to(bad_title, DOWN, buff=0.4) + + self.play(Write(bad_title)) + for item in bad_items: + self.play(FadeIn(item, shift=RIGHT * 0.3), run_time=0.5) + self.wait(2) + self.play(*[FadeOut(m) for m in self.mobjects]) + + +class Scene2_ToolUse(Scene): + """How LLMs currently handle computation: tool use (the airplane analogy).""" + + def construct(self): + self.camera.background_color = BG + + # --- Airplane analogy --- + analogy_title = Text("The Airplane Analogy", font_size=36, color=ACCENT, weight=BOLD) + analogy_title.to_edge(UP, buff=0.5) + self.play(Write(analogy_title)) + + human_label = Text("Human", font_size=28, color=TEXT_COL) + human_box = RoundedRectangle( + corner_radius=0.15, width=2.2, height=1.2, color=ACCENT + ) + human_grp = VGroup(human_box, human_label).move_to(LEFT * 3) + + plane_label = Text("Airplane", font_size=28, color=TEXT_COL) + plane_box = RoundedRectangle( + corner_radius=0.15, width=2.2, height=1.2, color=ACCENT3 + ) + plane_grp = VGroup(plane_box, plane_label).move_to(RIGHT * 3) + + arrow = Arrow(human_box.get_right(), plane_box.get_left(), color=DIM, buff=0.2) + arrow_label = Text("delegates flying", font_size=20, color=DIM).next_to(arrow, UP, buff=0.15) + + cant = Text("Humans can't fly.", font_size=24, color=WARN).next_to( + VGroup(human_grp, plane_grp), DOWN, buff=0.8 + ) + but = Text("Building airplanes doesn't change that.", font_size=24, color=WARN).next_to( + cant, DOWN, buff=0.25 + ) + + self.play(FadeIn(human_grp), FadeIn(plane_grp)) + self.play(GrowArrow(arrow), FadeIn(arrow_label)) + self.wait(0.5) + self.play(Write(cant)) + self.play(Write(but)) + self.wait(2) + self.play(*[FadeOut(m) for m in self.mobjects if m != analogy_title]) + self.play(FadeOut(analogy_title)) + + # --- Tool use diagram --- + tool_title = Text("Today: LLMs Use External Tools", font_size=34, color=ACCENT, weight=BOLD) + tool_title.to_edge(UP, buff=0.5) + self.play(Write(tool_title)) + + llm_box = RoundedRectangle(corner_radius=0.2, width=2.5, height=1.4, color=ACCENT) + llm_label = Text("LLM", font_size=30, color=ACCENT, weight=BOLD) + llm_grp = VGroup(llm_box, llm_label).move_to(LEFT * 3.5) + + interp_box = RoundedRectangle(corner_radius=0.2, width=3, height=1.4, color=ACCENT3) + interp_label = Text("Interpreter", font_size=26, color=ACCENT3) + interp_grp = VGroup(interp_box, interp_label).move_to(RIGHT * 3) + + a1 = Arrow(llm_box.get_right(), interp_box.get_left(), color=GOLD, buff=0.2).shift(UP * 0.2) + a1_label = Text("sends code", font_size=18, color=GOLD).next_to(a1, UP, buff=0.1) + a2 = Arrow(interp_box.get_left(), llm_box.get_right(), color=ACCENT3, buff=0.2).shift(DOWN * 0.2) + a2_label = Text("returns result", font_size=18, color=ACCENT3).next_to(a2, DOWN, buff=0.1) + + note = Text( + "The LLM describes computation\nbut never executes it.", + font_size=22, color=WARN, line_spacing=1.3, + ).next_to(VGroup(llm_grp, interp_grp), DOWN, buff=0.9) + + self.play(FadeIn(llm_grp), FadeIn(interp_grp)) + self.play(GrowArrow(a1), FadeIn(a1_label)) + self.play(GrowArrow(a2), FadeIn(a2_label)) + self.wait(0.5) + self.play(Write(note)) + self.wait(2.5) + self.play(*[FadeOut(m) for m in self.mobjects]) + + +class Scene3_KeyIdea(Scene): + """The breakthrough: building a computer INSIDE the transformer.""" + + def construct(self): + self.camera.background_color = BG + + title = Text("The Breakthrough", font_size=40, color=GOLD, weight=BOLD) + title.to_edge(UP, buff=0.5) + self.play(Write(title)) + + idea = Text( + "Build a computer INSIDE the transformer.", + font_size=30, color=TEXT_COL, + ).next_to(title, DOWN, buff=0.6) + self.play(Write(idea)) + self.wait(1) + + # Big transformer box + tf_box = RoundedRectangle( + corner_radius=0.25, width=9, height=4.5, color=ACCENT, stroke_width=3 + ).shift(DOWN * 0.5) + tf_label = Text("Transformer", font_size=22, color=ACCENT).next_to(tf_box, UP, buff=0.15) + + # CPU inside + cpu_box = RoundedRectangle( + corner_radius=0.15, width=3, height=2, color=GOLD, stroke_width=2 + ).move_to(tf_box.get_center() + LEFT * 2.2) + cpu_label = Text("Virtual CPU\n(WebAssembly)", font_size=20, color=GOLD, line_spacing=1.2).move_to(cpu_box) + + # Memory inside + mem_box = RoundedRectangle( + corner_radius=0.15, width=2.5, height=2, color=ACCENT3, stroke_width=2 + ).move_to(tf_box.get_center() + RIGHT * 2) + mem_label = Text("Memory\n& Stack", font_size=20, color=ACCENT3, line_spacing=1.2).move_to(mem_box) + + conn = Arrow(cpu_box.get_right(), mem_box.get_left(), color=DIM, buff=0.15) + + self.play(Create(tf_box), Write(tf_label)) + self.play(Create(cpu_box), Write(cpu_label)) + self.play(Create(mem_box), Write(mem_label)) + self.play(GrowArrow(conn)) + + result = Text( + "Arbitrary C programs → tokens → executed by the model itself", + font_size=22, color=ACCENT, + ).next_to(tf_box, DOWN, buff=0.5) + self.play(Write(result)) + self.wait(2.5) + self.play(*[FadeOut(m) for m in self.mobjects]) + + +class Scene4_ToolVsInModel(Scene): + """Side-by-side: tool use vs in-model execution for 3+5.""" + + def construct(self): + self.camera.background_color = BG + + title = Text("Tool Use vs In-Model Execution", font_size=34, color=ACCENT, weight=BOLD) + title.to_edge(UP, buff=0.4) + self.play(Write(title)) + + divider = DashedLine(UP * 2.5, DOWN * 2.5, color=DIM, dash_length=0.15).shift(DOWN * 0.3) + self.play(Create(divider)) + + # LEFT: tool use + left_title = Text("Tool Use", font_size=26, color=WARN, weight=BOLD).move_to(LEFT * 3.5 + UP * 2) + self.play(Write(left_title)) + + left_steps = [ + ("LLM:", 'print(3+5)', ACCENT, GOLD), + ("→ send to interpreter", "", DIM, DIM), + ("← result: 8", "", ACCENT3, ACCENT3), + ] + left_grp = VGroup() + y = 1.0 + for label_text, code_text, lc, cc in left_steps: + row = VGroup() + lab = Text(label_text, font_size=20, color=lc).move_to(LEFT * 5 + UP * y) + lab.align_to(LEFT * 5.5, LEFT) + row.add(lab) + if code_text: + cd = Text(code_text, font_size=18, color=cc, font="Monospace").next_to(lab, RIGHT, buff=0.2) + row.add(cd) + left_grp.add(row) + y -= 0.7 + + opaque = Text("Execution is opaque\n(black box)", font_size=18, color=WARN, line_spacing=1.2) + opaque.move_to(LEFT * 3.5 + DOWN * 1.5) + + # RIGHT: in-model + right_title = Text("In-Model", font_size=26, color=ACCENT3, weight=BOLD).move_to(RIGHT * 3.5 + UP * 2) + self.play(Write(right_title)) + + right_lines = [ + "i32.const 03", + "i32.const 05", + "i32.add → 08", + "output(08)", + "halt", + ] + right_grp = VGroup() + y = 1.0 + for line in right_lines: + t = Text(line, font_size=18, color=ACCENT3, font="Monospace").move_to(RIGHT * 3.5 + UP * y) + right_grp.add(t) + y -= 0.55 + + transparent = Text("Every step visible\nin the token stream", font_size=18, color=ACCENT3, line_spacing=1.2) + transparent.move_to(RIGHT * 3.5 + DOWN * 2) + + for row in left_grp: + self.play(FadeIn(row, shift=RIGHT * 0.2), run_time=0.5) + self.play(Write(opaque)) + self.wait(0.5) + + for t in right_grp: + self.play(FadeIn(t, shift=LEFT * 0.2), run_time=0.4) + self.play(Write(transparent)) + self.wait(2.5) + self.play(*[FadeOut(m) for m in self.mobjects]) + + +class Scene5_AppendOnlyTrace(Scene): + """Computation as an append-only trace — the notebook analogy.""" + + def construct(self): + self.camera.background_color = BG + + title = Text("How It Works: The Append-Only Notebook", font_size=32, color=ACCENT, weight=BOLD) + title.to_edge(UP, buff=0.4) + self.play(Write(title)) + + # Notebook visual + nb_rect = Rectangle(width=7, height=4.5, color=DIM, stroke_width=1.5).shift(DOWN * 0.4) + nb_label = Text("Notebook (token stream)", font_size=18, color=DIM).next_to(nb_rect, UP, buff=0.1) + self.play(Create(nb_rect), Write(nb_label)) + + # Lines + prompt_words = ["the", "cat", "runs", "and", "dog", "jumps", "over"] + colors = [DIM, DIM, ACCENT3, DIM, DIM, ACCENT3, DIM] + prompt_label = Text("PROMPT (input words):", font_size=18, color=ACCENT).move_to( + nb_rect.get_top() + DOWN * 0.4 + ) + self.play(Write(prompt_label)) + + word_mobs = VGroup() + for i, (w, c) in enumerate(zip(prompt_words, colors)): + t = Text(w, font_size=22, color=c, font="Monospace") + t.move_to(nb_rect.get_top() + DOWN * 0.8 + RIGHT * (i - 3) * 1.0) + word_mobs.add(t) + self.play(*[FadeIn(w) for w in word_mobs], run_time=0.8) + self.wait(0.5) + + # Trace section + trace_label = Text("EXECUTION TRACE (generated tokens):", font_size=18, color=GOLD).move_to( + nb_rect.get_center() + UP * 0.3 + ) + self.play(Write(trace_label)) + + # Show parity counting step by step + trace_vals = ["odd=0", "odd=0", "odd=1", "odd=1", "odd=1", "odd=0", "odd=0"] + trace_mobs = VGroup() + for i, val in enumerate(trace_vals): + t = Text(val, font_size=20, color=GOLD, font="Monospace") + t.move_to(nb_rect.get_center() + DOWN * 0.3 + RIGHT * (i - 3) * 1.0) + trace_mobs.add(t) + + rule = Text( + "Each new token looks back at (1) the input word and (2) the previous trace token", + font_size=17, color=TEXT_COL, + ).move_to(nb_rect.get_bottom() + DOWN * 0.05) + + for i, tm in enumerate(trace_mobs): + anims = [FadeIn(tm)] + # Draw arrows from word and previous trace + arr1 = Arrow( + word_mobs[i].get_bottom(), tm.get_top(), + color=ACCENT, stroke_width=1.5, buff=0.08, max_tip_length_to_length_ratio=0.15, + ) + anims.append(GrowArrow(arr1)) + if i > 0: + arr2 = Arrow( + trace_mobs[i - 1].get_right(), tm.get_left(), + color=GOLD, stroke_width=1.5, buff=0.08, max_tip_length_to_length_ratio=0.15, + ) + anims.append(GrowArrow(arr2)) + self.play(*anims, run_time=0.6) + + self.play(Write(rule)) + + key_insight = Text( + "Key: only 2 lookbacks per step — cost doesn't grow with length!", + font_size=20, color=ACCENT3, weight=BOLD, + ).next_to(nb_rect, DOWN, buff=0.6) + self.play(Write(key_insight)) + self.wait(2.5) + self.play(*[FadeOut(m) for m in self.mobjects]) + + +class Scene6_QuadraticProblem(Scene): + """The quadratic cost problem of standard attention.""" + + def construct(self): + self.camera.background_color = BG + + title = Text("The Problem: Attention Cost Grows", font_size=34, color=ACCENT, weight=BOLD) + title.to_edge(UP, buff=0.5) + self.play(Write(title)) + + # Axes + ax = Axes( + x_range=[0, 10, 2], y_range=[0, 100, 20], + x_length=5, y_length=3.5, + axis_config={"color": DIM, "include_numbers": False}, + ).shift(DOWN * 0.5) + x_lab = Text("tokens generated (t)", font_size=18, color=DIM).next_to(ax.x_axis, DOWN, buff=0.3) + y_lab = Text("work per step", font_size=18, color=DIM).next_to(ax.y_axis, LEFT, buff=0.3).rotate(PI / 2) + + self.play(Create(ax), Write(x_lab), Write(y_lab)) + + # Standard: linear cost per step → quadratic total + std_graph = ax.plot(lambda x: x * 10, x_range=[0.1, 10], color=WARN, stroke_width=3) + std_label = Text("Standard attention: O(t) per step", font_size=18, color=WARN) + std_label.next_to(ax, RIGHT, buff=0.3).shift(UP * 1) + + self.play(Create(std_graph), Write(std_label), run_time=1.5) + self.wait(1) + + # Log cost + log_graph = ax.plot(lambda x: np.log2(x + 1) * 8, x_range=[0.1, 10], color=ACCENT3, stroke_width=3) + log_label = Text("Hull attention: O(log t) per step", font_size=18, color=ACCENT3) + log_label.next_to(std_label, DOWN, buff=0.4) + + self.play(Create(log_graph), Write(log_label), run_time=1.5) + + gap_text = Text( + "At 1 million steps:\nstandard = 1,000,000 ops/step\nhull = ~20 ops/step", + font_size=18, color=GOLD, line_spacing=1.3, + ).next_to(ax, DOWN, buff=0.7) + self.play(Write(gap_text)) + self.wait(3) + self.play(*[FadeOut(m) for m in self.mobjects]) + + +class Scene7_2DHeads(Scene): + """The key unlock: 2D attention heads and convex hull queries.""" + + def construct(self): + self.camera.background_color = BG + + title = Text("The Key Unlock: 2D Attention Heads", font_size=34, color=ACCENT, weight=BOLD) + title.to_edge(UP, buff=0.4) + self.play(Write(title)) + + # Explain head dimension + expl = Text( + "Restrict each attention head to dimension 2\n→ keys & queries become 2D points", + font_size=22, color=TEXT_COL, line_spacing=1.3, + ).next_to(title, DOWN, buff=0.5) + self.play(Write(expl)) + self.wait(1) + + # 2D scatter of keys + np.random.seed(42) + n_pts = 20 + pts = np.random.randn(n_pts, 2) * 1.2 + plane = NumberPlane( + x_range=[-3, 3, 1], y_range=[-3, 3, 1], + x_length=5, y_length=4, + background_line_style={"stroke_color": DIM, "stroke_width": 0.5}, + axis_config={"stroke_color": DIM}, + ).shift(DOWN * 0.7) + + dots = VGroup() + for pt in pts: + d = Dot(plane.c2p(pt[0], pt[1]), radius=0.06, color=ACCENT) + dots.add(d) + + key_label = Text("Keys (past tokens)", font_size=16, color=ACCENT).next_to(plane, LEFT, buff=0.3) + self.play(Create(plane), Write(key_label)) + self.play(*[FadeIn(d, scale=0.5) for d in dots], run_time=1) + self.wait(0.5) + + # Convex hull + from scipy.spatial import ConvexHull # noqa: E402 + + hull = ConvexHull(pts) + hull_pts = [plane.c2p(pts[i][0], pts[i][1]) for i in hull.vertices] + hull_pts.append(hull_pts[0]) + hull_poly = Polygon(*hull_pts, color=GOLD, stroke_width=2, fill_opacity=0.08, fill_color=GOLD) + + hull_label = Text("Convex Hull", font_size=18, color=GOLD).next_to(plane, RIGHT, buff=0.3).shift(UP) + self.play(Create(hull_poly), Write(hull_label)) + self.wait(0.5) + + # Query point + q_pt = plane.c2p(1.5, 0.5) + q_dot = Dot(q_pt, radius=0.1, color=WARN) + q_label = Text("Query", font_size=16, color=WARN).next_to(q_dot, UR, buff=0.1) + self.play(FadeIn(q_dot, scale=2), Write(q_label)) + + insight = Text( + "Finding the best-matching key = a geometric query on the hull\n" + "→ binary search in O(log t) instead of scanning all t keys", + font_size=18, color=ACCENT3, line_spacing=1.3, + ).next_to(plane, DOWN, buff=0.5) + self.play(Write(insight)) + self.wait(3) + self.play(*[FadeOut(m) for m in self.mobjects]) + + +class Scene8_Results(Scene): + """Performance results and what this means.""" + + def construct(self): + self.camera.background_color = BG + + title = Text("Results: What This Enables", font_size=36, color=ACCENT, weight=BOLD) + title.to_edge(UP, buff=0.5) + self.play(Write(title)) + + results = VGroup( + self._result_card("30,000+ tok/s", "Execution speed on CPU", ACCENT3), + self._result_card("Millions of steps", "Correct execution length", GOLD), + self._result_card("100% accuracy", "On Sudoku benchmarks\n(incl. world's hardest)", ACCENT), + ).arrange(RIGHT, buff=0.6).next_to(title, DOWN, buff=0.8) + + for card in results: + self.play(FadeIn(card, shift=UP * 0.3), run_time=0.7) + self.wait(0.5) + + # Sudoku callout + sudoku_note = Text( + "Solves Arto Inkala's Sudoku (\"world's hardest\")\nin under 3 minutes — fully inside the transformer", + font_size=20, color=TEXT_COL, line_spacing=1.3, + ).next_to(results, DOWN, buff=0.8) + self.play(Write(sudoku_note)) + self.wait(2.5) + self.play(*[FadeOut(m) for m in self.mobjects]) + + def _result_card(self, big_text, small_text, color): + box = RoundedRectangle(corner_radius=0.15, width=3.5, height=2.5, color=color, stroke_width=2) + big = Text(big_text, font_size=28, color=color, weight=BOLD) + small = Text(small_text, font_size=16, color=TEXT_COL, line_spacing=1.2) + grp = VGroup(big, small).arrange(DOWN, buff=0.3) + return VGroup(box, grp) + + +class Scene9_Recap(Scene): + """Final recap and takeaway.""" + + def construct(self): + self.camera.background_color = BG + + title = Text("Recap", font_size=40, color=ACCENT, weight=BOLD) + title.to_edge(UP, buff=0.5) + self.play(Write(title)) + + steps = [ + ("1", "LLMs struggle with long, exact computation", TEXT_COL), + ("2", "Today we bolt on external tools (like airplanes for humans)", DIM), + ("3", "This team built a real computer inside a transformer", ACCENT3), + ("4", "C code → WebAssembly tokens → executed by the model itself", GOLD), + ("5", "2D attention heads enable O(log t) lookups (exponentially faster)", ACCENT), + ("6", "Result: millions of correct steps, 30k+ tokens/sec, 100% Sudoku", ACCENT3), + ] + + grp = VGroup() + for num, text, color in steps: + num_mob = Text(num, font_size=24, color=ACCENT2, weight=BOLD) + text_mob = Text(text, font_size=21, color=color) + row = VGroup(num_mob, text_mob).arrange(RIGHT, buff=0.3) + grp.add(row) + grp.arrange(DOWN, aligned_edge=LEFT, buff=0.35).next_to(title, DOWN, buff=0.6) + + for row in grp: + self.play(FadeIn(row, shift=RIGHT * 0.3), run_time=0.6) + self.wait(0.3) + + self.wait(1) + + takeaway = Text( + "The model stops being a coordinator of computation\nand becomes a computer itself.", + font_size=24, color=GOLD, weight=BOLD, line_spacing=1.3, + ).next_to(grp, DOWN, buff=0.7) + self.play(Write(takeaway, run_time=2)) + self.wait(3) + self.play(*[FadeOut(m) for m in self.mobjects])