Files
percepta-computer-as-llm/manim_project/scene.py

524 lines
21 KiB
Python
Raw Normal View History

from manim import *
import numpy as np
# Color palette
BG = "#0f0f1a"
ACCENT = "#4fc3f7"
ACCENT2 = "#ab47bc"
ACCENT3 = "#66bb6a"
WARN = "#ff7043"
TEXT_COL = "#e0e0e0"
DIM = "#666677"
GOLD = "#ffd54f"
class Scene1_Intro(Scene):
"""The big question: Can LLMs be computers?"""
def construct(self):
self.camera.background_color = BG
title = Text("Can LLMs Be Computers?", font_size=56, color=ACCENT, weight=BOLD)
sub = Text(
"Executing programs inside transformers",
font_size=28, color=DIM,
).next_to(title, DOWN, buff=0.4)
self.play(Write(title, run_time=1.5))
self.play(FadeIn(sub, shift=UP * 0.2))
self.wait(1.5)
# Fade out title
self.play(FadeOut(title), FadeOut(sub))
# --- What LLMs are good at vs bad at ---
good_title = Text("LLMs are great at…", font_size=32, color=ACCENT3)
good_title.to_edge(UP, buff=0.6)
good_items = VGroup(
Text("✓ Solving hard math (IMO gold-medal level)", font_size=24, color=TEXT_COL),
Text("✓ Writing code & reasoning about algorithms", font_size=24, color=TEXT_COL),
Text("✓ Understanding natural language", font_size=24, color=TEXT_COL),
).arrange(DOWN, aligned_edge=LEFT, buff=0.25).next_to(good_title, DOWN, buff=0.4)
self.play(Write(good_title))
for item in good_items:
self.play(FadeIn(item, shift=RIGHT * 0.3), run_time=0.5)
self.wait(1)
bad_title = Text("…but terrible at simple computation", font_size=32, color=WARN)
bad_title.next_to(good_items, DOWN, buff=0.6)
bad_items = VGroup(
Text("✗ Multiplying two numbers reliably", font_size=24, color=WARN),
Text("✗ Solving even easy Sudoku puzzles", font_size=24, color=WARN),
Text("✗ Any task needing many exact steps", font_size=24, color=WARN),
).arrange(DOWN, aligned_edge=LEFT, buff=0.25).next_to(bad_title, DOWN, buff=0.4)
self.play(Write(bad_title))
for item in bad_items:
self.play(FadeIn(item, shift=RIGHT * 0.3), run_time=0.5)
self.wait(2)
self.play(*[FadeOut(m) for m in self.mobjects])
class Scene2_ToolUse(Scene):
"""How LLMs currently handle computation: tool use (the airplane analogy)."""
def construct(self):
self.camera.background_color = BG
# --- Airplane analogy ---
analogy_title = Text("The Airplane Analogy", font_size=36, color=ACCENT, weight=BOLD)
analogy_title.to_edge(UP, buff=0.5)
self.play(Write(analogy_title))
human_label = Text("Human", font_size=28, color=TEXT_COL)
human_box = RoundedRectangle(
corner_radius=0.15, width=2.2, height=1.2, color=ACCENT
)
human_grp = VGroup(human_box, human_label).move_to(LEFT * 3)
plane_label = Text("Airplane", font_size=28, color=TEXT_COL)
plane_box = RoundedRectangle(
corner_radius=0.15, width=2.2, height=1.2, color=ACCENT3
)
plane_grp = VGroup(plane_box, plane_label).move_to(RIGHT * 3)
arrow = Arrow(human_box.get_right(), plane_box.get_left(), color=DIM, buff=0.2)
arrow_label = Text("delegates flying", font_size=20, color=DIM).next_to(arrow, UP, buff=0.15)
cant = Text("Humans can't fly.", font_size=24, color=WARN).next_to(
VGroup(human_grp, plane_grp), DOWN, buff=0.8
)
but = Text("Building airplanes doesn't change that.", font_size=24, color=WARN).next_to(
cant, DOWN, buff=0.25
)
self.play(FadeIn(human_grp), FadeIn(plane_grp))
self.play(GrowArrow(arrow), FadeIn(arrow_label))
self.wait(0.5)
self.play(Write(cant))
self.play(Write(but))
self.wait(2)
self.play(*[FadeOut(m) for m in self.mobjects if m != analogy_title])
self.play(FadeOut(analogy_title))
# --- Tool use diagram ---
tool_title = Text("Today: LLMs Use External Tools", font_size=34, color=ACCENT, weight=BOLD)
tool_title.to_edge(UP, buff=0.5)
self.play(Write(tool_title))
llm_box = RoundedRectangle(corner_radius=0.2, width=2.5, height=1.4, color=ACCENT)
llm_label = Text("LLM", font_size=30, color=ACCENT, weight=BOLD)
llm_grp = VGroup(llm_box, llm_label).move_to(LEFT * 3.5)
interp_box = RoundedRectangle(corner_radius=0.2, width=3, height=1.4, color=ACCENT3)
interp_label = Text("Interpreter", font_size=26, color=ACCENT3)
interp_grp = VGroup(interp_box, interp_label).move_to(RIGHT * 3)
a1 = Arrow(llm_box.get_right(), interp_box.get_left(), color=GOLD, buff=0.2).shift(UP * 0.2)
a1_label = Text("sends code", font_size=18, color=GOLD).next_to(a1, UP, buff=0.1)
a2 = Arrow(interp_box.get_left(), llm_box.get_right(), color=ACCENT3, buff=0.2).shift(DOWN * 0.2)
a2_label = Text("returns result", font_size=18, color=ACCENT3).next_to(a2, DOWN, buff=0.1)
note = Text(
"The LLM describes computation\nbut never executes it.",
font_size=22, color=WARN, line_spacing=1.3,
).next_to(VGroup(llm_grp, interp_grp), DOWN, buff=0.9)
self.play(FadeIn(llm_grp), FadeIn(interp_grp))
self.play(GrowArrow(a1), FadeIn(a1_label))
self.play(GrowArrow(a2), FadeIn(a2_label))
self.wait(0.5)
self.play(Write(note))
self.wait(2.5)
self.play(*[FadeOut(m) for m in self.mobjects])
class Scene3_KeyIdea(Scene):
"""The breakthrough: building a computer INSIDE the transformer."""
def construct(self):
self.camera.background_color = BG
title = Text("The Breakthrough", font_size=40, color=GOLD, weight=BOLD)
title.to_edge(UP, buff=0.5)
self.play(Write(title))
idea = Text(
"Build a computer INSIDE the transformer.",
font_size=30, color=TEXT_COL,
).next_to(title, DOWN, buff=0.6)
self.play(Write(idea))
self.wait(1)
# Big transformer box
tf_box = RoundedRectangle(
corner_radius=0.25, width=9, height=4.5, color=ACCENT, stroke_width=3
).shift(DOWN * 0.5)
tf_label = Text("Transformer", font_size=22, color=ACCENT).next_to(tf_box, UP, buff=0.15)
# CPU inside
cpu_box = RoundedRectangle(
corner_radius=0.15, width=3, height=2, color=GOLD, stroke_width=2
).move_to(tf_box.get_center() + LEFT * 2.2)
cpu_label = Text("Virtual CPU\n(WebAssembly)", font_size=20, color=GOLD, line_spacing=1.2).move_to(cpu_box)
# Memory inside
mem_box = RoundedRectangle(
corner_radius=0.15, width=2.5, height=2, color=ACCENT3, stroke_width=2
).move_to(tf_box.get_center() + RIGHT * 2)
mem_label = Text("Memory\n& Stack", font_size=20, color=ACCENT3, line_spacing=1.2).move_to(mem_box)
conn = Arrow(cpu_box.get_right(), mem_box.get_left(), color=DIM, buff=0.15)
self.play(Create(tf_box), Write(tf_label))
self.play(Create(cpu_box), Write(cpu_label))
self.play(Create(mem_box), Write(mem_label))
self.play(GrowArrow(conn))
result = Text(
"Arbitrary C programs → tokens → executed by the model itself",
font_size=22, color=ACCENT,
).next_to(tf_box, DOWN, buff=0.5)
self.play(Write(result))
self.wait(2.5)
self.play(*[FadeOut(m) for m in self.mobjects])
class Scene4_ToolVsInModel(Scene):
"""Side-by-side: tool use vs in-model execution for 3+5."""
def construct(self):
self.camera.background_color = BG
title = Text("Tool Use vs In-Model Execution", font_size=34, color=ACCENT, weight=BOLD)
title.to_edge(UP, buff=0.4)
self.play(Write(title))
divider = DashedLine(UP * 2.5, DOWN * 2.5, color=DIM, dash_length=0.15).shift(DOWN * 0.3)
self.play(Create(divider))
# LEFT: tool use
left_title = Text("Tool Use", font_size=26, color=WARN, weight=BOLD).move_to(LEFT * 3.5 + UP * 2)
self.play(Write(left_title))
left_steps = [
("LLM:", 'print(3+5)', ACCENT, GOLD),
("→ send to interpreter", "", DIM, DIM),
("← result: 8", "", ACCENT3, ACCENT3),
]
left_grp = VGroup()
y = 1.0
for label_text, code_text, lc, cc in left_steps:
row = VGroup()
lab = Text(label_text, font_size=20, color=lc).move_to(LEFT * 5 + UP * y)
lab.align_to(LEFT * 5.5, LEFT)
row.add(lab)
if code_text:
cd = Text(code_text, font_size=18, color=cc, font="Monospace").next_to(lab, RIGHT, buff=0.2)
row.add(cd)
left_grp.add(row)
y -= 0.7
opaque = Text("Execution is opaque\n(black box)", font_size=18, color=WARN, line_spacing=1.2)
opaque.move_to(LEFT * 3.5 + DOWN * 1.5)
# RIGHT: in-model
right_title = Text("In-Model", font_size=26, color=ACCENT3, weight=BOLD).move_to(RIGHT * 3.5 + UP * 2)
self.play(Write(right_title))
right_lines = [
"i32.const 03",
"i32.const 05",
"i32.add → 08",
"output(08)",
"halt",
]
right_grp = VGroup()
y = 1.0
for line in right_lines:
t = Text(line, font_size=18, color=ACCENT3, font="Monospace").move_to(RIGHT * 3.5 + UP * y)
right_grp.add(t)
y -= 0.55
transparent = Text("Every step visible\nin the token stream", font_size=18, color=ACCENT3, line_spacing=1.2)
transparent.move_to(RIGHT * 3.5 + DOWN * 2)
for row in left_grp:
self.play(FadeIn(row, shift=RIGHT * 0.2), run_time=0.5)
self.play(Write(opaque))
self.wait(0.5)
for t in right_grp:
self.play(FadeIn(t, shift=LEFT * 0.2), run_time=0.4)
self.play(Write(transparent))
self.wait(2.5)
self.play(*[FadeOut(m) for m in self.mobjects])
class Scene5_AppendOnlyTrace(Scene):
"""Computation as an append-only trace — the notebook analogy."""
def construct(self):
self.camera.background_color = BG
title = Text("How It Works: The Append-Only Notebook", font_size=32, color=ACCENT, weight=BOLD)
title.to_edge(UP, buff=0.4)
self.play(Write(title))
# Notebook visual
nb_rect = Rectangle(width=7, height=4.5, color=DIM, stroke_width=1.5).shift(DOWN * 0.4)
nb_label = Text("Notebook (token stream)", font_size=18, color=DIM).next_to(nb_rect, UP, buff=0.1)
self.play(Create(nb_rect), Write(nb_label))
# Lines
prompt_words = ["the", "cat", "runs", "and", "dog", "jumps", "over"]
colors = [DIM, DIM, ACCENT3, DIM, DIM, ACCENT3, DIM]
prompt_label = Text("PROMPT (input words):", font_size=18, color=ACCENT).move_to(
nb_rect.get_top() + DOWN * 0.4
)
self.play(Write(prompt_label))
word_mobs = VGroup()
for i, (w, c) in enumerate(zip(prompt_words, colors)):
t = Text(w, font_size=22, color=c, font="Monospace")
t.move_to(nb_rect.get_top() + DOWN * 0.8 + RIGHT * (i - 3) * 1.0)
word_mobs.add(t)
self.play(*[FadeIn(w) for w in word_mobs], run_time=0.8)
self.wait(0.5)
# Trace section
trace_label = Text("EXECUTION TRACE (generated tokens):", font_size=18, color=GOLD).move_to(
nb_rect.get_center() + UP * 0.3
)
self.play(Write(trace_label))
# Show parity counting step by step
trace_vals = ["odd=0", "odd=0", "odd=1", "odd=1", "odd=1", "odd=0", "odd=0"]
trace_mobs = VGroup()
for i, val in enumerate(trace_vals):
t = Text(val, font_size=20, color=GOLD, font="Monospace")
t.move_to(nb_rect.get_center() + DOWN * 0.3 + RIGHT * (i - 3) * 1.0)
trace_mobs.add(t)
rule = Text(
"Each new token looks back at (1) the input word and (2) the previous trace token",
font_size=17, color=TEXT_COL,
).move_to(nb_rect.get_bottom() + DOWN * 0.05)
for i, tm in enumerate(trace_mobs):
anims = [FadeIn(tm)]
# Draw arrows from word and previous trace
arr1 = Arrow(
word_mobs[i].get_bottom(), tm.get_top(),
color=ACCENT, stroke_width=1.5, buff=0.08, max_tip_length_to_length_ratio=0.15,
)
anims.append(GrowArrow(arr1))
if i > 0:
arr2 = Arrow(
trace_mobs[i - 1].get_right(), tm.get_left(),
color=GOLD, stroke_width=1.5, buff=0.08, max_tip_length_to_length_ratio=0.15,
)
anims.append(GrowArrow(arr2))
self.play(*anims, run_time=0.6)
self.play(Write(rule))
key_insight = Text(
"Key: only 2 lookbacks per step — cost doesn't grow with length!",
font_size=20, color=ACCENT3, weight=BOLD,
).next_to(nb_rect, DOWN, buff=0.6)
self.play(Write(key_insight))
self.wait(2.5)
self.play(*[FadeOut(m) for m in self.mobjects])
class Scene6_QuadraticProblem(Scene):
"""The quadratic cost problem of standard attention."""
def construct(self):
self.camera.background_color = BG
title = Text("The Problem: Attention Cost Grows", font_size=34, color=ACCENT, weight=BOLD)
title.to_edge(UP, buff=0.5)
self.play(Write(title))
# Axes
ax = Axes(
x_range=[0, 10, 2], y_range=[0, 100, 20],
x_length=5, y_length=3.5,
axis_config={"color": DIM, "include_numbers": False},
).shift(DOWN * 0.5)
x_lab = Text("tokens generated (t)", font_size=18, color=DIM).next_to(ax.x_axis, DOWN, buff=0.3)
y_lab = Text("work per step", font_size=18, color=DIM).next_to(ax.y_axis, LEFT, buff=0.3).rotate(PI / 2)
self.play(Create(ax), Write(x_lab), Write(y_lab))
# Standard: linear cost per step → quadratic total
std_graph = ax.plot(lambda x: x * 10, x_range=[0.1, 10], color=WARN, stroke_width=3)
std_label = Text("Standard attention: O(t) per step", font_size=18, color=WARN)
std_label.next_to(ax, RIGHT, buff=0.3).shift(UP * 1)
self.play(Create(std_graph), Write(std_label), run_time=1.5)
self.wait(1)
# Log cost
log_graph = ax.plot(lambda x: np.log2(x + 1) * 8, x_range=[0.1, 10], color=ACCENT3, stroke_width=3)
log_label = Text("Hull attention: O(log t) per step", font_size=18, color=ACCENT3)
log_label.next_to(std_label, DOWN, buff=0.4)
self.play(Create(log_graph), Write(log_label), run_time=1.5)
gap_text = Text(
"At 1 million steps:\nstandard = 1,000,000 ops/step\nhull = ~20 ops/step",
font_size=18, color=GOLD, line_spacing=1.3,
).next_to(ax, DOWN, buff=0.7)
self.play(Write(gap_text))
self.wait(3)
self.play(*[FadeOut(m) for m in self.mobjects])
class Scene7_2DHeads(Scene):
"""The key unlock: 2D attention heads and convex hull queries."""
def construct(self):
self.camera.background_color = BG
title = Text("The Key Unlock: 2D Attention Heads", font_size=34, color=ACCENT, weight=BOLD)
title.to_edge(UP, buff=0.4)
self.play(Write(title))
# Explain head dimension
expl = Text(
"Restrict each attention head to dimension 2\n→ keys & queries become 2D points",
font_size=22, color=TEXT_COL, line_spacing=1.3,
).next_to(title, DOWN, buff=0.5)
self.play(Write(expl))
self.wait(1)
# 2D scatter of keys
np.random.seed(42)
n_pts = 20
pts = np.random.randn(n_pts, 2) * 1.2
plane = NumberPlane(
x_range=[-3, 3, 1], y_range=[-3, 3, 1],
x_length=5, y_length=4,
background_line_style={"stroke_color": DIM, "stroke_width": 0.5},
axis_config={"stroke_color": DIM},
).shift(DOWN * 0.7)
dots = VGroup()
for pt in pts:
d = Dot(plane.c2p(pt[0], pt[1]), radius=0.06, color=ACCENT)
dots.add(d)
key_label = Text("Keys (past tokens)", font_size=16, color=ACCENT).next_to(plane, LEFT, buff=0.3)
self.play(Create(plane), Write(key_label))
self.play(*[FadeIn(d, scale=0.5) for d in dots], run_time=1)
self.wait(0.5)
# Convex hull
from scipy.spatial import ConvexHull # noqa: E402
hull = ConvexHull(pts)
hull_pts = [plane.c2p(pts[i][0], pts[i][1]) for i in hull.vertices]
hull_pts.append(hull_pts[0])
hull_poly = Polygon(*hull_pts, color=GOLD, stroke_width=2, fill_opacity=0.08, fill_color=GOLD)
hull_label = Text("Convex Hull", font_size=18, color=GOLD).next_to(plane, RIGHT, buff=0.3).shift(UP)
self.play(Create(hull_poly), Write(hull_label))
self.wait(0.5)
# Query point
q_pt = plane.c2p(1.5, 0.5)
q_dot = Dot(q_pt, radius=0.1, color=WARN)
q_label = Text("Query", font_size=16, color=WARN).next_to(q_dot, UR, buff=0.1)
self.play(FadeIn(q_dot, scale=2), Write(q_label))
insight = Text(
"Finding the best-matching key = a geometric query on the hull\n"
"→ binary search in O(log t) instead of scanning all t keys",
font_size=18, color=ACCENT3, line_spacing=1.3,
).next_to(plane, DOWN, buff=0.5)
self.play(Write(insight))
self.wait(3)
self.play(*[FadeOut(m) for m in self.mobjects])
class Scene8_Results(Scene):
"""Performance results and what this means."""
def construct(self):
self.camera.background_color = BG
title = Text("Results: What This Enables", font_size=36, color=ACCENT, weight=BOLD)
title.to_edge(UP, buff=0.5)
self.play(Write(title))
results = VGroup(
self._result_card("30,000+ tok/s", "Execution speed on CPU", ACCENT3),
self._result_card("Millions of steps", "Correct execution length", GOLD),
self._result_card("100% accuracy", "On Sudoku benchmarks\n(incl. world's hardest)", ACCENT),
).arrange(RIGHT, buff=0.6).next_to(title, DOWN, buff=0.8)
for card in results:
self.play(FadeIn(card, shift=UP * 0.3), run_time=0.7)
self.wait(0.5)
# Sudoku callout
sudoku_note = Text(
"Solves Arto Inkala's Sudoku (\"world's hardest\")\nin under 3 minutes — fully inside the transformer",
font_size=20, color=TEXT_COL, line_spacing=1.3,
).next_to(results, DOWN, buff=0.8)
self.play(Write(sudoku_note))
self.wait(2.5)
self.play(*[FadeOut(m) for m in self.mobjects])
def _result_card(self, big_text, small_text, color):
box = RoundedRectangle(corner_radius=0.15, width=3.5, height=2.5, color=color, stroke_width=2)
big = Text(big_text, font_size=28, color=color, weight=BOLD)
small = Text(small_text, font_size=16, color=TEXT_COL, line_spacing=1.2)
grp = VGroup(big, small).arrange(DOWN, buff=0.3)
return VGroup(box, grp)
class Scene9_Recap(Scene):
"""Final recap and takeaway."""
def construct(self):
self.camera.background_color = BG
title = Text("Recap", font_size=40, color=ACCENT, weight=BOLD)
title.to_edge(UP, buff=0.5)
self.play(Write(title))
steps = [
("1", "LLMs struggle with long, exact computation", TEXT_COL),
("2", "Today we bolt on external tools (like airplanes for humans)", DIM),
("3", "This team built a real computer inside a transformer", ACCENT3),
("4", "C code → WebAssembly tokens → executed by the model itself", GOLD),
("5", "2D attention heads enable O(log t) lookups (exponentially faster)", ACCENT),
("6", "Result: millions of correct steps, 30k+ tokens/sec, 100% Sudoku", ACCENT3),
]
grp = VGroup()
for num, text, color in steps:
num_mob = Text(num, font_size=24, color=ACCENT2, weight=BOLD)
text_mob = Text(text, font_size=21, color=color)
row = VGroup(num_mob, text_mob).arrange(RIGHT, buff=0.3)
grp.add(row)
grp.arrange(DOWN, aligned_edge=LEFT, buff=0.35).next_to(title, DOWN, buff=0.6)
for row in grp:
self.play(FadeIn(row, shift=RIGHT * 0.3), run_time=0.6)
self.wait(0.3)
self.wait(1)
takeaway = Text(
"The model stops being a coordinator of computation\nand becomes a computer itself.",
font_size=24, color=GOLD, weight=BOLD, line_spacing=1.3,
).next_to(grp, DOWN, buff=0.7)
self.play(Write(takeaway, run_time=2))
self.wait(3)
self.play(*[FadeOut(m) for m in self.mobjects])