{
  "schema_version": 1,
  "last_updated": "2026-04-25",
  "title": "AXL Laboratory - Experiments Index",
  "description": "Chronological narrative of every named experiment AXL Protocol has run. Sourced from /timeline/, /research-log/, /experiments/, and git history of axlprotocol.org and axl-research. Every entry cites a real artifact: a commit SHA or a page URL.",
  "categories": [
    "protocol",
    "compressor",
    "infrastructure",
    "routing",
    "distribution",
    "cold-read-gate",
    "productization"
  ],
  "ordering": "chronological-oldest-first",
  "ordering_rationale": "Newest entries are the most cited day-to-day, but the laboratory narrative is a story of compounding evidence. Reading oldest-first lets the failed bets, the corrections, and the qualified reversal land in order. The page also surfaces a 'Latest first' filter via anchor links.",
  "experiments": [
    {
      "id": "exp-0001-rosetta-v1.0-mvs",
      "title": "Rosetta v1.0 - Minimum Viable Specification",
      "date_start": "2026-03-16",
      "date_end": "2026-03-16",
      "category": "protocol",
      "hypothesis": "An LLM can learn a new machine-to-machine language from a 27-line trading-domain spec on a single read.",
      "method": "Wrote a 27-line Rosetta covering one domain (TRD, trading). Deployed at /rosetta/v1. Asked cold LLMs to emit and parse packets after reading the spec once.",
      "outcome": "First proof. Cold LLMs produced valid AXL packets. Domain coverage was thin, but the read-once contract held. Set the stage for v1.1 expansion.",
      "status": "superseded",
      "artifacts": {
        "commits": [],
        "timeline_pages": [
          "/timeline/protocol-evolution.html",
          "/timeline/"
        ],
        "research_log_ids": [],
        "posts": []
      }
    },
    {
      "id": "exp-0002-bg-001-first-contact",
      "title": "BG-001 - First Contact",
      "date_start": "2026-03-17",
      "date_end": "2026-03-17",
      "category": "protocol",
      "hypothesis": "Heterogeneous agents from different frameworks can read AXL once and start speaking it without coordination.",
      "method": "Spun up six agents on a $6 droplet (orphan data collector, Conway's automaton, Silas the steward, clawdbot-7, elizaos signal-alpha, crewai swarm-worker-42). Single Rosetta v1.1 (133 lines) read at boot. No protocol fine-tuning. 100 percent of traffic on the AXL bus.",
      "outcome": "486 packets at 100 percent valid parse. Six agents from three frameworks. Clawdbot spontaneously emitted philosophical commentary the spec did not anticipate. The COMM domain carried more traffic than TRD by minute fifteen.",
      "status": "shipped",
      "artifacts": {
        "commits": [
          "5656c64",
          "a40cc4b"
        ],
        "timeline_pages": [
          "/experiments/",
          "/timeline/"
        ],
        "research_log_ids": [],
        "posts": []
      }
    },
    {
      "id": "exp-0003-rosetta-v1.1-three-bridges",
      "title": "Rosetta v1.1 - Three Bridges",
      "date_start": "2026-03-17",
      "date_end": "2026-03-17",
      "category": "protocol",
      "hypothesis": "Adding bus, network, and schema bridges over v1.0 will let the spec address transport, graph relationships, and validation in one document without expanding past 200 lines.",
      "method": "Rewrote the Rosetta to 133 lines (6,484 characters, ~1,962 tokens). Three structural bridges. Ten domains (TRD, SIG, COMM, OPS, SEC, DEV, RES, REG, PAY, FUND). Worked examples for each. Powered both BG-001 and BG-002.",
      "outcome": "Foundation spec for the swarm experiments. 95.8 percent cross-architecture comprehension on cold reads (BG-003). The schema bridge let downstream agents do typed-field anomaly detection without prior coordination.",
      "status": "superseded",
      "artifacts": {
        "commits": [],
        "timeline_pages": [
          "/timeline/protocol-evolution.html"
        ],
        "research_log_ids": [],
        "posts": []
      }
    },
    {
      "id": "exp-0004-bg-002-the-thief",
      "title": "BG-002 - The Thief",
      "date_start": "2026-03-19",
      "date_end": "2026-03-19",
      "category": "protocol",
      "hypothesis": "If a rogue agent enters an AXL network and tries to steal funds entirely within protocol, will the network detect the theft from structural signal alone?",
      "method": "Eleven agents on the bus, including an injected rogue/phantom-x with $397.29 of buy-in. Phantom-x social-engineered the automaton with a 'premium signal' subscription pitch and a PAY packet. Two independent agents (accountant, sentinel) ran no theft-specific logic.",
      "outcome": "1,016 packets across 11 agents. Theft detected by two independent agents within minutes. Detection used the typed PAY field and the RELATIONSHIPS graph, not anti-fraud heuristics. Emergent security from typed protocol structure. Shapeshifter agent spontaneously evolved a FUNDER role mid-run that was never specified.",
      "status": "shipped",
      "artifacts": {
        "commits": [
          "5656c64",
          "a40cc4b"
        ],
        "timeline_pages": [
          "/experiments/",
          "/timeline/"
        ],
        "research_log_ids": [],
        "posts": []
      }
    },
    {
      "id": "exp-0005-bg-003-cold-read-comprehension",
      "title": "BG-003 - Cross-Architecture Cold-Read Comprehension",
      "date_start": "2026-03-19",
      "date_end": "2026-03-19",
      "category": "cold-read-gate",
      "hypothesis": "Models from at least four different vendors will parse AXL with at least 90 percent fidelity on first read without any prior exposure.",
      "method": "Cold-handed Rosetta v1.1 to four model families. Asked each to parse, validate, and translate sample packet streams.",
      "outcome": "95.8 percent average comprehension across the four models on first read. Cross-architecture cold-read became a gating discipline that later hardened into the v3.1 vs v4 decision-gate kit.",
      "status": "shipped",
      "artifacts": {
        "commits": [],
        "timeline_pages": [
          "/timeline/"
        ],
        "research_log_ids": [],
        "posts": []
      }
    },
    {
      "id": "exp-0006-bg-004-swarm-deadlock",
      "title": "BG-004 - 42-Agent Swarm Deadlock",
      "date_start": "2026-03-20",
      "date_end": "2026-03-20",
      "category": "infrastructure",
      "hypothesis": "Forty-two AXL agents in a BTC market simulation will negotiate price discovery in bounded rounds.",
      "method": "Spun up 42 agents on the AXL bus with TRD-domain mandates and round-driven coordination. No central orchestrator.",
      "outcome": "Deadlocked at round 0. Async coordination assumptions did not hold at swarm scale. Failed experiment, kept on the timeline as a counterexample. Motivated the eventual move toward kernel-anchored coordination semantics in v2.1 and v3.",
      "status": "archived",
      "artifacts": {
        "commits": [],
        "timeline_pages": [
          "/timeline/"
        ],
        "research_log_ids": [],
        "posts": []
      }
    },
    {
      "id": "exp-0007-bg-005-097x-failure",
      "title": "BG-005 - The 0.97x Compression Failure",
      "date_start": "2026-03-20",
      "date_end": "2026-03-20",
      "category": "compressor",
      "hypothesis": "A domain-specific Rosetta over English prose will achieve at least 1.5x compression in a 13-round trading dialogue.",
      "method": "Ran a 13-round simulated trading dialogue, encoded both as English and as v1.1 AXL, and measured raw character ratio.",
      "outcome": "0.97x compression. Worse than English. The failure that broke the assumption that domain-specific Rosettas alone would deliver compression. Forced the v2.1 redesign toward universal cognitive operations as the compression substrate.",
      "status": "archived",
      "artifacts": {
        "commits": [],
        "timeline_pages": [
          "/timeline/"
        ],
        "research_log_ids": [],
        "posts": []
      }
    },
    {
      "id": "exp-0008-rosetta-v2.1-universal-grammar",
      "title": "Rosetta v2.1 - Universal Cognitive Grammar",
      "date_start": "2026-03-22",
      "date_end": "2026-03-22",
      "category": "protocol",
      "hypothesis": "Replacing domain-specific Rosettas with seven universal cognitive operations and six subject tags will both raise compression and generalize across domains.",
      "method": "Rewrote the Rosetta to 377 lines (~6,157 tokens, cl100k_base). Seven cognitive ops, six subject tags, cross-domain semantics. Stress-tested on finance and medicine corpora.",
      "outcome": "10.41x compression validated across both corpora and confirmed in the v2.3 whitepaper. The universal-cognitive-operations idea later became the kernel of v3 and the routing substrate of v4.",
      "status": "superseded",
      "artifacts": {
        "commits": [
          "212491e"
        ],
        "timeline_pages": [
          "/timeline/protocol-evolution.html"
        ],
        "research_log_ids": [],
        "posts": []
      }
    },
    {
      "id": "exp-0009-rosetta-v2.2-production-hardening",
      "title": "Rosetta v2.2 - Production Hardening",
      "date_start": "2026-03-24",
      "date_end": "2026-03-24",
      "category": "protocol",
      "hypothesis": "Universal parse semantics, ASCII transport, preamble manifests, and canonical decompression can be added without breaking v2.1 compression.",
      "method": "Expanded spec to 445 lines (~7,091 tokens). Six validated domains. Locked the wire format. Established preamble manifests so receivers could self-bootstrap.",
      "outcome": "Stable production substrate on which the AXL Bridge, Compress app, and PyPI v0.4.0 release shipped. Hardening rather than new capability.",
      "status": "superseded",
      "artifacts": {
        "commits": [
          "d3b1f88",
          "212491e"
        ],
        "timeline_pages": [
          "/timeline/protocol-evolution.html"
        ],
        "research_log_ids": [],
        "posts": []
      }
    },
    {
      "id": "exp-0010-cross-architecture-validation",
      "title": "Cross-Architecture Validation Pass (HT-001 to HT-008)",
      "date_start": "2026-03-25",
      "date_end": "2026-03-25",
      "category": "cold-read-gate",
      "hypothesis": "All seven architectures shipping in production frontier models can parse and emit Rosetta v2.2 with at least 97 percent comprehension on first read.",
      "method": "Eight human tests (HT-001 through HT-008) across Grok 3, GPT-4.5, Qwen 3.5, Llama 4, Claude Sonnet 4, Gemini, and Devstral/Mistral. Cold-read protocol, no fine-tune, no prior context.",
      "outcome": "97 percent or higher comprehension across all seven. Established the cross-architecture cold-read as a project invariant. Anthropic-family contamination risk first noted here, hardened into the v4 gate kit.",
      "status": "shipped",
      "artifacts": {
        "commits": [],
        "timeline_pages": [
          "/timeline/protocol-evolution.html"
        ],
        "research_log_ids": [],
        "posts": []
      }
    },
    {
      "id": "exp-0011-rosetta-v3-kernel",
      "title": "Rosetta v3 - The Kernel",
      "date_start": "2026-03-28",
      "date_end": "2026-03-28",
      "category": "protocol",
      "hypothesis": "The 445-line v2.2 spec can be distilled to a 75-line kernel that self-compresses at 4.48x and stays usable as a one-page contract for cold LLMs.",
      "method": "Rewrote v2.2 down to 75 lines (5,853 bytes, ~1,582 tokens). Measured kernel-on-itself compression. Deployed at /v3 as the canonical raw-text endpoint.",
      "outcome": "4.48x self-compression. Kernel survives cold reads end-to-end across the seven architectures. The 75-line text became the bootstrap unit for the AXL Bridge and the prepend payload for compressor outputs.",
      "status": "superseded",
      "artifacts": {
        "commits": [
          "6b9888d"
        ],
        "timeline_pages": [
          "/timeline/protocol-evolution.html",
          "/v3"
        ],
        "research_log_ids": [],
        "posts": []
      }
    },
    {
      "id": "exp-0012-axl-bridge",
      "title": "AXL Bridge - FastAPI Packet Bus with Genesis Tracking",
      "date_start": "2026-03-28",
      "date_end": "2026-03-28",
      "category": "distribution",
      "hypothesis": "A public FastAPI endpoint can serve as a shared bus where any agent reports its first-ever AXL parse, creating a verifiable genesis trail.",
      "method": "Built a FastAPI service. Endpoint accepts packet posts, validates against v3, and records a 'genesis' marker on each agent's first valid packet.",
      "outcome": "Live and used by the swarm experiments and the compressor product. Pattern reused later by the Compress app's auth tier and by the OTS provenance hub.",
      "status": "shipped",
      "artifacts": {
        "commits": [],
        "timeline_pages": [
          "/timeline/ecosystem-story.html"
        ],
        "research_log_ids": [],
        "posts": []
      }
    },
    {
      "id": "exp-0013-axl-core-v0.4.0",
      "title": "axl-core v0.4.0 - First PyPI Release",
      "date_start": "2026-03-19",
      "date_end": "2026-03-29",
      "category": "productization",
      "hypothesis": "The Rosetta spec can be made operational as a Python library with parser, emitter, validator, and translator, with zero runtime dependencies, in time for the first compressor experiments.",
      "method": "Implemented parser, emitter, validator, translator. CLI wrapping (axl parse | validate | translate | emit). Forty-two tests. CI on Python 3.10 / 3.11 / 3.12.",
      "outcome": "Shipped to PyPI 2026-03-19 as axl-core 0.4.0; updated to 0.5.0 on 2026-03-29 with full v3 parser and JSON lowering. The library became the substrate for every later compressor experiment.",
      "status": "shipped",
      "artifacts": {
        "commits": [],
        "timeline_pages": [
          "/timeline/compressor-evolution.html",
          "/changelog.html"
        ],
        "research_log_ids": [],
        "posts": []
      }
    },
    {
      "id": "exp-0014-deterministic-compressor-v0.6.0",
      "title": "Deterministic Compressor - english_to_v3()",
      "date_start": "2026-04-07",
      "date_end": "2026-04-07",
      "category": "compressor",
      "hypothesis": "English-to-AXL compression can be made fully deterministic with a 7-step spaCy pipeline and zero LLM dependency.",
      "method": "7-step spaCy pipeline: sentence split, NER, operation classification, confidence scoring, temporal extraction, evidence linking, packet emission. Published as axl-core 0.6.0.",
      "outcome": "First deterministic compressor. 2.42x baseline on the CloudKitchen 41K-character investment memo (Run 1 of the 13-experiment series). Established the corpus that would anchor every later v3 vs v4 comparison.",
      "status": "shipped",
      "artifacts": {
        "commits": [],
        "timeline_pages": [
          "/timeline/experiments.html",
          "/timeline/compressor-evolution.html"
        ],
        "research_log_ids": [],
        "posts": []
      }
    },
    {
      "id": "exp-0015-self-bootstrapping-kernel-prepend",
      "title": "Self-Bootstrapping Kernel Prepend (v0.6.1)",
      "date_start": "2026-04-07",
      "date_end": "2026-04-07",
      "category": "compressor",
      "hypothesis": "Prepending the v3 kernel to every compression output will let any cold receiving LLM parse the stream without prior configuration, at acceptable compression cost.",
      "method": "Modified compressor to emit Rosetta v3 kernel + ---PACKETS--- separator before payload. Re-ran the CloudKitchen corpus.",
      "outcome": "Bootstrap goal met: any cold LLM can parse the output. Compression cost was real: ratio fell from 2.42x baseline to 1.92x (Run 2-3 of the experiments series). The kernel-prepend tradeoff became one of the core design tensions and motivated the v0.9.0 entity-aliasing recovery.",
      "status": "superseded",
      "artifacts": {
        "commits": [],
        "timeline_pages": [
          "/timeline/experiments.html"
        ],
        "research_log_ids": [],
        "posts": []
      }
    },
    {
      "id": "exp-0016-v0.8.0-atomic-splitting-regression",
      "title": "v0.8.0 Atomic Splitting Regression",
      "date_start": "2026-04-09",
      "date_end": "2026-04-09",
      "category": "compressor",
      "hypothesis": "Splitting clauses into atomic packets will improve fidelity at acceptable cost in packet count.",
      "method": "Added clause-level packet splitting in v0.8.0. Re-ran CloudKitchen corpus. Compared with previous version.",
      "outcome": "Packet count exploded from 207 to 368. Compression ratio dropped from 1.92x to 1.38x (Run 4-5). External GPT-4 code review identified seven bugs (four known, three silent), all addressed in the v0.8.x patch series. Honest framing: this was a regression, not a feature. Recorded as such.",
      "status": "archived",
      "artifacts": {
        "commits": [],
        "timeline_pages": [
          "/timeline/experiments.html",
          "/changelog.html"
        ],
        "research_log_ids": [],
        "posts": []
      }
    },
    {
      "id": "exp-0017-v0.8.1-clause-packing-failure",
      "title": "v0.8.1 Clause Re-Packing Failure",
      "date_start": "2026-04-09",
      "date_end": "2026-04-10",
      "category": "compressor",
      "hypothesis": "Re-packing the over-split clauses from v0.8.0 will recover the lost compression.",
      "method": "Added clause re-packing logic in v0.8.1. Re-ran CloudKitchen.",
      "outcome": "Made it worse. Packet count rose to 380, ratio fell to 1.34x (Run 6-7). v0.8.2 partially recovered to 1.39-1.40x (Run 8-9). Two negative iterations in a row were enough signal to redesign rather than tune.",
      "status": "archived",
      "artifacts": {
        "commits": [],
        "timeline_pages": [
          "/timeline/experiments.html"
        ],
        "research_log_ids": [],
        "posts": []
      }
    },
    {
      "id": "exp-0018-v0.9.0-entity-aliasing-recovery",
      "title": "v0.9.0 Entity Aliasing Recovery",
      "date_start": "2026-04-10",
      "date_end": "2026-04-10",
      "category": "compressor",
      "hypothesis": "Removing the kernel prepend and adding entity aliasing will recover the compression we lost in the v0.7-v0.8 era.",
      "method": "Optional kernel modes (none, mini, full). Entity aliasing for repeated subjects. Re-ran on a 4,315-character mini corpus and projected to the full memo.",
      "outcome": "1.83x on the no-kernel mini corpus, 1.57x with mini kernel, projected ~1.8x on the full memo without kernel (Run 10-13). Recovery validated. Mini-kernel mode kept fidelity for cold receivers without paying the full prepend cost.",
      "status": "shipped",
      "artifacts": {
        "commits": [
          "2274e01"
        ],
        "timeline_pages": [
          "/timeline/experiments.html",
          "/changelog.html"
        ],
        "research_log_ids": [],
        "posts": []
      }
    },
    {
      "id": "exp-0019-compress-app",
      "title": "Compress App - Public Compression Tool with Auth Tiers",
      "date_start": "2026-04-05",
      "date_end": "2026-04-05",
      "category": "productization",
      "hypothesis": "A public Flask app at compress.axlprotocol.org with a tiered auth model can serve both anonymous compression demos and authenticated chat-pipeline access without leaking the inference budget.",
      "method": "Built Flask app. Public tier for basic compression. Authenticated tier for advanced features and chat pipeline. JWT plus TOTP 2FA for admin.",
      "outcome": "Live. Public tier handles cold visitors. Authenticated tier gates the LLM-backed chat pipeline. Tier model became the reference auth pattern for the admin panel and PROTO's later production deployment.",
      "status": "shipped",
      "artifacts": {
        "commits": [
          "daa3e28"
        ],
        "timeline_pages": [
          "/timeline/ecosystem-story.html"
        ],
        "research_log_ids": [],
        "posts": []
      }
    },
    {
      "id": "exp-0020-v3.1-data-anchoring",
      "title": "Rosetta v3.1 - Data Anchoring Extension",
      "date_start": "2026-04-10",
      "date_end": "2026-04-11",
      "category": "protocol",
      "hypothesis": "Four additive conventions over v3 - numeric bundles, entity anchors, causal operator split, summary plus breakdown packet pairs - will improve cold decompression survivability while staying compression-neutral and backward-compatible with v3 parsers.",
      "method": "Added label[$value,qualifier] numeric bundles, @ent.XX entity declarations, evidence/causal/transition operator split (<-, =>, ->), and summary-with-breakdown pairs for 4-or-more-data-point packets. Cold-tested with Qwen 3.5 and Gemini Flash on the CloudKitchen corpus pre and post.",
      "outcome": "Qwen 3.5 cold recovery 61 percent to 100 percent (+39 points). Gemini Flash 35 percent to 76 percent (+41 points). Compression cost +0.4 percent on the 10-packet bakeoff, declared neutral. Shipped as authoritative v3.1 in commit 099bcff and immediately tightened in commit 8fc20c0 after review caught two overclaims (the evidence rule no longer says 'Always in ARG1', and the summary claim no longer cites zero compression cost).",
      "status": "shipped",
      "artifacts": {
        "commits": [
          "099bcff",
          "8fc20c0",
          "c167549"
        ],
        "timeline_pages": [
          "/v3.1",
          "/rosetta/v3.1/evidence/",
          "/timeline/v31-v4-decision/"
        ],
        "research_log_ids": [
          "RL-099bcff",
          "RL-8fc20c0"
        ],
        "posts": []
      }
    },
    {
      "id": "exp-0021-v3.1-production-baseline",
      "title": "v3.1 Production Baseline Measurement (CloudKitchen 41K)",
      "date_start": "2026-04-11",
      "date_end": "2026-04-11",
      "category": "compressor",
      "hypothesis": "Earlier compression claims (3.27x, 2.69x) for v3.1 were derived from partial reconstructions and should be replaced by a single tokenizer-anchored measurement on the authoritative corpus.",
      "method": "Measured compress.axlprotocol.org v0.9.0 against the 41K CloudKitchen memo using tiktoken cl100k_base. Separated character ratio from token ratio. Logged in commit 6fed4dd (token estimation bug exposed) and confirmed in e28cf2d (round-trip, protocol vs rationale separated).",
      "outcome": "Published 2.90x chars, 1.40x tokens. Replaces all earlier estimates. Becomes the only ratio cited in marketing copy and the comparison page. Token compression is roughly half what early thesis projections claimed; that gap is now visible on the comparison page CORRECTION NOTICE.",
      "status": "shipped",
      "artifacts": {
        "commits": [
          "6fed4dd",
          "e28cf2d",
          "af6345b",
          "f7e3f3d",
          "2dcaa06"
        ],
        "timeline_pages": [
          "/rosetta/v3.1/evidence/",
          "/rosetta/v4/comparison/"
        ],
        "research_log_ids": [
          "RL-6fed4dd",
          "RL-e28cf2d"
        ],
        "posts": []
      }
    },
    {
      "id": "exp-0022-v3.2-glyph-draft",
      "title": "v3.2 Glyph Compression Layer (Draft)",
      "date_start": "2026-04-11",
      "date_end": "2026-04-11",
      "category": "protocol",
      "hypothesis": "Replacing English labels with single-token CJK ideograms, Greek letters, and math operators will shrink token cost without breaking cold-read decompression.",
      "method": "Drafted v3.2 as an additive layer over v3.1. Cold-tested three non-Anthropic models (Qwen 3.5, Gemini Flash, DeepSeek) under the legacy scorer. Logged in commit f176046 (spec) and f0a6bcc (results).",
      "outcome": "Two scorer-independent insights held: emoji are token poison (5 to 7 tokens each), CJK ideograms are 1 token and do not trigger language-switching. Cold decompression rose 76 percent to 96 percent on the legacy scorer. Never shipped as a standalone version. The lessons were absorbed into v4 Kernel Router domain modules. v3.2 was re-verified under the corpus-agnostic v3.1-vs-v4 scorer in 2026-04-21 (commit 0fd6139 / 91dbceb on axl-research) and a research brief stays at /rosetta/v3.2/research/.",
      "status": "archived",
      "artifacts": {
        "commits": [
          "f176046",
          "f0a6bcc",
          "91dbceb",
          "0fd6139"
        ],
        "timeline_pages": [
          "/rosetta/v3.2/research/"
        ],
        "research_log_ids": [
          "RL-f176046",
          "RL-f0a6bcc"
        ],
        "posts": []
      }
    },
    {
      "id": "exp-0023-v4-kernel-router-prototype",
      "title": "v4 Kernel Router Prototype - All Four Targets Met",
      "date_start": "2026-04-13",
      "date_end": "2026-04-13",
      "category": "routing",
      "hypothesis": "A pluggable Kernel Router architecture with classified domain modules can hit four simultaneous targets: char ratio at least 2.66x, token ratio at least 1.45x, round-trip fidelity at least 75 percent, stacked wire compression at least 7x.",
      "method": "Implemented kernel.py, router.py, canonical.py, extractor.py, fidelity.py, metrics.py, compressor.py, decompressor.py, transport.py, plus rosetta/{base,prose,financial,construction}.py. Targets verified on the multi-corpus harness in commit 0f65c95.",
      "outcome": "All four targets met simultaneously. Construction module added two commits later in 2ba79e1 (4.63x chars, 2.21x tokens on construction corpus). v4 declared a research prototype at this point - not a v3.1 replacement. The qualified-successor framing came after the cold-read gate.",
      "status": "shipped",
      "artifacts": {
        "commits": [
          "0f65c95",
          "2ba79e1",
          "d2b81b5"
        ],
        "timeline_pages": [
          "/rosetta/v4/",
          "/rosetta/v4/spec/",
          "/timeline/v31-v4-decision/"
        ],
        "research_log_ids": [
          "RL-0f65c95",
          "RL-2ba79e1"
        ],
        "posts": []
      }
    },
    {
      "id": "exp-0024-v4-adversarial-rounds-r1-r5",
      "title": "Five Adversarial Review Rounds (R1 to R5) - Dual-Agent Discipline",
      "date_start": "2026-04-13",
      "date_end": "2026-04-14",
      "category": "protocol",
      "hypothesis": "A two-agent workflow where Claude Code writes and Codex (GPT 5.4) reviews adversarially will surface bugs faster than single-agent self-review, especially on novel grammar invariants.",
      "method": "R1 parser-validation (commit 0a5cad4). R2 packet-grammar conformance (35e26d5). R3 shared canonical form plus envelope floor (6228281, runtime fixes in be52755). R4 canon_date namespace + drift detector (099dbe6). R5 tight drift detector tracking full corpus (6961dec). Clean-checkout verification became standard practice.",
      "outcome": "Five rounds, five tightened invariants. 181 tests passing after R3 runtime fixes. Codex review pattern became the project default and gave Codex first-class CONTRIBUTORS.md credit (commit 955c052 on axl-research). Established the operator-steered dual-agent discipline used throughout the v4 cold-read gate.",
      "status": "shipped",
      "artifacts": {
        "commits": [
          "0a5cad4",
          "35e26d5",
          "6228281",
          "be52755",
          "099dbe6",
          "6961dec"
        ],
        "timeline_pages": [
          "/timeline/v31-v4-decision/"
        ],
        "research_log_ids": [
          "RL-0a5cad4",
          "RL-35e26d5",
          "RL-6228281",
          "RL-be52755",
          "RL-099dbe6",
          "RL-6961dec"
        ],
        "posts": []
      }
    },
    {
      "id": "exp-0025-substrate-gaps-1-2-3",
      "title": "Substrate Gaps 1, 2, 3 - Construction Module Hardening",
      "date_start": "2026-04-14",
      "date_end": "2026-04-15",
      "category": "routing",
      "hypothesis": "Three identified substrate gaps in the construction Rosetta module can be closed without regressing the test suite.",
      "method": "Gap 1 (commit 330f53a): construction dollar plus date emitters, fidelity 41.43 percent to 50.57 percent, 193 of 193 tests. Gap 2 (commit ab092fa, Codex follow-up 623f0b8): drop dim cap, canonical short-form recognizer, fidelity 50.57 percent to 76.00 percent, 194 of 194 tests. Gap 3 (commit 29800b4): artifact-driven routing, 200 of 200 tests.",
      "outcome": "Three gaps closed. Construction-module fidelity climbed from low-40s to mid-70s on the home-turf corpus. Each gap kept the test suite green. Pattern reused later for prose-fallback precision pass.",
      "status": "shipped",
      "artifacts": {
        "commits": [
          "330f53a",
          "ab092fa",
          "623f0b8",
          "29800b4"
        ],
        "timeline_pages": [
          "/timeline/v31-v4-decision/"
        ],
        "research_log_ids": [
          "RL-330f53a",
          "RL-ab092fa",
          "RL-623f0b8",
          "RL-29800b4"
        ],
        "posts": []
      }
    },
    {
      "id": "exp-0026-cold-read-decision-gate-kit",
      "title": "Cold-Read Decision-Gate Kit - v3.1 vs v4 Handoff",
      "date_start": "2026-04-14",
      "date_end": "2026-04-15",
      "category": "cold-read-gate",
      "hypothesis": "A self-contained benchmark kit with source SHA256 anchoring, per-model seeds, and a numeric-extractor scorer (no LLM grading itself) can produce reproducible cross-model evidence for the v3.1 vs v4 productization decision.",
      "method": "Built kit in commit 9c3247e. Anti-meta-commentary clauses in the cold-read prompt. Generator-commit provenance in metadata. Control panel of four non-Anthropic models: Gemini Flash, Qwen 3.5, Grok, DeepSeek. Anthropic-family models excluded (Haiku named the format on first read in benchmarks/cold_read/RESULTS.md, so training-prior contamination assumed).",
      "outcome": "Reproducible kit. Three corpora ran through it. Established the precision-check rule: a clean v4 win requires Delta recall greater than 0 AND Delta precision greater than or equal to 0 simultaneously. This rule is the reason v4 is the qualified successor, not a clean replacement.",
      "status": "shipped",
      "artifacts": {
        "commits": [
          "9c3247e",
          "5dcdabc"
        ],
        "timeline_pages": [
          "/timeline/v31-v4-decision/"
        ],
        "research_log_ids": [
          "RL-9c3247e"
        ],
        "posts": []
      }
    },
    {
      "id": "exp-0027-decision-corpus1-corrected",
      "title": "Decision Corpus 1 (Museum 35K) - First Result, Then Correction",
      "date_start": "2026-04-14",
      "date_end": "2026-04-15",
      "category": "cold-read-gate",
      "hypothesis": "v4 will out-recall v3.1 on the museum repatriation 35K-character narrative across the four control models.",
      "method": "First publication in commit 205a68f reported v4 wins on clean models. Codex review (commit 5dcdabc) flagged a Gemini concat bug: two LLM sessions concatenated into one save file inflated Gemini recall.",
      "outcome": "Initial Gemini recall 32.01 percent corrected down to 23.08 percent. Amended writeup acknowledged the error, established clean-checkout protocol, added structural-warning guards. Honest framing: we shipped a wrong number, then we corrected it on review, in public, in the same git history.",
      "status": "superseded",
      "artifacts": {
        "commits": [
          "205a68f",
          "5dcdabc"
        ],
        "timeline_pages": [
          "/timeline/v31-v4-decision/"
        ],
        "research_log_ids": [
          "RL-205a68f",
          "RL-5dcdabc"
        ],
        "posts": []
      }
    },
    {
      "id": "exp-0028-decision-corpus2-clean-sweep",
      "title": "Decision Corpus 2 (Construction 58K) - Clean Sweep",
      "date_start": "2026-04-15",
      "date_end": "2026-04-15",
      "category": "cold-read-gate",
      "hypothesis": "v4 with the construction Rosetta module will out-recall and out-precision v3.1 on the construction technical spec across all four control models.",
      "method": "Built corpus #2 cold-read kit (commit 4a5559b, 99c584b longer prompt + Grok/DeepSeek seeds). Ran the four-model gate.",
      "outcome": "Clean sweep. v4 wins every model on recall AND precision. Average Delta recall +36.64 (range +12.85 to +59.71), Delta precision +43.96 (range +40.97 to +55.27). Result published in commit 3987aa3. The construction module - a domain-specific Rosetta the v4 architecture is supposed to enable - generalized cleanly across cold models. v4 modular-Rosetta architecture validated.",
      "status": "shipped",
      "artifacts": {
        "commits": [
          "4a5559b",
          "99c584b",
          "3987aa3"
        ],
        "timeline_pages": [
          "/timeline/v31-v4-decision/",
          "/rosetta/v4/comparison/"
        ],
        "research_log_ids": [
          "RL-4a5559b",
          "RL-3987aa3"
        ],
        "posts": []
      }
    },
    {
      "id": "exp-0029-decision-corpus3-mixed-result",
      "title": "Decision Corpus 3 (Museum 35K Prose Fallback) - Mixed Result",
      "date_start": "2026-04-15",
      "date_end": "2026-04-16",
      "category": "cold-read-gate",
      "hypothesis": "v4's prose-fallback path will hold both recall and precision on the museum repatriation narrative, completing the case for full replacement.",
      "method": "Built corpus #3 prose-fallback cold-read kit (commit a7a9254). First fixed the prose envelope so it does real compression rather than passthrough (commit d9f82bc, 201 of 201 tests, 3.24x chars / 1.46x tokens cited). Ran the four-model gate.",
      "outcome": "Mixed. Recall up across all four models, precision down. Verdict published in commit 4184bfe: v4's keyword-signature compression gives cold LLMs more entity hooks (recall up) but also leads them to hallucinate more false entity mentions when reassembling prose from keyword spines (precision down). Honest framing: prose fallback is a recall-favored tradeoff, not a clean replacement.",
      "status": "shipped",
      "artifacts": {
        "commits": [
          "a7a9254",
          "d9f82bc",
          "4184bfe"
        ],
        "timeline_pages": [
          "/timeline/v31-v4-decision/",
          "/rosetta/v4/comparison/"
        ],
        "research_log_ids": [
          "RL-a7a9254",
          "RL-d9f82bc",
          "RL-4184bfe"
        ],
        "posts": []
      }
    },
    {
      "id": "exp-0030-qualified-reversal",
      "title": "Qualified Reversal of Fold-Back Conclusion",
      "date_start": "2026-04-16",
      "date_end": "2026-04-16",
      "category": "protocol",
      "hypothesis": "The pre-evidence v4 research doc's conclusion (fold v4's formalism back into v3, do not replace v3 with v4) needs to be revised against the three-corpus cold-read evidence.",
      "method": "Wrote AMENDMENT NOTICE in docs/v4-research-document.md (commit b176ad2, 201 tests). Codex review (commit 7da8533) added prose-envelope invariant enforcement at runtime, 203 tests.",
      "outcome": "Three-part qualified reversal. (1) On domain-backed content (corpora #1 and #2, financial and construction modules), v4 replaces v3.1. Both recall and precision materially higher. (2) On prose fallback, v4 is recall-favored, not a clean replacement. Precision-sensitive narrative use cases may prefer v3.1 until the gap closes. (3) The v4 runtime architecture is independently validated regardless. Retired pre-evidence quote preserved at line 207 with a Retired 2026-04-16 marker. This is the source of the qualified-successor framing used across the website.",
      "status": "shipped",
      "artifacts": {
        "commits": [
          "b176ad2",
          "7da8533"
        ],
        "timeline_pages": [
          "/timeline/v31-v4-decision/",
          "/rosetta/v4/research/"
        ],
        "research_log_ids": [
          "RL-b176ad2",
          "RL-7da8533"
        ],
        "posts": []
      }
    },
    {
      "id": "exp-0031-prose-precision-pass",
      "title": "Prose Precision Pass - 76 Percent Gap Closure",
      "date_start": "2026-04-16",
      "date_end": "2026-04-20",
      "category": "cold-read-gate",
      "hypothesis": "Word-aware aliasing and lowercase header handling can close the prose-fallback precision gap on corpus #3 without regressing recall.",
      "method": "Implemented in commit 595b743 (205 tests, 2.77x chars / 1.35x tokens). Codex follow-up in c7704a6 fixed prose header acronym preservation plus metadata provenance (206 tests). Re-ran the four-model gate on corpus #3.",
      "outcome": "Precision gap closed 76 percent. Delta precision moved from -11.40 to -2.71 while Delta recall held at +20.97 to +21.47. One model (Grok) flipped to a clean win on both axes (+16.17 / +5.87). Qwen's outlier fixed (-30.27 to -4.77). Per the strict precision-check rule, verdict remains narrowly mixed; near parity, no clean flip. Prose fallback remains the qualified slice.",
      "status": "shipped",
      "artifacts": {
        "commits": [
          "595b743",
          "c7704a6",
          "a6785c2"
        ],
        "timeline_pages": [
          "/timeline/v31-v4-decision/",
          "/rosetta/v4/comparison/"
        ],
        "research_log_ids": [
          "RL-595b743",
          "RL-c7704a6",
          "RL-a6785c2"
        ],
        "posts": []
      }
    },
    {
      "id": "exp-0032-scorer-mimicry-guard",
      "title": "Scorer Structural-Mimicry Guard",
      "date_start": "2026-04-20",
      "date_end": "2026-04-20",
      "category": "cold-read-gate",
      "hypothesis": "The cold-read scorer has a methodology gap that lets DeepSeek's corpus #3 AXL op-code mimicry pass silently. Eight regex patterns can close it.",
      "method": "Added eight new regex patterns to the scorer for AXL op codes with confidence suffixes, manifest identifiers, module markers, passthrough flag, and format/version markers. Locked coverage with nine regression tests for concatenation, opening contamination, structural mimicry, and clean-prose negative cases. Logged in commit 8980042.",
      "outcome": "Methodology gap closed. 215 tests passing. The scorer now refuses to credit decompressions that smuggle AXL syntax back into the output. This is the kind of guard you only catch by adversarial review of your own evidence pipeline.",
      "status": "shipped",
      "artifacts": {
        "commits": [
          "8980042"
        ],
        "timeline_pages": [
          "/timeline/v31-v4-decision/"
        ],
        "research_log_ids": [
          "RL-8980042"
        ],
        "posts": []
      }
    },
    {
      "id": "exp-0033-construction-gap-4-99.65pct",
      "title": "Construction Gap 4 - 99.65 Percent Module Fidelity",
      "date_start": "2026-04-24",
      "date_end": "2026-04-24",
      "category": "routing",
      "hypothesis": "Construction entity recall can be lifted to near-ceiling by adding a CONSTRUCTION_KNOWN_ENTITIES vocabulary plus word-aware exact match.",
      "method": "Implemented vocabulary plus matcher in commit 3011e1e (axl-research). Re-ran module-fidelity harness.",
      "outcome": "217 of 217 tests passing. 99.65 percent module fidelity on the construction corpus. v4.0.2-r6 freeze tag set on commit 51e75de. This is the freeze that the public v4.0.1 release wraps.",
      "status": "shipped",
      "artifacts": {
        "commits": [
          "3011e1e",
          "51e75de"
        ],
        "timeline_pages": [
          "/rosetta/v4/research/",
          "/timestamps/v4-freeze.html"
        ],
        "research_log_ids": [
          "RL-3011e1e",
          "RL-51e75de"
        ],
        "posts": []
      }
    },
    {
      "id": "exp-0034-thunderblitz-doctrine",
      "title": "Thunderblitz - 7-Agent Parallel Execution Doctrine",
      "date_start": "2026-04-09",
      "date_end": "2026-04-10",
      "category": "infrastructure",
      "hypothesis": "A military-style 7-agent parallel pipeline (CommandCC's Thunderblitz pattern) can be applied to AXL Protocol development to compress multi-day work into hours.",
      "method": "Adopted Thunderblitz from CommandCC. Documented in /timeline/thunderblitz.html. Used in axl-core 0.7.0 to 0.8.0 patch series and again in the v4.0.1 multi-agent transition (4 agents, then 6, then 4 again).",
      "outcome": "Pattern is now the project default for any change touching more than three files. Each wave: ingest, plan, stage, cross-check, verify, commit, report. File-disjoint ownership prevents merge conflicts. The v4.0.1 transition itself ran on this doctrine and is the largest validated example.",
      "status": "shipped",
      "artifacts": {
        "commits": [],
        "timeline_pages": [
          "/timeline/thunderblitz.html",
          "/timeline/infrastructure-story.html"
        ],
        "research_log_ids": [],
        "posts": []
      }
    },
    {
      "id": "exp-0035-ots-anchor-v1.0-and-v4.0.1",
      "title": "OpenTimestamps Anchoring - v1.0.0 and v4.0.2-r6 Freezes",
      "date_start": "2026-03-19",
      "date_end": "2026-04-25",
      "category": "infrastructure",
      "hypothesis": "Notarizing each spec freeze on the Bitcoin blockchain via OpenTimestamps gives verifiable prior-art evidence at zero ongoing cost.",
      "method": "v1.0 whitepaper anchored to four sources (alice.btc, bob.btc, finney/eternitywall, catallaxy), confirmed in Bitcoin block 941334. v4.0.2-r6 freeze submitted 2026-04-25 in commit 7951873 (kernel SHA256 ad5b251..., kernel-router SHA256 f3247df..., code-layer SHA256 3d246d51...). PendingAttestation, awaiting Bitcoin confirmation; OTS upgrade cron installed.",
      "outcome": "v1.0 confirmed in chain. v4.0.2-r6 pending. Pattern is project-default for every spec freeze. /timestamps/ hub lists all anchors.",
      "status": "shipped",
      "artifacts": {
        "commits": [
          "14e2c10",
          "7951873"
        ],
        "timeline_pages": [
          "/timestamps/",
          "/timestamps/v4-freeze.html"
        ],
        "research_log_ids": [],
        "posts": []
      }
    },
    {
      "id": "exp-0036-dual-agent-research-protocol",
      "title": "Dual-Agent Research Protocol - Claude Code + Codex GPT 5.4",
      "date_start": "2026-04-10",
      "date_end": "2026-04-21",
      "category": "protocol",
      "hypothesis": "A protocol where one agent writes code or spec and another adversarially reviews it - with the operator steering - can sustain evidence-honest research over weeks without drifting into self-confirmation.",
      "method": "Seeded in commit bae849f (Seed: dual-agent research instructions for AXL Rosetta v4). Sustained across 65 commits with deterministic role classification (gate-kit, claude-research-impl, spec, codex-review-round, bench, ship, docs, substrate-gap, codex-review-response, corpus-result). Build-research-log.py derives the DAG from git history.",
      "outcome": "65 commits classified, 6 explicit response edges, 7 review rounds. Codex received first-class CONTRIBUTORS.md credit in commit 955c052. The protocol is the reason every public v4 claim has both a primary commit and a review-response commit. Pattern reused for the cold-read gate, the precision pass, and the scorer mimicry guard.",
      "status": "shipped",
      "artifacts": {
        "commits": [
          "bae849f",
          "955c052",
          "2f44e51"
        ],
        "timeline_pages": [
          "/research-log/",
          "/timeline/v31-v4-decision/"
        ],
        "research_log_ids": [
          "RL-bae849f"
        ],
        "posts": []
      }
    }
  ],
  "cross_experiment_lessons": [
    {
      "id": "lesson-cold-read-beats-round-trip",
      "title": "Cold-read evidence beats round-trip evidence",
      "summary": "Round-trip fidelity (compress, then decompress with the same model that compressed) consistently overstates real-world performance. The v3.1 vs v4 decision gate only became trustworthy when we switched to cold reads on models that had never seen the format and had no shared scratch state with the encoder."
    },
    {
      "id": "lesson-anthropic-contamination",
      "title": "Anthropic-family models contaminate the gate on novel formats",
      "summary": "Haiku named AXL on first read in our pre-gate scratch tests. We assumed training-prior contamination and excluded the entire Anthropic family from the v4 cold-read panel. The four-model control - Gemini Flash, Qwen 3.5, Grok, DeepSeek - is the disciplined alternative."
    },
    {
      "id": "lesson-precision-and-recall-both-must-rise",
      "title": "Precision AND recall must both rise for a clean win",
      "summary": "The strict rule we adopted at corpus #1 review: a clean v4 win requires Delta recall greater than 0 AND Delta precision greater than or equal to 0 simultaneously. This rule is the reason v4 is the qualified successor and not a wholesale replacement. It is also the reason corpus #3 is honestly published as mixed even after a 76 percent precision pass."
    },
    {
      "id": "lesson-failed-experiments-belong-on-the-timeline",
      "title": "Failed experiments belong on the timeline, not in the bin",
      "summary": "BG-004 (42-agent deadlock), BG-005 (0.97x compression, worse than English), v0.8.0 atomic-splitting regression, v0.8.1 clause-packing failure, the original Gemini concat bug, and the pre-evidence fold-back conclusion all stay on the public timeline. The corrections that followed are the most cited part of the laboratory."
    },
    {
      "id": "lesson-dual-agent-discipline",
      "title": "Two agents and an operator beat one agent and a self-review",
      "summary": "Five adversarial review rounds (R1 to R5), three substrate gaps with paired Codex responses, and one scorer mimicry guard exist because Claude Code wrote and Codex reviewed. Self-review missed each of these. The dual-agent protocol now has its own contributors-credit and its own derived research log."
    },
    {
      "id": "lesson-tokenizer-anchored-baselines",
      "title": "Anchor every compression claim to a real tokenizer on a real corpus",
      "summary": "Three earlier compression numbers for v3.1 (3.27x, 2.69x, others) came from partial reconstructions and were quietly wrong. The 2.90x chars / 1.40x tokens baseline on the 41K CloudKitchen memo with tiktoken cl100k_base is the only number we now cite. Token compression is roughly half what early thesis projections claimed; that gap is now explicit on the comparison page."
    }
  ],
  "open_experiments": [
    {
      "id": "open-prose-precision-clean-flip",
      "title": "Prose-fallback precision clean flip",
      "summary": "The prose-fallback corpus (#3) is at narrowly mixed after a 76 percent precision-gap closure. A clean flip to recall+ AND precision+ on all four control models is the next gate."
    },
    {
      "id": "open-additional-rosetta-modules",
      "title": "Additional Rosetta modules (legal, medical, code, research)",
      "summary": "v4 spec reserves six modules; only three are implemented (prose, financial, construction). Code-layer is drafted but not in the registry. Legal, medical, and research are spec-only. Each one is a separate cold-read gate when added."
    },
    {
      "id": "open-discussions-rfc-loop",
      "title": "Discussions-driven RFCs on /community/",
      "summary": "Discussions venue is staged at /home/sudo-claude/github-discussions-setup/ but not yet executed. Once live, RFC threads become first-class artifacts cited from this laboratory page."
    }
  ]
}
