lucidia-core/researcher.py

#!/usr/bin/env python3
"""Codex-3 Researcher agent.

This agent follows the charter provided in ``codex3.yaml``. It watches for
experiment briefs dropped into ``/srv/lucidia/researcher/inbox.jsonl`` and
contradictions escalated by Guardian via
``/srv/lucidia/state/contradictions.log``. Each cycle moves through the
behavioral loop described in the charter: ask → test → observe → analyze →
integrate → teach → rest.

Key behaviours implemented here:

* Structured experiment ingestion with reproducibility fingerprints.
* Hypothesis generation from Guardian contradictions.
* Teaching card emission for other Codex agents.
* Emergence tracking that highlights recurring research topics.

The seed manifest is stored as JSON (valid YAML) so we can load it using the
standard library. The ``--emit`` directory receives machine-readable deltas
that downstream Codex tooling can ingest.
"""

from __future__ import annotations

import argparse
import hashlib
import json
import time
from dataclasses import dataclass, field
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict, Iterable, List, Tuple

STATE_ROOT = Path("/srv/lucidia/researcher")
INBOX_PATH = STATE_ROOT / "inbox.jsonl"
RESULTS_PATH = STATE_ROOT / "results.jsonl"
HYPOTHESES_PATH = STATE_ROOT / "hypotheses.jsonl"
TEACHING_DIR = STATE_ROOT / "teaching_cards"
EMERGENCE_LOG = STATE_ROOT / "emergence.log"
CURSOR_PATH = STATE_ROOT / "state.json"
CONTRADICTION_LOG = Path("/srv/lucidia/state/contradictions.log")
EMIT_STREAM = "researcher_stream.jsonl"


def utc_now() -> str:
    """Return the current UTC timestamp in ISO8601 format."""

    return datetime.now(timezone.utc).isoformat()


@dataclass
class ResearchState:
    """Mutable runtime state persisted between loop iterations."""

    cursors: Dict[str, int] = field(
        default_factory=lambda: {"inbox": 0, "contradictions": 0}
    )
    emergence: Dict[str, int] = field(default_factory=dict)

    @classmethod
    def load(cls, path: Path) -> "ResearchState":
        if not path.exists():
            return cls()
        with path.open("r", encoding="utf-8") as handle:
            try:
                payload = json.load(handle)
            except json.JSONDecodeError:
                return cls()
        cursors = payload.get("cursors", {})
        emergence = payload.get("emergence", {})
        return cls(cursors=cursors, emergence=emergence)

    def save(self, path: Path) -> None:
        path.parent.mkdir(parents=True, exist_ok=True)
        payload = {"cursors": self.cursors, "emergence": self.emergence}
        with path.open("w", encoding="utf-8") as handle:
            json.dump(payload, handle, indent=2, sort_keys=True)


class Researcher:
    """Implements the Codex-3 Researcher behavioural loop."""

    def __init__(self, seed_path: Path, emit_dir: Path, poll_interval: float = 5.0):
        self.seed_manifest = self._load_seed(seed_path)
        self.identity = self.seed_manifest.get("agent", "Codex-3 Researcher")
        self.directives = self.seed_manifest.get("directives", [])
        self.core_tasks = self.seed_manifest.get("core_tasks", [])
        self.behavioural_loop = self.seed_manifest.get(
            "behavioral_loop", ["ask", "test", "observe", "analyze", "integrate", "teach", "rest"]
        )
        self.seed_language = self.seed_manifest.get("seed_language", "")
        self.emit_dir = emit_dir
        self.emit_dir.mkdir(parents=True, exist_ok=True)
        STATE_ROOT.mkdir(parents=True, exist_ok=True)
        TEACHING_DIR.mkdir(parents=True, exist_ok=True)
        self.state = ResearchState.load(CURSOR_PATH)
        self.poll_interval = poll_interval

    # ------------------------------------------------------------------
    # Seed and state helpers
    # ------------------------------------------------------------------
    def _load_seed(self, path: Path) -> Dict[str, Any]:
        with path.open("r", encoding="utf-8") as handle:
            raw = handle.read()
        try:
            return json.loads(raw)
        except json.JSONDecodeError as exc:
            raise ValueError(f"Seed manifest {path} is not valid JSON/YAML") from exc

    def _persist_state(self) -> None:
        self.state.save(CURSOR_PATH)

    # ------------------------------------------------------------------
    # File tailing
    # ------------------------------------------------------------------
    def _tail_file(self, path: Path, offset: int) -> Tuple[List[str], int]:
        if not path.exists():
            return [], offset
        with path.open("r", encoding="utf-8") as handle:
            handle.seek(offset)
            lines = handle.readlines()
            new_offset = handle.tell()
        return [line.strip() for line in lines if line.strip()], new_offset

    def _append_jsonl(self, path: Path, payloads: Iterable[Dict[str, Any]]) -> None:
        path.parent.mkdir(parents=True, exist_ok=True)
        with path.open("a", encoding="utf-8") as handle:
            for payload in payloads:
                handle.write(json.dumps(payload, sort_keys=True))
                handle.write("\n")

    # ------------------------------------------------------------------
    # Experiment ingestion
    # ------------------------------------------------------------------
    def _handle_inbox(self) -> List[Dict[str, Any]]:
        cursor = self.state.cursors.get("inbox", 0)
        lines, new_cursor = self._tail_file(INBOX_PATH, cursor)
        new_results: List[Dict[str, Any]] = []
        for line in lines:
            try:
                brief = json.loads(line)
            except json.JSONDecodeError:
                print(f"[{self.identity}] Skipping malformed inbox line: {line}")
                continue
            result = self._execute_experiment(brief)
            new_results.append(result)
        if new_results:
            self._append_jsonl(RESULTS_PATH, new_results)
        self.state.cursors["inbox"] = new_cursor
        return new_results

    def _execute_experiment(self, brief: Dict[str, Any]) -> Dict[str, Any]:
        experiment_id = brief.get("id") or hashlib.sha256(json.dumps(brief, sort_keys=True).encode("utf-8")).hexdigest()[:12]
        question = brief.get("question") or brief.get("prompt")
        hypothesis = brief.get("hypothesis")
        protocol = brief.get("protocol", {})
        domain = brief.get("domain") or brief.get("topic") or "general"
        observation = {
            "agent": self.identity,
            "experiment_id": experiment_id,
            "timestamp": utc_now(),
            "question": question,
            "hypothesis": hypothesis,
            "protocol": protocol,
            "inputs": brief.get("inputs", {}),
            "observations": brief.get("observations", []),
            "analysis": self._generate_analysis(brief),
            "domain": domain,
            "directives_snapshot": self.directives,
            "core_tasks_snapshot": self.core_tasks,
            "seed_language": self.seed_language,
        }
        observation["replication_hash"] = self._replication_hash(observation)
        self._maybe_log_emergence(domain)
        print(f"[{self.identity}] Recorded experiment {experiment_id} in domain '{domain}'")
        return observation

    def _generate_analysis(self, brief: Dict[str, Any]) -> Dict[str, Any]:
        notes = brief.get("notes") or ""
        metrics = brief.get("metrics", {})
        protocol = brief.get("protocol", {})
        summary = {
            "status": brief.get("status", "recorded"),
            "notes": notes,
            "metrics": metrics,
            "replication_ready": bool(protocol),
        }
        if protocol and "steps" in protocol:
            summary["step_count"] = len(protocol["steps"])
        return summary

    def _replication_hash(self, observation: Dict[str, Any]) -> str:
        canonical = json.dumps(
            {
                "question": observation.get("question"),
                "hypothesis": observation.get("hypothesis"),
                "protocol": observation.get("protocol"),
                "inputs": observation.get("inputs"),
            },
            sort_keys=True,
        )
        return hashlib.sha256(canonical.encode("utf-8")).hexdigest()

    # ------------------------------------------------------------------
    # Contradiction-driven hypotheses
    # ------------------------------------------------------------------
    def _handle_contradictions(self) -> List[Dict[str, Any]]:
        cursor = self.state.cursors.get("contradictions", 0)
        lines, new_cursor = self._tail_file(CONTRADICTION_LOG, cursor)
        hypotheses: List[Dict[str, Any]] = []
        for line in lines:
            try:
                payload = json.loads(line)
            except json.JSONDecodeError:
                print(f"[{self.identity}] Skipping malformed contradiction line: {line}")
                continue
            hypothesis = self._create_hypothesis(payload)
            hypotheses.append(hypothesis)
        if hypotheses:
            self._append_jsonl(HYPOTHESES_PATH, hypotheses)
        self.state.cursors["contradictions"] = new_cursor
        return hypotheses

    def _create_hypothesis(self, payload: Dict[str, Any]) -> Dict[str, Any]:
        topic = payload.get("topic") or payload.get("domain") or "unknown"
        context = payload.get("context") or payload.get("details")
        contradiction_id = payload.get("id") or payload.get("hash") or hashlib.sha256(json.dumps(payload, sort_keys=True).encode("utf-8")).hexdigest()[:10]
        hypothesis_id = f"H-{contradiction_id}-{int(time.time())}"
        rationale = payload.get("rationale") or "Guardian flagged contradiction requiring investigation."
        hypothesis = {
            "agent": self.identity,
            "hypothesis_id": hypothesis_id,
            "source_contradiction": contradiction_id,
            "timestamp": utc_now(),
            "topic": topic,
            "question": f"What reconciles the contradiction in {topic}?",
            "rationale": rationale,
            "context": context,
        }
        print(f"[{self.identity}] Derived hypothesis {hypothesis_id} for topic '{topic}'")
        return hypothesis

    # ------------------------------------------------------------------
    # Teaching cards & publication
    # ------------------------------------------------------------------
    def _emit_results(self, results: List[Dict[str, Any]]) -> None:
        if not results:
            return
        stream_path = self.emit_dir / EMIT_STREAM
        self._append_jsonl(stream_path, results)
        for result in results:
            card_path = self._write_teaching_card(result)
            self._publish_card(result, card_path)

    def _emit_hypotheses(self, hypotheses: List[Dict[str, Any]]) -> None:
        if not hypotheses:
            return
        stream_path = self.emit_dir / "hypotheses.jsonl"
        self._append_jsonl(stream_path, hypotheses)

    def _write_teaching_card(self, result: Dict[str, Any]) -> Path:
        card_name = f"{result['experiment_id']}.md"
        card_path = TEACHING_DIR / card_name
        directives = "\n".join(f"- {item}" for item in self.directives)
        tasks = "\n".join(f"- {item}" for item in self.core_tasks)
        loop = " → ".join(self.behavioural_loop)
        content = [
            f"# Teaching Card · {result['experiment_id']}",
            "",
            f"**Agent:** {self.identity}",
            f"**Domain:** {result.get('domain', 'general')}",
            f"**Question:** {result.get('question') or 'n/a'}",
            f"**Hypothesis:** {result.get('hypothesis') or 'n/a'}",
            f"**Replication Hash:** `{result['replication_hash']}`",
            "",
            "## Protocol",
        ]
        protocol = result.get("protocol") or {}
        if isinstance(protocol, dict) and protocol.get("steps"):
            for idx, step in enumerate(protocol["steps"], start=1):
                content.append(f"{idx}. {step}")
        else:
            content.append("No explicit steps provided; protocol requires elaboration.")
        content.extend(
            [
                "",
                "## Analysis",
                json.dumps(result.get("analysis", {}), indent=2, sort_keys=True),
                "",
                "## Directives Snapshot",
                directives if directives else "- (none)",
                "",
                "## Core Tasks Snapshot",
                tasks if tasks else "- (none)",
                "",
                f"## Behavioral Loop\n{loop}",
                "",
                "## Seed Language",
                result.get("seed_language") or self.seed_language or "(not provided)",
            ]
        )
        card_path.write_text("\n".join(content), encoding="utf-8")
        return card_path

    def _publish_card(self, result: Dict[str, Any], card_path: Path) -> None:
        emit_card_dir = self.emit_dir / "teaching_cards"
        emit_card_dir.mkdir(parents=True, exist_ok=True)
        target = emit_card_dir / card_path.name
        target.write_text(card_path.read_text(encoding="utf-8"), encoding="utf-8")

    # ------------------------------------------------------------------
    # Emergence tracking
    # ------------------------------------------------------------------
    def _maybe_log_emergence(self, domain: str) -> None:
        count = self.state.emergence.get(domain, 0) + 1
        self.state.emergence[domain] = count
        if count in (3, 5, 10):
            record = {
                "timestamp": utc_now(),
                "agent": self.identity,
                "domain": domain,
                "count": count,
                "message": f"Emergent pattern: {count} findings within domain '{domain}'",
            }
            self._append_jsonl(EMERGENCE_LOG, [record])
            print(f"[{self.identity}] Emergence alert for domain '{domain}' at count {count}")

    # ------------------------------------------------------------------
    # Main loop
    # ------------------------------------------------------------------
    def cycle(self) -> None:
        print(f"[{self.identity}] Starting behavioural loop: {' → '.join(self.behavioural_loop)}")
        results = self._handle_inbox()
        hypotheses = self._handle_contradictions()
        if results:
            self._emit_results(results)
        if hypotheses:
            self._emit_hypotheses(hypotheses)
        self._persist_state()

    def loop(self, run_once: bool = False) -> None:
        while True:
            self.cycle()
            if run_once:
                break
            time.sleep(self.poll_interval)


def build_parser() -> argparse.ArgumentParser:
    parser = argparse.ArgumentParser(description="Codex-3 Researcher agent")
    parser.add_argument("--seed", type=Path, required=True, help="Path to codex3 seed manifest")
    parser.add_argument("--emit", type=Path, required=True, help="Directory to write output deltas")
    parser.add_argument(
        "--interval",
        type=float,
        default=5.0,
        help="Poll interval in seconds between behavioural loop iterations",
    )
    parser.add_argument(
        "--once",
        action="store_true",
        help="Run a single behavioural loop cycle and exit",
    )
    return parser


def main() -> None:
    parser = build_parser()
    args = parser.parse_args()
    agent = Researcher(seed_path=args.seed, emit_dir=args.emit, poll_interval=args.interval)
    agent.loop(run_once=args.once)


if __name__ == "__main__":
    main()