lucidia-core/mathematician.py

#!/usr/bin/env python3
"""Codex-22 Mathematician agent implementation.

This agent gives structure to Lucidia's mathematical knowledge. It ingests
formula definitions, proof requests, contradiction logs, and dataset manifests,
then emits notation registries, proof sketches, validation reports, and
teaching notes that other agents can use. The workflow mirrors the charter's
behavioural loop:

observe → abstract → formalise → prove → teach → rest

The intent is not to produce fully formal proofs but to illuminate structure and
highlight where deeper verification is required. Outputs are persisted under the
Mathematician state directory so downstream tooling can subscribe to them.
"""

from __future__ import annotations

import argparse
import hashlib
import json
import math
from dataclasses import dataclass, field
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict, Iterable, List, Mapping, MutableMapping, Optional, Sequence, Tuple

import yaml

DEFAULT_STATE_ROOT = Path("/srv/lucidia/mathematician")
FORMULA_LOG_NAME = "formulas.jsonl"
PROOF_REQUEST_LOG_NAME = "proof_requests.jsonl"
STATE_FILENAME = "state.json"
PROOF_ARCHIVE_NAME = "proof_archive.jsonl"
NOTATION_REGISTRY_NAME = "notation_system.json"
VALIDATION_REPORT_NAME = "validation_report.json"
TEACHING_NOTES_NAME = "teaching_notes.md"
DATASET_DIR_NAME = "datasets"
CONTRADICTION_LOG_PATH = Path("/srv/lucidia/state/contradictions.log")


# ---------------------------------------------------------------------------
# Seed parsing helpers
# ---------------------------------------------------------------------------


def _ensure_list(value: Any) -> List[str]:
    if value is None:
        return []
    if isinstance(value, list):
        return [str(item).strip() for item in value if str(item).strip()]
    if isinstance(value, str):
        stripped = value.strip()
        return [stripped] if stripped else []
    raise TypeError(f"Expected list-compatible value, received {type(value)!r}")


@dataclass
class MathematicianSeed:
    """Structured representation of the Mathematician seed manifest."""

    identifier: str
    agent_name: str
    generation: str
    parent: Optional[str]
    siblings: List[str]
    domain: List[str]
    moral_constant: str
    core_principle: str
    purpose: str
    directives: List[str]
    jobs: List[str]
    personality: Mapping[str, Any]
    input_channels: List[str]
    output_channels: List[str]
    behavioural_loop: List[str]
    seed_language: str
    boot_command: str


def load_seed(path: Path) -> MathematicianSeed:
    """Load and validate the Mathematician seed file."""

    with path.open("r", encoding="utf-8") as handle:
        data = yaml.safe_load(handle) or {}

    if not isinstance(data, MutableMapping):
        raise ValueError("Mathematician seed must be a mapping at the top level")

    charter = data.get("system_charter")
    if not isinstance(charter, MutableMapping):
        raise ValueError("Mathematician seed missing 'system_charter' mapping")

    required_charter = [
        "agent_name",
        "generation",
        "domain",
        "moral_constant",
        "core_principle",
    ]
    missing_charter = [field for field in required_charter if field not in charter]
    if missing_charter:
        raise ValueError(
            "Mathematician seed missing charter field(s): " + ", ".join(missing_charter)
        )

    for required in [
        "purpose",
        "directives",
        "jobs",
        "input",
        "output",
        "behavioral_loop",
        "seed_language",
        "boot_command",
    ]:
        if required not in data:
            raise ValueError(f"Mathematician seed missing required field: {required}")

    return MathematicianSeed(
        identifier=str(data.get("id", "codex-22")),
        agent_name=str(charter["agent_name"]),
        generation=str(charter["generation"]),
        parent=str(charter.get("parent")) if charter.get("parent") else None,
        siblings=_ensure_list(charter.get("siblings")),
        domain=_ensure_list(charter.get("domain")),
        moral_constant=str(charter["moral_constant"]),
        core_principle=str(charter["core_principle"]),
        purpose=str(data["purpose"]).strip(),
        directives=_ensure_list(data["directives"]),
        jobs=_ensure_list(data["jobs"]),
        personality=data.get("personality", {}),
        input_channels=_ensure_list(data["input"]),
        output_channels=_ensure_list(data["output"]),
        behavioural_loop=_ensure_list(data["behavioral_loop"]),
        seed_language=str(data["seed_language"]).strip(),
        boot_command=str(data["boot_command"]).strip(),
    )


# ---------------------------------------------------------------------------
# State models
# ---------------------------------------------------------------------------


def utc_now() -> str:
    return datetime.now(timezone.utc).isoformat()


@dataclass
class DatasetStatus:
    path: str
    fingerprint: str
    size_bytes: int
    status: str
    last_checked: str

    def to_dict(self) -> Dict[str, Any]:
        return {
            "path": self.path,
            "fingerprint": self.fingerprint,
            "size_bytes": self.size_bytes,
            "status": self.status,
            "last_checked": self.last_checked,
        }


@dataclass
class MathematicianState:
    cursors: Dict[str, int] = field(
        default_factory=lambda: {"formulas": 0, "proof_requests": 0, "contradictions": 0}
    )
    dataset_fingerprints: Dict[str, str] = field(default_factory=dict)

    @classmethod
    def load(cls, path: Path) -> "MathematicianState":
        if not path.exists():
            return cls()
        with path.open("r", encoding="utf-8") as handle:
            try:
                payload = json.load(handle)
            except json.JSONDecodeError:
                return cls()
        cursors = payload.get("cursors", {})
        dataset_fingerprints = payload.get("dataset_fingerprints", {})
        return cls(cursors=cursors, dataset_fingerprints=dataset_fingerprints)

    def save(self, path: Path) -> None:
        path.parent.mkdir(parents=True, exist_ok=True)
        payload = {
            "cursors": self.cursors,
            "dataset_fingerprints": self.dataset_fingerprints,
        }
        with path.open("w", encoding="utf-8") as handle:
            json.dump(payload, handle, indent=2, sort_keys=True)


@dataclass
class NormalisedFormula:
    record: Dict[str, Any]
    normalised: str
    signature: str
    notation_label: str
    variables: List[str]

    def to_summary(self) -> Dict[str, Any]:
        return {
            "id": self.record.get("id"),
            "notation": self.notation_label,
            "normalised": self.normalised,
            "signature": self.signature,
            "variables": self.variables,
            "source": self.record,
        }


@dataclass
class Observation:
    formulas: List[Dict[str, Any]]
    proof_requests: List[Dict[str, Any]]
    contradictions: List[Dict[str, Any]]
    datasets: List[DatasetStatus]


@dataclass
class ProofPlan:
    request_id: str
    theorem: str
    steps: List[Dict[str, Any]]
    used_notation: List[str]
    contradictions: List[Dict[str, Any]]
    confidence: float

    def to_dict(self) -> Dict[str, Any]:
        return {
            "request_id": self.request_id,
            "theorem": self.theorem,
            "steps": self.steps,
            "used_notation": self.used_notation,
            "contradictions": self.contradictions,
            "confidence": self.confidence,
        }


@dataclass
class ProofRecord:
    plan: ProofPlan
    issued_at: str
    summary: str

    def to_dict(self) -> Dict[str, Any]:
        return {
            **self.plan.to_dict(),
            "issued_at": self.issued_at,
            "summary": self.summary,
        }


# ---------------------------------------------------------------------------
# Mathematician implementation
# ---------------------------------------------------------------------------


class Mathematician:
    """Implements the Codex-22 Mathematician behavioural loop."""

    def __init__(
        self,
        seed_path: Path,
        emit_dir: Optional[Path] = None,
        state_root: Path = DEFAULT_STATE_ROOT,
    ) -> None:
        self.seed = load_seed(seed_path)
        self.identity = self.seed.agent_name
        self.state_root = state_root
        self.state_root.mkdir(parents=True, exist_ok=True)
        self.formula_log = self.state_root / FORMULA_LOG_NAME
        self.proof_request_log = self.state_root / PROOF_REQUEST_LOG_NAME
        self.dataset_dir = self.state_root / DATASET_DIR_NAME
        self.dataset_dir.mkdir(parents=True, exist_ok=True)
        self.proof_archive = self.state_root / PROOF_ARCHIVE_NAME
        self.notation_registry_path = self.state_root / NOTATION_REGISTRY_NAME
        self.validation_report_path = self.state_root / VALIDATION_REPORT_NAME
        self.teaching_notes_path = self.state_root / TEACHING_NOTES_NAME
        self.state_path = self.state_root / STATE_FILENAME
        self.emit_dir = emit_dir
        if self.emit_dir is not None:
            self.emit_dir.mkdir(parents=True, exist_ok=True)
        self.state = MathematicianState.load(self.state_path)
        self.notation_entries: List[Dict[str, Any]] = []
        self._notation_by_signature: Dict[str, Dict[str, Any]] = {}
        self._notation_counter = 1
        self._load_notation_registry()

    # ------------------------------------------------------------------
    # Initialisation helpers
    # ------------------------------------------------------------------
    def _load_notation_registry(self) -> None:
        if not self.notation_registry_path.exists():
            return
        try:
            payload = json.loads(self.notation_registry_path.read_text(encoding="utf-8"))
        except json.JSONDecodeError:
            return
        entries = payload.get("entries", []) if isinstance(payload, dict) else []
        if not isinstance(entries, list):
            return
        for entry in entries:
            if not isinstance(entry, dict):
                continue
            signature = str(entry.get("signature")) if entry.get("signature") else None
            label = str(entry.get("label")) if entry.get("label") else None
            if not signature or not label:
                continue
            self.notation_entries.append(entry)
            self._notation_by_signature[signature] = entry
            try:
                suffix = label.split("-")[-1]
                value = int(suffix)
                self._notation_counter = max(self._notation_counter, value + 1)
            except (ValueError, IndexError):
                continue

    # ------------------------------------------------------------------
    # Observation phase
    # ------------------------------------------------------------------
    def observe(self) -> Observation:
        formulas = self._consume_jsonl(self.formula_log, "formulas")
        proof_requests = self._consume_jsonl(self.proof_request_log, "proof_requests")
        contradictions = self._consume_contradictions(CONTRADICTION_LOG_PATH)
        datasets = self._scan_datasets()
        return Observation(
            formulas=formulas,
            proof_requests=proof_requests,
            contradictions=contradictions,
            datasets=datasets,
        )

    def _consume_jsonl(self, path: Path, cursor_key: str) -> List[Dict[str, Any]]:
        if cursor_key not in self.state.cursors:
            self.state.cursors[cursor_key] = 0
        cursor = self.state.cursors.get(cursor_key, 0)
        records: List[Dict[str, Any]] = []
        if not path.exists():
            return records
        with path.open("r", encoding="utf-8") as handle:
            handle.seek(cursor)
            lines = handle.readlines()
            new_cursor = handle.tell()
        for line in lines:
            line = line.strip()
            if not line:
                continue
            try:
                record = json.loads(line)
            except json.JSONDecodeError:
                record = {"raw": line, "error": "unparseable_json"}
            records.append(record)
        self.state.cursors[cursor_key] = new_cursor
        return records

    def _consume_contradictions(self, path: Path) -> List[Dict[str, Any]]:
        cursor_key = "contradictions"
        if cursor_key not in self.state.cursors:
            self.state.cursors[cursor_key] = 0
        cursor = self.state.cursors.get(cursor_key, 0)
        entries: List[Dict[str, Any]] = []
        if not path.exists():
            return entries
        with path.open("r", encoding="utf-8") as handle:
            handle.seek(cursor)
            lines = handle.readlines()
            new_cursor = handle.tell()
        for line in lines:
            line = line.strip()
            if not line:
                continue
            try:
                payload = json.loads(line)
                if isinstance(payload, dict):
                    payload.setdefault("source", "guardian")
                    entries.append(payload)
                    continue
            except json.JSONDecodeError:
                pass
            entries.append({"message": line, "source": "guardian", "parsed": False})
        self.state.cursors[cursor_key] = new_cursor
        return entries

    def _scan_datasets(self) -> List[DatasetStatus]:
        statuses: List[DatasetStatus] = []
        now = utc_now()
        if not self.dataset_dir.exists():
            return statuses
        for path in sorted(self.dataset_dir.rglob("*")):
            if not path.is_file():
                continue
            fingerprint = self._hash_file(path)
            size = path.stat().st_size
            previous = self.state.dataset_fingerprints.get(str(path))
            if previous is None:
                status = "new"
            elif previous != fingerprint:
                status = "updated"
            else:
                status = "verified"
            statuses.append(
                DatasetStatus(
                    path=str(path.relative_to(self.state_root)),
                    fingerprint=fingerprint,
                    size_bytes=size,
                    status=status,
                    last_checked=now,
                )
            )
            self.state.dataset_fingerprints[str(path)] = fingerprint
        return statuses

    def _hash_file(self, path: Path, chunk_size: int = 65536) -> str:
        digest = hashlib.sha256()
        with path.open("rb") as handle:
            while True:
                chunk = handle.read(chunk_size)
                if not chunk:
                    break
                digest.update(chunk)
        return digest.hexdigest()

    # ------------------------------------------------------------------
    # Abstract phase
    # ------------------------------------------------------------------
    def abstract(self, observation: Observation) -> List[NormalisedFormula]:
        normalised: List[NormalisedFormula] = []
        for record in observation.formulas:
            expression = self._extract_expression(record)
            if not expression:
                continue
            clean = self._normalise_expression(expression)
            signature = hashlib.sha256(clean.encode("utf-8")).hexdigest()[:16]
            variables = self._resolve_variables(record, clean)
            notation_label = self._ensure_notation_entry(
                signature=signature,
                expression=expression,
                normalised=clean,
                variables=variables,
                source_id=str(record.get("id") or signature),
            )
            normalised.append(
                NormalisedFormula(
                    record=record,
                    normalised=clean,
                    signature=signature,
                    notation_label=notation_label,
                    variables=variables,
                )
            )
        return normalised

    def _extract_expression(self, record: Mapping[str, Any]) -> Optional[str]:
        for key in ("expression", "formula", "statement"):
            value = record.get(key)
            if isinstance(value, str) and value.strip():
                return value.strip()
        return None

    def _normalise_expression(self, expression: str) -> str:
        condensed = " ".join(expression.replace("→", "->").replace("⇔", "<->").split())
        condensed = condensed.replace(" forall", " ∀").replace(" exists", " ∃")
        condensed = condensed.replace("\t", " ")
        condensed = " ".join(condensed.split())
        return condensed

    def _resolve_variables(self, record: Mapping[str, Any], expression: str) -> List[str]:
        if "variables" in record and isinstance(record["variables"], Sequence):
            return [str(item) for item in record["variables"] if str(item).strip()]
        tokens = set()
        current = ""
        for char in expression:
            if char.isalpha():
                current += char
            else:
                if current:
                    tokens.add(current)
                    current = ""
        if current:
            tokens.add(current)
        variables = sorted(tokens)
        return variables[:12]

    def _ensure_notation_entry(
        self,
        *,
        signature: str,
        expression: str,
        normalised: str,
        variables: Sequence[str],
        source_id: str,
    ) -> str:
        existing = self._notation_by_signature.get(signature)
        if existing:
            sources = existing.setdefault("sources", [])
            if source_id not in sources:
                sources.append(source_id)
            return str(existing["label"])
        label = f"Φ22-{self._notation_counter:03d}"
        self._notation_counter += 1
        entry = {
            "label": label,
            "signature": signature,
            "expression": expression,
            "normalised": normalised,
            "variables": list(variables),
            "sources": [source_id],
            "created_at": utc_now(),
        }
        self.notation_entries.append(entry)
        self._notation_by_signature[signature] = entry
        return label

    # ------------------------------------------------------------------
    # Formalise and prove phases
    # ------------------------------------------------------------------
    def formalise(
        self,
        normalised_formulas: List[NormalisedFormula],
        observation: Observation,
    ) -> List[ProofPlan]:
        plans: List[ProofPlan] = []
        for request in observation.proof_requests:
            theorem = str(request.get("theorem") or request.get("statement") or "Unnamed theorem")
            request_id = str(request.get("id") or hashlib.sha256(theorem.encode("utf-8")).hexdigest()[:12])
            used_notation, steps = self._map_request_to_notation(request, normalised_formulas)
            contradiction_refs = observation.contradictions[-5:]
            confidence = self._estimate_confidence(request, used_notation, contradiction_refs)
            plan = ProofPlan(
                request_id=request_id,
                theorem=theorem,
                steps=steps,
                used_notation=used_notation,
                contradictions=contradiction_refs,
                confidence=confidence,
            )
            plans.append(plan)
        return plans

    def _map_request_to_notation(
        self,
        request: Mapping[str, Any],
        normalised_formulas: List[NormalisedFormula],
    ) -> Tuple[List[str], List[Dict[str, Any]]]:
        assumptions = request.get("assumptions")
        if isinstance(assumptions, str):
            assumptions_list = [assumptions]
        elif isinstance(assumptions, Sequence):
            assumptions_list = [str(item) for item in assumptions]
        else:
            assumptions_list = []
        goal = str(request.get("goal") or request.get("conclusion") or "")
        matched: List[str] = []
        steps: List[Dict[str, Any]] = []
        for formula in normalised_formulas:
            score = self._match_score(formula.normalised, assumptions_list, goal)
            if score <= 0:
                continue
            matched.append(formula.notation_label)
            steps.append(
                {
                    "action": "reference",
                    "notation": formula.notation_label,
                    "support": score,
                    "summary": formula.normalised,
                }
            )
        if goal:
            steps.append(
                {
                    "action": "target",
                    "summary": goal,
                }
            )
        if not matched and not steps:
            steps.append({"action": "analysis", "summary": "No existing notation matched; manual proof required."})
        return matched, steps

    def _match_score(self, expression: str, assumptions: List[str], goal: str) -> float:
        score = 0.0
        lowered = expression.lower()
        for assumption in assumptions:
            if not assumption:
                continue
            tokens = assumption.lower().split()
            overlap = sum(1 for token in tokens if token and token in lowered)
            score += overlap / max(len(tokens), 1)
        if goal:
            goal_tokens = goal.lower().split()
            overlap = sum(1 for token in goal_tokens if token and token in lowered)
            score += 0.5 * (overlap / max(len(goal_tokens), 1))
        return round(score, 3)

    def _estimate_confidence(
        self,
        request: Mapping[str, Any],
        used_notation: List[str],
        contradictions: List[Dict[str, Any]],
    ) -> float:
        base = 0.4
        complexity = float(request.get("complexity", 1.0))
        complexity = max(0.2, min(5.0, complexity))
        support = min(len(used_notation) * 0.1, 0.4)
        contradiction_penalty = 0.1 * len(contradictions)
        confidence = base + support - contradiction_penalty
        confidence /= math.sqrt(complexity)
        return round(max(0.05, min(confidence, 0.99)), 3)

    def prove(self, plans: List[ProofPlan]) -> List[ProofRecord]:
        records: List[ProofRecord] = []
        for plan in plans:
            summary = self._summarise_plan(plan)
            issued_at = utc_now()
            records.append(ProofRecord(plan=plan, issued_at=issued_at, summary=summary))
        return records

    def _summarise_plan(self, plan: ProofPlan) -> str:
        if not plan.used_notation:
            return (
                "Sketch pending: no established notation matched this request. "
                "Recommend generating a bespoke proof exploration."
            )
        notation_list = ", ".join(plan.used_notation)
        return (
            f"Proof sketch references {notation_list} with estimated confidence {plan.confidence:.2f}. "
            "Contradictions reviewed: "
            + str(len(plan.contradictions))
        )

    # ------------------------------------------------------------------
    # Teach and rest phases
    # ------------------------------------------------------------------
    def teach(self, records: List[ProofRecord], observation: Observation) -> None:
        if records:
            self._append_jsonl(self.proof_archive, (record.to_dict() for record in records))
            if self.emit_dir is not None:
                for record in records:
                    self._emit_prompt(record)
        self._write_notation_registry()
        self._write_validation_report(observation.datasets)
        self._write_teaching_notes(records, observation)
        self.state.save(self.state_path)

    def _append_jsonl(self, path: Path, payloads: Iterable[Mapping[str, Any]]) -> None:
        path.parent.mkdir(parents=True, exist_ok=True)
        with path.open("a", encoding="utf-8") as handle:
            for payload in payloads:
                handle.write(json.dumps(payload, sort_keys=True))
                handle.write("\n")

    def _emit_prompt(self, record: ProofRecord) -> None:
        if self.emit_dir is None:
            return
        filename = f"mathematician-{record.plan.request_id}-{record.issued_at.replace(':', '').replace('-', '')}.json"
        path = self.emit_dir / filename
        payload = {
            "agent": self.identity,
            "summary": record.summary,
            "plan": record.plan.to_dict(),
        }
        path.write_text(json.dumps(payload, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")

    def _write_notation_registry(self) -> None:
        payload = {
            "agent": self.identity,
            "updated_at": utc_now(),
            "entries": sorted(self.notation_entries, key=lambda entry: entry.get("label", "")),
        }
        self.notation_registry_path.write_text(
            json.dumps(payload, indent=2, ensure_ascii=False) + "\n",
            encoding="utf-8",
        )

    def _write_validation_report(self, datasets: List[DatasetStatus]) -> None:
        payload = {
            "agent": self.identity,
            "updated_at": utc_now(),
            "datasets": [status.to_dict() for status in datasets],
        }
        self.validation_report_path.write_text(
            json.dumps(payload, indent=2, ensure_ascii=False) + "\n",
            encoding="utf-8",
        )

    def _write_teaching_notes(self, records: List[ProofRecord], observation: Observation) -> None:
        timestamp = utc_now()
        header = f"## {timestamp}\n"
        lines = [header]
        if self.seed.seed_language and not self.teaching_notes_path.exists():
            lines.append(self.seed.seed_language.strip() + "\n\n")
        lines.append(
            f"Observed {len(observation.formulas)} new formulae, {len(observation.proof_requests)} proof requests, "
            f"and {len(observation.contradictions)} contradictions.\n"
        )
        if records:
            lines.append("Proof sketches issued:\n")
            for record in records:
                lines.append(
                    f"- {record.plan.theorem} ({record.plan.request_id}) — confidence {record.plan.confidence:.2f}\n"
                )
        else:
            lines.append("No proof sketches emitted this cycle; awaiting richer notation overlaps.\n")
        lines.append("\n")
        with self.teaching_notes_path.open("a", encoding="utf-8") as handle:
            handle.writelines(lines)

    def rest(self) -> None:
        # Placeholder for potential scheduling hooks. The agent already respects the
        # behavioural loop by persisting state at the end of teach().
        pass

    # ------------------------------------------------------------------
    # Control flow
    # ------------------------------------------------------------------
    def run_cycle(self) -> List[ProofRecord]:
        observation = self.observe()
        normalised = self.abstract(observation)
        plans = self.formalise(normalised, observation)
        records = self.prove(plans)
        self.teach(records, observation)
        self.rest()
        return records


# ---------------------------------------------------------------------------
# CLI entrypoint
# ---------------------------------------------------------------------------


def build_parser() -> argparse.ArgumentParser:
    parser = argparse.ArgumentParser(description="Run the Codex-22 Mathematician agent")
    parser.add_argument("--seed", required=True, help="Path to the Mathematician seed manifest")
    parser.add_argument("--emit", help="Directory for emitted prompts")
    parser.add_argument(
        "--state-root",
        default=str(DEFAULT_STATE_ROOT),
        help=f"Override the state root directory (default: {DEFAULT_STATE_ROOT})",
    )
    return parser


def main(argv: Optional[Sequence[str]] = None) -> int:
    parser = build_parser()
    args = parser.parse_args(argv)
    seed_path = Path(args.seed)
    emit_dir = Path(args.emit) if args.emit else None
    state_root = Path(args.state_root)
    agent = Mathematician(seed_path=seed_path, emit_dir=emit_dir, state_root=state_root)
    agent.run_cycle()
    return 0


if __name__ == "__main__":
    raise SystemExit(main())