simulation-theory/scrapers/arxiv_scraper.py

"""
arXiv scraper — fetches abstracts for papers related to simulation theory research topics.

Topics covered: simulation hypothesis, Gödel incompleteness, Riemann hypothesis,
quantum computation, SHA-256/cryptographic hash functions, consciousness/integrated
information theory, ternary/qutrit systems.

Usage:
    python arxiv_scraper.py
    python arxiv_scraper.py --query "Riemann hypothesis" --max 5
    python arxiv_scraper.py --output results.json
"""

import argparse
import json
import time
import xml.etree.ElementTree as ET

import requests

ARXIV_API = "https://export.arxiv.org/api/query"

DEFAULT_QUERIES = [
    "simulation hypothesis computational reality",
    "Gödel incompleteness self-reference formal systems",
    "Riemann zeta function trivial zeros",
    "SHA-256 hash chain cryptographic proof",
    "qutrit ternary quantum computation",
    "integrated information theory consciousness",
    "halting problem quantum physics undecidability",
]

NS = {"atom": "http://www.w3.org/2005/Atom", "arxiv": "http://arxiv.org/schemas/atom"}


def fetch_papers(query: str, max_results: int = 5) -> list[dict]:
    """Return a list of paper dicts for the given arXiv search query."""
    params = {
        "search_query": f"all:{query}",
        "start": 0,
        "max_results": max_results,
        "sortBy": "relevance",
        "sortOrder": "descending",
    }
    resp = requests.get(ARXIV_API, params=params, timeout=30)
    resp.raise_for_status()

    root = ET.fromstring(resp.text)
    papers = []
    for entry in root.findall("atom:entry", NS):
        title_el = entry.find("atom:title", NS)
        summary_el = entry.find("atom:summary", NS)
        id_el = entry.find("atom:id", NS)
        published_el = entry.find("atom:published", NS)
        authors = [
            a.find("atom:name", NS).text
            for a in entry.findall("atom:author", NS)
            if a.find("atom:name", NS) is not None
        ]
        papers.append(
            {
                "title": title_el.text.strip() if title_el is not None else "",
                "authors": authors,
                "published": published_el.text.strip() if published_el is not None else "",
                "abstract": summary_el.text.strip() if summary_el is not None else "",
                "url": id_el.text.strip() if id_el is not None else "",
            }
        )
    return papers


def scrape(queries: list[str], max_per_query: int = 5) -> dict[str, list[dict]]:
    """Scrape arXiv for each query and return results keyed by query string."""
    results = {}
    for query in queries:
        print(f"Fetching: {query!r} …")
        try:
            results[query] = fetch_papers(query, max_results=max_per_query)
        except requests.RequestException as exc:
            print(f"  Error: {exc}")
            results[query] = []
        time.sleep(1)  # be polite to the API
    return results


def main() -> None:
    parser = argparse.ArgumentParser(description="Scrape arXiv for simulation-theory topics.")
    parser.add_argument(
        "--query",
        nargs="*",
        default=DEFAULT_QUERIES,
        help="Search queries (defaults to built-in topic list).",
    )
    parser.add_argument(
        "--max",
        type=int,
        default=5,
        dest="max_results",
        help="Maximum results per query (default: 5).",
    )
    parser.add_argument(
        "--output",
        default=None,
        help="Write results to a JSON file instead of stdout.",
    )
    args = parser.parse_args()

    results = scrape(args.query, max_per_query=args.max_results)

    if args.output:
        with open(args.output, "w", encoding="utf-8") as fh:
            json.dump(results, fh, indent=2, ensure_ascii=False)
        print(f"Results written to {args.output}")
    else:
        print(json.dumps(results, indent=2, ensure_ascii=False))


if __name__ == "__main__":
    main()