Add scrapers for arXiv, Wikipedia, and OEIS

Co-authored-by: blackboxprogramming <118287761+blackboxprogramming@users.noreply.github.com>
2026-03-17 07:57:26 -05:00 · 2026-02-25 18:20:10 +00:00
parent 740aa11699
commit 6879279cdd
6 changed files with 440 additions and 0 deletions
--- a/scrapers/arxiv_scraper.py
+++ b/scrapers/arxiv_scraper.py
@@ -0,0 +1,119 @@
+"""
+arXiv scraper — fetches abstracts for papers related to simulation theory research topics.
+
+Topics covered: simulation hypothesis, Gödel incompleteness, Riemann hypothesis,
+quantum computation, SHA-256/cryptographic hash functions, consciousness/integrated
+information theory, ternary/qutrit systems.
+
+Usage:
+    python arxiv_scraper.py
+    python arxiv_scraper.py --query "Riemann hypothesis" --max 5
+    python arxiv_scraper.py --output results.json
+"""
+
+import argparse
+import json
+import time
+import xml.etree.ElementTree as ET
+
+import requests
+
+ARXIV_API = "https://export.arxiv.org/api/query"
+
+DEFAULT_QUERIES = [
+    "simulation hypothesis computational reality",
+    "Gödel incompleteness self-reference formal systems",
+    "Riemann zeta function trivial zeros",
+    "SHA-256 hash chain cryptographic proof",
+    "qutrit ternary quantum computation",
+    "integrated information theory consciousness",
+    "halting problem quantum physics undecidability",
+]
+
+NS = {"atom": "http://www.w3.org/2005/Atom", "arxiv": "http://arxiv.org/schemas/atom"}
+
+
+def fetch_papers(query: str, max_results: int = 5) -> list[dict]:
+    """Return a list of paper dicts for the given arXiv search query."""
+    params = {
+        "search_query": f"all:{query}",
+        "start": 0,
+        "max_results": max_results,
+        "sortBy": "relevance",
+        "sortOrder": "descending",
+    }
+    resp = requests.get(ARXIV_API, params=params, timeout=30)
+    resp.raise_for_status()
+
+    root = ET.fromstring(resp.text)
+    papers = []
+    for entry in root.findall("atom:entry", NS):
+        title_el = entry.find("atom:title", NS)
+        summary_el = entry.find("atom:summary", NS)
+        id_el = entry.find("atom:id", NS)
+        published_el = entry.find("atom:published", NS)
+        authors = [
+            a.find("atom:name", NS).text
+            for a in entry.findall("atom:author", NS)
+            if a.find("atom:name", NS) is not None
+        ]
+        papers.append(
+            {
+                "title": title_el.text.strip() if title_el is not None else "",
+                "authors": authors,
+                "published": published_el.text.strip() if published_el is not None else "",
+                "abstract": summary_el.text.strip() if summary_el is not None else "",
+                "url": id_el.text.strip() if id_el is not None else "",
+            }
+        )
+    return papers
+
+
+def scrape(queries: list[str], max_per_query: int = 5) -> dict[str, list[dict]]:
+    """Scrape arXiv for each query and return results keyed by query string."""
+    results = {}
+    for query in queries:
+        print(f"Fetching: {query!r} …")
+        try:
+            results[query] = fetch_papers(query, max_results=max_per_query)
+        except requests.RequestException as exc:
+            print(f"  Error: {exc}")
+            results[query] = []
+        time.sleep(1)  # be polite to the API
+    return results
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Scrape arXiv for simulation-theory topics.")
+    parser.add_argument(
+        "--query",
+        nargs="*",
+        default=DEFAULT_QUERIES,
+        help="Search queries (defaults to built-in topic list).",
+    )
+    parser.add_argument(
+        "--max",
+        type=int,
+        default=5,
+        dest="max_results",
+        help="Maximum results per query (default: 5).",
+    )
+    parser.add_argument(
+        "--output",
+        default=None,
+        help="Write results to a JSON file instead of stdout.",
+    )
+    args = parser.parse_args()
+
+    results = scrape(args.query, max_per_query=args.max_results)
+
+    if args.output:
+        with open(args.output, "w", encoding="utf-8") as fh:
+            json.dump(results, fh, indent=2, ensure_ascii=False)
+        print(f"Results written to {args.output}")
+    else:
+        print(json.dumps(results, indent=2, ensure_ascii=False))
+
+
+if __name__ == "__main__":
+    main()