mirror of
https://github.com/blackboxprogramming/simulation-theory.git
synced 2026-03-17 07:57:26 -05:00
Add scrapers for arXiv, Wikipedia, and OEIS
Co-authored-by: blackboxprogramming <118287761+blackboxprogramming@users.noreply.github.com>
This commit is contained in:
119
scrapers/arxiv_scraper.py
Normal file
119
scrapers/arxiv_scraper.py
Normal file
@@ -0,0 +1,119 @@
|
||||
"""
|
||||
arXiv scraper — fetches abstracts for papers related to simulation theory research topics.
|
||||
|
||||
Topics covered: simulation hypothesis, Gödel incompleteness, Riemann hypothesis,
|
||||
quantum computation, SHA-256/cryptographic hash functions, consciousness/integrated
|
||||
information theory, ternary/qutrit systems.
|
||||
|
||||
Usage:
|
||||
python arxiv_scraper.py
|
||||
python arxiv_scraper.py --query "Riemann hypothesis" --max 5
|
||||
python arxiv_scraper.py --output results.json
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import time
|
||||
import xml.etree.ElementTree as ET
|
||||
|
||||
import requests
|
||||
|
||||
ARXIV_API = "https://export.arxiv.org/api/query"
|
||||
|
||||
DEFAULT_QUERIES = [
|
||||
"simulation hypothesis computational reality",
|
||||
"Gödel incompleteness self-reference formal systems",
|
||||
"Riemann zeta function trivial zeros",
|
||||
"SHA-256 hash chain cryptographic proof",
|
||||
"qutrit ternary quantum computation",
|
||||
"integrated information theory consciousness",
|
||||
"halting problem quantum physics undecidability",
|
||||
]
|
||||
|
||||
NS = {"atom": "http://www.w3.org/2005/Atom", "arxiv": "http://arxiv.org/schemas/atom"}
|
||||
|
||||
|
||||
def fetch_papers(query: str, max_results: int = 5) -> list[dict]:
|
||||
"""Return a list of paper dicts for the given arXiv search query."""
|
||||
params = {
|
||||
"search_query": f"all:{query}",
|
||||
"start": 0,
|
||||
"max_results": max_results,
|
||||
"sortBy": "relevance",
|
||||
"sortOrder": "descending",
|
||||
}
|
||||
resp = requests.get(ARXIV_API, params=params, timeout=30)
|
||||
resp.raise_for_status()
|
||||
|
||||
root = ET.fromstring(resp.text)
|
||||
papers = []
|
||||
for entry in root.findall("atom:entry", NS):
|
||||
title_el = entry.find("atom:title", NS)
|
||||
summary_el = entry.find("atom:summary", NS)
|
||||
id_el = entry.find("atom:id", NS)
|
||||
published_el = entry.find("atom:published", NS)
|
||||
authors = [
|
||||
a.find("atom:name", NS).text
|
||||
for a in entry.findall("atom:author", NS)
|
||||
if a.find("atom:name", NS) is not None
|
||||
]
|
||||
papers.append(
|
||||
{
|
||||
"title": title_el.text.strip() if title_el is not None else "",
|
||||
"authors": authors,
|
||||
"published": published_el.text.strip() if published_el is not None else "",
|
||||
"abstract": summary_el.text.strip() if summary_el is not None else "",
|
||||
"url": id_el.text.strip() if id_el is not None else "",
|
||||
}
|
||||
)
|
||||
return papers
|
||||
|
||||
|
||||
def scrape(queries: list[str], max_per_query: int = 5) -> dict[str, list[dict]]:
|
||||
"""Scrape arXiv for each query and return results keyed by query string."""
|
||||
results = {}
|
||||
for query in queries:
|
||||
print(f"Fetching: {query!r} …")
|
||||
try:
|
||||
results[query] = fetch_papers(query, max_results=max_per_query)
|
||||
except requests.RequestException as exc:
|
||||
print(f" Error: {exc}")
|
||||
results[query] = []
|
||||
time.sleep(1) # be polite to the API
|
||||
return results
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description="Scrape arXiv for simulation-theory topics.")
|
||||
parser.add_argument(
|
||||
"--query",
|
||||
nargs="*",
|
||||
default=DEFAULT_QUERIES,
|
||||
help="Search queries (defaults to built-in topic list).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max",
|
||||
type=int,
|
||||
default=5,
|
||||
dest="max_results",
|
||||
help="Maximum results per query (default: 5).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
default=None,
|
||||
help="Write results to a JSON file instead of stdout.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
results = scrape(args.query, max_per_query=args.max_results)
|
||||
|
||||
if args.output:
|
||||
with open(args.output, "w", encoding="utf-8") as fh:
|
||||
json.dump(results, fh, indent=2, ensure_ascii=False)
|
||||
print(f"Results written to {args.output}")
|
||||
else:
|
||||
print(json.dumps(results, indent=2, ensure_ascii=False))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user