Add scrapers for arXiv, Wikipedia, and OEIS

Co-authored-by: blackboxprogramming <118287761+blackboxprogramming@users.noreply.github.com>
This commit is contained in:
copilot-swe-agent[bot]
2026-02-25 18:20:10 +00:00
parent 740aa11699
commit 6879279cdd
6 changed files with 440 additions and 0 deletions

100
scrapers/oeis_scraper.py Normal file
View File

@@ -0,0 +1,100 @@
"""
OEIS (On-Line Encyclopedia of Integer Sequences) scraper — fetches sequence
metadata for integer sequences relevant to simulation-theory research.
Sequences of interest: primes, Fibonacci, pi digits, EulerMascheroni constant
digits, Pascal's triangle, Catalan numbers, SHA-256 round constants, and others.
Usage:
python oeis_scraper.py
python oeis_scraper.py --ids A000040 A000045
python oeis_scraper.py --output results.json
"""
import argparse
import json
import time
import requests
OEIS_SEARCH_URL = "https://oeis.org/search"
# Default sequence IDs relevant to the repository topics
DEFAULT_IDS = [
"A000040", # prime numbers
"A000045", # Fibonacci numbers
"A000796", # decimal expansion of pi
"A001620", # decimal expansion of EulerMascheroni constant
"A000108", # Catalan numbers
"A000012", # the all-1s sequence (trivial zero analogue)
"A000720", # pi(n): number of primes <= n
"A006862", # Euclid numbers: 1 + product of first n primes
"A000041", # number of partitions of n
"A001358", # semiprimes
]
def fetch_sequence(oeis_id: str) -> dict:
"""Fetch metadata for a single OEIS sequence via the JSON search endpoint."""
params = {"q": f"id:{oeis_id}", "fmt": "json"}
resp = requests.get(OEIS_SEARCH_URL, params=params, timeout=30)
resp.raise_for_status()
data = resp.json()
results = data.get("results") or []
if not results:
return {"id": oeis_id, "name": "", "description": "", "values": [], "url": ""}
seq = results[0]
return {
"id": oeis_id,
"name": seq.get("name", ""),
"description": seq.get("comment", [""])[0] if seq.get("comment") else "",
"values": seq.get("data", "").split(",")[:20], # first 20 terms
"url": f"https://oeis.org/{oeis_id}",
}
def scrape(ids: list[str]) -> list[dict]:
"""Scrape OEIS for each sequence ID."""
results = []
for oeis_id in ids:
print(f"Fetching: {oeis_id}")
try:
results.append(fetch_sequence(oeis_id))
except requests.RequestException as exc:
print(f" Error: {exc}")
results.append({"id": oeis_id, "name": "", "description": "", "values": [], "url": ""})
time.sleep(0.5) # be polite
return results
def main() -> None:
parser = argparse.ArgumentParser(
description="Scrape OEIS sequences relevant to simulation-theory research."
)
parser.add_argument(
"--ids",
nargs="*",
default=DEFAULT_IDS,
help="OEIS sequence IDs (e.g. A000040). Defaults to built-in list.",
)
parser.add_argument(
"--output",
default=None,
help="Write results to a JSON file instead of stdout.",
)
args = parser.parse_args()
results = scrape(args.ids)
if args.output:
with open(args.output, "w", encoding="utf-8") as fh:
json.dump(results, fh, indent=2, ensure_ascii=False)
print(f"Results written to {args.output}")
else:
print(json.dumps(results, indent=2, ensure_ascii=False))
if __name__ == "__main__":
main()