mirror of
https://github.com/blackboxprogramming/simulation-theory.git
synced 2026-03-17 05:57:19 -05:00
Add scrapers for arXiv, Wikipedia, and OEIS
Co-authored-by: blackboxprogramming <118287761+blackboxprogramming@users.noreply.github.com>
This commit is contained in:
100
scrapers/oeis_scraper.py
Normal file
100
scrapers/oeis_scraper.py
Normal file
@@ -0,0 +1,100 @@
|
||||
"""
|
||||
OEIS (On-Line Encyclopedia of Integer Sequences) scraper — fetches sequence
|
||||
metadata for integer sequences relevant to simulation-theory research.
|
||||
|
||||
Sequences of interest: primes, Fibonacci, pi digits, Euler–Mascheroni constant
|
||||
digits, Pascal's triangle, Catalan numbers, SHA-256 round constants, and others.
|
||||
|
||||
Usage:
|
||||
python oeis_scraper.py
|
||||
python oeis_scraper.py --ids A000040 A000045
|
||||
python oeis_scraper.py --output results.json
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import time
|
||||
|
||||
import requests
|
||||
|
||||
OEIS_SEARCH_URL = "https://oeis.org/search"
|
||||
|
||||
# Default sequence IDs relevant to the repository topics
|
||||
DEFAULT_IDS = [
|
||||
"A000040", # prime numbers
|
||||
"A000045", # Fibonacci numbers
|
||||
"A000796", # decimal expansion of pi
|
||||
"A001620", # decimal expansion of Euler–Mascheroni constant
|
||||
"A000108", # Catalan numbers
|
||||
"A000012", # the all-1s sequence (trivial zero analogue)
|
||||
"A000720", # pi(n): number of primes <= n
|
||||
"A006862", # Euclid numbers: 1 + product of first n primes
|
||||
"A000041", # number of partitions of n
|
||||
"A001358", # semiprimes
|
||||
]
|
||||
|
||||
|
||||
def fetch_sequence(oeis_id: str) -> dict:
|
||||
"""Fetch metadata for a single OEIS sequence via the JSON search endpoint."""
|
||||
params = {"q": f"id:{oeis_id}", "fmt": "json"}
|
||||
resp = requests.get(OEIS_SEARCH_URL, params=params, timeout=30)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
|
||||
results = data.get("results") or []
|
||||
if not results:
|
||||
return {"id": oeis_id, "name": "", "description": "", "values": [], "url": ""}
|
||||
|
||||
seq = results[0]
|
||||
return {
|
||||
"id": oeis_id,
|
||||
"name": seq.get("name", ""),
|
||||
"description": seq.get("comment", [""])[0] if seq.get("comment") else "",
|
||||
"values": seq.get("data", "").split(",")[:20], # first 20 terms
|
||||
"url": f"https://oeis.org/{oeis_id}",
|
||||
}
|
||||
|
||||
|
||||
def scrape(ids: list[str]) -> list[dict]:
|
||||
"""Scrape OEIS for each sequence ID."""
|
||||
results = []
|
||||
for oeis_id in ids:
|
||||
print(f"Fetching: {oeis_id} …")
|
||||
try:
|
||||
results.append(fetch_sequence(oeis_id))
|
||||
except requests.RequestException as exc:
|
||||
print(f" Error: {exc}")
|
||||
results.append({"id": oeis_id, "name": "", "description": "", "values": [], "url": ""})
|
||||
time.sleep(0.5) # be polite
|
||||
return results
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Scrape OEIS sequences relevant to simulation-theory research."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--ids",
|
||||
nargs="*",
|
||||
default=DEFAULT_IDS,
|
||||
help="OEIS sequence IDs (e.g. A000040). Defaults to built-in list.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
default=None,
|
||||
help="Write results to a JSON file instead of stdout.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
results = scrape(args.ids)
|
||||
|
||||
if args.output:
|
||||
with open(args.output, "w", encoding="utf-8") as fh:
|
||||
json.dump(results, fh, indent=2, ensure_ascii=False)
|
||||
print(f"Results written to {args.output}")
|
||||
else:
|
||||
print(json.dumps(results, indent=2, ensure_ascii=False))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user