Source code for fanoutqa.wiki

"""Utils for working with Wikipedia"""

import functools
import logging
import urllib.parse

import httpx

from .models import Evidence
from .utils import CACHE_DIR, DATASET_EPOCH, markdownify

USER_AGENT = "fanoutqa/1.0.0 (andrz@seas.upenn.edu)"
WIKI_CACHE_DIR = CACHE_DIR / "wikicache"
WIKI_CACHE_DIR.mkdir(exist_ok=True, parents=True)

log = logging.getLogger(__name__)
wikipedia = httpx.Client(
    base_url="https://en.wikipedia.org/w/api.php",
    headers={"User-Agent": USER_AGENT},
    follow_redirects=True,
    timeout=30,
)


class LazyEvidence(Evidence):
    """A subclass of Evidence without a known revision ID; lazily loads it when needed."""

    def __init__(self, title: str, pageid: int, url: str = None):
        self.title = title
        self.pageid = pageid
        self._url = url

    @property
    def url(self):
        if self._url is not None:
            return self._url
        encoded_title = urllib.parse.quote(self.title)
        return f"https://en.wikipedia.org/wiki/{encoded_title}"

    @functools.cached_property
    def revid(self):
        resp = wikipedia.get(
            "",
            params={
                "format": "json",
                "action": "query",
                "prop": "revisions",
                "rvprop": "ids|timestamp",
                "rvlimit": 1,
                "pageids": self.pageid,
                "rvstart": DATASET_EPOCH.isoformat(),
            },
        )
        resp.raise_for_status()
        data = resp.json()
        page = data["query"]["pages"][str(self.pageid)]
        return page["revisions"][0]["revid"]



[docs]
@functools.lru_cache()
def wiki_search(query: str, results=10) -> list[Evidence]:
    """Return a list of Evidence documents given the search query."""
    # get the list of articles that match the query
    resp = wikipedia.get(
        "", params={"format": "json", "action": "query", "list": "search", "srsearch": query, "srlimit": results}
    )
    resp.raise_for_status()
    data = resp.json()

    # and return a LazyEvidence for each
    return [LazyEvidence(title=d["title"], pageid=d["pageid"]) for d in data["query"]["search"]]




[docs]
def wiki_content(doc: Evidence) -> str:
    """Get the page content in markdown, including tables and infoboxes, appropriate for displaying to an LLM."""
    # get the cached content, if available
    cache_filename = WIKI_CACHE_DIR / f"{doc.pageid}-dated.md"
    if cache_filename.exists():
        try:
            return cache_filename.read_text(encoding="utf-8")
        except UnicodeDecodeError:
            pass

    # otherwise retrieve it from Wikipedia
    resp = wikipedia.get("", params={"format": "json", "action": "parse", "oldid": doc.revid, "prop": "text"})
    resp.raise_for_status()
    data = resp.json()
    try:
        html = data["parse"]["text"]["*"]
    except KeyError:
        log.warning(f"Could not find dated revision of {doc.title} - maybe the page did not exist yet?")
        html = ""

    # MD it, cache it, and return
    text = markdownify(html)
    cache_filename.write_text(text, encoding="utf-8")
    return text