from __future__ import annotations import re import time from pathlib import Path from urllib.error import HTTPError from urllib.request import urlopen __all__ = ["DOWNLOAD_DIR", "retrieve_file", "output_file", "urls_from_file"] NAME_REMOVE = ("http://", "https://", "github.com/", "/raw/") DOWNLOAD_DIR = Path(__file__).parent # ---------------------------------------------------------------------- # Please update ./preload.py accordingly when modifying this file # ---------------------------------------------------------------------- def output_file(url: str, download_dir: Path = DOWNLOAD_DIR) -> Path: file_name = url.strip() for part in NAME_REMOVE: file_name = file_name.replace(part, '').strip().strip('/:').strip() return Path(download_dir, re.sub(r"[^\-_\.\w\d]+", "_", file_name)) def retrieve_file(url: str, download_dir: Path = DOWNLOAD_DIR, wait: float = 5) -> Path: path = output_file(url, download_dir) if path.exists(): print(f"Skipping {url} (already exists: {path})") else: download_dir.mkdir(exist_ok=True, parents=True) print(f"Downloading {url} to {path}") try: download(url, path) except HTTPError: time.sleep(wait) # wait a few seconds and try again. download(url, path) return path def urls_from_file(list_file: Path) -> list[str]: """``list_file`` should be a text file where each line corresponds to a URL to download. """ print(f"file: {list_file}") content = list_file.read_text(encoding="utf-8") return [url for url in content.splitlines() if not url.startswith("#")] def download(url: str, dest: Path): with urlopen(url) as f: data = f.read() with open(dest, "wb") as f: f.write(data) assert Path(dest).exists()