|
| 1 | +import abc |
1 | 2 | import base64
|
| 3 | +import collections |
2 | 4 | import contextlib
|
3 | 5 | import dataclasses
|
| 6 | +import html.parser |
4 | 7 | import itertools
|
5 | 8 | import os.path
|
6 | 9 | import typing
|
|
9 | 12 | import aiofiles.os
|
10 | 13 | import httpx
|
11 | 14 |
|
| 15 | +from pypi_browser import packaging |
| 16 | + |
| 17 | + |
| 18 | +class PythonRepository(abc.ABC): |
| 19 | + |
| 20 | + @abc.abstractmethod |
| 21 | + async def files_for_package(self, package_name: str) -> dict[str, str]: |
| 22 | + """Return mapping from filename to file URL for files in a package.""" |
| 23 | + |
| 24 | + |
| 25 | +class HTMLAnchorParser(html.parser.HTMLParser): |
| 26 | + anchors: set[str] |
| 27 | + |
| 28 | + def __init__(self) -> None: |
| 29 | + super().__init__() |
| 30 | + self.anchors = set() |
| 31 | + |
| 32 | + def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None: |
| 33 | + if tag == 'a': |
| 34 | + if href := dict(attrs).get('href'): |
| 35 | + self.anchors.add(href) |
| 36 | + |
12 | 37 |
|
13 | 38 | @dataclasses.dataclass(frozen=True)
|
14 |
| -class PyPIConfig: |
15 |
| - cache_path: str |
| 39 | +class SimpleRepository(PythonRepository): |
| 40 | + """Old-style "simple" PyPI registry serving HTML files.""" |
| 41 | + # TODO: Also handle PEP691 JSON simple repositories. |
16 | 42 | pypi_url: str
|
17 | 43 |
|
| 44 | + async def files_for_package(self, package_name: str) -> dict[str, str]: |
| 45 | + async with httpx.AsyncClient() as client: |
| 46 | + resp = await client.get( |
| 47 | + f'{self.pypi_url}/{package_name}', |
| 48 | + follow_redirects=True, |
| 49 | + ) |
| 50 | + if resp.status_code == 404: |
| 51 | + raise PackageDoesNotExist(package_name) |
| 52 | + parser = HTMLAnchorParser() |
| 53 | + parser.feed(resp.text) |
18 | 54 |
|
19 |
| -class PackageDoesNotExist(Exception): |
20 |
| - pass |
| 55 | + def clean_url(url: str) -> str: |
| 56 | + parsed = urllib.parse.urlparse(urllib.parse.urljoin(str(resp.url), url)) |
| 57 | + return parsed._replace(fragment='').geturl() |
21 | 58 |
|
| 59 | + return { |
| 60 | + (urllib.parse.urlparse(url).path).split('/')[-1]: clean_url(url) |
| 61 | + for url in parser.anchors |
| 62 | + } |
22 | 63 |
|
23 |
| -async def package_metadata( |
24 |
| - config: PyPIConfig, |
25 |
| - client: httpx.AsyncClient, |
26 |
| - package: str, |
27 |
| -) -> typing.Dict[typing.Any, typing.Any]: |
28 |
| - resp = await client.get(f'{config.pypi_url}/pypi/{package}/json') |
29 |
| - if resp.status_code == 404: |
30 |
| - raise PackageDoesNotExist(package) |
31 |
| - resp.raise_for_status() |
32 |
| - return resp.json() |
33 | 64 |
|
| 65 | +@dataclasses.dataclass(frozen=True) |
| 66 | +class LegacyJsonRepository(PythonRepository): |
| 67 | + """Non-standardized JSON API compatible with pypi.org's /pypi/*/json endpoints.""" |
| 68 | + pypi_url: str |
34 | 69 |
|
35 |
| -async def files_for_package(config: PyPIConfig, package: str) -> typing.Dict[str, typing.Set[str]]: |
36 |
| - async with httpx.AsyncClient() as client: |
37 |
| - metadata = await package_metadata(config, client, package) |
| 70 | + async def files_for_package(self, package_name: str) -> dict[str, str]: |
| 71 | + async with httpx.AsyncClient() as client: |
| 72 | + resp = await client.get( |
| 73 | + f'{self.pypi_url}/pypi/{package_name}/json', |
| 74 | + follow_redirects=True, |
| 75 | + ) |
| 76 | + if resp.status_code == 404: |
| 77 | + raise PackageDoesNotExist(package_name) |
| 78 | + resp.raise_for_status() |
| 79 | + return { |
| 80 | + file_['filename']: urllib.parse.urljoin(str(resp.url), file_['url']) |
| 81 | + for file_ in itertools.chain.from_iterable(resp.json()['releases'].values()) |
| 82 | + } |
38 | 83 |
|
39 |
| - return { |
40 |
| - version: {file_['filename'] for file_ in files} |
41 |
| - for version, files in metadata['releases'].items() |
42 |
| - if len(files) > 0 |
43 |
| - } |
| 84 | + |
| 85 | +@dataclasses.dataclass(frozen=True) |
| 86 | +class PyPIConfig: |
| 87 | + repo: PythonRepository |
| 88 | + cache_path: str |
| 89 | + |
| 90 | + |
| 91 | +class PackageDoesNotExist(Exception): |
| 92 | + pass |
| 93 | + |
| 94 | + |
| 95 | +async def files_by_version(config: PyPIConfig, package: str) -> dict[str | None, set[str]]: |
| 96 | + ret = collections.defaultdict(set) |
| 97 | + for filename in await config.repo.files_for_package(package): |
| 98 | + try: |
| 99 | + version = packaging.guess_version_from_filename(filename) |
| 100 | + except ValueError: |
| 101 | + # Possible with some very poorly-formed packages that used to be |
| 102 | + # allowed on PyPI. Just skip them when this happens. |
| 103 | + pass |
| 104 | + else: |
| 105 | + ret[version].add(filename) |
| 106 | + return ret |
44 | 107 |
|
45 | 108 |
|
46 | 109 | class CannotFindFileError(Exception):
|
@@ -81,21 +144,15 @@ async def downloaded_file_path(config: PyPIConfig, package: str, filename: str)
|
81 | 144 | if await aiofiles.os.path.exists(stored_path):
|
82 | 145 | return stored_path
|
83 | 146 |
|
84 |
| - async with httpx.AsyncClient() as client: |
85 |
| - metadata = await package_metadata(config, client, package) |
86 |
| - |
87 |
| - # Parsing versions from non-wheel Python packages isn't perfectly |
88 |
| - # reliable, so just search through all releases until we find a |
89 |
| - # matching file. |
90 |
| - for file_ in itertools.chain.from_iterable(metadata['releases'].values()): |
91 |
| - if file_['filename'] == filename: |
92 |
| - url = urllib.parse.urljoin(config.pypi_url, file_['url']) |
93 |
| - break |
94 |
| - else: |
95 |
| - raise CannotFindFileError(package, filename) |
| 147 | + filename_to_url = await config.repo.files_for_package(package) |
| 148 | + try: |
| 149 | + url = filename_to_url[filename] |
| 150 | + except KeyError: |
| 151 | + raise CannotFindFileError(package, filename) |
96 | 152 |
|
97 |
| - await aiofiles.os.makedirs(os.path.dirname(stored_path), exist_ok=True) |
| 153 | + await aiofiles.os.makedirs(os.path.dirname(stored_path), exist_ok=True) |
98 | 154 |
|
| 155 | + async with httpx.AsyncClient() as client: |
99 | 156 | async with _atomic_file(stored_path) as f:
|
100 | 157 | async with client.stream('GET', url) as resp:
|
101 | 158 | resp.raise_for_status()
|
|
0 commit comments