Skip to content

Commit d3ecc67

Browse files
authored
Merge pull request #10 from chriskuehl/support-simple-html
Support traditional "simple" HTML registries
2 parents 3c1da1b + 2aaea3e commit d3ecc67

File tree

6 files changed

+197
-48
lines changed

6 files changed

+197
-48
lines changed

.pre-commit-config.yaml

+2-3
Original file line numberDiff line numberDiff line change
@@ -14,17 +14,16 @@ repos:
1414
rev: v3.12.0
1515
hooks:
1616
- id: reorder-python-imports
17-
args: ['--py38-plus']
17+
args: ['--py39-plus']
1818
- repo: https://github.com/asottile/add-trailing-comma
1919
rev: v3.1.0
2020
hooks:
2121
- id: add-trailing-comma
22-
args: ['--py36-plus']
2322
- repo: https://github.com/asottile/pyupgrade
2423
rev: v3.15.2
2524
hooks:
2625
- id: pyupgrade
27-
args: ['--py38-plus']
26+
args: ['--py310-plus']
2827
- repo: https://github.com/hhatto/autopep8
2928
rev: v2.1.0
3029
hooks:

README.md

+13
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,17 @@ You can set these environment variables to configure the server:
5656

5757
* `PYPI_BROWSER_PYPI_URL`: URL for the PyPI server to use (defaults to
5858
`https://pypi.org`)
59+
60+
If your registry supports the pypi.org-compatible JSON API (e.g.
61+
`{registry}/pypi/{package}/json`), specify your base registry URL without
62+
appending `/simple` (e.g. `https://my-registry`).
63+
64+
If your registry only supports the traditional HTML "simple" index, specify
65+
the registry URL with `/simple` at the end (e.g.
66+
`https://my-registry/simple`).
67+
68+
Note that the [PEP691][pep691] JSON-based "simple" API is not yet supported.
69+
5970
* `PYPI_BROWSER_PACKAGE_CACHE_PATH`: Filesystem path to use for caching
6071
downloaded files. This will grow forever (the app does not clean it up) so
6172
you may want to use `tmpreaper` or similar to manage its size.
@@ -77,3 +88,5 @@ $ make start-dev
7788
```
7889

7990
to run a copy of the application locally with hot reloading enabled.
91+
92+
[pep691]: https://peps.python.org/pep-0691/

pypi_browser/app.py

+19-3
Original file line numberDiff line numberDiff line change
@@ -74,9 +74,16 @@ async def dispatch(
7474

7575

7676
config = starlette.config.Config()
77+
pypi_url = config('PYPI_BROWSER_PYPI_URL', default='https://pypi.org').rstrip('/')
78+
repo: pypi.PythonRepository
79+
if pypi_url.endswith('/simple'):
80+
repo = pypi.SimpleRepository(pypi_url)
81+
else:
82+
repo = pypi.LegacyJsonRepository(pypi_url)
83+
7784
pypi_config = pypi.PyPIConfig(
85+
repo=repo,
7886
cache_path=config('PYPI_BROWSER_PACKAGE_CACHE_PATH', default='/tmp'),
79-
pypi_url=config('PYPI_BROWSER_PYPI_URL', default='https://pypi.org'),
8087
)
8188

8289
templates = Jinja2Templates(
@@ -115,16 +122,25 @@ async def package(request: Request) -> Response:
115122
return RedirectResponse(request.url_for('package', package=normalized_package_name))
116123

117124
try:
118-
version_to_files = await pypi.files_for_package(pypi_config, package_name)
125+
version_to_files = await pypi.files_by_version(pypi_config, package_name)
119126
except pypi.PackageDoesNotExist:
120127
return PlainTextResponse(
121128
f'Package {package_name!r} does not exist on PyPI.',
122129
status_code=404,
123130
)
124131
else:
132+
def _version_sort_key(version: str | None) -> packaging.version.Version:
133+
if version is not None:
134+
try:
135+
return packaging.version.parse(version)
136+
except packaging.version.InvalidVersion:
137+
pass
138+
# Not really correct, but just throw everything we can't parse at the bottom.
139+
return packaging.version.Version('0.0.0')
140+
125141
version_to_files_sorted = sorted(
126142
version_to_files.items(),
127-
key=lambda item: packaging.version.parse(item[0]),
143+
key=lambda item: _version_sort_key(item[0]),
128144
reverse=True,
129145
)
130146
return templates.TemplateResponse(

pypi_browser/packaging.py

+62-6
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,66 @@
1212
from types import TracebackType
1313

1414

15+
# Copied from distlib/wheel.py
16+
WHEEL_FILENAME_RE = re.compile(
17+
r'''
18+
(?P<nm>[^-]+)
19+
-(?P<vn>\d+[^-]*)
20+
(-(?P<bn>\d+[^-]*))?
21+
-(?P<py>\w+\d+(\.\w+\d+)*)
22+
-(?P<bi>\w+)
23+
-(?P<ar>\w+(\.\w+)*)
24+
\.whl$
25+
''', re.IGNORECASE | re.VERBOSE,
26+
)
27+
28+
1529
def pep426_normalize(package_name: str) -> str:
1630
return re.sub(r'[-_.]+', '-', package_name.strip()).lower()
1731

1832

33+
def _remove_extension(name: str) -> str:
34+
if name.endswith(('gz', 'bz2')):
35+
name, _ = name.rsplit('.', 1)
36+
name, _ = name.rsplit('.', 1)
37+
return name
38+
39+
40+
def guess_version_from_filename(filename: str) -> str | None:
41+
# Inspired by https://github.com/chriskuehl/dumb-pypi/blob/a71c3cfeba6/dumb_pypi/main.py#L56
42+
if filename.endswith('.whl'):
43+
# TODO: Switch to packaging.utils.parse_wheel_filename which enforces
44+
# PEP440 versions for wheels.
45+
m = WHEEL_FILENAME_RE.match(filename)
46+
if m is not None:
47+
return m.group('vn')
48+
else:
49+
raise ValueError(f'Invalid package name: {filename}')
50+
else:
51+
# These don't have a well-defined format like wheels do, so they are
52+
# sort of "best effort", with lots of tests to back them up.
53+
# The most important thing is to correctly parse the name.
54+
name = _remove_extension(filename)
55+
version = None
56+
57+
if '-' in name:
58+
if name.count('-') == 1:
59+
name, version = name.split('-')
60+
else:
61+
parts = name.split('-')
62+
for i in range(len(parts) - 1, 0, -1):
63+
part = parts[i]
64+
if '.' in part and re.search('[0-9]', part):
65+
name, version = '-'.join(parts[0:i]), '-'.join(parts[i:])
66+
67+
# Possible with poorly-named files.
68+
if len(name) <= 0:
69+
raise ValueError(f'Invalid package name: {filename}')
70+
71+
assert version is None or len(version) > 0, version
72+
return version
73+
74+
1975
class UnsupportedPackageType(Exception):
2076
pass
2177

@@ -38,7 +94,7 @@ class PackageEntry:
3894
size: int
3995

4096

41-
def _package_entries_from_zipfile(path: str) -> typing.Set[PackageEntry]:
97+
def _package_entries_from_zipfile(path: str) -> set[PackageEntry]:
4298
with zipfile.ZipFile(path) as zf:
4399
return {
44100
PackageEntry(
@@ -51,7 +107,7 @@ def _package_entries_from_zipfile(path: str) -> typing.Set[PackageEntry]:
51107
}
52108

53109

54-
def _package_entries_from_tarball(path: str) -> typing.Set[PackageEntry]:
110+
def _package_entries_from_tarball(path: str) -> set[PackageEntry]:
55111
with tarfile.open(path) as tf:
56112
return {
57113
PackageEntry(
@@ -76,9 +132,9 @@ async def __aenter__(self) -> 'AsyncArchiveFile':
76132

77133
async def __aexit__(
78134
self,
79-
exc_t: typing.Optional[typing.Type[BaseException]],
80-
exc_v: typing.Optional[BaseException],
81-
exc_tb: typing.Optional[TracebackType],
135+
exc_t: type[BaseException] | None,
136+
exc_v: BaseException | None,
137+
exc_tb: TracebackType | None,
82138
) -> None:
83139
await asyncio.to_thread(self.file_.close)
84140

@@ -117,7 +173,7 @@ def from_path(cls, path: str) -> 'Package':
117173
path=path,
118174
)
119175

120-
async def entries(self) -> typing.Set[PackageEntry]:
176+
async def entries(self) -> set[PackageEntry]:
121177
if self.package_format is PackageFormat.ZIPFILE:
122178
return await asyncio.to_thread(_package_entries_from_zipfile, self.path)
123179
elif self.package_format is PackageFormat.TARBALL:

pypi_browser/pypi.py

+92-35
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
1+
import abc
12
import base64
3+
import collections
24
import contextlib
35
import dataclasses
6+
import html.parser
47
import itertools
58
import os.path
69
import typing
@@ -9,38 +12,98 @@
912
import aiofiles.os
1013
import httpx
1114

15+
from pypi_browser import packaging
16+
17+
18+
class PythonRepository(abc.ABC):
19+
20+
@abc.abstractmethod
21+
async def files_for_package(self, package_name: str) -> dict[str, str]:
22+
"""Return mapping from filename to file URL for files in a package."""
23+
24+
25+
class HTMLAnchorParser(html.parser.HTMLParser):
26+
anchors: set[str]
27+
28+
def __init__(self) -> None:
29+
super().__init__()
30+
self.anchors = set()
31+
32+
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
33+
if tag == 'a':
34+
if href := dict(attrs).get('href'):
35+
self.anchors.add(href)
36+
1237

1338
@dataclasses.dataclass(frozen=True)
14-
class PyPIConfig:
15-
cache_path: str
39+
class SimpleRepository(PythonRepository):
40+
"""Old-style "simple" PyPI registry serving HTML files."""
41+
# TODO: Also handle PEP691 JSON simple repositories.
1642
pypi_url: str
1743

44+
async def files_for_package(self, package_name: str) -> dict[str, str]:
45+
async with httpx.AsyncClient() as client:
46+
resp = await client.get(
47+
f'{self.pypi_url}/{package_name}',
48+
follow_redirects=True,
49+
)
50+
if resp.status_code == 404:
51+
raise PackageDoesNotExist(package_name)
52+
parser = HTMLAnchorParser()
53+
parser.feed(resp.text)
1854

19-
class PackageDoesNotExist(Exception):
20-
pass
55+
def clean_url(url: str) -> str:
56+
parsed = urllib.parse.urlparse(urllib.parse.urljoin(str(resp.url), url))
57+
return parsed._replace(fragment='').geturl()
2158

59+
return {
60+
(urllib.parse.urlparse(url).path).split('/')[-1]: clean_url(url)
61+
for url in parser.anchors
62+
}
2263

23-
async def package_metadata(
24-
config: PyPIConfig,
25-
client: httpx.AsyncClient,
26-
package: str,
27-
) -> typing.Dict[typing.Any, typing.Any]:
28-
resp = await client.get(f'{config.pypi_url}/pypi/{package}/json')
29-
if resp.status_code == 404:
30-
raise PackageDoesNotExist(package)
31-
resp.raise_for_status()
32-
return resp.json()
3364

65+
@dataclasses.dataclass(frozen=True)
66+
class LegacyJsonRepository(PythonRepository):
67+
"""Non-standardized JSON API compatible with pypi.org's /pypi/*/json endpoints."""
68+
pypi_url: str
3469

35-
async def files_for_package(config: PyPIConfig, package: str) -> typing.Dict[str, typing.Set[str]]:
36-
async with httpx.AsyncClient() as client:
37-
metadata = await package_metadata(config, client, package)
70+
async def files_for_package(self, package_name: str) -> dict[str, str]:
71+
async with httpx.AsyncClient() as client:
72+
resp = await client.get(
73+
f'{self.pypi_url}/pypi/{package_name}/json',
74+
follow_redirects=True,
75+
)
76+
if resp.status_code == 404:
77+
raise PackageDoesNotExist(package_name)
78+
resp.raise_for_status()
79+
return {
80+
file_['filename']: urllib.parse.urljoin(str(resp.url), file_['url'])
81+
for file_ in itertools.chain.from_iterable(resp.json()['releases'].values())
82+
}
3883

39-
return {
40-
version: {file_['filename'] for file_ in files}
41-
for version, files in metadata['releases'].items()
42-
if len(files) > 0
43-
}
84+
85+
@dataclasses.dataclass(frozen=True)
86+
class PyPIConfig:
87+
repo: PythonRepository
88+
cache_path: str
89+
90+
91+
class PackageDoesNotExist(Exception):
92+
pass
93+
94+
95+
async def files_by_version(config: PyPIConfig, package: str) -> dict[str | None, set[str]]:
96+
ret = collections.defaultdict(set)
97+
for filename in await config.repo.files_for_package(package):
98+
try:
99+
version = packaging.guess_version_from_filename(filename)
100+
except ValueError:
101+
# Possible with some very poorly-formed packages that used to be
102+
# allowed on PyPI. Just skip them when this happens.
103+
pass
104+
else:
105+
ret[version].add(filename)
106+
return ret
44107

45108

46109
class CannotFindFileError(Exception):
@@ -81,21 +144,15 @@ async def downloaded_file_path(config: PyPIConfig, package: str, filename: str)
81144
if await aiofiles.os.path.exists(stored_path):
82145
return stored_path
83146

84-
async with httpx.AsyncClient() as client:
85-
metadata = await package_metadata(config, client, package)
86-
87-
# Parsing versions from non-wheel Python packages isn't perfectly
88-
# reliable, so just search through all releases until we find a
89-
# matching file.
90-
for file_ in itertools.chain.from_iterable(metadata['releases'].values()):
91-
if file_['filename'] == filename:
92-
url = urllib.parse.urljoin(config.pypi_url, file_['url'])
93-
break
94-
else:
95-
raise CannotFindFileError(package, filename)
147+
filename_to_url = await config.repo.files_for_package(package)
148+
try:
149+
url = filename_to_url[filename]
150+
except KeyError:
151+
raise CannotFindFileError(package, filename)
96152

97-
await aiofiles.os.makedirs(os.path.dirname(stored_path), exist_ok=True)
153+
await aiofiles.os.makedirs(os.path.dirname(stored_path), exist_ok=True)
98154

155+
async with httpx.AsyncClient() as client:
99156
async with _atomic_file(stored_path) as f:
100157
async with client.stream('GET', url) as resp:
101158
resp.raise_for_status()

pypi_browser/templates/package.html

+9-1
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,15 @@ <h1 class="font-monospace">{{package}}</h1>
2121
</p>
2222
{% for version, files in version_to_files %}
2323
<div class="card bg-light mb-3">
24-
<div class="card-header"><h5 class="mb-0">{{version}}</h5></div>
24+
<div class="card-header">
25+
<h5 class="mb-0">
26+
{% if version is not none %}
27+
{{version}}
28+
{% else %}
29+
(unparseable version)
30+
{% endif %}
31+
</h5>
32+
</div>
2533
<div class="list-group list-group-flush">
2634
{% for file in files|sort %}
2735
<a class="list-group-item list-group-item-action" href="{{url_for('package_file', package=package, filename=file)}}">

0 commit comments

Comments
 (0)