Skip to content

Commit 2b96010

Browse files
committed
replace all usage of pooch with urllib
1 parent fb82d31 commit 2b96010

File tree

2 files changed

+154
-78
lines changed

2 files changed

+154
-78
lines changed

pyproject.toml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,6 @@ dependencies = [
4848
'numpy',
4949
'matplotlib',
5050
'yt>=4.0.2',
51-
'pooch',
5251
"importlib_resources;python_version<'3.9'"
5352
]
5453

src/python/pygrackle/utilities/grdata.py

Lines changed: 154 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -1,40 +1,31 @@
11
#!/usr/bin/env python3
22

3-
# A tool for managing grackle data files. This should be usable as
4-
# -> a standalone command line tool (when pygrackle isn't installed)
5-
# -> as a part of pygrackle.
3+
# A tool for managing grackle data files. (More details provided after imports)
64
#
7-
# There is 1 snag to converting this to file to a standalone command-line-tool
8-
# (that can be used without installing any packages):
5+
# This file should be usable as both (i) a part of pygrackle and (ii) a standalone
6+
# command line tool (when pygrackle IS NOT installed)
97
#
10-
# -> Currently, we are using the external pooch package.
11-
# -> I originaly thought it would be a great tool for helping us manage
12-
# datafiles. But our custom deduplication strategy required me to
13-
# implement a bunch of functionality that makes a lot of pooch's features
14-
# unnecessary.
15-
# -> We have 2 choices:
16-
# 1. We could totally remove pooch and use urllib instead
17-
# 2. We could package this program as a
18-
# `zipapp <https://docs.python.org/3/library/zipapp.html#module-zipapp>`_.
19-
8+
# To support scenario 1, this CAN ONLY use python's built in modules.
209

2110
import argparse
22-
from contextlib import ExitStack
11+
from contextlib import contextmanager, ExitStack
2312
import filecmp
2413
import hashlib
2514
import io
15+
from math import log10
2616
import os
2717
import re
2818
import shutil
2919
import stat
3020
import sys
3121
import traceback
3222
from typing import IO, NamedTuple, Union
23+
import urllib.request
24+
from urllib.error import URLError, HTTPError
3325
import warnings
3426

35-
import pooch # external import
36-
3727
if (sys.version_info.major, sys.version_info.minor) < (3, 6, 1):
28+
# 3.6.0 doesn't support all NamedTuple features
3829
raise RuntimeError("python 3.6.1 or newer is required")
3930

4031
# Down below, we provide a detailed description that serves 3 purposes
@@ -270,20 +261,20 @@
270261
# -> See the end of this file for what that entails
271262

272263

273-
def _use_progress_bar():
274-
try:
275-
import tqdm # noqa: F401
276-
277-
return True
278-
except ImportError:
279-
return False
264+
# define some constants
265+
# =====================
280266

267+
# default chunksize used for file operations
268+
_CHUNKSIZE = 4096
281269

282-
_PROGRESSBAR = _use_progress_bar()
283-
270+
# the name of the subdirectory in a data-store where we handle deduplication
271+
_OBJECT_STORE_SUBDIR = "object-store"
284272

273+
# Alternative to `None` for specifying that a value wasn't specified. This is primarily
274+
# used as a default value for an optional command line flag that requires an argument.
275+
# In that case, a value of _UNSPECIFIED means that the flag wasn't specified while a
276+
# value of `None` means that the flag doesn't have an associated value.
285277
_UNSPECIFIED = object()
286-
_OBJECT_STORE_SUBDIR = "object-store"
287278

288279

289280
class GenericToolError(RuntimeError):
@@ -306,6 +297,100 @@ def _ensure_all_removed(fnames):
306297
continue
307298

308299

300+
_MAX_BAR = 160 * "="
301+
302+
303+
@contextmanager
304+
def _progress_bar(ncols, total_bytes, *, use_dummy=False):
305+
"""
306+
ContextManager that provides a function used for drawing/updating progress bars
307+
308+
If the program wasn't excuted from a shell, or the caller want to draw
309+
too few columns, the returned function does nothing.
310+
"""
311+
# the main template is '[<progress-bar>] <size>/<size> <unit>'
312+
# -> <progress-bar> is some fraction of _FULL_BAR and empty space
313+
# -> <size> describes the current/total download size (takes up to 6 characters)
314+
# -> <unit> is 1 or 2 characters
315+
# -> thus, we need 19 characters for everything other than <progress-bar>
316+
bar_len = min(len(_MAX_BAR), ncols - 19)
317+
318+
if use_dummy or (total_bytes <= 0) or (bar_len <= 0) or not sys.stdout.isatty():
319+
use_dummy = True
320+
321+
def _update(size):
322+
return None
323+
else:
324+
power_div_3 = int(log10(total_bytes)) // 3
325+
factor, unit = 1000.0**power_div_3, ("B", "KB", "MB", "GB")[power_div_3]
326+
fmt = "\r[{bar:{len}.{nfill}}] {size:.2f}" + f"/{total_bytes/factor:.2f} {unit}"
327+
328+
def _update(size):
329+
nfill = bar_len * int(size / total_bytes)
330+
val = fmt.format(bar=_MAX_BAR, len=bar_len, nfill=nfill, size=size / factor)
331+
print(val, end="", flush=True)
332+
333+
try:
334+
yield _update
335+
finally:
336+
# always execute this clause when exiting the context manager. If an exception
337+
# caused the exit, it will be re-raised after this clause
338+
if not use_dummy:
339+
print(flush=True)
340+
341+
342+
def _retrieve_url(url, dest, *, use_progress_bar=True, chunksize=_CHUNKSIZE):
343+
"""
344+
download the file from url to dest
345+
346+
Note
347+
----
348+
Online discussion about calling `response.read(chunksize)`, where
349+
`response` is the context manager object produced by
350+
`url.request.urlopen`, seems to strongly imply that this limits the
351+
amount of data read from the http request into memory at a given
352+
point in time. However, the documentation seems vague on this point.
353+
354+
This is unlikely to ever be a problem (the biggest file we need is
355+
currently under 10 Megabytes). However, if it does become a problem,
356+
we have 2 options:
357+
1. we could conditionally fall back to ``curl`` (cli tool) or
358+
``Requests`` (python package) if they are present on the system
359+
2. we could craft custom http requests
360+
"""
361+
ncols = shutil.get_terminal_size()[0] - 1
362+
req = urllib.request.Request(url)
363+
try:
364+
with ExitStack() as stack:
365+
# enter context managers for http-response, progress-bar, & output-file
366+
response = stack.enter_context(urllib.request.urlopen(req))
367+
total_bytes = int(response.headers.get("Content-Length", -1))
368+
update_progress = stack.enter_context(
369+
_progress_bar(ncols, total_bytes, use_dummy=not use_progress_bar)
370+
)
371+
out_file = stack.enter_context(open(dest, "wb"))
372+
373+
# write downloaded data to a file
374+
downloaded_bytes = 0
375+
while True:
376+
update_progress(downloaded_bytes)
377+
block = response.read(chunksize)
378+
if not block:
379+
break
380+
downloaded_bytes += len(block)
381+
out_file.write(block)
382+
except HTTPError as e:
383+
raise GenericToolError(
384+
f"The server couldn't fulfill the request for retrieving {fname} from "
385+
f"{url}.\nError code: {e.code}"
386+
)
387+
except URLError as e:
388+
raise GenericToolError(
389+
f"The server couldn't be reached while trying to retrieve {fname} from "
390+
f"{url}.\nError code: {e.code}"
391+
)
392+
393+
309394
class Fetcher(NamedTuple):
310395
"""Encodes information for fetching data files
311396
@@ -342,51 +427,41 @@ def __call__(self, fname, checksum, checksum_kind, dest_dir):
342427
343428
Notes
344429
-----
345-
It doesn't look too difficult to swap out the ``pooch.retrieve``
346-
functionality for custom logic that calls methods from python's
347-
builtin urllib module. This would simplify the process of shipping
348-
this functionality as a portable standalone script.
349-
350-
If we ever move away from pooch (e.g. to make this functionality
351-
easier to use in a portable standalone script), we need to ensure
352-
that the our new approach implements a similar procedure that
353-
they adopt where:
354-
1. any downloaded file is first put in a temporary location
355-
2. and then, only after we verify that the checksum is correct,
356-
we move the file to the downloads directory.
430+
We follow the following procedure (inspired by pooch):
431+
1. downloads the file to a temporary location
432+
2. verifies that the checksum is correct
433+
3. move the file to the appropriate destination
434+
435+
This provides robust behavior if the program is interupted. In
436+
principle, we could combine steps 1 and 2. But, there may be some
437+
minor benefits to keeping our procedure like this (theoretically,
438+
we may be more likely to catch corruption from harddrive hardware
439+
errors).
357440
"""
441+
src = os.path.join(self.base_path, fname)
442+
dst = os.path.join(dest_dir, fname)
443+
_pretty_log(f"-> fetching `{fname}`", indent_all=True)
444+
tmp_name = os.path.join(dest_dir, "_tempfile")
445+
# tmp_name can safely be removed if it exists (it only exists if this logic
446+
# previously crashed or was interupted by SIGKILL)
447+
_ensure_all_removed([tmp_name])
358448

359-
if self.holds_url:
360-
return pooch.retrieve(
361-
url=f"{self.base_path}/{fname}",
362-
fname=fname,
363-
known_hash=f"{checksum_kind}:{checksum}",
364-
path=dest_dir,
365-
progressbar=_PROGRESSBAR,
366-
)
367-
else:
368-
tmp_name = os.path.join(dest_dir, "_tempfile")
369-
# tmp_name can safely be removed if it exists (it only exists if this logic
370-
# previously crashed or was interupted by SIGKILL)
371-
_ensure_all_removed([tmp_name])
372-
try:
373-
src = os.path.join(self.base_path, fname)
374-
dst = os.path.join(dest_dir, fname)
375-
_pretty_log(
376-
f"retrieving {fname}:\n" f"-> from: {src}\n" f"-> to: {dst}"
377-
)
449+
try:
450+
if self.holds_url:
451+
_retrieve_url(src, tmp_name)
452+
else:
378453
# copy the file
379454
shutil.copyfile(src, tmp_name)
380-
if not matches_checksum(tmp_name, checksum_kind, checksum):
381-
if matches_checksum(src, checksum_kind, checksum):
382-
raise GenericToolError(
383-
f"while copying from {src}, data may have been corrupted"
384-
)
385-
raise GenericToolError(f"{src} does't have the expected checksum")
386-
os.rename(tmp_name, dst)
455+
if not matches_checksum(tmp_name, checksum_kind, checksum):
456+
if matches_checksum(src, checksum_kind, checksum):
457+
raise GenericToolError(
458+
f"while copying from {src}, data may have been corrupted"
459+
)
460+
raise GenericToolError(f"{src} does't have the expected checksum")
461+
os.rename(tmp_name, dst)
387462

388-
finally:
389-
_ensure_all_removed([tmp_name])
463+
finally:
464+
_ensure_all_removed([tmp_name])
390465

391466

392467
class DataStoreConfig(NamedTuple):
@@ -597,7 +672,7 @@ def standard_lockfile(data_store_config):
597672
return LockFileContext(os.path.join(data_store_config.data_dir, "lockfile"))
598673

599674

600-
def calc_checksum(fname, alg_name, *, chunksize=4096):
675+
def calc_checksum(fname, alg_name, *, chunksize=_CHUNKSIZE):
601676
"""Calculate the checksum for a given fname"""
602677
# construct the object to track intermediate state of the checksum
603678
# calculation as we stream through the data
@@ -619,13 +694,14 @@ def matches_checksum(fname, alg_name, checksum):
619694
return checksum == calc_checksum(fname, alg_name)
620695

621696

622-
def _pretty_log(arg):
697+
def _pretty_log(arg, *, indent_all=False):
623698
"""indent messages so it's clear when multiline messages are a single thought"""
624699
lines = arg.splitlines()
625-
if len(lines):
626-
print("\n".join([f"-- {lines[0]}"] + [f" {e}" for e in lines[1:]]))
700+
if len(lines) and not indent_all:
701+
formatted = [f"-- {lines[0]}"] + [f" {e}" for e in lines[1:]]
627702
else:
628-
print("")
703+
formatted = [f" {e}" for e in lines]
704+
print(*formatted, sep="\n")
629705

630706

631707
def _ensure_exists(path, content_description):
@@ -975,14 +1051,15 @@ def fetch_all(self, registry):
9751051

9761052
with lockfile_ctx:
9771053
num_fetched = 0
1054+
_pretty_log(f"preparing to fetch files from: {self.fetcher.base_path}")
9781055
for fname, full_checksum_str in registry.items():
9791056
any_work, _ = self._fetch_file(
9801057
fname, full_checksum_str, lockfile_ctx=lockfile_ctx
9811058
)
9821059
num_fetched += any_work
9831060

9841061
if num_fetched == 0:
985-
_pretty_log("no files needed to be downloaded")
1062+
_pretty_log("-> no files needed to be retrieved", indent_all=True)
9861063

9871064

9881065
def fetch_command(args, tool_config, data_store_config):
@@ -1467,14 +1544,14 @@ def make_config_objects(grackle_version, file_registry_file):
14671544
return tool_config, data_store_config
14681545

14691546

1547+
# Here, we define machinery employed when used as a standalone program
1548+
# ====================================================================
1549+
14701550
# to support installing this file as a standalone program, we will need to introduce the
14711551
# following procedure to the build-system:
14721552
# - treat this file as a template-file and configure it with CMake's
14731553
# ``configure_file`` command (or invoke ``configure_file.py`` under the classic
14741554
# build system) in order to substitute the names enclosed by the @ symbols
1475-
# - if we are still using pooch (or some other external package) we'll need to
1476-
# introduce logic to convert this into a zipapp (this is a special zip file that
1477-
# contains all dependencies that the python interpretter knows how to execute)
14781555
# - make resulting file executable (and maybe drop the .py suffix)
14791556
# - install it into the bin directory alongside the grackle libraries
14801557

0 commit comments

Comments
 (0)