1
1
#!/usr/bin/env python3
2
2
3
- # A tool for managing grackle data files. This should be usable as
4
- # -> a standalone command line tool (when pygrackle isn't installed)
5
- # -> as a part of pygrackle.
3
+ # A tool for managing grackle data files. (More details provided after imports)
6
4
#
7
- # There is 1 snag to converting this to file to a standalone command-line-tool
8
- # (that can be used without installing any packages):
5
+ # This file should be usable as both (i) a part of pygrackle and (ii) a standalone
6
+ # command line tool (when pygrackle IS NOT installed)
9
7
#
10
- # -> Currently, we are using the external pooch package.
11
- # -> I originaly thought it would be a great tool for helping us manage
12
- # datafiles. But our custom deduplication strategy required me to
13
- # implement a bunch of functionality that makes a lot of pooch's features
14
- # unnecessary.
15
- # -> We have 2 choices:
16
- # 1. We could totally remove pooch and use urllib instead
17
- # 2. We could package this program as a
18
- # `zipapp <https://docs.python.org/3/library/zipapp.html#module-zipapp>`_.
19
-
8
+ # To support scenario 1, this CAN ONLY use python's built in modules.
20
9
21
10
import argparse
22
- from contextlib import ExitStack
11
+ from contextlib import contextmanager , ExitStack
23
12
import filecmp
24
13
import hashlib
25
14
import io
15
+ from math import log10
26
16
import os
27
17
import re
28
18
import shutil
29
19
import stat
30
20
import sys
31
21
import traceback
32
22
from typing import IO , NamedTuple , Union
23
+ import urllib .request
24
+ from urllib .error import URLError , HTTPError
33
25
import warnings
34
26
35
- import pooch # external import
36
-
37
27
if (sys .version_info .major , sys .version_info .minor ) < (3 , 6 , 1 ):
28
+ # 3.6.0 doesn't support all NamedTuple features
38
29
raise RuntimeError ("python 3.6.1 or newer is required" )
39
30
40
31
# Down below, we provide a detailed description that serves 3 purposes
270
261
# -> See the end of this file for what that entails
271
262
272
263
273
- def _use_progress_bar ():
274
- try :
275
- import tqdm # noqa: F401
276
-
277
- return True
278
- except ImportError :
279
- return False
264
+ # define some constants
265
+ # =====================
280
266
267
+ # default chunksize used for file operations
268
+ _CHUNKSIZE = 4096
281
269
282
- _PROGRESSBAR = _use_progress_bar ()
283
-
270
+ # the name of the subdirectory in a data-store where we handle deduplication
271
+ _OBJECT_STORE_SUBDIR = "object-store"
284
272
273
+ # Alternative to `None` for specifying that a value wasn't specified. This is primarily
274
+ # used as a default value for an optional command line flag that requires an argument.
275
+ # In that case, a value of _UNSPECIFIED means that the flag wasn't specified while a
276
+ # value of `None` means that the flag doesn't have an associated value.
285
277
_UNSPECIFIED = object ()
286
- _OBJECT_STORE_SUBDIR = "object-store"
287
278
288
279
289
280
class GenericToolError (RuntimeError ):
@@ -306,6 +297,100 @@ def _ensure_all_removed(fnames):
306
297
continue
307
298
308
299
300
+ _MAX_BAR = 160 * "="
301
+
302
+
303
+ @contextmanager
304
+ def _progress_bar (ncols , total_bytes , * , use_dummy = False ):
305
+ """
306
+ ContextManager that provides a function used for drawing/updating progress bars
307
+
308
+ If the program wasn't excuted from a shell, or the caller want to draw
309
+ too few columns, the returned function does nothing.
310
+ """
311
+ # the main template is '[<progress-bar>] <size>/<size> <unit>'
312
+ # -> <progress-bar> is some fraction of _FULL_BAR and empty space
313
+ # -> <size> describes the current/total download size (takes up to 6 characters)
314
+ # -> <unit> is 1 or 2 characters
315
+ # -> thus, we need 19 characters for everything other than <progress-bar>
316
+ bar_len = min (len (_MAX_BAR ), ncols - 19 )
317
+
318
+ if use_dummy or (total_bytes <= 0 ) or (bar_len <= 0 ) or not sys .stdout .isatty ():
319
+ use_dummy = True
320
+
321
+ def _update (size ):
322
+ return None
323
+ else :
324
+ power_div_3 = int (log10 (total_bytes )) // 3
325
+ factor , unit = 1000.0 ** power_div_3 , ("B" , "KB" , "MB" , "GB" )[power_div_3 ]
326
+ fmt = "\r [{bar:{len}.{nfill}}] {size:.2f}" + f"/{ total_bytes / factor :.2f} { unit } "
327
+
328
+ def _update (size ):
329
+ nfill = bar_len * int (size / total_bytes )
330
+ val = fmt .format (bar = _MAX_BAR , len = bar_len , nfill = nfill , size = size / factor )
331
+ print (val , end = "" , flush = True )
332
+
333
+ try :
334
+ yield _update
335
+ finally :
336
+ # always execute this clause when exiting the context manager. If an exception
337
+ # caused the exit, it will be re-raised after this clause
338
+ if not use_dummy :
339
+ print (flush = True )
340
+
341
+
342
+ def _retrieve_url (url , dest , * , use_progress_bar = True , chunksize = _CHUNKSIZE ):
343
+ """
344
+ download the file from url to dest
345
+
346
+ Note
347
+ ----
348
+ Online discussion about calling `response.read(chunksize)`, where
349
+ `response` is the context manager object produced by
350
+ `url.request.urlopen`, seems to strongly imply that this limits the
351
+ amount of data read from the http request into memory at a given
352
+ point in time. However, the documentation seems vague on this point.
353
+
354
+ This is unlikely to ever be a problem (the biggest file we need is
355
+ currently under 10 Megabytes). However, if it does become a problem,
356
+ we have 2 options:
357
+ 1. we could conditionally fall back to ``curl`` (cli tool) or
358
+ ``Requests`` (python package) if they are present on the system
359
+ 2. we could craft custom http requests
360
+ """
361
+ ncols = shutil .get_terminal_size ()[0 ] - 1
362
+ req = urllib .request .Request (url )
363
+ try :
364
+ with ExitStack () as stack :
365
+ # enter context managers for http-response, progress-bar, & output-file
366
+ response = stack .enter_context (urllib .request .urlopen (req ))
367
+ total_bytes = int (response .headers .get ("Content-Length" , - 1 ))
368
+ update_progress = stack .enter_context (
369
+ _progress_bar (ncols , total_bytes , use_dummy = not use_progress_bar )
370
+ )
371
+ out_file = stack .enter_context (open (dest , "wb" ))
372
+
373
+ # write downloaded data to a file
374
+ downloaded_bytes = 0
375
+ while True :
376
+ update_progress (downloaded_bytes )
377
+ block = response .read (chunksize )
378
+ if not block :
379
+ break
380
+ downloaded_bytes += len (block )
381
+ out_file .write (block )
382
+ except HTTPError as e :
383
+ raise GenericToolError (
384
+ f"The server couldn't fulfill the request for retrieving { fname } from "
385
+ f"{ url } .\n Error code: { e .code } "
386
+ )
387
+ except URLError as e :
388
+ raise GenericToolError (
389
+ f"The server couldn't be reached while trying to retrieve { fname } from "
390
+ f"{ url } .\n Error code: { e .code } "
391
+ )
392
+
393
+
309
394
class Fetcher (NamedTuple ):
310
395
"""Encodes information for fetching data files
311
396
@@ -342,51 +427,41 @@ def __call__(self, fname, checksum, checksum_kind, dest_dir):
342
427
343
428
Notes
344
429
-----
345
- It doesn't look too difficult to swap out the ``pooch.retrieve``
346
- functionality for custom logic that calls methods from python's
347
- builtin urllib module. This would simplify the process of shipping
348
- this functionality as a portable standalone script.
349
-
350
- If we ever move away from pooch (e.g. to make this functionality
351
- easier to use in a portable standalone script), we need to ensure
352
- that the our new approach implements a similar procedure that
353
- they adopt where:
354
- 1. any downloaded file is first put in a temporary location
355
- 2. and then, only after we verify that the checksum is correct,
356
- we move the file to the downloads directory.
430
+ We follow the following procedure (inspired by pooch):
431
+ 1. downloads the file to a temporary location
432
+ 2. verifies that the checksum is correct
433
+ 3. move the file to the appropriate destination
434
+
435
+ This provides robust behavior if the program is interupted. In
436
+ principle, we could combine steps 1 and 2. But, there may be some
437
+ minor benefits to keeping our procedure like this (theoretically,
438
+ we may be more likely to catch corruption from harddrive hardware
439
+ errors).
357
440
"""
441
+ src = os .path .join (self .base_path , fname )
442
+ dst = os .path .join (dest_dir , fname )
443
+ _pretty_log (f"-> fetching `{ fname } `" , indent_all = True )
444
+ tmp_name = os .path .join (dest_dir , "_tempfile" )
445
+ # tmp_name can safely be removed if it exists (it only exists if this logic
446
+ # previously crashed or was interupted by SIGKILL)
447
+ _ensure_all_removed ([tmp_name ])
358
448
359
- if self .holds_url :
360
- return pooch .retrieve (
361
- url = f"{ self .base_path } /{ fname } " ,
362
- fname = fname ,
363
- known_hash = f"{ checksum_kind } :{ checksum } " ,
364
- path = dest_dir ,
365
- progressbar = _PROGRESSBAR ,
366
- )
367
- else :
368
- tmp_name = os .path .join (dest_dir , "_tempfile" )
369
- # tmp_name can safely be removed if it exists (it only exists if this logic
370
- # previously crashed or was interupted by SIGKILL)
371
- _ensure_all_removed ([tmp_name ])
372
- try :
373
- src = os .path .join (self .base_path , fname )
374
- dst = os .path .join (dest_dir , fname )
375
- _pretty_log (
376
- f"retrieving { fname } :\n " f"-> from: { src } \n " f"-> to: { dst } "
377
- )
449
+ try :
450
+ if self .holds_url :
451
+ _retrieve_url (src , tmp_name )
452
+ else :
378
453
# copy the file
379
454
shutil .copyfile (src , tmp_name )
380
- if not matches_checksum (tmp_name , checksum_kind , checksum ):
381
- if matches_checksum (src , checksum_kind , checksum ):
382
- raise GenericToolError (
383
- f"while copying from { src } , data may have been corrupted"
384
- )
385
- raise GenericToolError (f"{ src } does't have the expected checksum" )
386
- os .rename (tmp_name , dst )
455
+ if not matches_checksum (tmp_name , checksum_kind , checksum ):
456
+ if matches_checksum (src , checksum_kind , checksum ):
457
+ raise GenericToolError (
458
+ f"while copying from { src } , data may have been corrupted"
459
+ )
460
+ raise GenericToolError (f"{ src } does't have the expected checksum" )
461
+ os .rename (tmp_name , dst )
387
462
388
- finally :
389
- _ensure_all_removed ([tmp_name ])
463
+ finally :
464
+ _ensure_all_removed ([tmp_name ])
390
465
391
466
392
467
class DataStoreConfig (NamedTuple ):
@@ -597,7 +672,7 @@ def standard_lockfile(data_store_config):
597
672
return LockFileContext (os .path .join (data_store_config .data_dir , "lockfile" ))
598
673
599
674
600
- def calc_checksum (fname , alg_name , * , chunksize = 4096 ):
675
+ def calc_checksum (fname , alg_name , * , chunksize = _CHUNKSIZE ):
601
676
"""Calculate the checksum for a given fname"""
602
677
# construct the object to track intermediate state of the checksum
603
678
# calculation as we stream through the data
@@ -619,13 +694,14 @@ def matches_checksum(fname, alg_name, checksum):
619
694
return checksum == calc_checksum (fname , alg_name )
620
695
621
696
622
- def _pretty_log (arg ):
697
+ def _pretty_log (arg , * , indent_all = False ):
623
698
"""indent messages so it's clear when multiline messages are a single thought"""
624
699
lines = arg .splitlines ()
625
- if len (lines ):
626
- print ( " \n " . join ( [f"-- { lines [0 ]} " ] + [f" { e } " for e in lines [1 :]]))
700
+ if len (lines ) and not indent_all :
701
+ formatted = [f"-- { lines [0 ]} " ] + [f" { e } " for e in lines [1 :]]
627
702
else :
628
- print ("" )
703
+ formatted = [f" { e } " for e in lines ]
704
+ print (* formatted , sep = "\n " )
629
705
630
706
631
707
def _ensure_exists (path , content_description ):
@@ -975,14 +1051,15 @@ def fetch_all(self, registry):
975
1051
976
1052
with lockfile_ctx :
977
1053
num_fetched = 0
1054
+ _pretty_log (f"preparing to fetch files from: { self .fetcher .base_path } " )
978
1055
for fname , full_checksum_str in registry .items ():
979
1056
any_work , _ = self ._fetch_file (
980
1057
fname , full_checksum_str , lockfile_ctx = lockfile_ctx
981
1058
)
982
1059
num_fetched += any_work
983
1060
984
1061
if num_fetched == 0 :
985
- _pretty_log ("no files needed to be downloaded" )
1062
+ _pretty_log ("-> no files needed to be retrieved" , indent_all = True )
986
1063
987
1064
988
1065
def fetch_command (args , tool_config , data_store_config ):
@@ -1467,14 +1544,14 @@ def make_config_objects(grackle_version, file_registry_file):
1467
1544
return tool_config , data_store_config
1468
1545
1469
1546
1547
+ # Here, we define machinery employed when used as a standalone program
1548
+ # ====================================================================
1549
+
1470
1550
# to support installing this file as a standalone program, we will need to introduce the
1471
1551
# following procedure to the build-system:
1472
1552
# - treat this file as a template-file and configure it with CMake's
1473
1553
# ``configure_file`` command (or invoke ``configure_file.py`` under the classic
1474
1554
# build system) in order to substitute the names enclosed by the @ symbols
1475
- # - if we are still using pooch (or some other external package) we'll need to
1476
- # introduce logic to convert this into a zipapp (this is a special zip file that
1477
- # contains all dependencies that the python interpretter knows how to execute)
1478
1555
# - make resulting file executable (and maybe drop the .py suffix)
1479
1556
# - install it into the bin directory alongside the grackle libraries
1480
1557
0 commit comments