Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Local zipfile support #57

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -639,15 +639,16 @@ def swap_references(
yield item


def start_parser(filename) -> Generator:
with JSONOpen(filename) as f:
def start_parser(filename, zip_file=None) -> Generator:
with JSONOpen(filename, zip_file) as f:
yield from ijson.parse(f, use_float = True)


def in_network_file_to_csv(
url: str,
out_dir: str,
file: str | None = None,
zip_file: str | None = None,
code_filter: set | None = None,
npi_filter: set | None = None,
) -> None:
Expand All @@ -673,7 +674,7 @@ def in_network_file_to_csv(
ref_map = None

metadata = ijson.ObjectBuilder()
parser = start_parser(file)
parser = start_parser(file, zip_file)

file_row = file_row_from_url(url)
file_row['url'] = url
Expand All @@ -695,7 +696,7 @@ def in_network_file_to_csv(
except StopIteration:
if completed: break
if ref_map is None: ref_map = {}
parser = start_parser(file)
parser = start_parser(file, zip_file)
ffwd(parser, to_prefix='', to_value='in_network')
prefix, event, value = ('', 'map_key', 'in_network')
prepend(('', 'map_key', 'in_network'), parser)
Expand Down
96 changes: 56 additions & 40 deletions transparency-in-coverage/python/mrfutils/src/mrfutils/helpers.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,20 @@
import csv
import gzip
import hashlib
import io
import json
import logging
import os
import zipfile
from itertools import chain
from pathlib import Path
from urllib.parse import urlparse

import requests

from mrfutils.exceptions import InvalidMRF

log = logging.getLogger('mrfutils')
log.setLevel(logging.INFO)
log = logging.getLogger('flatteners')
# log.setLevel(logging.DEBUG)


def prepend(value, iterator):
Expand All @@ -37,65 +38,80 @@ def peek(iterator):

class JSONOpen:
"""
Context manager for opening JSON(.gz) MRFs.
Context manager for opening JSON(.gz/.zip) MRFs.
Usage:
>>> with JSONOpen('localfile.json') as f:
or
>>> with JSONOpen(some_json_url) as f:
including both zipped and unzipped files.
"""

def __init__(self, filename):
def __init__(self, filename, zip_file=None):
self.filename = filename
self.zip_file = zip_file
self.f = None
self.r = None
self.is_remote = None

parsed_url = urlparse(self.filename)
self.suffix = ''.join(Path(parsed_url.path).suffixes)
if not self.suffix:
self.suffix = ''.join(Path(parsed_url.query).suffixes)
if not self.zip_file:
parsed_url = urlparse(self.filename)
self.suffix = ''.join(Path(parsed_url.path).suffixes)
if not self.suffix:
self.suffix = ''.join(Path(parsed_url.query).suffixes)

if not (
self.suffix.endswith('.json.gz') or
self.suffix.endswith('.json')
):
raise InvalidMRF(f'Suffix not JSON: {self.filename=} {self.suffix=}')
if not (
self.suffix.endswith('.json.gz') or
self.suffix.endswith('.json') or
self.suffix.endswith('.zip')
):
raise InvalidMRF(f'Suffix not JSON or ZIP: {self.filename=} {self.suffix=}')

self.is_remote = parsed_url.scheme in ('http', 'https')
self.is_remote = parsed_url.scheme in ('http', 'https')
else:
self.suffix = ".zip"
self.is_remote = False

def __enter__(self):
if (
self.is_remote
# endswith is used to protect against the case
# where the filename contains lots of dots
# insurer.stuff.json.gz
and self.suffix.endswith('.json.gz')
):
self.s = requests.Session()
self.r = self.s.get(self.filename, stream=True)
self.f = gzip.GzipFile(fileobj=self.r.raw)

elif (
self.is_remote
and self.suffix.endswith('.json')
):
self.s = requests.Session()
self.r = self.s.get(self.filename, stream=True)
self.r.raw.decode_content = True
self.f = self.r.raw

elif self.suffix == '.json.gz':
self.f = gzip.open(self.filename, 'rb')

if self.suffix.endswith('.zip'):
if self.is_remote:
# Download the zip file and store it in memory
response = requests.get(self.filename)
response.raise_for_status()
zip_data = io.BytesIO(response.content)

# Open the first file in the zip
with zipfile.ZipFile(zip_data) as zip_file:
inner_filename = zip_file.namelist()[0]
self.f = zip_file.open(inner_filename)
else:
with zipfile.ZipFile(self.zip_file) as z:
self.f = z.open(self.filename)

elif self.suffix.endswith('.json.gz'):
if self.is_remote:
self.s = requests.Session()
self.r = self.s.get(self.filename, stream=True)
self.f = gzip.GzipFile(fileobj=self.r.raw)
else:
self.f = gzip.open(self.filename, 'rb')
elif self.suffix.endswith('.json'):
if self.is_remote:
self.s = requests.Session()
self.r = self.s.get(self.filename, stream=True)
self.r.raw.decode_content = True
self.f = self.r.raw
else:
self.f = open(self.filename, 'rb')
else:
self.f = open(self.filename, 'rb')
raise InvalidMRF(f'Suffix not JSON or ZIP: {self.filename=} {self.suffix=}')

log.info(f'Opened file: {self.filename}')
return self.f


def __exit__(self, exc_type, exc_val, exc_tb):
if self.is_remote:
# ZIP files do not use sessions and are thus not closable
if self.is_remote and not self.suffix.endswith('.zip'):
self.s.close()
self.r.close()

Expand Down