pipelines/introduction-to-generic-pipelines/load_data.py

#
# Copyright 2018-2023 Elyra Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import os
import tarfile
from pathlib import Path
from urllib.parse import urlparse

import requests


def download_from_public_url(url):

    data_dir_name = 'data'

    print('Downloading data file {} ...'.format(url))
    r = requests.get(url)
    if r.status_code != 200:
        raise RuntimeError('Could not fetch {}: HTTP status code {}'
                           .format(url, r.status_code))
    else:
        # extract data set file name from URL
        data_file_name = Path((urlparse(url).path)).name
        # create the directory where the downloaded file will be stored
        data_dir = Path(data_dir_name)
        data_dir.mkdir(parents=True, exist_ok=True)
        downloaded_data_file = data_dir / data_file_name

        print('Saving downloaded file "{}" as ...'.format(data_file_name))
        with open(downloaded_data_file, 'wb') as downloaded_file:
            downloaded_file.write(r.content)

        if r.headers['content-type'] == 'application/x-tar':
            print('Extracting downloaded file in directory "{}" ...'
                  .format(data_dir))
            with tarfile.open(downloaded_data_file, 'r') as tar:
                tar.extractall(data_dir)
            print('Removing downloaded file ...')
            downloaded_data_file.unlink()


if __name__ == "__main__":

    # This script downloads a compressed data set archive from a public
    # location e.g. http://server/path/to/archive and extracts it.
    # The archive location can be specified using the DATASET_URL environment
    # variable DATASET_URL=http://server/path/to/archive.

    # initialize download URL from environment variable
    dataset_url = os.environ.get('DATASET_URL')

    # No data set URL was provided.
    if dataset_url is None:
        raise RuntimeError(
            'Cannot run script. A data set URL must be provided as input.')

    # Try to process the URL
    download_from_public_url(dataset_url)