GillesMarkvoort
diff --git a/‎.gitignore
Lines changed: 125 additions & 0 deletions b/‎.gitignore
Lines changed: 125 additions & 0 deletions
diff --git a/‎.gitpod.Dockerfile
Lines changed: 24 additions & 0 deletions b/‎.gitpod.Dockerfile
Lines changed: 24 additions & 0 deletions
diff --git a/‎.gitpod.yml
Lines changed: 33 additions & 0 deletions b/‎.gitpod.yml
Lines changed: 33 additions & 0 deletions
diff --git a/‎README.md
Lines changed: 149 additions & 0 deletions b/‎README.md
Lines changed: 149 additions & 0 deletions
diff --git a/‎bootstrap_repo.sh
Lines changed: 20 additions & 0 deletions b/‎bootstrap_repo.sh
Lines changed: 20 additions & 0 deletions
diff --git a/‎exercises/__init__.py b/‎exercises/__init__.py
diff --git a/‎exercises/a_spark_demo/__init__.py b/‎exercises/a_spark_demo/__init__.py
diff --git a/‎exercises/a_spark_demo/a_create_frame.py
Lines changed: 36 additions & 0 deletions b/‎exercises/a_spark_demo/a_create_frame.py
Lines changed: 36 additions & 0 deletions
@@ -0,0 +1,125 @@
+# Result files
+output
+spark-warehouse
+artifacts
+metastore_db/
+
+exercises/resources/flights
+exercises/target
+exercises/resources/as_parquet
+fhvhv_tripdata_2020-05
+# Editor extensions
+.idea
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+
+
+Pipfile.lock
+videos
+cloud9/terraform
+cloud9/python
+cloud9/*.yaml
+cloud9/.gitignore
@@ -0,0 +1,24 @@
+FROM gitpod/workspace-python:2022-02-04-06-25-23
+
+ENV DEBIAN_FRONTEND=noninteractive
+ENV SPARK_LOCAL_IP=0.0.0.0
+# needed for master
+
+USER root
+# Install apt packages and clean up cached files
+RUN apt-get update && \
+    apt-get install -y openjdk-8-jdk python3-venv && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+# Install the AWS CLI and clean up tmp files
+RUN wget https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip -O ./awscliv2.zip && \
+    unzip awscliv2.zip && \
+    ./aws/install && \
+    rm -rf ./aws awscliv2.zip
+
+USER gitpod
+
+# For vscode
+EXPOSE 3000
+# for spark
+EXPOSE 4040
@@ -0,0 +1,33 @@
+github:
+  prebuilds:
+    master: true
+    branches: true
+    pullRequests: false
+    pullRequestsFromForks: false
+    addCheck: false
+    addComment: false
+    addBadge: false
+    
+image:
+  file: .gitpod.Dockerfile
+  
+ports:
+  - port: 4040 # pyspark UI
+    onOpen: notify
+    
+tasks:
+  - name: setup
+    init: |
+      python -m venv .venv
+      source .venv/bin/activate
+      python -m pip install -r requirements.txt
+      echo "source $(pwd)/.venv/bin/activate" >> ~/.bashrc
+      echo "export PYTHONPATH=$(pwd)" >> ~/.bashrc
+      clear
+    command: |  
+      source .venv/bin/activate
+      export PYTHONPATH=$(pwd)  
+
+vscode:
+  extensions:
+    - ms-python.python
@@ -0,0 +1,149 @@
+# Better Data Engineering with Pyspark
+
+📚 A course brought to you by the [Data Minded Academy].
+
+## Context
+
+These are the exercises used in the course *Better Data Engineering with
+PySpark*, developed by instructors at Data Minded. The exercises are meant
+to be completed in the order determined by the lexicographical order of
+their parent folders. That is, exercises inside the folder `b_foo` should be
+completed before those in `c_bar`, but both should come after those of
+`a_foo_bar`.
+
+## Getting started
+
+While you can clone the repo locally, we do not offer support for setting up
+your coding environment. Instead, we recommend you [tackle the exercises
+using Gitpod][this gitpod].
+
+[![Open in Gitpod][gitpod logo]][this gitpod]
+
+
+⚠ IMPORTANT: Create a new branch and periodically push your work to the remote.
+After 30min of inactivity this environment shuts down and you will lose unsaved
+progress.
+
+# Course objectives
+
+- Introduce good data engineering practices.
+- Illustrate modular and easily testable data transformation pipelines using
+  PySpark.
+- Illustrate PySpark concepts, like lazy evaluation, caching & partitioning.
+  Not limited to these three though.
+
+# Intended audience
+
+- People working with (Py)Spark or soon to be working with it.
+- Familiar with Python functions, variables and the container data types of
+  `list`, `tuple`, `dict`, and `set`.
+
+# Approach
+
+Lecturer first sets the foundations right for Python development and
+gradually builds up to PySpark data pipelines.
+
+There is a high degree of participation expected from the students: they
+will need to write code themselves and reason on topics, so that they can
+better retain the knowledge. 
+  
+Participants are recommended to be working on a branch for any changes they
+make, to avoid conflicts (otherwise the onus is on the participant), as the
+instructors may choose to release an update to the current branch.
+
+Note: this course is not about writing the best pipelines possible. There are
+many ways to skin a cat, in this course we show one (or sometimes a few), which
+should be suitable for the level of the participants.
+
+## Exercises
+
+### Warm-up: thinking critically about tests
+
+Glance at the file [./exercises/b_unit_test_demo/distance_metrics.py]. Then, 
+complete [./tests/test_distance_metrics.py], by writing at least two useful 
+tests, one of which should prove that the code, as it is, is wrong.
+
+### Adding derived columns
+
+Check out [exercises/c_labellers/dates.py] and implement the pure Python 
+function `is_belgian_holiday`. Verify your correct implementation by running 
+the test `test_pure_python_function` from [tests/test_labellers.py]. You could 
+do this from the command line with
+`pytest tests/test_labellers.py::test_pure_python_function`.
+
+With that implemented, it's time to take a step back and think about how one 
+would compare data that might be distributed over different machines. Implement
+`assert_frames_functionally_equivalent` from [tests/comparers.py]. Validate 
+that your implementation is correct by running the test suite at 
+[tests/test_comparers.py]. You will use this function in a few subsequent 
+exercises.
+
+Return to [exercises/c_labellers/dates.py] and implement `label_weekend`. 
+Again, run the related test from [tests/test_labellers.py]. It might be more 
+useful to you if you first read the test.
+
+Finally, implement `label_holidays` from [exercises/c_labellers/dates.py]. 
+As before, run the relevant test to verify a few easy cases (keep in mind that 
+few tests are exhaustive: it's typically easier to prove something is wrong, 
+than that something is right).
+
+If you're making great speed, try to think of an alternative implementation 
+to `label_holidays` and discuss pros and cons.
+
+### (Optional) Get in the habit of writing test
+
+Have a look at [exercises/d_laziness/date_helper.py]. Explain the intent of the
+author. Which two key aspects to Spark's processing did the author forget? If 
+you can't answer this, run `test_date_helper_doesnt_work_as_intended` from 
+[exercises/d_laziness/test_laziness.py]. Now write an alternative to the 
+`convert_date` function that does do what the author intended.
+
+### Common business case 1: cleaning data
+
+Using the information seen in the videos, prepare a sizeable dataset for 
+storage in "the clean zone" of a data lake, by implementing the `clean` 
+function of [exercises/h_cleansers/clean_flights_starter.py].
+
+### Cataloging your datasets
+
+To prevent your code from having links to datasets hardcoded everywhere,
+create a simple catalog and a convenience function to load data by 
+referencing this catalog. You have a template in 
+[exercises/i_catalog/catalog_starter.py].
+
+Once done, revisit [exercises/h_cleansers/clean_flights_starter.py], and 
+replace the call to load the dataset using your new catalog helpers.
+
+Adapt the `import` statements in [exercises/h_cleansers/clean_airports.py] 
+and [exercises/h_cleansers/clean_carriers.py] and execute these files with the 
+Python interpreter. Pay attention to where the data is being stored.
+
+### Peer review
+
+In group, discuss the improvements one could make to 
+[exercises/l_code_review/bingewatching.py].
+
+### Common business case 2: report generation
+
+Create a complete view of the flights data in which you combine the airline
+carriers (a dimension table), the airport names (another dimension table) and
+the flights tables (a facts table).
+
+Your manager wants to know how many flights were operated by American Airlines
+in 2011.
+
+How many of those flights arrived with less than (or equal to) 10 minutes of
+delay?
+
+A data scientist is looking for correlations between the departure delays and
+the dates. In particular, he/she thinks that on Fridays there are relatively
+speaking more flights departing with a delay than on any other day of the week.
+Verify his/her claim.
+
+Out of the 5 categories of sources for delays, which one appeared most often in
+2011? In other words, in which category should we invest more time to improve?
+
+
+[this gitpod]: https://gitpod.io/#https://github.com/oliverw1/summerschoolsept
+[gitpod logo]: https://gitpod.io/button/open-in-gitpod.svg
+[Data Minded Academy]: https://www.dataminded.academy/
@@ -0,0 +1,20 @@
+rm -rf .git
+git init
+git add .
+git rm --cached \
+  tests/test_distance_metrics_solution.py \
+  tests/comparers_solution.py \
+  exercises/b_unit_test_demo/distance_metrics_corrected.py \
+  exercises/c_labellers/dates_solution.py \
+  exercises/d_laziness/test_improved.py \
+  exercises/d_laziness/improved_date_helper.py \
+  exercises/d_laziness/test_laziness.py \
+  exercises/h_cleansers/clean_airports.py \
+  exercises/h_cleansers/clean_carriers.py \
+  exercises/h_cleansers/clean_flights.py \
+  exercises/h_cleansers/test_clean_flights.py \
+  exercises/h_cleansers/cleaning_villo_stations_solution.py \
+  exercises/i_catalog/catalog.py \
+  exercises/m_business/master_flights.py \
+  exercises/m_business/num_flights.py \
+  exercises/resources/fhvhv_tripdata_2020-05.csv
@@ -0,0 +1,36 @@
+"""
+Illustrate several ways to create small, toy-example dataframes.
+This is incredibly useful in tests.
+"""
+
+from pyspark.sql import SparkSession
+from pyspark.sql.types import IntegerType, StringType, StructField, StructType
+
+spark = SparkSession.builder.getOrCreate()
+
+# The verbose way
+fields = [
+    StructField("name", StringType(), nullable=True),
+    StructField("age", IntegerType(), nullable=True),
+]
+users = spark.createDataFrame(
+    data=[
+        ("Wim", 1),
+        (None, 2),
+    ],
+    schema=StructType(fields),
+)
+
+# A shorter way, with implicit assumptions: Spark will attempt to infer the datatypes.
+# They will typically be chosen overly large.
+currencies = spark.createDataFrame(
+    data=[
+        ("Euro", 1.0, 1),
+        ("USD", 1.2, 1),
+    ],
+    schema=("currency", "value", "random"),
+)
+
+for frame in (users, currencies):
+    frame.show()  # An action.
+    frame.printSchema()  # Not an action.