From 1c4f826f52bf79ec41c7d0b0e6fee86c6c3a9d0e Mon Sep 17 00:00:00 2001 From: wmkouw Date: Wed, 8 Feb 2023 14:42:07 +0100 Subject: [PATCH] Added 2022-2023 PP assignment solution --- ...c Programming - Assignment [Solution].html | 15424 ++++++++++++++++ ... Programming - Assignment [Solution].ipynb | 2332 +++ 2 files changed, 17756 insertions(+) create mode 100644 lessons/assignment/Probabilistic Programming - Assignment [Solution].html create mode 100644 lessons/assignment/Probabilistic Programming - Assignment [Solution].ipynb diff --git a/lessons/assignment/Probabilistic Programming - Assignment [Solution].html b/lessons/assignment/Probabilistic Programming - Assignment [Solution].html new file mode 100644 index 0000000..2b51af5 --- /dev/null +++ b/lessons/assignment/Probabilistic Programming - Assignment [Solution].html @@ -0,0 +1,15424 @@ + + + + +Probabilistic Programming - Assignment [Solution] + + + + + + + + + + + + + + + + + + + + + + + +
+
+ +
+
+
+

[5SSD0] Probabilistic Programming - Assignment [SOLUTION]

Year: 2022-2023

+ +
+
+
+
+
+
In [ ]:
+
+
+
# Enter your name and student ID
+name = 
+ID   =
+
+ +
+
+
+ +
+
+
+
+

In this assignment, we will go through the Bayesian model design cycle:

+

+

In questions 1 and 2, you will build a simple model, fit it to data and evaluate its performance on future data. You will find that its performance is not great. In question 3, you will improve the model in multiple ways. Finally, in question 4, you will do model selection based on free energy.

+

The final questions will require knowledge from the last probabilistic programming session. But questions 1 and 2 can be done relatively early in the course.

+ +
+
+
+
+
+
In [ ]:
+
+
+
using Pkg
+Pkg.activate(".")
+Pkg.instantiate();
+
+ +
+
+
+ +
+
+
+
In [3]:
+
+
+
using CSV
+using DataFrames
+using LinearAlgebra
+using ProgressMeter
+using RxInfer
+using Plots
+default(label="",
+        grid=false, 
+        linewidth=3, 
+        markersize=4,
+        guidefontsize=12, 
+        margins=15Plots.pt)
+
+ +
+
+
+ +
+
+
+
+

Problem: Forecasting Air Quality

Many Europeans suspect that the air quality in their city is declining. A recent study measured the air quality of a major city in North Italy using an electronic nose. The measurements were made in the middle of the city and reflect urban activity. We will inspect the specific chemical concentrations found and build a model to accurately predict CO for future time points.

+ +
+
+
+
+
+
+

https://www.theguardian.com/environment/2020/apr/07/air-pollution-linked-to-far-higher-covid-19-death-rates-study-finds

+

Photograph taken by Claudio Furlan/LaPresse/Zuma Press/Rex/Shutterstock (link)

+ +
+
+
+
+
+
+

Data

The data can be found here: https://archive.ics.uci.edu/ml/datasets/Air+Quality. I've done some pre-processing and selected the most important features. In this assignment we will infer parameters in a model of the data and predict air quality in the future. For that purpose, the data has been split into past and future.

+ +
+
+
+
+
+
In [4]:
+
+
+
# Load training data
+past_data = DataFrame(CSV.File("data/airquality_past.csv"))
+
+ +
+
+
+ +
+
+ + +
+ +
Out[4]:
+ + + +
+
200×2 DataFrame
175 rows omitted
RowtimeCO
DateTimeFloat64
12004-03-10T18:00:001360.0
22004-03-10T19:00:001292.0
32004-03-10T20:00:001402.0
42004-03-10T21:00:001376.0
52004-03-10T22:00:001272.0
62004-03-10T23:00:001197.0
72004-03-11T00:00:001185.0
82004-03-11T01:00:001136.0
92004-03-11T02:00:001094.0
102004-03-11T03:00:001010.0
112004-03-11T04:00:001011.0
122004-03-11T05:00:001066.0
132004-03-11T06:00:001052.0
1892004-03-18T14:00:001379.0
1902004-03-18T15:00:001322.0
1912004-03-18T16:00:001496.0
1922004-03-18T17:00:001409.0
1932004-03-18T18:00:001513.0
1942004-03-18T19:00:001667.0
1952004-03-18T20:00:001667.0
1962004-03-18T21:00:001430.0
1972004-03-18T22:00:001333.0
1982004-03-18T23:00:001262.0
1992004-03-19T00:00:001287.0
2002004-03-19T01:00:001134.0
+
+ +
+ +
+
+ +
+
+
+
+

Let's visualize the carbon monoxide measurements over time.

+ +
+
+
+
+
+
In [5]:
+
+
+
scatter(past_data[:,1], 
+        past_data[:,2], 
+        size=(900,300), 
+        color="black", 
+        xlabel="time", 
+        ylabel="CO (ppm)",
+        ylims=[400,2000])
+
+ +
+
+
+ +
+
+ + +
+ +
Out[5]:
+ + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ +
+
+ +
+
+
+
+

1. Model specification & parameter estimation

We suspect that there is a temporal dependence in this dataset. In other words, the data changes relatively slowly over time and neighbouring data points end up being highly correlated. To exploit this correlation, we will build an auto-regressive model of the form:

+$$ y_k = \theta y_{k-1} + \epsilon_k \, , $$

where the noise $\epsilon_k$ is drawn from a zero-mean Gaussian with precision parameter $\tau$:

+$$ \epsilon_k \sim \mathcal{N}(0, \tau^{-1}) \, .$$

Tasks:

+
    +
  • [1pt] Specify the above equation as a probabilistic model in RxInfer, using $\tau = 1.0$.
  • +
  • [1pt] Specify and execute an inference procedure to infer a posterior distribution for $\theta$.
  • +
  • [1pt] Plot the inferred distribution over the interval $[0,\ 2]$.
  • +
+ +
+
+
+
+
+
In [6]:
+
+
+
### YOUR CODE HERE
+
+# Number of data points
+T = size(past_data,1);
+
+# Prepare data
+y_k    = [past_data[i,2] for i in 2:T]
+y_prev = [past_data[i,2] for i in 1:(T-1)];
+
+# Specify parameters of prior distributions
+prior_params = Dict( => (1.0, 100.0))
+
+@model function auto_regression1(prior_params; num_samples=1)
+    
+    x = datavar(Float64,num_samples)
+    y = datavar(Float64,num_samples)
+    
+    # Prior coefficients
+    θ ~ Normal(mean = prior_params[][1], variance = prior_params[][2])
+    
+    for i = 1:num_samples
+        
+        # Likelihood
+        y[i] ~ Normal(mean = θ*x[i], precision = 1.0)
+        
+    end
+    return y, x, θ
+end
+
+results = inference(
+    model         = auto_regression1(prior_params, num_samples=T-1),
+    data          = (x = y_prev, y = y_k,),
+    free_energy   = true,
+)
+
+# Plot posterior
+plot(0.0:1e-3:2.0, x -> pdf(results.posteriors[], x), xlabel="θ", ylabel="p(θ|y)", size=(900,300))
+####
+
+ +
+
+
+ +
+
+ + +
+ +
Out[6]:
+ + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ +
+
+ +
+
+
+
+

2. Predictions & model evaluation

We want to evaluate the parameters inferred under the model. For now, we will do this by visually inspecting the 1-step ahead predictions on our data set. Later, we will use free energy as a metric.

+

The posterior predictive distribution for the next time step is:

+$$ p(y_{t+1} \mid y_{t}, \mathcal{D}) = \int p(y_{t+1} \mid \theta, y_{t}) p(\theta \mid \mathcal{D}) \, \mathrm{d}\theta \, , $$

where $\mathcal{D}$ refers to the data used to infer the posterior distribution. To make 1-step ahead predictions, you will have to loop over the data (i.e., for t in 1:T), plug in the current data point and compute the parameters of the posterior predictive distribution for the next data point. You may start from $t=2$, using $y_1$ as initial "previous observation".

+

Tasks:

+
    +
  • [1pt] Derive the parameters of the posterior predictive and compute the 1-step ahead predictions on the data set.
  • +
  • [1pt] Plot the first 10 predictions (posterior predictive variance in ribbon=) along with $y_{2:11}$ (scatterplot).
  • +
+
+

Note that if you failed to infer a posterior distribution in the previous question, you can still answer this question using a standard normal, $p(\theta) = \mathcal{N}(0,1)$.

+ +
+
+
+
+
+
In [7]:
+
+
+
### YOUR CODE HERE
+
+# Preallocate predictions
+sim_mean = zeros(T)
+sim_vars = zeros(T)
+
+# Initialize recursive previous
+y_prev = past_data[1,2]
+
+# Extract parameters of posterior
+,  = mean_var(results.posteriors[])
+
+for t = 2:T
+    
+    # Compute parameters of posterior
+    sim_mean[t] = *y_prev
+    sim_vars[t] = y_prev**y_prev + 1.0
+    
+    # Update data
+    y_prev = past_data[t,2]
+    
+end
+
+scatter(past_data[2:11,1], past_data[2:11,2], color="black", label="data", xlabel="time", ylabel="CO")
+plot!(past_data[2:11,1], sim_mean[2:11], ribbon=sim_vars[2:11], color="blue", size=(900,300), label="predictions")
+
+###
+
+ +
+
+
+ +
+
+ + +
+ +
Out[7]:
+ + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ +
+
+ +
+
+
+
+

3. Model critiqueing & improvement

From the results of the previous question, you may conclude that our initial model isn't great: it only considers extremely short-term changes, which are highly affected by noise.

+

The model can be improved in two ways:

+
    +
  1. We set the noise level $\tau$ to $1.0$ but that was based on convenience, not on domain expertise or data. You are going to put a prior probability distribution over $\tau$ and infer a posterior $p(\tau \mid \mathcal{D})$ simultaneously.
  2. +
  3. We can consider changes over longer periods of time by increasing the model order of the auto-regressive model. That corresponds to:
  4. +
+$$ y_k = \sum_{m=1}^{M} \theta_m y_{k-m} + \epsilon_k \, ,$$

where $M$ refers to model order.

+

Tasks:

+
    +
  • [1pt] Specify a higher-order AR model with an order parameter $M$, and with noise precision as a random variable.
  • +
  • [1pt] Infer the approximate posteriors for $\theta$ and $\tau$, for model order $M=3$.
  • +
  • [1pt] Visualize the posterior for the noise distribution $q(\tau)$ and the 1-step ahead predictions (mean and variance) on the data.
  • +
+ +
+
+
+
+
+
In [8]:
+
+
+
# Number of iterations of variational inference
+n_iters = 10;
+
+# Model order
+M = 3;
+
+ +
+
+
+ +
+
+
+
In [9]:
+
+
+
### YOUR CODE HERE
+
+# Number of data points
+T = size(past_data,1);
+
+# Prepare data
+y_k    = [past_data[i,2] for i in (M+1):T]
+y_prev = [past_data[i-1:-1:i-M,2] for i in (M+1):T]
+
+# Specify parameters of prior distributions 
+prior_params = Dict( => (exp.(-(1:M)), diagm(ones(M))),
+                     => (1.0, 1e3))
+
+@model function auto_regression2(prior_params; model_order=2, num_samples=1)
+    
+    x = datavar(Vector{Float64}, num_samples)
+    y = datavar(Float64, num_samples)
+    
+    # Prior coefficients
+    θ ~ MvNormal(mean = prior_params[][1], covariance = prior_params[][2])
+    
+    # Prior noise precision
+    τ ~ Gamma(shape = prior_params[][1], rate = prior_params[][2])
+    
+    for i = 1:num_samples
+        
+        # Likelihood
+        y[i] ~ Normal(mean = dot(θ,x[i]), precision = τ)
+        
+    end
+    return y, x, θ, τ
+end
+
+constraints = @constraints begin
+    q(θ,τ) = q(θ)q(τ)
+end
+
+results = inference(
+    model         = auto_regression2(prior_params, model_order=M, num_samples=T-M),
+    data          = (x = y_prev, y = y_k,),
+    constraints   = constraints,
+    initmarginals = (θ = MvNormal(prior_params[]...), τ = Gamma(prior_params[]...)),
+    returnvars    = (θ = KeepLast(), τ = KeepLast(),),
+    iterations    = n_iters,
+    free_energy   = true,
+    showprogress  = true,
+)
+
+# Posterior for noise precision
+p1 = plot(1e-5:1e-6:5e-4, x -> exp.(logpdf(results.posteriors[], x)))
+
+# Preallocate predictions
+sim_mean = zeros(T)
+sim_vars = zeros(T)
+
+# Initialize recursive previous
+y_prev = past_data[M:-1:1,2]
+
+# Extract parameters of posterior
+,  = mean_cov(results.posteriors[])
+     = mode(results.posteriors[])
+
+for t = M:T
+    
+    # Compute parameters of posterior
+    sim_mean[t] = '*y_prev
+    sim_vars[t] = y_prev'**y_prev + inv()
+    
+    # Update recursion
+    y_prev = past_data[t:-1:t-M+1,2]
+    
+end
+
+p2 = scatter(past_data[M:T,1], past_data[M:T,2], color="black", label="data", xlabel="time", ylabel="CO (ppm)")
+plot!(past_data[M:T,1], sim_mean[M:T], ribbon=sqrt.(sim_vars[M:T]), color="blue", size=(900,300), label="predicted")
+
+plot(p1, p2, layout=(2,1), size=(900,600))
+
+####
+
+ +
+
+
+ +
+
+ + +
+ +
+ + +
+
Progress: 100%|█████████████████████████████████████████| Time: 0:00:05
+
+
+
+ +
+ +
Out[9]:
+ + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ +
+
+ +
+
+
+
+

4. Model selection & deployment

We now essentially have a different model for each value of $M$. Which is the best?

+

Tasks:

+
    +
  • [1pt] Compute the free energies for a given range of model orders and report the best performing one.
  • +
  • [1pt] Deploy the best model (i.e., compute 1-step ahead predictions) on future data.
  • +
+ +
+
+
+
+
+
In [10]:
+
+
+
# Model order range
+model_orders = [2,4,8,16,32];
+
+ +
+
+
+ +
+
+
+
In [11]:
+
+
+
##### YOUR CODE HERE
+
+# Preallocate free energy
+FE = zeros(length(model_orders), n_iters)
+
+# Preallocate list of posteriors
+posts = Vector(undef, length(model_orders))
+
+# Loop over model orders
+@showprogress for (j,m) in enumerate(model_orders)
+    
+    # Prepare data
+    y_k    = [past_data[i,2] for i in (m+1):T]
+    y_prev = [past_data[i-1:-1:i-m,2] for i = (m+1):T]
+
+    # Specify parameters of prior distributions 
+    prior_params = Dict( => (exp.(-(1:m)), diagm(ones(m))),
+                         => (1.0, 1e3))
+
+    results = inference(
+        model         = auto_regression2(prior_params, model_order=m, num_samples=T-m),
+        data          = (x = y_prev, y = y_k,),
+        constraints   = constraints,
+        initmarginals = (θ = MvNormal(prior_params[]...), τ = Gamma(prior_params[]...)),
+        returnvars    = (θ = KeepLast(), τ = KeepLast(),),
+        iterations    = n_iters,
+        free_energy   = true,
+        showprogress  = true,
+    )
+    
+    # Store final FE
+    FE[j,:] = results.free_energy
+    
+    # Save posteriors
+    posts[j] = results.posteriors
+end
+
+# Extract best model and report
+M_best = model_orders[argmin(FE[:,end])]
+println("FE indicates M = "*string(M_best)*" is optimal.")
+
+# Visualize free energies
+plot(1:n_iters, 
+     FE', 
+     xlabel="Iterations of variational inference", 
+     ylabel="Free energy [F(q)]", 
+     labels = cat([["M = $m"] for m in model_orders]..., dims=2), 
+     size=(900,300))
+
+#####
+
+ +
+
+
+ +
+
+ + +
+ +
+ + +
+
Progress: 100%|█████████████████████████████████████████| Time: 0:00:00
+Progress: 100%|█████████████████████████████████████████| Time: 0:00:01
+
+
+
+ +
+ +
+ + +
+
FE indicates M = 32 is optimal.
+
+
+
+ +
+ +
Out[11]:
+ + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ +
+
+ +
+
+
+
In [12]:
+
+
+
# Load test data
+future_data = DataFrame(CSV.File("data/airquality_future.csv"))
+
+ +
+
+
+ +
+
+ + +
+ +
Out[12]:
+ + + +
+
300×2 DataFrame
275 rows omitted
RowtimeCO
DateTimeFloat64
12004-03-19T02:00:00999.0
22004-03-19T03:00:00961.0
32004-03-19T04:00:00934.0
42004-03-19T05:00:00913.0
52004-03-19T06:00:00969.0
62004-03-19T07:00:001182.0
72004-03-19T08:00:001740.0
82004-03-19T09:00:001819.0
92004-03-19T10:00:001427.0
102004-03-19T11:00:001390.0
112004-03-19T12:00:001283.0
122004-03-19T13:00:001304.0
132004-03-19T14:00:001364.0
2892004-03-31T02:00:00972.0
2902004-03-31T03:00:00832.0
2912004-03-31T04:00:00851.0
2922004-03-31T05:00:00909.0
2932004-03-31T06:00:001038.0
2942004-03-31T07:00:001636.0
2952004-03-31T08:00:001580.0
2962004-03-31T09:00:001262.0
2972004-03-31T10:00:001197.0
2982004-03-31T11:00:001277.0
2992004-03-31T12:00:001430.0
3002004-03-31T13:00:001242.0
+
+ +
+ +
+
+ +
+
+
+
In [13]:
+
+
+
### YOUR CODE HERE
+
+# Select posterior
+best_post = posts[argmin(FE[:,end])]
+
+# Extract parameters of selected posterior
+,  = mean_cov(best_post[])
+     = mode(best_post[])
+
+T = size(future_data,1)
+
+# Preallocate predictions
+sim_mean = zeros(T)
+sim_vars = zeros(T)
+
+# Initialize recursive previous
+y_prev = future_data[M_best:-1:1,2]
+
+for t = (M_best+1):T
+    
+    # Compute parameters of posterior
+    sim_mean[t] = '*y_prev
+    sim_vars[t] = y_prev'**y_prev + inv()
+    
+    # Update data
+    y_prev = future_data[t-1:-1:t-M_best,2]
+    
+end
+
+ix = (M_best+1):300
+scatter(future_data[ix,1], future_data[ix,2], color="black", label="data", xlabel="time", ylabel="CO")
+plot!(future_data[ix,1], sim_mean[ix], ribbon=sqrt.(sim_vars[ix]), color="blue", size=(900,300), label="predictions")
+
+###
+
+ +
+
+
+ +
+
+ + +
+ +
Out[13]:
+ + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ +
+
+ +
+
+
+
In [ ]:
+
+
+

+
+ +
+
+
+ +
+
+
+ + + + + + diff --git a/lessons/assignment/Probabilistic Programming - Assignment [Solution].ipynb b/lessons/assignment/Probabilistic Programming - Assignment [Solution].ipynb new file mode 100644 index 0000000..87a5164 --- /dev/null +++ b/lessons/assignment/Probabilistic Programming - Assignment [Solution].ipynb @@ -0,0 +1,2332 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# [5SSD0] Probabilistic Programming - Assignment [SOLUTION]\n", + "\n", + "Year: 2022-2023" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Enter your name and student ID\n", + "name = \n", + "ID = " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this assignment, we will go through the Bayesian model design cycle:\n", + "\n", + " \"\" \n", + "\n", + "In questions 1 and 2, you will build a simple model, fit it to data and evaluate its performance on future data. You will find that its performance is not great. In question 3, you will improve the model in multiple ways. Finally, in question 4, you will do model selection based on free energy.\n", + "\n", + "The final questions will require knowledge from the last probabilistic programming session. But questions 1 and 2 can be done relatively early in the course." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "using Pkg\n", + "Pkg.activate(\".\")\n", + "Pkg.instantiate();" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "using CSV\n", + "using DataFrames\n", + "using LinearAlgebra\n", + "using ProgressMeter\n", + "using RxInfer\n", + "using Plots\n", + "default(label=\"\",\n", + " grid=false, \n", + " linewidth=3, \n", + " markersize=4,\n", + " guidefontsize=12, \n", + " margins=15Plots.pt)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Problem: Forecasting Air Quality\n", + "\n", + "Many Europeans suspect that the air quality in their city is declining. A [recent study](https://doi.org/10.1016/j.snb.2007.09.060) measured the air quality of a major city in North Italy using an electronic nose. The measurements were made in the middle of the city and reflect urban activity. We will inspect the specific chemical concentrations found and build a model to accurately predict CO for future time points." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![https://www.theguardian.com/environment/2020/apr/07/air-pollution-linked-to-far-higher-covid-19-death-rates-study-finds](figures/air-milan-wide.png)\n", + "\n", + "Photograph taken by Claudio Furlan/LaPresse/Zuma Press/Rex/Shutterstock ([link](https://www.theguardian.com/environment/2020/apr/07/air-pollution-linked-to-far-higher-covid-19-death-rates-study-finds))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Data\n", + "\n", + "The data can be found here: https://archive.ics.uci.edu/ml/datasets/Air+Quality. I've done some pre-processing and selected the most important features. In this assignment we will infer parameters in a model of the data and predict air quality in the future. For that purpose, the data has been split into past and future." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
200×2 DataFrame
175 rows omitted
RowtimeCO
DateTimeFloat64
12004-03-10T18:00:001360.0
22004-03-10T19:00:001292.0
32004-03-10T20:00:001402.0
42004-03-10T21:00:001376.0
52004-03-10T22:00:001272.0
62004-03-10T23:00:001197.0
72004-03-11T00:00:001185.0
82004-03-11T01:00:001136.0
92004-03-11T02:00:001094.0
102004-03-11T03:00:001010.0
112004-03-11T04:00:001011.0
122004-03-11T05:00:001066.0
132004-03-11T06:00:001052.0
1892004-03-18T14:00:001379.0
1902004-03-18T15:00:001322.0
1912004-03-18T16:00:001496.0
1922004-03-18T17:00:001409.0
1932004-03-18T18:00:001513.0
1942004-03-18T19:00:001667.0
1952004-03-18T20:00:001667.0
1962004-03-18T21:00:001430.0
1972004-03-18T22:00:001333.0
1982004-03-18T23:00:001262.0
1992004-03-19T00:00:001287.0
2002004-03-19T01:00:001134.0
" + ], + "text/latex": [ + "\\begin{tabular}{r|cc}\n", + "\t& time & CO\\\\\n", + "\t\\hline\n", + "\t& DateTime & Float64\\\\\n", + "\t\\hline\n", + "\t1 & 2004-03-10T18:00:00 & 1360.0 \\\\\n", + "\t2 & 2004-03-10T19:00:00 & 1292.0 \\\\\n", + "\t3 & 2004-03-10T20:00:00 & 1402.0 \\\\\n", + "\t4 & 2004-03-10T21:00:00 & 1376.0 \\\\\n", + "\t5 & 2004-03-10T22:00:00 & 1272.0 \\\\\n", + "\t6 & 2004-03-10T23:00:00 & 1197.0 \\\\\n", + "\t7 & 2004-03-11T00:00:00 & 1185.0 \\\\\n", + "\t8 & 2004-03-11T01:00:00 & 1136.0 \\\\\n", + "\t9 & 2004-03-11T02:00:00 & 1094.0 \\\\\n", + "\t10 & 2004-03-11T03:00:00 & 1010.0 \\\\\n", + "\t11 & 2004-03-11T04:00:00 & 1011.0 \\\\\n", + "\t12 & 2004-03-11T05:00:00 & 1066.0 \\\\\n", + "\t13 & 2004-03-11T06:00:00 & 1052.0 \\\\\n", + "\t14 & 2004-03-11T07:00:00 & 1144.0 \\\\\n", + "\t15 & 2004-03-11T08:00:00 & 1333.0 \\\\\n", + "\t16 & 2004-03-11T09:00:00 & 1351.0 \\\\\n", + "\t17 & 2004-03-11T10:00:00 & 1233.0 \\\\\n", + "\t18 & 2004-03-11T11:00:00 & 1179.0 \\\\\n", + "\t19 & 2004-03-11T12:00:00 & 1236.0 \\\\\n", + "\t20 & 2004-03-11T13:00:00 & 1286.0 \\\\\n", + "\t21 & 2004-03-11T14:00:00 & 1371.0 \\\\\n", + "\t22 & 2004-03-11T15:00:00 & 1310.0 \\\\\n", + "\t23 & 2004-03-11T16:00:00 & 1292.0 \\\\\n", + "\t24 & 2004-03-11T17:00:00 & 1383.0 \\\\\n", + "\t25 & 2004-03-11T18:00:00 & 1581.0 \\\\\n", + "\t26 & 2004-03-11T19:00:00 & 1776.0 \\\\\n", + "\t27 & 2004-03-11T20:00:00 & 1640.0 \\\\\n", + "\t28 & 2004-03-11T21:00:00 & 1313.0 \\\\\n", + "\t29 & 2004-03-11T22:00:00 & 965.0 \\\\\n", + "\t30 & 2004-03-11T23:00:00 & 913.0 \\\\\n", + "\t$\\dots$ & $\\dots$ & $\\dots$ \\\\\n", + "\\end{tabular}\n" + ], + "text/plain": [ + "\u001b[1m200×2 DataFrame\u001b[0m\n", + "\u001b[1m Row \u001b[0m│\u001b[1m time \u001b[0m\u001b[1m CO \u001b[0m\n", + " │\u001b[90m DateTime \u001b[0m\u001b[90m Float64 \u001b[0m\n", + "─────┼──────────────────────────────\n", + " 1 │ 2004-03-10T18:00:00 1360.0\n", + " 2 │ 2004-03-10T19:00:00 1292.0\n", + " 3 │ 2004-03-10T20:00:00 1402.0\n", + " 4 │ 2004-03-10T21:00:00 1376.0\n", + " 5 │ 2004-03-10T22:00:00 1272.0\n", + " 6 │ 2004-03-10T23:00:00 1197.0\n", + " 7 │ 2004-03-11T00:00:00 1185.0\n", + " 8 │ 2004-03-11T01:00:00 1136.0\n", + " 9 │ 2004-03-11T02:00:00 1094.0\n", + " 10 │ 2004-03-11T03:00:00 1010.0\n", + " 11 │ 2004-03-11T04:00:00 1011.0\n", + " ⋮ │ ⋮ ⋮\n", + " 191 │ 2004-03-18T16:00:00 1496.0\n", + " 192 │ 2004-03-18T17:00:00 1409.0\n", + " 193 │ 2004-03-18T18:00:00 1513.0\n", + " 194 │ 2004-03-18T19:00:00 1667.0\n", + " 195 │ 2004-03-18T20:00:00 1667.0\n", + " 196 │ 2004-03-18T21:00:00 1430.0\n", + " 197 │ 2004-03-18T22:00:00 1333.0\n", + " 198 │ 2004-03-18T23:00:00 1262.0\n", + " 199 │ 2004-03-19T00:00:00 1287.0\n", + " 200 │ 2004-03-19T01:00:00 1134.0\n", + "\u001b[36m 179 rows omitted\u001b[0m" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Load training data\n", + "past_data = DataFrame(CSV.File(\"data/airquality_past.csv\"))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's visualize the carbon monoxide measurements over time." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "scatter(past_data[:,1], \n", + " past_data[:,2], \n", + " size=(900,300), \n", + " color=\"black\", \n", + " xlabel=\"time\", \n", + " ylabel=\"CO (ppm)\",\n", + " ylims=[400,2000])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Model specification & parameter estimation\n", + "\n", + "We suspect that there is a temporal dependence in this dataset. In other words, the data changes relatively slowly over time and neighbouring data points end up being highly correlated. To exploit this correlation, we will build an _auto-regressive model_ of the form:\n", + "\n", + "$$ y_k = \\theta y_{k-1} + \\epsilon_k \\, , $$\n", + "\n", + "where the noise $\\epsilon_k$ is drawn from a zero-mean Gaussian with precision parameter $\\tau$: \n", + "\n", + "$$ \\epsilon_k \\sim \\mathcal{N}(0, \\tau^{-1}) \\, .$$\n", + "\n", + "Tasks:\n", + "- [1pt] Specify the above equation as a probabilistic model in RxInfer, using $\\tau = 1.0$.\n", + "- [1pt] Specify and execute an inference procedure to infer a posterior distribution for $\\theta$.\n", + "- [1pt] Plot the inferred distribution over the interval $[0,\\ 2]$." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "### YOUR CODE HERE\n", + "\n", + "# Number of data points\n", + "T = size(past_data,1);\n", + "\n", + "# Prepare data\n", + "y_k = [past_data[i,2] for i in 2:T]\n", + "y_prev = [past_data[i,2] for i in 1:(T-1)];\n", + "\n", + "# Specify parameters of prior distributions\n", + "prior_params = Dict(:θ => (1.0, 100.0))\n", + "\n", + "@model function auto_regression1(prior_params; num_samples=1)\n", + " \n", + " x = datavar(Float64,num_samples)\n", + " y = datavar(Float64,num_samples)\n", + " \n", + " # Prior coefficients\n", + " θ ~ Normal(mean = prior_params[:θ][1], variance = prior_params[:θ][2])\n", + " \n", + " for i = 1:num_samples\n", + " \n", + " # Likelihood\n", + " y[i] ~ Normal(mean = θ*x[i], precision = 1.0)\n", + " \n", + " end\n", + " return y, x, θ\n", + "end\n", + "\n", + "results = inference(\n", + " model = auto_regression1(prior_params, num_samples=T-1),\n", + " data = (x = y_prev, y = y_k,),\n", + " free_energy = true,\n", + ")\n", + "\n", + "# Plot posterior\n", + "plot(0.0:1e-3:2.0, x -> pdf(results.posteriors[:θ], x), xlabel=\"θ\", ylabel=\"p(θ|y)\", size=(900,300))\n", + "####" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Predictions & model evaluation\n", + "\n", + "We want to evaluate the parameters inferred under the model. For now, we will do this by visually inspecting the 1-step ahead predictions on our data set. Later, we will use free energy as a metric. \n", + "\n", + "The posterior predictive distribution for the next time step is:\n", + "\n", + "$$ p(y_{t+1} \\mid y_{t}, \\mathcal{D}) = \\int p(y_{t+1} \\mid \\theta, y_{t}) p(\\theta \\mid \\mathcal{D}) \\, \\mathrm{d}\\theta \\, , $$\n", + "\n", + "where $\\mathcal{D}$ refers to the data used to infer the posterior distribution. To make 1-step ahead predictions, you will have to loop over the data (i.e., `for t in 1:T`), plug in the current data point and compute the parameters of the posterior predictive distribution for the next data point. You may start from $t=2$, using $y_1$ as initial \"previous observation\". \n", + "\n", + "Tasks:\n", + "- [1pt] Derive the parameters of the posterior predictive and compute the 1-step ahead predictions on the data set.\n", + "- [1pt] Plot the first 10 predictions (posterior predictive variance in `ribbon=`) along with $y_{2:11}$ (scatterplot).\n", + "\n", + "---\n", + "\n", + "Note that if you failed to infer a posterior distribution in the previous question, you can still answer this question using a standard normal, $p(\\theta) = \\mathcal{N}(0,1)$." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "### YOUR CODE HERE\n", + "\n", + "# Preallocate predictions\n", + "sim_mean = zeros(T)\n", + "sim_vars = zeros(T)\n", + "\n", + "# Initialize recursive previous\n", + "y_prev = past_data[1,2]\n", + "\n", + "# Extract parameters of posterior\n", + "mθ, vθ = mean_var(results.posteriors[:θ])\n", + "\n", + "for t = 2:T\n", + " \n", + " # Compute parameters of posterior\n", + " sim_mean[t] = mθ*y_prev\n", + " sim_vars[t] = y_prev*vθ*y_prev + 1.0\n", + " \n", + " # Update data\n", + " y_prev = past_data[t,2]\n", + " \n", + "end\n", + "\n", + "scatter(past_data[2:11,1], past_data[2:11,2], color=\"black\", label=\"data\", xlabel=\"time\", ylabel=\"CO\")\n", + "plot!(past_data[2:11,1], sim_mean[2:11], ribbon=sim_vars[2:11], color=\"blue\", size=(900,300), label=\"predictions\")\n", + "\n", + "###" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Model critiqueing & improvement\n", + "\n", + "From the results of the previous question, you may conclude that our initial model isn't great: it only considers extremely short-term changes, which are highly affected by noise. \n", + "\n", + "The model can be improved in two ways:\n", + "1. We set the noise level $\\tau$ to $1.0$ but that was based on convenience, not on domain expertise or data. You are going to put a prior probability distribution over $\\tau$ and infer a posterior $p(\\tau \\mid \\mathcal{D})$ simultaneously. \n", + "2. We can consider changes over longer periods of time by increasing the model order of the auto-regressive model. That corresponds to:\n", + "\n", + "$$ y_k = \\sum_{m=1}^{M} \\theta_m y_{k-m} + \\epsilon_k \\, ,$$\n", + "\n", + "where $M$ refers to model order.\n", + "\n", + "Tasks:\n", + "- [1pt] Specify a higher-order AR model with an order parameter $M$, and with noise precision as a random variable.\n", + "- [1pt] Infer the approximate posteriors for $\\theta$ and $\\tau$, for model order $M=3$.\n", + "- [1pt] Visualize the posterior for the noise distribution $q(\\tau)$ and the 1-step ahead predictions (mean and variance) on the data." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "# Number of iterations of variational inference\n", + "n_iters = 10;\n", + "\n", + "# Model order\n", + "M = 3;" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:05\u001b[39m\n" + ] + }, + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "### YOUR CODE HERE\n", + "\n", + "# Number of data points\n", + "T = size(past_data,1);\n", + "\n", + "# Prepare data\n", + "y_k = [past_data[i,2] for i in (M+1):T]\n", + "y_prev = [past_data[i-1:-1:i-M,2] for i in (M+1):T]\n", + "\n", + "# Specify parameters of prior distributions \n", + "prior_params = Dict(:θ => (exp.(-(1:M)), diagm(ones(M))),\n", + " :τ => (1.0, 1e3))\n", + "\n", + "@model function auto_regression2(prior_params; model_order=2, num_samples=1)\n", + " \n", + " x = datavar(Vector{Float64}, num_samples)\n", + " y = datavar(Float64, num_samples)\n", + " \n", + " # Prior coefficients\n", + " θ ~ MvNormal(mean = prior_params[:θ][1], covariance = prior_params[:θ][2])\n", + " \n", + " # Prior noise precision\n", + " τ ~ Gamma(shape = prior_params[:τ][1], rate = prior_params[:τ][2])\n", + " \n", + " for i = 1:num_samples\n", + " \n", + " # Likelihood\n", + " y[i] ~ Normal(mean = dot(θ,x[i]), precision = τ)\n", + " \n", + " end\n", + " return y, x, θ, τ\n", + "end\n", + "\n", + "constraints = @constraints begin\n", + " q(θ,τ) = q(θ)q(τ)\n", + "end\n", + "\n", + "results = inference(\n", + " model = auto_regression2(prior_params, model_order=M, num_samples=T-M),\n", + " data = (x = y_prev, y = y_k,),\n", + " constraints = constraints,\n", + " initmarginals = (θ = MvNormal(prior_params[:θ]...), τ = Gamma(prior_params[:τ]...)),\n", + " returnvars = (θ = KeepLast(), τ = KeepLast(),),\n", + " iterations = n_iters,\n", + " free_energy = true,\n", + " showprogress = true,\n", + ")\n", + "\n", + "# Posterior for noise precision\n", + "p1 = plot(1e-5:1e-6:5e-4, x -> exp.(logpdf(results.posteriors[:τ], x)))\n", + "\n", + "# Preallocate predictions\n", + "sim_mean = zeros(T)\n", + "sim_vars = zeros(T)\n", + "\n", + "# Initialize recursive previous\n", + "y_prev = past_data[M:-1:1,2]\n", + "\n", + "# Extract parameters of posterior\n", + "mθ, Sθ = mean_cov(results.posteriors[:θ])\n", + "mτ = mode(results.posteriors[:τ])\n", + "\n", + "for t = M:T\n", + " \n", + " # Compute parameters of posterior\n", + " sim_mean[t] = mθ'*y_prev\n", + " sim_vars[t] = y_prev'*Sθ*y_prev + inv(mτ)\n", + " \n", + " # Update recursion\n", + " y_prev = past_data[t:-1:t-M+1,2]\n", + " \n", + "end\n", + "\n", + "p2 = scatter(past_data[M:T,1], past_data[M:T,2], color=\"black\", label=\"data\", xlabel=\"time\", ylabel=\"CO (ppm)\")\n", + "plot!(past_data[M:T,1], sim_mean[M:T], ribbon=sqrt.(sim_vars[M:T]), color=\"blue\", size=(900,300), label=\"predicted\")\n", + "\n", + "plot(p1, p2, layout=(2,1), size=(900,600))\n", + "\n", + "####" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Model selection & deployment\n", + "\n", + "We now essentially have a different model for each value of $M$. Which is the best?\n", + "\n", + "Tasks:\n", + "- [1pt] Compute the free energies for a given range of model orders and report the best performing one.\n", + "- [1pt] Deploy the best model (i.e., compute 1-step ahead predictions) on future data." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "# Model order range\n", + "model_orders = [2,4,8,16,32];" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:00\u001b[39m\n", + "\u001b[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:01\u001b[39m\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "FE indicates M = 32 is optimal.\n" + ] + }, + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "##### YOUR CODE HERE\n", + "\n", + "# Preallocate free energy\n", + "FE = zeros(length(model_orders), n_iters)\n", + "\n", + "# Preallocate list of posteriors\n", + "posts = Vector(undef, length(model_orders))\n", + "\n", + "# Loop over model orders\n", + "@showprogress for (j,m) in enumerate(model_orders)\n", + " \n", + " # Prepare data\n", + " y_k = [past_data[i,2] for i in (m+1):T]\n", + " y_prev = [past_data[i-1:-1:i-m,2] for i = (m+1):T]\n", + "\n", + " # Specify parameters of prior distributions \n", + " prior_params = Dict(:θ => (exp.(-(1:m)), diagm(ones(m))),\n", + " :τ => (1.0, 1e3))\n", + "\n", + " results = inference(\n", + " model = auto_regression2(prior_params, model_order=m, num_samples=T-m),\n", + " data = (x = y_prev, y = y_k,),\n", + " constraints = constraints,\n", + " initmarginals = (θ = MvNormal(prior_params[:θ]...), τ = Gamma(prior_params[:τ]...)),\n", + " returnvars = (θ = KeepLast(), τ = KeepLast(),),\n", + " iterations = n_iters,\n", + " free_energy = true,\n", + " showprogress = true,\n", + " )\n", + " \n", + " # Store final FE\n", + " FE[j,:] = results.free_energy\n", + " \n", + " # Save posteriors\n", + " posts[j] = results.posteriors\n", + "end\n", + "\n", + "# Extract best model and report\n", + "M_best = model_orders[argmin(FE[:,end])]\n", + "println(\"FE indicates M = \"*string(M_best)*\" is optimal.\")\n", + "\n", + "# Visualize free energies\n", + "plot(1:n_iters, \n", + " FE', \n", + " xlabel=\"Iterations of variational inference\", \n", + " ylabel=\"Free energy [F(q)]\", \n", + " labels = cat([[\"M = $m\"] for m in model_orders]..., dims=2), \n", + " size=(900,300))\n", + "\n", + "#####" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
300×2 DataFrame
275 rows omitted
RowtimeCO
DateTimeFloat64
12004-03-19T02:00:00999.0
22004-03-19T03:00:00961.0
32004-03-19T04:00:00934.0
42004-03-19T05:00:00913.0
52004-03-19T06:00:00969.0
62004-03-19T07:00:001182.0
72004-03-19T08:00:001740.0
82004-03-19T09:00:001819.0
92004-03-19T10:00:001427.0
102004-03-19T11:00:001390.0
112004-03-19T12:00:001283.0
122004-03-19T13:00:001304.0
132004-03-19T14:00:001364.0
2892004-03-31T02:00:00972.0
2902004-03-31T03:00:00832.0
2912004-03-31T04:00:00851.0
2922004-03-31T05:00:00909.0
2932004-03-31T06:00:001038.0
2942004-03-31T07:00:001636.0
2952004-03-31T08:00:001580.0
2962004-03-31T09:00:001262.0
2972004-03-31T10:00:001197.0
2982004-03-31T11:00:001277.0
2992004-03-31T12:00:001430.0
3002004-03-31T13:00:001242.0
" + ], + "text/latex": [ + "\\begin{tabular}{r|cc}\n", + "\t& time & CO\\\\\n", + "\t\\hline\n", + "\t& DateTime & Float64\\\\\n", + "\t\\hline\n", + "\t1 & 2004-03-19T02:00:00 & 999.0 \\\\\n", + "\t2 & 2004-03-19T03:00:00 & 961.0 \\\\\n", + "\t3 & 2004-03-19T04:00:00 & 934.0 \\\\\n", + "\t4 & 2004-03-19T05:00:00 & 913.0 \\\\\n", + "\t5 & 2004-03-19T06:00:00 & 969.0 \\\\\n", + "\t6 & 2004-03-19T07:00:00 & 1182.0 \\\\\n", + "\t7 & 2004-03-19T08:00:00 & 1740.0 \\\\\n", + "\t8 & 2004-03-19T09:00:00 & 1819.0 \\\\\n", + "\t9 & 2004-03-19T10:00:00 & 1427.0 \\\\\n", + "\t10 & 2004-03-19T11:00:00 & 1390.0 \\\\\n", + "\t11 & 2004-03-19T12:00:00 & 1283.0 \\\\\n", + "\t12 & 2004-03-19T13:00:00 & 1304.0 \\\\\n", + "\t13 & 2004-03-19T14:00:00 & 1364.0 \\\\\n", + "\t14 & 2004-03-19T15:00:00 & 1410.0 \\\\\n", + "\t15 & 2004-03-19T16:00:00 & 1476.0 \\\\\n", + "\t16 & 2004-03-19T17:00:00 & 1522.0 \\\\\n", + "\t17 & 2004-03-19T18:00:00 & 1442.0 \\\\\n", + "\t18 & 2004-03-19T19:00:00 & 1469.0 \\\\\n", + "\t19 & 2004-03-19T20:00:00 & 1467.0 \\\\\n", + "\t20 & 2004-03-19T21:00:00 & 1421.0 \\\\\n", + "\t21 & 2004-03-19T22:00:00 & 1175.0 \\\\\n", + "\t22 & 2004-03-19T23:00:00 & 1215.0 \\\\\n", + "\t23 & 2004-03-20T00:00:00 & 1127.0 \\\\\n", + "\t24 & 2004-03-20T01:00:00 & 1090.0 \\\\\n", + "\t25 & 2004-03-20T02:00:00 & 1017.0 \\\\\n", + "\t26 & 2004-03-20T03:00:00 & 997.0 \\\\\n", + "\t27 & 2004-03-20T04:00:00 & 945.0 \\\\\n", + "\t28 & 2004-03-20T05:00:00 & 956.0 \\\\\n", + "\t29 & 2004-03-20T06:00:00 & 966.0 \\\\\n", + "\t30 & 2004-03-20T07:00:00 & 1064.0 \\\\\n", + "\t$\\dots$ & $\\dots$ & $\\dots$ \\\\\n", + "\\end{tabular}\n" + ], + "text/plain": [ + "\u001b[1m300×2 DataFrame\u001b[0m\n", + "\u001b[1m Row \u001b[0m│\u001b[1m time \u001b[0m\u001b[1m CO \u001b[0m\n", + " │\u001b[90m DateTime \u001b[0m\u001b[90m Float64 \u001b[0m\n", + "─────┼──────────────────────────────\n", + " 1 │ 2004-03-19T02:00:00 999.0\n", + " 2 │ 2004-03-19T03:00:00 961.0\n", + " 3 │ 2004-03-19T04:00:00 934.0\n", + " 4 │ 2004-03-19T05:00:00 913.0\n", + " 5 │ 2004-03-19T06:00:00 969.0\n", + " 6 │ 2004-03-19T07:00:00 1182.0\n", + " 7 │ 2004-03-19T08:00:00 1740.0\n", + " 8 │ 2004-03-19T09:00:00 1819.0\n", + " 9 │ 2004-03-19T10:00:00 1427.0\n", + " 10 │ 2004-03-19T11:00:00 1390.0\n", + " 11 │ 2004-03-19T12:00:00 1283.0\n", + " ⋮ │ ⋮ ⋮\n", + " 291 │ 2004-03-31T04:00:00 851.0\n", + " 292 │ 2004-03-31T05:00:00 909.0\n", + " 293 │ 2004-03-31T06:00:00 1038.0\n", + " 294 │ 2004-03-31T07:00:00 1636.0\n", + " 295 │ 2004-03-31T08:00:00 1580.0\n", + " 296 │ 2004-03-31T09:00:00 1262.0\n", + " 297 │ 2004-03-31T10:00:00 1197.0\n", + " 298 │ 2004-03-31T11:00:00 1277.0\n", + " 299 │ 2004-03-31T12:00:00 1430.0\n", + " 300 │ 2004-03-31T13:00:00 1242.0\n", + "\u001b[36m 279 rows omitted\u001b[0m" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Load test data\n", + "future_data = DataFrame(CSV.File(\"data/airquality_future.csv\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "### YOUR CODE HERE\n", + "\n", + "# Select posterior\n", + "best_post = posts[argmin(FE[:,end])]\n", + "\n", + "# Extract parameters of selected posterior\n", + "mθ, Sθ = mean_cov(best_post[:θ])\n", + "mτ = mode(best_post[:τ])\n", + "\n", + "T = size(future_data,1)\n", + "\n", + "# Preallocate predictions\n", + "sim_mean = zeros(T)\n", + "sim_vars = zeros(T)\n", + "\n", + "# Initialize recursive previous\n", + "y_prev = future_data[M_best:-1:1,2]\n", + "\n", + "for t = (M_best+1):T\n", + " \n", + " # Compute parameters of posterior\n", + " sim_mean[t] = mθ'*y_prev\n", + " sim_vars[t] = y_prev'*Sθ*y_prev + inv(mτ)\n", + " \n", + " # Update data\n", + " y_prev = future_data[t-1:-1:t-M_best,2]\n", + " \n", + "end\n", + "\n", + "ix = (M_best+1):300\n", + "scatter(future_data[ix,1], future_data[ix,2], color=\"black\", label=\"data\", xlabel=\"time\", ylabel=\"CO\")\n", + "plot!(future_data[ix,1], sim_mean[ix], ribbon=sqrt.(sim_vars[ix]), color=\"blue\", size=(900,300), label=\"predictions\")\n", + "\n", + "###" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "@webio": { + "lastCommId": null, + "lastKernelId": null + }, + "kernelspec": { + "display_name": "Julia 1.8.4", + "language": "julia", + "name": "julia-1.8" + }, + "language_info": { + "file_extension": ".jl", + "mimetype": "application/julia", + "name": "julia", + "version": "1.8.4" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}