# Imports
# Packages imported as a whole as well, so the print below lists all dependencies in the notebook
import sys
import os
import numpy as np
import pandas as pd

# Visualization
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

# Statistics and machine learning
import statsmodels.api as sm
import sklearn
from sklearn.linear_model import LinearRegression

# Bayesian statistics
import pymc3 as pm
import arviz as az

# Some plotting styles
%config InlineBackend.figure_format = 'retina'
az.style.use('arviz-darkgrid')

print("This notebook uses the following packages (and versions):")
print("---------------------------------------------------------")
print("python", sys.version[:5])
print('\n'.join(f'{m.__name__} {m.__version__}' for m in globals().values() if getattr(m, '__version__', None)))

This notebook uses the following packages (and versions):
---------------------------------------------------------
python 3.8.5
numpy 1.19.2
pandas 1.2.4
matplotlib 3.3.2
seaborn 0.11.1
statsmodels.api 0.11.1
sklearn 0.23.2
pymc3 3.11.2
arviz 0.11.2


# Parameters about the data:
size = 20   # number of data points
true_intercept = 4
true_slope = 6

# Create the independent variable
x = np.linspace(0, 1, size)
# Create the independent variable y = a + b*x
true_regression_line = true_intercept + true_slope * x
# add noise, drawn from a Gaussian (from numpy) with mean 0 and a standard deviation of 0.5
np.random.seed(123)
y = true_regression_line + np.random.normal(scale=.5, size=size)


# Here's what it looks like (plotted the matplotlib interactive way)
plt.figure(figsize=(5,5))
plt.scatter(x, y, label='Data')
plt.plot(x, true_regression_line, color='red', linewidth=3, label='Input relation')
plt.legend()
plt.xlabel('x'); plt.ylabel('y');


# Statsmodels requires one to specify the extra constant that needs to be fitted and can then fit using Ordinary Least Squares
X = sm.add_constant(x, prepend=True)
smreg = sm.OLS(y, X)
smres = smreg.fit()
# After fitting, an extensive summary of fit results can be produced.
print(smres.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.929
Model:                            OLS   Adj. R-squared:                  0.925
Method:                 Least Squares   F-statistic:                     234.4
Date:                Fri, 16 Apr 2021   Prob (F-statistic):           9.16e-12
Time:                        15:21:00   Log-Likelihood:                -17.074
No. Observations:                  20   AIC:                             38.15
Df Residuals:                      18   BIC:                             40.14
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          3.6792      0.258     14.254      0.000       3.137       4.221
x1             6.7560      0.441     15.309      0.000       5.829       7.683
==============================================================================
Omnibus:                        1.416   Durbin-Watson:                   2.247
Prob(Omnibus):                  0.493   Jarque-Bera (JB):                0.922
Skew:                           0.150   Prob(JB):                        0.631
Kurtosis:                       1.991   Cond. No.                         4.18
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.


# %load solutions/statsmodels_results.py
# 1
best_fit = smres.predict(X)
plt.figure(figsize=(5,5))
plt.scatter(x, y, label='Data')
plt.plot(x, true_regression_line, color='red', linewidth=3, label='Input relation')
plt.plot(x, best_fit, color='black', linewidth=3, label='OLS result')
plt.legend()
plt.xlabel('x'); plt.ylabel('y');

# 2
for i in range(20):
    # Sample a Gaussian with the mean at the best fit and a standard deviation 
    # equal to the standard error of the fit results
    intercept = np.random.normal(loc=smres.params[0], scale=smres.bse[0])
    slope = np.random.normal(loc=smres.params[1], scale=smres.bse[1])
    # Draw it
    this_fit = intercept + x*slope
    plt.plot(x, this_fit, color='black', linewidth=1, alpha=0.2)
# Let's overplot the original once more, for visibility
# This could be done only once here, but some might choose to skip exercise 2
plt.plot(x, true_regression_line, color='red', linewidth=3);


# %load solutions/sklearn_results.py
# 1
from sklearn.linear_model import LinearRegression
regr = LinearRegression()
x_in = x.reshape(-1,1)
regr.fit(x_in, y)

# 2
print(dir(regr))  # This can also be obtained by typing "regr."" and then hit tab (twice)
print()
print(f'Slope {regr.coef_[0]:3.2f} and intercept {regr.intercept_:3.2f}')

# 3
ypred = regr.predict(x_in)
plt.figure(figsize=(5,5))
plt.scatter(x, y, label='Data')
plt.plot(x, true_regression_line, color='red', linewidth=3, label='Input relation')
plt.plot(x, ypred, color='black', linewidth=3, label='Prediction')
plt.legend()
plt.xlabel('x'); plt.ylabel('y');

['__abstractmethods__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_abc_impl', '_check_n_features', '_decision_function', '_estimator_type', '_get_param_names', '_get_tags', '_more_tags', '_preprocess_data', '_repr_html_', '_repr_html_inner', '_repr_mimebundle_', '_residues', '_set_intercept', '_validate_data', 'coef_', 'copy_X', 'fit', 'fit_intercept', 'get_params', 'intercept_', 'n_features_in_', 'n_jobs', 'normalize', 'predict', 'rank_', 'score', 'set_params', 'singular_']

Slope 6.76 and intercept 3.68


with pm.Model() as linreg: # model specifications in PyMC3 are wrapped in a context
    # Define priors
    # These priors are pretty uninformative: wide distributions without any peaks, the stochastic variables can be anywhere
    # They are defined using PyMC3 defined probability distribution functions (and corresponding parameters) and they get a name
    
    # The intercept and the slope can both be positive or negative, and we don't know their value yet
    intercept = pm.Normal('Intercept', 0, sigma=20)
    x_coeff = pm.Normal('Slope', 0, sigma=20)
    # The data generating process is such that we have the linear relationship with scatter, so
    # the scatter also needs to be part of the model. We model it as Gaussian, with zero mean
    # and a positive standard deviation that is not yet well constrained
    sigma = pm.HalfCauchy('sigma', beta=10)

    # Define a likelihood: a PDF as well, that depends on the stochastic variables for which you defined a prior
    # The observed=-keyword specifies that this is the observed data that is being described by the model
    # The likelihood is Gaussian, as we said above, with a mean given by the linear relation we are after
    likelihood = pm.Normal('y', mu=intercept + x_coeff * x,
                        sigma=sigma, observed=y)

    # The line below does inference! The posterior PDF will be sampled
    trace = pm.sample(2000, cores=2, tune=2000, return_inferencedata=True, ) # draw 3000 posterior samples using NUTS sampling

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (2 chains in 2 jobs)
NUTS: [sigma, Slope, Intercept]

Sampling 2 chains for 2_000 tune and 2_000 draw iterations (4_000 + 4_000 draws total) took 7 seconds.
The acceptance probability does not match the target. It is 0.8834170920972357, but should be close to 0.8. Try to increase the number of tuning steps.


# We can get basic statistics of the posterior like this:
print(pm.summary(trace)) # A summary of the results for all stochastic variables

# Plotting functionality for traces is provided by the package Arviz
plt.figure(figsize=(7,7))
az.plots.traceplot.plot_trace(trace) ;

            mean     sd  hdi_3%  hdi_97%  mcse_mean  mcse_sd  ess_bulk  \
Intercept  3.680  0.295   3.164    4.250      0.007    0.005    1569.0   
Slope      6.752  0.504   5.784    7.659      0.013    0.009    1583.0   
sigma      0.649  0.122   0.447    0.883      0.003    0.002    1849.0   

           ess_tail  r_hat  
Intercept    1887.0    1.0  
Slope        1657.0    1.0  
sigma        1809.0    1.0

<Figure size 700x700 with 0 Axes>


# We can sample the posterior to plot a bunch of reasonable fits, according to the posterior PDF
plt.figure(figsize=(7, 7))
plt.scatter(x, y, label='data')
#     This plotting functionality is kindly provided by arviz, (for now) called from pymc3:
pm.plot_posterior_predictive_glm(trace, samples=20, lm=(lambda  x, sample: sample['Intercept'] + sample['Slope'] * x),
                              label='posterior predictive regression lines')
plt.plot(x, true_regression_line, label='true regression line', lw=3., c='red')

plt.title('Posterior predictive regression lines')
plt.legend(loc=0)
plt.xlabel('x')
plt.ylabel('y');

/home/marcel/anaconda3/envs/bayes/lib/python3.8/site-packages/pymc3/plots/posteriorplot.py:59: DeprecationWarning: The `plot_posterior_predictive_glm` function will migrate to Arviz in a future release. 
Keep up to date with `ArviZ <https://arviz-devs.github.io/arviz/>`_ for future updates.
  warnings.warn(


# %load solutions/bayesian_results.py
# 1
az.plot_pair(trace, kind='hexbin', figsize=(7,7));

# The top plot shows the anti-correlation. We can also show only this part and the marginal PDFs alongside with seaborn:


# 2
df = trace.posterior.to_dataframe()

plt.figure(figsize=(6,6))
sns.jointplot(x='Slope', y='Intercept', data=df)

/home/marcel/anaconda3/envs/bayes/lib/python3.8/site-packages/seaborn/axisgrid.py:1668: UserWarning: This figure was using constrained_layout==True, but that is incompatible with subplots_adjust and or tight_layout: setting constrained_layout==False. 
  f.tight_layout()

<seaborn.axisgrid.JointGrid at 0x7f612a27d0a0>

<Figure size 600x600 with 0 Axes>

A look at Linear Regression from three perspectives¶

Linear regression: inference and prediction¶

Creating a data set to work with¶

Linear regression for "traditional statisticians"¶

Interpretation of these results¶

The maths of the 'simple linear regression'¶

Exercise¶

Linear regression as a machine learning model¶

Exercise¶

Linear regression, the Bayesian way¶

OK... so what happened?¶

Exercise¶

Conclusion¶