import numpy as np
import matplotlib.pyplot as plt

import sklearn.linear_model as lm
from sklearn.preprocessing import PolynomialFeatures

from numpy.random import default_rng
rng = default_rng(0)


# function to reconstruct from data ('underlying truth')
def truth(x):
    return x + np.cos(2 * np.pi * x)

# range and grid for plotting
xmin = 0
xmax = 1
x = np.linspace(xmin, xmax, 100)

# plot truth
fig, ax = plt.subplots()
ax.plot(x, truth(x), '-b', label='truth')
ax.legend()
plt.show()


n = 100    # number of data points to generate
noise_level = 0.3    # standard deviation of artificial noise

# simulate data
X = (xmax - xmin) * rng.random((n, 1)) + xmin
y = truth(X).reshape(-1) + noise_level * rng.standard_normal(n)

# plot truth and data
fig, ax = plt.subplots()
ax.plot(x, truth(x), '-b', label='truth')
ax.plot(X.reshape(-1), y, 'or', markersize=3, label='data')
ax.legend()
plt.show()


degree = 20    # degree for polynomial regression

# regression
regression = lm.LinearRegression()
transform = PolynomialFeatures(degree=degree).fit_transform
regression.fit(transform(X), y)

# get hypothesis for plotting
y_reg = regression.predict(transform(x.reshape(-1, 1)))

# plot truth, data, hypothesis
fig, ax = plt.subplots()
ax.plot(x, truth(x), '-b', label='truth')
ax.plot(X.reshape(-1), y, 'or', markersize=3, label='data')
ax.plot(x, y_reg, '-g', label='model')
ax.legend()
plt.show()


from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.5)

print(X_train.shape, X_val.shape)

# plot truth, training data, validation data
fig, ax = plt.subplots()
ax.plot(x, truth(x), '-b', label='truth')
ax.plot(X_train.reshape(-1), y_train, 'or', markersize=3, label='training data')
ax.plot(X_val.reshape(-1), y_val, 'oy', markersize=3, label='validation data')
ax.legend()
plt.show()

(50, 1) (50, 1)


from sklearn import metrics

max_degree = 25

regression = lm.LinearRegression()

train_errors = np.zeros(max_degree)
val_errors = np.zeros(max_degree)

degrees = range(1, max_degree + 1)

for degree in degrees:

    # regression
    transform = PolynomialFeatures(degree=degree).fit_transform
    regression.fit(transform(X_train), y_train)

    # predictions on subsets
    y_train_pred = regression.predict(transform(X_train))
    y_val_pred = regression.predict(transform(X_val))
    
    # errors
    train_errors[degree - 1] = metrics.mean_squared_error(y_train_pred, y_train)
    val_errors[degree - 1] = metrics.mean_squared_error(y_val_pred, y_val)

# plot errors
fig, ax = plt.subplots()
ax.semilogy(degrees, train_errors, '-or', label='errors on training set')
ax.semilogy(degrees, val_errors, '-oy', label='errors on validation set')
ax.set_xlabel('degree')
ax.set_ylabel('mean squared error')
ax.legend()
plt.show()

# print errors
print('   training    validation')
print(np.concatenate((train_errors.reshape(-1, 1), val_errors.reshape(-1, 1)), axis=1))

   training    validation
[[0.55475377 0.65833334]
 [0.10955904 0.10388025]
 [0.10847943 0.10374372]
 [0.08992226 0.07614731]
 [0.08987253 0.07609062]
 [0.08974851 0.07648786]
 [0.08771272 0.07632496]
 [0.08523254 0.08120632]
 [0.08509052 0.0783087 ]
 [0.08388223 0.07900878]
 [0.08386908 0.07957545]
 [0.08329354 0.08595799]
 [0.08189777 0.08633976]
 [0.07905458 0.20383808]
 [0.07717599 0.43253638]
 [0.07373326 0.12178211]
 [0.06759049 0.98826327]
 [0.06712174 0.55851482]
 [0.06700693 0.80982014]
 [0.06683892 0.38952924]
 [0.06701758 0.58037515]
 [0.06446327 5.82184725]
 [0.06434993 7.36358424]
 [0.06451238 9.24132794]
 [0.06430998 4.23637356]]


max_degree = 25

regression = lm.LinearRegression()

max_coeffs = np.zeros(max_degree)

degrees = range(1, max_degree + 1)

for degree in degrees:

    # regression
    transform = PolynomialFeatures(degree=degree).fit_transform
    regression.fit(transform(X_train), y_train)

    # maximum coefficient
    max_coeffs[degree - 1] = np.max(np.abs(regression.coef_))
    
# plot errors
fig, ax = plt.subplots()
ax.semilogy(degrees, max_coeffs, '-om',)
ax.set_xlabel('degree')
ax.set_ylabel('maximum coefficient')
plt.show()


degree = 20    # degree for polynomial regression
alpha = 1e-5    # regularization parameter

# regression
regression = lm.Ridge(alpha=alpha)
transform = PolynomialFeatures(degree=degree).fit_transform
regression.fit(transform(X), y)

# get hypothesis for plotting
y_reg = regression.predict(transform(x.reshape(-1, 1)))

# plot truth, data, hypothesis
fig, ax = plt.subplots()
ax.plot(x, truth(x), '-b', label='truth')
ax.plot(X.reshape(-1), y, 'or', markersize=3, label='data')
ax.plot(x, y_reg, '-g', label='model')
ax.legend()
plt.show()


degree = 20    # degree for polynomial regression
alphas = [2 ** (-k) for k in range(5, 40)]    # regularization parameters

errors = np.zeros(len(alphas))

for idx, alpha in enumerate(alphas):

    # regression
    regression = lm.Ridge(alpha=alpha)
    transform = PolynomialFeatures(degree=degree).fit_transform
    regression.fit(transform(X_train), y_train)

    # get mean squared error for equispaced grid (same as for plotting)
    y_reg = regression.predict(transform(x.reshape(-1, 1)))
    y_true = truth(x)
    errors[idx] = metrics.mean_squared_error(y_reg, y_true)

# plot errors
fig, ax = plt.subplots()
ax.semilogx(alphas, errors, '-m')
ax.set_xlabel('$\\alpha$')
ax.set_ylabel('error')
plt.show()


degree = 20    # degree for polynomial regression
alphas = [2 ** (-k) for k in range(5, 40)]    # regularization parameters

train_errors = np.zeros(len(alphas))
val_errors = np.zeros(len(alphas))

for idx, alpha in enumerate(alphas):

    # regression
    regression = lm.Ridge(alpha=alpha)
    transform = PolynomialFeatures(degree=degree).fit_transform
    regression.fit(transform(X_train), y_train)

    # predictions on subsets
    y_train_pred = regression.predict(transform(X_train))
    y_val_pred = regression.predict(transform(X_val))
    
    # errors
    train_errors[idx] = metrics.mean_squared_error(y_train_pred, y_train)
    val_errors[idx] = metrics.mean_squared_error(y_val_pred, y_val)

# plot errors
fig, (ax_left, ax_right) = plt.subplots(1, 2, figsize=(12, 4))
ax_left.semilogx(alphas, train_errors, '-r', label='errors on training set')
ax_left.semilogx(alphas, val_errors, '-y', label='errors on validation set')
ax_left.set_xlabel('$\\alpha$')
ax_left.set_ylabel('mean squared error')
ax_left.legend()
ax_right.semilogx(alphas, val_errors / train_errors, '-m')
ax_right.set_xlabel('$\\alpha$')
ax_right.set_ylabel('ratio of mean squared errors')
plt.show()

# print errors
print('  training       validation')
print(np.concatenate((train_errors.reshape(-1, 1), val_errors.reshape(-1, 1)), axis=1))

  training       validation
[[0.09888504 0.09174894]
 [0.09603693 0.08664607]
 [0.09470958 0.08389443]
 [0.09393232 0.08232414]
 [0.09315866 0.08109168]
 [0.09212581 0.07980087]
 [0.09080771 0.07861829]
 [0.08946548 0.07823487]
 [0.08846184 0.07906215]
 [0.08790115 0.08048615]
 [0.08761824 0.08151375]
 [0.08741833 0.08166539]
 [0.08717514 0.08095738]
 [0.08681368 0.07954297]
 [0.08631789 0.07766709]
 [0.0857721  0.07587515]
 [0.08532308 0.07483053]
 [0.08503902 0.0747013 ]
 [0.08487    0.07515039]
 [0.08474637 0.07586492]
 [0.0846399  0.07668768]
 [0.08454526 0.07740039]
 [0.08444816 0.07769675]
 [0.08432935 0.07741891]
 [0.08418912 0.07674461]
 [0.08405459 0.07612503]
 [0.08395048 0.07591862]
 [0.0838706  0.07615269]
 [0.08379299 0.07676065]
 [0.08370746 0.07781204]
 [0.08361853 0.07939469]
 [0.08352501 0.08143972]
 [0.08340736 0.08392931]
 [0.08324143 0.0872662 ]
 [0.08301806 0.09224448]]

Overfitting and regularization¶

Example¶

Detecting overfitting¶

Avoiding overfitting¶