import matplotlib.pyplot as plt
import numpy as np

# This is the true data generating process. 
# We do not have access to this information in a typical machine learning problem
inputs = np.linspace(-5,5,50)        
outputs = inputs*0.5+3 
labels = outputs + np.random.randn(inputs.shape[0])

plt.plot(inputs,labels,'bo')
plt.plot(np.linspace(-5,5,500),
         np.linspace(-5,5,500)*0.5+3,'k-')
plt.xlabel("inputs (x)")
plt.ylabel("labels (y)")
plt.show()

num_samples = inputs.shape[0]
num_train_samples = num_samples 
idx = np.random.permutation(num_samples) 
inputs_train = inputs[idx[:num_train_samples]]
labels_train = labels[idx[:num_train_samples]]

class LinearRegression:
    def __init__(self):
        pass
      
    def extract_features(self, inputs):
        m = inputs.shape[0]
        inputs = np.expand_dims(inputs,axis=1)
        input_features = np.concatenate( (inputs, np.ones([m,1])), axis=1)
        return input_features

    def learn(self, inputs, labels):
        prediction = np.zeros_like(labels)
        input_features = self.extract_features(inputs)
        # This is where the least-squares solution is implemented!
        XtX_inv = np.linalg.inv(input_features.T.dot(input_features))
        self.weights = XtX_inv.dot(input_features.T).dot(labels)

    def predict(self, inputs):
        input_features = self.extract_features(inputs)
        predictions = input_features.dot(self.weights)
        return predictions

model = LinearRegression() 

model.learn(inputs_train, labels_train) 
predictions = model.predict(np.linspace(-5,5,500))

plt.plot(inputs,labels,'bo')
plt.plot(np.linspace(-5,5,500), 
         np.linspace(-5,5,500)*0.5+3,'k-', 
         label='true labeling function')
plt.plot(np.linspace(-5,5,500), 
         predictions ,'r-', label='learned hypothesis')
plt.legend(loc="upper left")
plt.xlabel("inputs (x)")
plt.ylabel("labels (y)")
plt.show()

p_values = [0.5, 1, 2, 3, 7, np.inf]
a1, a2 = np.meshgrid(np.linspace(-1.2, 1.2, num=101), 
                     np.linspace(-1.2, 1.2, num=101))
fig, axes = plt.subplots(ncols=(len(p_values) + 1)// 2,
                     nrows=2, figsize=(14, 7))
for p, ax in zip(p_values, axes.flat):
    if p == 0:
        zz = (a1 != 0).astype(int) + (a2 != 0).astype(int)
        ax.imshow(zz, cmap='bwr', extent=(xx.min(),xx.max(),
                      a2.min(),a2.max()), aspect="auto")
    else:
        if np.isinf(p):
            zz = np.maximum(np.abs(a1),np.abs(a2))
        else:
            zz = ((np.abs((a1))**p) + (np.abs((a2))**p))**(1./p)
        ax.contour(a1,a2,zz, [1], colors='red', linewidths = 2)
        ax.set_title("p= {0}".format(p))       

plt.show()

from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
import pandas as pd

class RidgeRegression:
    def __init__(self, n_dims):
        pass

    def extract_features(self, inputs):
        m = inputs.shape[0]
        input_features = np.concatenate( (inputs, np.ones([m,1])), axis=1)
        return input_features
      
    def learn(self, inputs, labels, lambda_coef=0.1):
        input_features = self.extract_features(inputs)
        XtX_inv = np.linalg.inv(input_features.T.dot(input_features) \
                    +lambda_coef*np.eye(input_features.shape[1]))
        self.weights = XtX_inv.dot(input_features.T).dot(labels)

    def predict(self, inputs):
        input_features = self.extract_features(inputs)
        predictions = input_features.dot(self.weights)
        return predictions

d = load_diabetes()
df = pd.DataFrame(d.data, columns=d.feature_names)
df['disease'] = d.target
#df.head(3)
n = len(df)
X = df.drop('disease',axis=1).values
y = df['disease'].values
# This is a nice scikit-learn function
X_train, X_test, y_train, y_test = \
         train_test_split(X, y, test_size=0.20) # hold out 20%

print("Means:")
print(X_train.mean(axis=0))
print("Variances:")
print(X_train.var(axis=0))

Means:
[-2.56121130e-04  1.53405796e-03 -1.23796484e-04 -9.89120277e-04
 -1.11151410e-04 -2.98140914e-04  2.15611279e-04  9.11707862e-05
 -2.31720551e-04  8.81882825e-04]
Variances:
[0.0022517  0.00226935 0.0023025  0.00216993 0.00227635 0.00231144
 0.00225041 0.00228229 0.00233905 0.00216888]

# z-score normalization
m = np.mean(X_train,axis=0)
std = np.std(X_train,axis=0)
X_train = (X_train-m)/std
X_test = (X_test-m)/std

print("Means:")
print(X_train.mean(axis=0))
print("Variances:")
print(X_train.var(axis=0))

Means:
[-1.47820063e-17  1.25804309e-17 -9.43532316e-18 -3.74267818e-17
  2.17012433e-17 -2.13867325e-17 -6.79343267e-17  1.73609946e-16
  4.46605296e-17 -5.27591820e-17]
Variances:
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]

model_ridge = RidgeRegression(n_dims=X_train.shape[1])

model_ridge.learn(X_train, y_train, lambda_coef=1)

predictions = model_ridge.predict(X_train)
train_error = np.sqrt(((predictions - y_train)**2).mean())
print("Train RMSE: {:.2f}".format(train_error))

predictions = model_ridge.predict(X_test)
test_error = np.sqrt(((predictions - y_test)**2).mean())
print("Test RMSE: {:.2f}".format(test_error))

Train RMSE: 53.70
Test RMSE: 53.76

import torch as th
import torch.nn as nn
import torch.nn.functional as F
import torch.optim

class LassoRegression(nn.Module):
    def __init__(self, n_dims, lambda_coef=1):
        super(LassoRegression, self).__init__()
        self.lambda_coef = lambda_coef
        self.emp_risk = nn.MSELoss()
        self.weight = nn.Parameter(th.randn((n_dims,1)))
        self.bias = nn.Parameter(th.randn((1)))

    def predict(self, input):
        return input @ self.weight + self.bias

    def learn(self, inputs, labels, num_steps=1):
        # The in-built Stochastic Gradient Descent optimizer
        # The argument "lr" sets the learning rate
        optimizer = torch.optim.SGD(self.parameters(), lr=0.01)

        for ii in  range(num_steps):
            # Predict with the current weight values
            # This step is called a "forward pass"
            predictions = self.predict(inputs)
            loss =  ((predictions - labels)**2).mean() \
                    + self.weight.abs().sum()*self.lambda_coef
            # Clear the gradient values remaining from 
            # the previous iteration
            optimizer.zero_grad()
            # Compute the new gradient values
            # This step is called the "backward pass"
            loss.backward()
            # Take the gradient descent step
            optimizer.step()


# Convert data into the Torch format
X_train = torch.tensor(X_train).float()
X_test = torch.tensor(X_test).float()
y_train = torch.tensor(y_train).float().reshape(-1,1)
y_test = torch.tensor(y_test).float().reshape(-1,1)

# z-score normalization
m = th.mean(X_train,axis=0)
std = th.std(X_train,axis=0)
X_train = (X_train-m)/std
X_test = (X_test-m)/std

# Train our model
model_lasso = LassoRegression(n_dims=X_train.shape[1], lambda_coef=1)
# Number of gradient descent iterations
num_iterations = 250

# Collect the train and test errors here.
train_errors = np.zeros(num_iterations)
test_errors = np.zeros(num_iterations)

for ii in  range(num_iterations):
    model_lasso.learn(X_train, y_train)

    predictions = model_lasso.predict(X_train)
    train_error = ((predictions - y_train)**2).mean().sqrt()
    train_errors[ii] = train_error.detach().numpy()

    # Test our model
    predictions = model_lasso.predict(X_test)
    test_error = ((predictions - y_test)**2).mean().sqrt()
    test_errors[ii] = test_error.detach().numpy()

# Plot the learning curve
plt.plot(np.arange(num_iterations),train_errors,'b-', label="Train RMSE")
plt.plot(np.arange(num_iterations),test_errors,'r-', label="Test RMSE")
plt.xlabel("Iteration")
plt.ylabel("RMSE")
plt.legend(loc="upper right")
plt.show()

fig, axes = plt.subplots(ncols=2, nrows=1, figsize=(14, 7))

axes[0].bar(np.arange(X_train.shape[1]), 
            model_lasso.weight.view(-1).detach().numpy())
axes[0].set_ylim(-40,40)
axes[0].set_title("Lasso weights")
axes[1].bar(np.arange(X_train.shape[1]), 
            model_ridge.weights[:10].squeeze())
axes[1].set_ylim(-40,40)
axes[1].set_title("Ridge weights")
plt.show()

Metric spaces¶

Regularized least squares¶