Classification by Neural Network¶

Prelims¶

In [1]:
import numpy as np
from dataset import *
from copy import deepcopy
import matplotlib.pyplot as plt
from tqdm import tqdm
In [2]:
%load_ext autoreload
%autoreload 2
In [3]:
def logistic(z):
    return 1./( 1. + np.exp(-z) )

def logistic_grad(z):
    y = logistic(z)
    return y*(1. - y)


def categorical_ce(y, t):
    return -np.sum(t * np.log(y))

def categorical_cs_grad(y, t):
    return -t / y


def softmax(z):
    expz = np.exp(z)
    sumz = np.sum(expz)
    return expz / sumz

def softmax_grad(z, s):
    # s (grad from above) should have only one non-zero element
    idx = np.nonzero(s)[0][0]  # Find index of non-zero element in s
    y = softmax(z)
    kronecker = np.zeros_like(s)
    kronecker[idx] = 1.
    return s[idx]*y[idx]*(kronecker-y)

Dataset¶

In [4]:
ds = Annuli(n=1000, noise=1.)
ds.plot()
# or...
#ds = UClasses(n=1000)
#ds.plot()
In [5]:
X = ds.inputs()
X[:5]
Out[5]:
array([[ 0.50914045,  0.13859321],
       [ 0.0576307 , -0.00659747],
       [ 0.74830528,  0.26346251],
       [-0.24609827, -0.87633696],
       [-0.53753603, -0.09685067]])
In [6]:
T = ds.targets()
T[:5]
Out[6]:
array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.]])

Create a network¶

In [8]:
# How many neurons do we want in the hidden layers?
hidden_dim = 10

# Set up the connection weights and biases

# input -> hidden
W1 = np.random.normal(size=(hidden_dim, ds.input_dim))
b1 = np.zeros((hidden_dim))

# hidden -> output
W2 = np.random.normal(size=(ds.n_classes, hidden_dim))
b2 = np.zeros((ds.n_classes,))

Forward pass of the network¶

In [9]:
x = X[0]  # input
t = T[0]  # target

alpha = W1@x + b1    # input -> alpha
h = logistic(alpha)  # alpha -> h
beta = W2@h + b2     # h -> beta
y = softmax(beta)    # beta -> y

# Compute the loss between y (output) and t (target)
loss = categorical_ce(y, t)

How good is the initial network?¶

In [10]:
# Run the inputs through the network
y = []
for x,t in zip(X,T):

    alpha = W1@x + b1
    h = logistic(alpha)
    beta = W2@h + b2
    y.append(softmax(beta))

# Plot the dataset using the network's output as labels
# Use the outputs as labels.
ds.plot(labels=y);

Backpropagate the gradient of the loss¶

In [11]:
# Feedforward pass first
x = X[0]  # input
t = T[0]  # target

alpha = W1@x + b1    # input -> alpha
h = logistic(alpha)  # alpha -> h
beta = W2@h + b2     # h -> beta
y = softmax(beta)    # beta -> y

# Compute the loss between y (output) and t (target)
loss = categorical_ce(y, t)



# Gradient of loss w.r.t. y
dloss_dy = categorical_cs_grad(y, t)
# Gradient of loss w.r.t. beta
dloss_dbeta = softmax_grad(beta, dloss_dy)

# Gradient of loss w.r.t. W2
dloss_dW2 = np.outer(dloss_dbeta, h)
# Gradient of loss w.r.t. b2
dloss_db2 = dloss_dbeta

# Gradient of loss w.r.t. h
dloss_dh = W2.T @ dloss_dbeta
# Gradient of loss w.r.t. alpha
dloss_dalpha = dloss_dh * logistic_grad(alpha)

# Gradient of loss w.r.t. W1
dloss_dW1 = np.outer(dloss_dalpha, x)
# Gradient of loss w.r.t. b1
dloss_db1 = dloss_dalpha
In [12]:
print(beta)
print(dloss_dbeta)
[ 0.54110102 -0.78924242 -1.30526665]
[ 0.70313807  0.18590017 -0.88903824]
In [13]:
print(f'{W1} \n')
print(dloss_dW1)
[[-0.55517567 -0.60343372]
 [ 0.93826216 -0.46268204]
 [ 0.44757177 -0.34370798]
 [-0.16619815 -2.00738823]
 [-0.53361728  3.57107671]
 [ 0.02343947  1.14840791]
 [-1.27741013  0.86057669]
 [-0.24788056  0.20720883]
 [-1.82013999 -0.22142245]
 [ 1.06336751  0.5737013 ]] 

[[-0.0190782  -0.00519328]
 [-0.14372557 -0.03912356]
 [ 0.30395695  0.08274017]
 [ 0.1075365   0.02927253]
 [ 0.0052815   0.00143768]
 [-0.02893441 -0.00787624]
 [ 0.02614044  0.00711569]
 [-0.07583874 -0.02064408]
 [ 0.09778822  0.02661895]
 [ 0.09511591  0.02589152]]

Train the network¶

In [14]:
n_epochs = 500
kappa = 1.

losses = []

for epoch in tqdm(range(n_epochs)):
    dloss_dW1 = np.zeros_like(W1)
    dloss_db1 = np.zeros_like(b1)
    dloss_dW2 = np.zeros_like(W2)
    dloss_db2 = np.zeros_like(b2)

    loss = 0.
    
    for x,t in zip(X,T):

        # Forward pass
        alpha = W1@x + b1
        h = logistic(alpha)
        beta = W2@h + b2
        y = softmax(beta)

        # Compute the loss
        loss += categorical_ce(y, t)

        # Backpropagate the gradient of the loss
        # Gradient from loss -> beta
        dloss_dy = categorical_cs_grad(y, t)
        dloss_dbeta = softmax_grad(beta, dloss_dy)

        # Gradient of loss w.r.t. W2 and b2
        dloss_dW2 += np.outer(dloss_dbeta, h)
        dloss_db2 += dloss_dbeta

        # Gradient: beta -> x
        dloss_dh = W2.T @ dloss_dbeta
        dloss_dalpha = dloss_dh * logistic_grad(alpha)

        # Gradient of loss w.r.t. W1 and b1
        dloss_dW1 += np.outer(dloss_dalpha, x)
        dloss_db1 += dloss_dalpha

    # Increment the weights and biases
    W1 -= kappa * dloss_dW1 / len(X)
    b1 -= kappa * dloss_db1 / len(X)
    W2 -= kappa * dloss_dW2 / len(X)
    b2 -= kappa * dloss_db2 / len(X)
    losses.append(loss / len(X))
    

plt.plot(losses); plt.xlabel('Epoch'); plt.ylabel('Loss');
100%|██████████| 500/500 [00:21<00:00, 23.79it/s]
In [15]:
# Run the inputs through the network
Y = []
for x,t in zip(X,T):
    alpha = W1@x + b1
    h = logistic(alpha)
    beta = W2@h + b2
    Y.append(softmax(beta))

print(f'Target: {T[0]}')
print(f'Output: {Y[0]}')

# Plot the dataset using the network's output as labels
ds.plot(labels=Y);
Target: [0. 0. 1.]
Output: [0.22907674 0.5291186  0.24180465]

Model's accuracy¶

In [16]:
# Count correct
correct = 0
for t,y in zip(T,Y):
    k = np.argmax(t)
    j = np.argmax(y)
    if k==j:
        correct += 1
# Print accuracy as percent
print(f'Accuracy = {correct/len(ds)*100}%')
Accuracy = 94.39999999999999%
In [ ]: