PCA Demo¶

In [1]:
# Standard imports
import numpy as np
from numpy import cos, sin
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
%matplotlib notebook

Create a data cloud¶

In [2]:
def RotMat(t0, t1, t2):
    '''
    RotMat(tx, ty, tz)
    tx, ty and tz are the rotation angles about the x-, y- and z-axes, respectively
    (in radians). The output is a 3x3 rotation matrix.
    '''
    M = np.zeros([3,3])
    M = np.array([[1., 0, 0],[0, cos(t0), -sin(t0)],[0, sin(t0), cos(t0)]])
    M = np.dot(np.array([[cos(t1), 0, sin(t1)],[0, 1., 0],[-sin(t1), 0, cos(t1)]]), M)
    M = np.dot(np.array([[cos(t2), -sin(t2), 0],[sin(t2), cos(t2), 0],[0, 0, 1.]]), M)
    return M
In [3]:
sigma0 = 1.5  # standard deviation along x-axis
sigma1 = 0.2  # standard deviation along y-axis
sigma2 = 0.5  # standard deviation along z-axis

n_points = 500
M = np.diag([sigma0, sigma1, sigma2])
A = np.random.normal(0.0, 1.0, size=[3,n_points])
A = M @ A  # scale the random points by standard deviations
A = RotMat(0.5, 0.2, -0.3) @ A  # rotation, if desired
In [4]:
np.shape(A)
Out[4]:
(3, 500)

View Data Cloud in 3-D¶

In [5]:
fig = plt.figure(1)
ax = fig.add_subplot(111, projection='3d')
ax.scatter(A[0,:], A[1,:], A[2,:], marker='.');
plt.xlabel('x'); plt.ylabel('y'); ax.set_zlabel('z');
plt.axis('equal');
plt.title('The cloud is like an elongated pancake');

SVD of data matrix¶

In [6]:
U, S, VT = np.linalg.svd(A, full_matrices=False)
In [7]:
print(U.shape)
print(VT.shape)
(3, 3)
(3, 500)
In [8]:
U.T @ U
Out[8]:
array([[ 1.00000000e+00, -3.18128297e-17, -4.41406642e-17],
       [-3.18128297e-17,  1.00000000e+00, -6.47149823e-18],
       [-4.41406642e-17, -6.47149823e-18,  1.00000000e+00]])
In [9]:
VT @ VT.T
Out[9]:
array([[ 1.00000000e+00,  4.35849273e-17, -6.24500451e-17],
       [ 4.35849273e-17,  1.00000000e+00, -3.36753195e-16],
       [-6.24500451e-17, -3.36753195e-16,  1.00000000e+00]])
In [10]:
S
Out[10]:
array([33.00998878, 11.53522009,  4.45818728])
In [11]:
S/np.sqrt(n_points)
Out[11]:
array([1.47625158, 0.51587073, 0.1993762 ])
In [12]:
np.diag(S)
Out[12]:
array([[33.00998878,  0.        ,  0.        ],
       [ 0.        , 11.53522009,  0.        ],
       [ 0.        ,  0.        ,  4.45818728]])

Semi-axes¶

In [13]:
US = 2./np.sqrt(n_points) * U @ np.diag(S)
In [16]:
fig = plt.figure(2)
ax = fig.add_subplot(111, projection='3d')
ax.scatter(A[0,:], A[1,:], A[2,:], marker='.', color=(0.2,0.4,0.8,0.2));
plt.axis('equal')
plt.xlabel('x'); plt.ylabel('y'); ax.set_zlabel('z');

c = ['r', 'g', 'b']
minbound = -3
maxbound = 3
for idx in range(3):
    plt.plot([0, US[0,idx]], [0, US[1,idx]], [0, US[2,idx]], c[idx], linewidth=3)
ax.auto_scale_xyz(*[[minbound, maxbound]]*3)
ax.axis('scaled');

Low-Rank Approximation¶

In [17]:
k = 2

from copy import deepcopy
Sk = deepcopy(S)
Sk[2] = 0.
print(Sk)

Ak = U @ np.diag(Sk) @ VT
[33.00998878 11.53522009  0.        ]

3D (flattened) Data Cloud¶

In [18]:
fig = plt.figure(3)
ax = fig.add_subplot(111, projection='3d')
ax.scatter(Ak[0,:], Ak[1,:], Ak[2,:], marker='.');
plt.xlabel('x'); plt.ylabel('y'); ax.set_zlabel('z');
plt.axis('equal');
plt.title('Looking at the subspace end-on');

Express data in terms of Principal Components¶

In [19]:
# Project points into 2D using ...
C = np.diag(Sk) @ VT
In [20]:
# Or just use the 1st 2 rows of...
C = U.T @ A
In [21]:
plt.figure()
plt.plot(C[0,:], C[1,:], '.'); plt.axis('equal');

Compression¶

In [22]:
f = plt.imread('statue.jpg')
f = np.array(f, dtype=float)
np.shape(f)
plt.figure(5)
plt.clf()
plt.imshow(f, cmap='gray');
In [23]:
U, S, V = np.linalg.svd(f, full_matrices=0)
plt.figure(6)
plt.plot(S)
plt.xlabel('Index')
plt.ylabel('Singular Value');

Now let's take a reduced-rank approximation by setting a bunch of singular values to zero.

In [24]:
r = 25   # how many singular values to keep
Sr = S.copy()
Sr[r:] = 0
fr = U@np.diag(Sr)@V  # reconstitute low-rank matrix
In [25]:
plt.figure(7)
plt.clf()
plt.imshow(fr, cmap='gray');
In [26]:
total_original_data = np.shape(f)[0] * np.shape(f)[1]
print(f'Total original data = {total_original_data}')
total_data = (np.shape(U)[0] + np.shape(V)[1] + 1)*r
print(f'Total data = {total_data}')
print(f'Compression ratio = {float(total_data)/total_original_data}')
Total original data = 67500
Total data = 13150
Compression ratio = 0.1948148148148148
In [ ]: