Non-linear decision boundary

Going back to classification. Let us generate data that is not linearly separable. In this section, we will show that linear classification extends to input data that is not linearly separable. The idea is to apply a sequence of transformations (nonlinear, i.e. using activations) on the input \(\boldsymbol{\mathsf{x}}\) such that the final features \(f(\boldsymbol{\mathsf{x}})\) become linearly separable.

import torch 
torch.manual_seed(2)

N = 1500  # sample size
noise = lambda e: torch.randn(N, 2) * e
t = 2 * torch.pi * torch.rand(N, 1)
s = 2 * torch.pi * torch.rand(N, 1)

x0 = torch.cat([0.1 * torch.cos(s), 0.1 * torch.sin(s)], dim=1) + noise(0.05)
x1 = torch.cat([1.0 * torch.cos(t), 1.0 * torch.sin(t)], dim=1) + noise(0.1)
y0 = (torch.ones(N,) * 0).long()
y1 = (torch.ones(N,) * 1).long()
Hide code cell source
%config InlineBackend.figure_format = 'svg'
import matplotlib.pyplot as plt

plt.scatter(x0[:, 0], x0[:, 1], s=2.0, label=0, color="C0")
plt.scatter(x1[:, 0], x1[:, 1], s=2.0, label=1, color="C1")
plt.xlabel("$x_1$")
plt.ylabel("$x_2$")
plt.legend()
plt.axis('equal');
../../../_images/4c73ac0d0a4e365a0b2d885fd48c5522259926dafb134c9f0853fc7dbd1bec81.svg

Modeling this with a fully-connected neural network with one hidden layer containing units that uses the tanh function as activation: \(\text{tanh}(z) = \frac{e^{z} - e^{-z}}{e^{z} + e^{-z}}.\) This maps \(\mathbb{R}\) to \([-1, 1]\) symmetrically with \(\text{tanh}(0) = 0\) and \(\text{tanh}(z) \to \pm 1\) as \(z \to \pm\infty.\) Note that tanh is actually just a scaled and translated version of the sigmoid function.

import torch.nn as nn
from torchsummary import summary

model = lambda: nn.Sequential(
    nn.Linear(2, 3), nn.Tanh(),
    nn.Linear(3, 2)
)

summary(model(), input_size=(2,))
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
================================================================
            Linear-1                    [-1, 3]               9
              Tanh-2                    [-1, 3]               0
            Linear-3                    [-1, 2]               8
================================================================
Total params: 17
Trainable params: 17
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.00
Params size (MB): 0.00
Estimated Total Size (MB): 0.00
----------------------------------------------------------------

Gradient descent on cross-entropy loss (equivalent to NLL):

import torch.nn.functional as F

net = model()
optim = torch.optim.SGD(net.parameters(), lr=0.01)

x = torch.cat([x0, x1])
y = torch.cat([y0, y1])
history = {"accs": [], "loss": []}
for step in range(25000):
    s = net(x)
    loss = F.cross_entropy(s, y)
    loss.backward()
    optim.step()
    optim.zero_grad()
    history["loss"].append(loss.item())
    history["accs"].append(100 * (y == torch.argmax(s, dim=1)).float().mean())
Hide code cell source
fig, ax1 = plt.subplots(figsize=(8, 4))

ax1.plot(history["loss"], color="blue", linewidth=2)
ax1.set_xlabel("step")
ax1.ticklabel_format(axis="x", style="sci", scilimits=(3, 3))
ax1.grid(axis="both", linestyle="dotted", alpha=0.8)

ax2 = ax1.twinx()
ax2.plot(history["accs"], color="red", linewidth=2)

ax1.set_ylabel("Batch train loss")
ax2.set_ylabel("Batch train acc (%)")
ax1.yaxis.label.set_color("blue")
ax2.yaxis.label.set_color("red");
../../../_images/0c1d94f6cb16f6abdaeadbf68eb3a1f74f1e62dac516e75fb057301f6fd73d88.svg
Hide code cell source
import numpy as np
import matplotlib.pyplot as plt

# transformations
with torch.no_grad():
    linear_0 = net[0](x0)
    linear_1 = net[0](x1)
    linear_act_0 = net[1](net[0](x0))
    linear_act_1 = net[1](net[0](x1))

    # separating hyperplane (see above discussion, i.e. w <- w1 - w0  == logistic reg)
    h = 1
    w, b = net[2].parameters()
    w, b = (w[h] - w[h-1]), (b[h] - b[h-1])

# plot
fig = plt.figure(figsize=(12, 4))
ax0 = fig.add_subplot(131)
ax1 = fig.add_subplot(132, projection='3d')
ax2 = fig.add_subplot(133, projection='3d')

ax0.grid(alpha=0.8, linestyle="dotted")
ax0.set_axisbelow(True)
ax0.scatter(x0[:, 0], x0[:, 1], s=2.0, label=0, color="C0")
ax0.scatter(x1[:, 0], x1[:, 1], s=2.0, label=1, color="C1")
ax0.set_xlabel("$x_1$")
ax0.set_ylabel("$x_2$")
ax0.set_xlim(-1.5, 1.5)
ax0.set_ylim(-1.5, 1.5)
ax0.set_title("(a) input")
ax0.legend()
ax0.axis('equal')

ax1.scatter(linear_0[:, 0], linear_0[:, 1], linear_0[:, 2], s=3, label=0, color="C0")
ax1.scatter(linear_1[:, 0], linear_1[:, 1], linear_1[:, 2], s=3, label=1, color="C1")
ax1.set_xlabel('$x_1$')
ax1.set_ylabel('$x_2$')
ax1.set_zlabel('$x_3$')
ax1.set_title('(b) linear')

ax2.scatter(linear_act_0[:, 0], linear_act_0[:, 1], linear_act_0[:, 2], s=3, label=0, color="C0")
ax2.scatter(linear_act_1[:, 0], linear_act_1[:, 1], linear_act_1[:, 2], s=3, label=1, color="C1")
ax2.set_xlabel('$x_1$')
ax2.set_ylabel('$x_2$')
ax2.set_zlabel('$x_3$')
ax2.set_title('(c) linear + tanh')

# Generate grid of points
x_min = min(linear_act_1[:, 0].min(), linear_act_0[:, 0].min())
x_max = max(linear_act_1[:, 0].max(), linear_act_0[:, 0].max())
y_min = min(linear_act_1[:, 1].min(), linear_act_0[:, 1].min())
y_max = max(linear_act_1[:, 1].max(), linear_act_0[:, 1].max())
a, b, c, d = w[0], w[1], w[2], b
x = np.linspace(x_min, x_max, 50)
y = np.linspace(y_min, y_max, 50)
X, Y = np.meshgrid(x, y)
Z = (-a * X - b * Y - d) / c

# Plot the hyperplane for the positive class
ax2.plot_surface(X, Y, Z, alpha=0.5, color=f"C{h}")
fig.tight_layout();
../../../_images/f6780a9f90e9a6549e0075c985aecffd9b8c64e9098e1b69773785b2eaf34e73.svg

Remark. In general, a weight vector is learned for each class, so that there are as many separating hyperplanes as the number of classes. For binary classification with softmax, the weight vectors and biases fuse resulting in one separating plane.

Checking classification accuracy:

a = (torch.argmax(net(x0), dim=1) == y0).float().mean().item()
b = (torch.argmax(net(x1), dim=1) == y1).float().mean().item()
a, b
(1.0, 1.0)

Decision boundary in the input space. Note that we have to convert logits to probabilities:

Hide code cell source
from matplotlib.colors import LinearSegmentedColormap

# define custom colormap
colors = ["C0", "C1"]
n_bins = 100
cm = LinearSegmentedColormap.from_list(name="", colors=colors, N=n_bins)

# create a grid of points
N = 100
x = np.linspace(-1.5, 1.5, N)
y = np.linspace(-1.5, 1.5, N)
X, Y = np.meshgrid(x, y)

# calculate p1 for each point in grid
Z = np.zeros_like(X)
for i in range(N):
    for j in range(N):
        out = F.softmax(net(torch.tensor([[float(X[i, j]), float(Y[i, j])]])), dim=1)
        Z[i, j] = out[0][1]

# create a color plot
plt.pcolormesh(X, Y, Z, shading="auto", cmap=cm, rasterized=True)
plt.colorbar()
plt.xlabel("X")
plt.ylabel("Y")

plt.scatter(x0[:, 0], x0[:, 1], s=10.0, label=0, color="C0", edgecolor="black")
plt.scatter(x1[:, 0], x1[:, 1], s=10.0, label=1, color="C1", edgecolor="black")
plt.xlabel("$x_1$")
plt.ylabel("$x_2$")
plt.legend()
plt.axis("equal");

plt.xlim(-1.5, 1.5)
plt.ylim(-1.5, 1.5)
plt.show();
../../../_images/8ae4b639a7b3ff6bcc395c14c33bca1fbb62cfa857861e35335ec9a7b76f0e64.svg

Fig. Probability assigned by the trained network on the input space. Note that we were able to extend linear classification to learning a nonlinear decision boundary. Using ReLU activation here instead of Tanh results in a boundary with sharp corners. This can be thought of as a manifestation of inductive bias.