Counting n-grams

We develop a simple language model based on n-grams.

import math
import torch
import random
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from matplotlib_inline import backend_inline

FRAC_LIMIT = 0.3
PAD_TOKEN = "."

DATASET_DIR = Path("./data").absolute()
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

warnings.simplefilter(action="ignore")
backend_inline.set_matplotlib_formats("svg")

Getting the dataset:

import os
if not os.path.isfile("./data/surnames_freq_ge_100.csv"):
    !wget -O ./data/surnames_freq_ge_100.csv https://raw.githubusercontent.com/particle1331/spanish-names-surnames/master/surnames_freq_ge_100.csv
    !wget -O ./data/surnames_freq_ge_20_le_99.csv https://raw.githubusercontent.com/particle1331/spanish-names-surnames/master/surnames_freq_ge_20_le_99.csv
else:
    print("Data files already exist.")
Data files already exist.

Loading and preprocessing into a list of strings:

def load_surnames(frac: float = FRAC_LIMIT, min_len=2) -> list[str]:
    """Load shuffled surnames from files into a list."""

    col = ["surname", "frequency_first", "frequency_second", "frequency_both"]
    filepaths = ["surnames_freq_ge_100.csv", "surnames_freq_ge_20_le_99.csv"]
    dfs = [pd.read_csv(DATASET_DIR / f, names=col, header=0) for f in filepaths]
    df = pd.concat(dfs, axis=0)[["surname"]].sample(frac=frac)
    df = df.reset_index(drop=True)
    df["surname"] = df["surname"].map(lambda s: s.lower())
    df["surname"] = df["surname"].map(lambda s: s.replace("de la", "dela"))
    df["surname"] = df["surname"].map(lambda s: s.replace(" ", "_"))
    df = df[["surname"]].dropna().astype(str)

    names = [
        n for n in df.surname.tolist() 
        if ("'" not in n) and ('ç' not in n) and (len(n) >= min_len)
    ]
    
    return names
names = load_surnames()
for j in range(5):
    print(names[j])
durana
dela_azuela
atlassi
irulegi
stockwell

We will tokenize the names at the character level:

from collections import Counter

name_lengths = Counter([len(n) for n in names])
corpus = "".join(names)
print("num chars:  ", len(set(corpus)))
print("total names:", len(names))
print("total chars:", len(corpus))
num chars:   28
total names: 22136
total chars: 164611
Hide code cell source
fig, ax = plt.subplots(1, 2, figsize=(10, 4))
vocab = Vocab(corpus, preprocess=False)
ax[0].bar(x=[tok for tok, _ in vocab.token_freqs], height=[freq for _, freq in vocab.token_freqs])
ax[0].set_xlabel("character", fontsize=10)
ax[0].set_ylabel("count", fontsize=10)

ax[1].set_xlabel("name length")
ax[1].set_ylabel("count")
ax[1].bar(name_lengths.keys(), name_lengths.values(), color="C1")

fig.tight_layout();
../../../_images/4e2ebcd03f42d205f29db2d0306643e3e5e182b7c7c118ba815438ab5e0a515d.svg

Spanish surnames have median name length 7 (counting space):

Hide code cell source
print("range: ", [min(name_lengths.keys()), max(name_lengths.keys())])
print("median: ", int(np.median(sorted([len(n) for n in names]))))
range:  [2, 25]
median:  7

Defining the dataset of context-target pairs. These are constructed by iterating with a window of fixed size over each name to get \((\boldsymbol{\mathsf{x}}_{i}, \ldots, \boldsymbol{\mathsf{x}}_{i + c - 1}) \mapsto \boldsymbol{\mathsf{x}}_c\) pairs where \(i\) is the index and \(c\) is the block size. Moreover, each name is padded with padding . indicating the start and end of a name, e.g. "...durana." for context size 3. Note that all inputs have the same size.

import torch
from typing import List
from torch.utils.data import Dataset

class CharDataset(Dataset):
    def __init__(self, 
        names: List[str], 
        block_size: int,
        vocab: Optional[Vocab] = None
    ):
        self.block_size = block_size
        self.vocab = vocab or Vocab(text="".join(names), preprocess=False, reserved_tokens=[PAD_TOKEN])
        self.tokenizer = Tokenizer(self.vocab)
        self.xs, self.ys = self.samples(names)

    def vocab_size(self):
        return len(self.vocab)
    
    def __len__(self):
        return len(self.xs)
    
    def __getitem__(self, i: int):
        x = self.tokenizer.encode(self.xs[i])
        y = self.tokenizer.encode(self.ys[i])[0]
        return x, y

    def samples(self, names: List[str]):
        xs, ys = [], []
        for name in names:
            context = PAD_TOKEN * self.block_size
            for c in name + PAD_TOKEN:
                xs.append(context)
                ys.append(c)
                context = context[1:] + c
        return xs, ys

Using this we can create sequence datasets of any fixed context size to predict the next character:

dataset = CharDataset(names, block_size=3)
x, y = zip(*[dataset[i] for i in range(7)])
pd.DataFrame({"xs": dataset.xs[:7], "ys": dataset.ys[:7], "x": list(map(lambda e: e.tolist(), x)), "y": list(map(lambda e: e.item(), y))})
xs ys x y
0 ... d [1, 1, 1] 13
1 ..d u [1, 1, 13] 10
2 .du r [1, 13, 10] 4
3 dur a [13, 10, 4] 2
4 ura n [10, 4, 2] 8
5 ran a [4, 2, 8] 2
6 ana . [2, 8, 2] 1

Frequency table

Bigrams are essentially input-output pairs with context size 1. The following constructs the table of character bigrams appearing in names in the training dataset. An entry [i, j] in the array below is the count of bigrams that start with the ith character followed by the jth character. We also consider padding.

split_point = int(0.80 * len(names))
names_train = names[:split_point]
names_valid = names[split_point:]
vocab = Vocab(text="".join(names), preprocess=False, reserved_tokens=[PAD_TOKEN])
bigram_train = CharDataset(names_train, block_size=1, vocab=vocab)
bigram_valid = CharDataset(names_valid, block_size=1, vocab=vocab)

# count matrix
n = bigram_train.vocab_size()
N2 = torch.zeros((n, n), dtype=torch.int32)
for x, y in bigram_train:
    N2[x[0].item(), y.item()] += 1

Remark. Note that we use one vocabulary for training and validation (i.e. combined).

Hide code cell source
# viz (ignore <unk>)
plt.figure(figsize=(18, 18))
plt.imshow(N2[1:, 1:], cmap='Blues')
for i in range(N2.shape[0] - 1):
    for j in range(N2.shape[1] - 1):
        chstr = bigram_train.tokenizer.decode(i + 1) + bigram_train.tokenizer.decode(j + 1)
        plt.text(j, i, chstr, ha="center", va="bottom", color="gray")
        plt.text(j, i, N2[1:, 1:][i, j].item(), ha="center", va="top", color="gray")

plt.axis("off");
../../../_images/392dda6d28ad042cf184e511c14c08a4e68ff44be82143c30235c878518c075f.svg

Remark. The count table for general n-grams is a \(| \mathcal{V} |^n\) matrix. But by Zipf’s law, this matrix becomes increasingly sparse with increasing n.


Modeling counts

The above frequency table can be constructed with \(n\)-grams, in general. A context \(\boldsymbol{\mathsf{x}}\) represented as an \(n - 1\) tuple can be used to index the frequency table to get the frequency distribution for the target token. The model outputs the distribution \(f(\boldsymbol{\mathsf{x}}) = p(\cdot \mid \boldsymbol{\mathsf{x}})\) by normalizing and smoothing the counts.

class CountingModel:
    def __init__(self, block_size: int, vocab_size: int, alpha=0.01):
        """Model of observed n-grams to estimate next char probability."""
        self.P = None                    # cond. prob
        self.N = None                    # counts
        self.alpha = alpha               # laplace smoothing
        self.block_size = block_size
        self.vocab_size = vocab_size

    def __call__(self, x: torch.tensor) -> torch.tensor:
        # tuple(x.T) = ([x11, x21, x31], [x12, x22, x32]) 
        # i.e. len = block_size, num entries = B
        # then, P[tuple(x.T)][b] == P[xb1, xb2], so output has shape (B, vocab_size) 
        return torch.tensor(self.P[tuple(x.T)])    

    def fit(self, dataset: CharDataset):
        v = self.vocab_size
        n = self.block_size + 1     # +1 for output dim
        self.N = torch.zeros([v] * n, dtype=torch.int32)  
        for x, y in dataset:
            self.N[tuple(x)][y] += 1

        a = self.alpha
        self.P = (self.N + a)/ (self.N + a).sum(dim=-1, keepdim=True)

    def evaluate(self, dataset: CharDataset):
        loss = 0.0
        for x, y in dataset:
            loss += -torch.log(self(x[None, :])[0, y]).item()
        return loss / len(dataset)

Fitting a bigram model which has a context of one character:

vocab = Vocab(text="".join(names), preprocess=False, reserved_tokens=[PAD_TOKEN])
bigram_train = CharDataset(names_train, block_size=1, vocab=vocab)
bigram_valid = CharDataset(names_valid, block_size=1, vocab=vocab)

bigram_model = CountingModel(block_size=1, vocab_size=len(vocab))
bigram_model.fit(bigram_train)
bigram_model.evaluate(bigram_valid)
2.510331458524424

The learned conditional probabilities can be visualized for the bigram model:

plt.imshow(bigram_model.P);    # yellow pixel is for “qu” ☺
../../../_images/642a7adb77dcb2e584e680e4af05a7a931c24d1e12d610cdb1b4f676cdc04805.svg

Similarly, we can fit a trigram model as follows:

trigram_train = CharDataset(names_train, block_size=2, vocab=vocab)
trigram_valid = CharDataset(names_valid, block_size=2, vocab=vocab)
trigram_model = CountingModel(block_size=2, vocab_size=len(vocab))
trigram_model.fit(trigram_train)

# Sample prediction and evals
p = trigram_model(torch.tensor([[1, 1], [2, 2]]))
print(p.shape, p.sum(dim=1).view(-1))
print(trigram_model.evaluate(trigram_train))
print(trigram_model.evaluate(trigram_valid))
torch.Size([2, 30]) tensor([1.0000, 1.0000])
2.267962301462323
2.3533043515255345

Both models are better than random:

import math
math.log(len(vocab) - 2)    # -2 for <unk> and padding
3.332204510175204

Note that already at \(n = 3\) generalization starts to drop:

ngram_train = CharDataset(names_train, block_size=3, vocab=vocab)
ngram_valid = CharDataset(names_valid, block_size=3, vocab=vocab)
ngram_model = CountingModel(block_size=3, vocab_size=len(vocab))
ngram_model.fit(ngram_train)

# Sample prediction and evals
print(ngram_model.evaluate(ngram_train))
print(ngram_model.evaluate(ngram_valid))
1.9255481190171766
2.4070117212841913

Generating names

For concreteness, we will consider a trigram model. First, a character is sampled from a multinomial distribution[1] given the start context ... The context is appended with each sampled character, say e. Since our trigram model only uses the last 2 characters, the relevant context becomes .e. This is repeated until another . is sampled, signaling the end of a name.

def generate_name(
    model, 
    dataset: CharDataset, 
    min_len=2,
    max_len=100, 
    g=None, seed=2718
):
    """Generate names from a Markov process with cond prob from model."""
    if g is None:
        g = torch.Generator().manual_seed(seed)
    
    context = PAD_TOKEN * dataset.block_size
    out = []
    while len(context) < max_len:
        x = dataset.tokenizer.encode(context).view(1, -1)
        p = model(x)[0]
        j = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
        c = dataset.tokenizer.decode(j)
        if c == PAD_TOKEN:
            if len(out) >= min_len:
                break
            else:
                continue
        
        out.append(c)
        context = context[1:] + c
    
    return "".join(out)

Sampling based on bigram counts:

g = torch.Generator().manual_seed(0)
for i in range(10):
    print(generate_name(bigram_model, bigram_train, min_len=3, max_len=60, g=g))
pelaza
trogrgoson_qussata
bularockelaya
nousabeganderamiledous
marezuz
frdedhadaileon
caraalugurenujao
ndertye
cia
dancial

Sampling based on trigram counts seem to have better results:

g = torch.Generator().manual_seed(0)
for i in range(10):
    print(generate_name(trigram_model, trigram_train, min_len=3, max_len=60, g=g))
pelaza
torogbomez_de_santa
bulcarci
layo
nabonesabentelardalledouguichez
piniñedhamaillon
caraalugerowskao
nov
tre
caridancial