◉ Attention Mechanism#

Reflections on Embeddings notes - link
Attention is all you need by google
Annotated transformers harvard

atten

4d example#

import numpy as np

X = np.array([1.0, 2.0, 3.0, 4.0])
Y = np.array([1.2, 2.1, 2.9, 3.7])

def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

cos = cosine_similarity(X, Y)
angle = np.degrees(np.arccos(cos))

print("X: {}".format(X))
print("Y: {}".format(Y))
print("dot(X, Y): {:.2f}".format(np.dot(X, Y)))
print("||X||: {:.2f}".format(np.linalg.norm(X)))
print("||Y||: {:.2f}".format(np.linalg.norm(Y)))
print("cosine: {:.4f}".format(cos))
print("angle degrees: {:.2f}".format(angle))

X: [1. 2. 3. 4.]
Y: [1.2 2.1 2.9 3.7]
dot(X, Y): 28.90
||X||: 5.48
||Y||: 5.29
cosine: 0.9980
angle degrees: 3.59

Post normalization#

X_unit = X / np.linalg.norm(X)
Y_unit = Y / np.linalg.norm(Y)

print("X_unit: {}".format(np.round(X_unit, 3)))
print("Y_unit: {}".format(np.round(Y_unit, 3)))

print("dot(X_unit, Y_unit): {:.4f}".format(np.dot(X_unit, Y_unit)))
print("cosine(X_unit, Y_unit): {:.4f}".format(cosine_similarity(X_unit, Y_unit)))

X_unit: [0.183 0.365 0.548 0.73 ]
Y_unit: [0.227 0.397 0.549 0.7  ]
dot(X_unit, Y_unit): 0.9980
cosine(X_unit, Y_unit): 0.9980

atten

Bahdanau Attention (2014), were specifically designed to help RNNs but…
- The sequential bottleneck.. i.e. to compute attention for the 50th word, you must first calculate the hidden states for words 1 through 49 in order
- Can parallelize the training due to sequential state forward architecture
- Long term Dependency Problem i.e. for longer context the key knowledge bits would vanish

atten

Meh…#

atten

import numpy as np
import matplotlib.pyplot as plt

np.set_printoptions(precision=3, suppress=True)
np.random.seed(42)

tokens = ["I", "prefer", "apple", "over", "orange", "because", "it", "is", "less", "acidic"]

X = np.array([
    [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],  # I
    [0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],  # prefer
    [0.0, 0.1, 1.0, 0.2, 0.0, 0.0, 0.1, 0.0],  # apple
    [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0],  # over
    [0.0, 0.1, 1.0, 0.2, 0.0, 0.0, 0.3, 0.0],  # orange
    [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.1, 0.0],  # because
    [0.0, 0.0, 0.4, 0.0, 0.0, 1.0, 0.1, 0.0],  # it
    [0.0, 0.0, 0.0, 0.0, 0.2, 0.0, 0.0, 0.0],  # is
    [0.0, 0.0, 0.0, 0.0, 0.1, 0.0, 0.4, 1.0],  # less
    [0.0, 0.0, 0.2, 0.0, 0.4, 0.0, 1.0, 0.0],  # acidic
])

print("Token embedding shape:", X.shape)

Token embedding shape: (10, 8)

def softmax(x, axis=-1):
    x = x - np.max(x, axis=axis, keepdims=True)
    e = np.exp(x)
    return e / np.sum(e, axis=axis, keepdims=True)


def scaled_dot_product_attention(Q, K, V):
    d_k = Q.shape[-1]
    scores = Q @ K.T / np.sqrt(d_k)
    weights = softmax(scores, axis=-1)
    output = weights @ V
    return scores, weights, output


def plot_attention_matrix(weights, tokens, title):
    plt.figure(figsize=(9, 7))
    plt.imshow(weights)
    plt.xticks(range(len(tokens)), tokens, rotation=45, ha="right")
    plt.yticks(range(len(tokens)), tokens)
    plt.colorbar(label="attention weight")
    plt.title(title)
    plt.xlabel("Key/value token being attended to")
    plt.ylabel("Query token doing the attending")

    for i in range(len(tokens)):
        for j in range(len(tokens)):
            plt.text(j, i, f"{weights[i, j]:.2f}", ha="center", va="center", fontsize=8)

    plt.tight_layout()
    plt.show()


def plot_activation(tokens, values, title, ylabel):
    plt.figure(figsize=(9, 4))
    plt.bar(tokens, values)
    plt.xticks(rotation=45, ha="right")
    plt.ylabel(ylabel)
    plt.title(title)
    plt.tight_layout()
    plt.show()

d_model = X.shape[1]

# Identity projections for teaching.
# In real transformers, Wq, Wk, Wv are learned.
Wq = np.eye(d_model)
Wk = np.eye(d_model)
Wv = np.eye(d_model)

Q = X @ Wq
K = X @ Wk
V = X @ Wv

scores, single_head_weights, single_head_output = scaled_dot_product_attention(Q, K, V)

plot_attention_matrix(
    single_head_weights,
    tokens,
    "Single-head self-attention: one contextual view"
)

print("Input X shape:", X.shape)
print("Q shape:", Q.shape)
print("K shape:", K.shape)
print("V shape:", V.shape)
print("Single-head output shape:", single_head_output.shape)

_images/30701da4fa7a145a361b183f6f13e3000ffe874b20f3b7c0f0f8fec5bf156c79.png

Input X shape: (10, 8)
Q shape: (10, 8)
K shape: (10, 8)
V shape: (10, 8)
Single-head output shape: (10, 8)

single_head_activation = np.linalg.norm(single_head_output, axis=1)

plot_activation(
    tokens,
    single_head_activation,
    "Activation strength after single-head self-attention",
    "L2 norm of contextualized output"
)

_images/a6fca40b2888a8ddee06a4342ba9409b08ccab52fccbbf353e8b73f8556525fb.png

Multi-head attention#

atten

Note: The hard coding of various head’s to focus on a specific feature is just for human explanation to visualize the attention patterns for understanding. In real models, these projections are learned automatically, and the model attention heads discovers useful patterns on its own

def make_projection(selected_dims, input_dim=8):
    W = np.zeros((input_dim, len(selected_dims)))

    for out_idx, in_idx in enumerate(selected_dims):
        W[in_idx, out_idx] = 1.0

    return W


heads = {
    "Head 1: fruit/category": {
        "dims": [2],
    },
    "Head 2: preference/comparison": {
        "dims": [1, 3],
    },
    "Head 3: explanation/acidity": {
        "dims": [4, 6, 7],
    },
    "Head 4: pronoun/coreference": {
        "dims": [2, 5],
    },
}

head_outputs = []
head_weights = {}

for head_name, config in heads.items():
    dims = config["dims"]

    Wq = make_projection(dims, input_dim=X.shape[1])
    Wk = make_projection(dims, input_dim=X.shape[1])
    Wv = make_projection(dims, input_dim=X.shape[1])

    Q = X @ Wq
    K = X @ Wk
    V = X @ Wv

    scores, weights, output = scaled_dot_product_attention(Q, K, V)

    head_weights[head_name] = weights
    head_outputs.append(output)

    print(f"{head_name}")
    print(f"  Q/K/V shape: {Q.shape}")
    print(f"  Output shape: {output.shape}")
    print()

Head 1: fruit/category
  Q/K/V shape: (10, 1)
  Output shape: (10, 1)

Head 2: preference/comparison
  Q/K/V shape: (10, 2)
  Output shape: (10, 2)

Head 3: explanation/acidity
  Q/K/V shape: (10, 3)
  Output shape: (10, 3)

Head 4: pronoun/coreference
  Q/K/V shape: (10, 2)
  Output shape: (10, 2)

for head_name, weights in head_weights.items():
    plot_attention_matrix(weights, tokens, head_name)

_images/565aa18023e55f1ab5f2940e58332a3ff3472ec942f981d42ed5111c81abbada.png

_images/a6be15a668e65cdb0f001e674be6b744b648f6c39e3d283bf07c51a3e10ce0d2.png

_images/565eac12bb50f8856d93da79d3393bc0be4184b96a80801c3d20c893b92d1ea5.png

_images/f0516336707b948b8f271c1b941002b772212adeb098b2a65951dcb093ba81ab.png

multi_head_output = np.concatenate(head_outputs, axis=-1)

print("Number of heads:", len(head_outputs))
print("Single-head output shape:", single_head_output.shape)
print("Multi-head concatenated output shape:", multi_head_output.shape)

Number of heads: 4
Single-head output shape: (10, 8)
Multi-head concatenated output shape: (10, 8)

multi_head_activation = np.linalg.norm(multi_head_output, axis=1)

plot_activation(
    tokens,
    multi_head_activation,
    "Activation strength after multi-head attention",
    "L2 norm after concatenating heads"
)

_images/72a8b4248a5ac04803669d1adba298bc4a300af192d0aff2c3f3f6db6002188a.png

Comparing Self attention to Multi Head#

x = np.arange(len(tokens))
width = 0.35

plt.figure(figsize=(11, 5))

plt.bar(
    x - width / 2,
    single_head_activation,
    width,
    label="Single-head self-attention"
)

plt.bar(
    x + width / 2,
    multi_head_activation,
    width,
    label="Multi-head attention"
)

plt.xticks(x, tokens, rotation=45, ha="right")
plt.ylabel("Activation strength")
plt.title("Single-head vs multi-head activation strength")
plt.legend()
plt.tight_layout()
plt.show()

_images/60228fedf7dbfccfc963a9afe9008bb3cff9367b9a1b95dfb3a15ecfa44db021.png