From meaning to vectors and back
This deck is a work in progressβ¦
and always will be
@misc{a-tour-of-genai-jgalego,
title = {Mapping Embeddings: from meaning to vectors and back},
author = {Galego, JoΓ£o},
howpublished = \url{jgalego.github.io/MappingEmbeddings},
year = {2024}
}
The slides were created using reveal.js
and the presentation is hosted on GitHub Pages
Just open an issue/PR for this project
github.com/JGalego/MappingEmbeddings
"""
Sends love to Amazon Titan for Embeddings π
and gets a bunch of numbers in return π’
"""
import json
import boto3
# Initialize Bedrock Runtime client
# https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/bedrock-runtime.html
bedrock = boto3.client("bedrock-runtime")
# Call Amazon Titan for Embeddings model on "love"
# https://docs.aws.amazon.com/bedrock/latest/userguide/titan-embedding-models.html
response = bedrock.invoke_model(
modelId="amazon.titan-embed-text-v1",
body="{\"inputText\": \"love\"}"
)
# Process the model response and print the final result
body = json.loads(response.get('body').read())
print(body['embedding'])
and work out some definitions.
A numerical representation of a piece of information
What if you had the embeddings of ALL Wikipedia?
Neighboring vectors, similar tracks
You may have heard of the
$\texttt{1 token} \sim \texttt{4 chars}$ rule of thumb
so let's spend a few tokens on tokenization
# pylint: disable=import-error,invalid-name
"""
Train image embeddings model from scratch using contrastive learning.
Adapted from Hadsell, Chopra & LeCun (2005)
https://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
and Underfitted's 'Training a model to generate image embeddings'
https://underfitted.svpino.com/p/training-a-model-to-generate-image
"""
import numpy as np
from keras import datasets, Input, Model
from keras.layers import Dense, Lambda
from keras.metrics import binary_accuracy, BinaryAccuracy
from keras.models import Sequential
from keras.ops import cast, maximum, norm, square
########
# Data #
########
# Load dataset
(X_train, y_train), (X_test, y_test) = datasets.mnist.load_data()
# Reshape and normalize it
X_train = X_train.reshape(-1, 784)
X_test = X_test.reshape(-1, 784)
X_train = X_train.astype('float32') / 255.0
X_test = X_test.astype('float32') / 255.0
def generate_pairs(X, y):
"""
Creates a collection of positive and negative image pairs.
"""
X_pairs = []
y_pairs = []
for i in enumerate(X):
digit = y[i]
# Create positive match
positive_digit_index = np.random.choice(np.where(y == digit)[0])
X_pairs.append([X[i], X[positive_digit_index]])
y_pairs.append([0])
# Create negative match
negative_digit_index = np.random.choice(np.where(y != digit)[0])
X_pairs.append([X[i], X[negative_digit_index]])
y_pairs.append([1])
# Shuffle everything
indices = np.arange(len(X_pairs))
np.random.shuffle(indices)
return np.array(X_pairs)[indices], np.array(y_pairs)[indices]
# Prepare input pairs
X_train_pairs, y_train_pairs = generate_pairs(X_train, y_train)
X_test_pairs, y_test_pairs = generate_pairs(X_test, y_test)
#########
# Model #
#########
# Define inputs
input1 = Input(shape=(784,))
input2 = Input(shape=(784,))
# Build siamese network
network = Sequential(
[
Input(shape=(784,)),
Dense(512, activation="relu"),
Dense(256, activation="relu"),
Dense(128, activation=None),
]
)
# Define twin branches
twin1 = network(input1)
twin2 = network(input2)
# Define distance
def euclidean_distance(a, b):
"""Computes the Euclidean distance."""
return norm(a - b, axis=1, keepdims=True)
distance = Lambda(euclidean_distance)(twin1, twin2)
# Set up the model
model = Model(inputs=[input1, input2], outputs=distance)
########
# Loss #
########
def contrastive_loss(y, d):
"""
Computes the contrastive loss from Hasdell, Chopra & LeCun (2005)
"""
margin = 1.0
y = cast(y, d.dtype)
loss = (1 - y) / 2 * square(d) + y / 2 * square(maximum(0.0, margin - d))
return loss
# Compile model using contrastive loss
model.compile(
loss=contrastive_loss,
optimizer="adam",
metrics=[binary_accuracy]
)
#########
# Train #
#########
# Fit the model
history = model.fit(
x=[X_train_pairs[:, 0], X_train_pairs[:, 1]],
y=y_train_pairs[:],
validation_data=([X_test_pairs[:, 0], X_test_pairs[:, 1]], y_test_pairs[:]),
batch_size=32,
epochs=5,
)
########
# Test #
########
# Generate predictions
predictions = model.predict([X_test_pairs[:, 0], X_test_pairs[:, 1]]) >= 0.5
# Compute model accuracy
accuracy = BinaryAccuracy()
accuracy.update_state(y_test_pairs, predictions.astype(int))
print(f"Accuracy: {accuracy.result().numpy():.2f}")
############
# Generate #
############
# Initialize model
embedding_model = model.layers[2]
# Generate embeddings
digits = np.where(y_test == 7)[0]
embeddings = embedding_model.predict(X_test[np.random.choice(digits)].reshape(1, -1))
print(embeddings, len(embeddings))
pip install -q sentence-transformers
MTEB: Massive Text Embedding Benchmark
"The most important piece of the preprocessing pipeline, from a systems standpoint, is the vector database."Andreessen Horowitz
"In the future, we believe that every database will be a vector database."Google
A simple tool for RAG visualizations
Visualization tool for exploring embeddings
π§
"What we know is a drop,Isaac Newton
what we don't know is an ocean."