Skip to main content

Multimodal Embeddings

Text-only RAG works great for documents, but many real-world knowledge bases contain images, diagrams, and screenshots. Multimodal embeddings map both text and images into the same vector space, enabling cross-modal retrieval — searching for images with text queries or searching for text with image queries.

CLIP — Contrastive Language-Image Pre-training

CLIP (by OpenAI) learns to understand images and text jointly. It maps both modalities into the same embedding space where similar concepts are close together, regardless of modality.

bash
uv add torch transformers Pillow
python
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch
import numpy as np

# Load CLIP model
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

def get_text_embedding(text: str) -> np.ndarray:
"""Get CLIP embedding for text."""
inputs = processor(text=text, return_tensors="pt", padding=True)
with torch.no_grad():
embedding = model.get_text_features(**inputs)
# Normalize to unit vector
embedding = embedding / embedding.norm(p=2, dim=-1, keepdim=True)
return embedding.numpy().flatten()

def get_image_embedding(image_path: str) -> np.ndarray:
"""Get CLIP embedding for an image."""
image = Image.open(image_path).convert("RGB")
inputs = processor(images=image, return_tensors="pt")
with torch.no_grad():
embedding = model.get_image_features(**inputs)
# Normalize to unit vector
embedding = embedding / embedding.norm(p=2, dim=-1, keepdim=True)
return embedding.numpy().flatten()

# Compute embeddings
text_emb = get_text_embedding("a photo of a cat sitting on a couch")
image_emb = get_image_embedding("cat_photo.jpg")

# Calculate similarity
similarity = np.dot(text_emb, image_emb)
print(f"Text-Image similarity: {similarity:.3f}")

Building a Multimodal Search Engine

python
import os
from pathlib import Path
from PIL import Image
import numpy as np

class MultimodalSearchEngine:
def __init__(self):
self.model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
self.items = [] # Each: {"type": "text"/"image", "content": ..., "embedding": ...}

def add_text(self, text: str):
embedding = self._text_embedding(text)
self.items.append({"type": "text", "content": text, "embedding": embedding})

def add_image(self, image_path: str):
embedding = self._image_embedding(image_path)
self.items.append({"type": "image", "content": image_path, "embedding": embedding})

def search(self, query: str, top_k: int = 5) -> list[dict]:
"""Search by text query across both images and text."""
query_emb = self._text_embedding(query)
scores = []
for item in self.items:
sim = np.dot(query_emb, item["embedding"])
scores.append((item, sim))
scores.sort(key=lambda x: x[1], reverse=True)
return [{"type": s[0]["type"], "content": s[0]["content"], "score": float(s[1])}
for s in scores[:top_k]]

def _text_embedding(self, text: str) -> np.ndarray:
inputs = self.processor(text=text, return_tensors="pt", padding=True)
with torch.no_grad():
emb = self.model.get_text_features(**inputs)
return (emb / emb.norm(p=2, dim=-1, keepdim=True)).numpy().flatten()

def _image_embedding(self, path: str) -> np.ndarray:
image = Image.open(path).convert("RGB")
inputs = self.processor(images=image, return_tensors="pt")
with torch.no_grad():
emb = self.model.get_image_features(**inputs)
return (emb / emb.norm(p=2, dim=-1, keepdim=True)).numpy().flatten()

# Usage
engine = MultimodalSearchEngine()

# Add text documents
engine.add_text("Python is a versatile programming language")
engine.add_text("Machine learning models learn from data")

# Add images
for img_path in Path("./images").glob("*.jpg"):
engine.add_image(str(img_path))

# Search with text
results = engine.search("programming code")
for r in results:
print(f"[{r['score']:.3f}] {r['type']}: {r['content']}")

Zero-Shot Image Classification

CLIP enables zero-shot classification without training on specific categories:

python
def zero_shot_classify(image_path: str, categories: list[str]) -> dict:
"""Classify an image into categories without any training."""
image = Image.open(image_path).convert("RGB")
inputs = processor(
text=[f"a photo of a {cat}" for cat in categories],
images=image,
return_tensors="pt",
padding=True,
)

with torch.no_grad():
outputs = model(**inputs)
logits = outputs.logits_per_image # Image-to-text similarity
probs = logits.softmax(dim=1).flatten()

return {cat: float(prob) for cat, prob in sorted(
zip(categories, probs), key=lambda x: x[1], reverse=True
)}

# Classify an image
categories = ["cat", "dog", "bird", "fish", "car", "building"]
result = zero_shot_classify("unknown_photo.jpg", categories)
for cat, prob in result.items():
print(f" {cat}: {prob:.2%}")

Multimodal RAG

For RAG over documents with images, you need to process both text and images:

python
import fitz # PyMuPDF

def extract_multimodal_content(pdf_path: str, output_dir: str = "./extracted"):
"""Extract text and images from a PDF for multimodal RAG."""
Path(output_dir).mkdir(exist_ok=True)
doc = fitz.open(pdf_path)

chunks = []
for page_num, page in enumerate(doc):
# Extract text
text = page.get_text().strip()
if text:
chunks.append({
"type": "text",
"content": text,
"source": f"page_{page_num + 1}",
})

# Extract images
for img_index, img in enumerate(page.get_images()):
xref = img[0]
base_image = doc.extract_image(xref)
image_bytes = base_image["image"]
image_ext = base_image["ext"]

img_path = f"{output_dir}/page{page_num + 1}_img{img_index}.{image_ext}"
with open(img_path, "wb") as f:
f.write(image_bytes)

chunks.append({
"type": "image",
"content": img_path,
"source": f"page_{page_num + 1}_image_{img_index}",
})

doc.close()
return chunks

# Build multimodal index from PDF
chunks = extract_multimodal_content("research_paper.pdf")
engine = MultimodalSearchEngine()

for chunk in chunks:
if chunk["type"] == "text":
engine.add_text(chunk["content"])
elif chunk["type"] == "image":
engine.add_image(chunk["content"])

# Search across both text and images
results = engine.search("neural network architecture diagram")
CLIP limitations

CLIP's image understanding is good for natural images (photos, diagrams) but struggles with:

  • Text within images (use OCR instead)
  • Very fine-grained distinctions (e.g., two similar chart types)
  • Long documents (CLIP processes a single image, not multi-page PDFs)

For documents with text-heavy images, combine CLIP with OCR for the best results.

Multimodal Embedding APIs

For production, use hosted multimodal embedding APIs:

ProviderModelDimensionsSupports
OpenAIclip-vit512Text + Image
Googlemultimodalembedding1408Text + Image + Video
Jinajina-clip-v21024Text + Image
python
# Example: Jina CLIP v2 via API
import requests

def jina_multimodal_embed(text: str = None, image_url: str = None) -> list:
url = "https://api.jina.ai/v1/embeddings"
headers = {
"Authorization": f"Bearer {os.environ['JINA_API_KEY']}",
"Content-Type": "application/json",
}
data = {
"model": "jina-clip-v2",
"input": [],
}
if text:
data["input"].append({"text": text})
if image_url:
data["input"].append({"image": image_url})

response = requests.post(url, headers=headers, json=data)
return [item["embedding"] for item in response.json()["data"]]