Skip to main content

Image Processing

Before feeding images to AI models, you often need to preprocess them — resizing, normalizing, annotating, and extracting regions of interest. This page covers the essential image processing techniques using OpenCV and Pillow.

Setup

bash
uv add opencv-python-headless pillow numpy
python
import cv2
import numpy as np
from PIL import Image, ImageDraw, ImageFont

Basic Image Operations

Reading, Writing, and Converting

python
# Read an image with OpenCV (BGR format)
img_cv = cv2.imread("photo.jpg")

# Read with PIL (RGB format)
img_pil = Image.open("photo.jpg")

# Convert between formats
img_rgb = cv2.cvtColor(img_cv, cv2.COLOR_BGR2RGB) # BGR -> RGB
img_bgr = cv2.cvtColor(np.array(img_pil), cv2.COLOR_RGB2BGR) # PIL -> OpenCV

# Save images
cv2.imwrite("output.jpg", img_cv)
img_pil.save("output.png")

# Get image properties
height, width, channels = img_cv.shape
print(f"Size: {width}x{height}, Channels: {channels}")

Resizing and Cropping

python
# Resize to specific dimensions
resized = cv2.resize(img_cv, (640, 480))

# Resize maintaining aspect ratio
def resize_with_aspect(image, target_width=None, target_height=None):
h, w = image.shape[:2]
if target_width:
ratio = target_width / w
new_size = (target_width, int(h * ratio))
elif target_height:
ratio = target_height / h
new_size = (int(w * ratio), target_height)
else:
return image
return cv2.resize(image, new_size, interpolation=cv2.INTER_AREA)

# Crop a region (y1:y2, x1:x2)
cropped = img_cv[100:400, 200:600]

# Center crop to square
def center_crop(image, size):
h, w = image.shape[:2]
min_dim = min(h, w)
start_y = (h - min_dim) // 2
start_x = (w - min_dim) // 2
crop = image[start_y:start_y+min_dim, start_x:start_x+min_dim]
return cv2.resize(crop, (size, size))

Drawing Annotations

Bounding Boxes with OpenCV

python
def draw_bounding_box(
image,
bbox: tuple[int, int, int, int], # x1, y1, x2, y2
label: str = "",
color: tuple = (0, 255, 0), # Green in BGR
thickness: int = 2,
font_scale: float = 0.6,
):
"""Draw a bounding box with label on an image."""
x1, y1, x2, y2 = bbox

# Draw rectangle
cv2.rectangle(image, (x1, y1), (x2, y2), color, thickness)

if label:
# Get text size for background
(text_w, text_h), baseline = cv2.getTextSize(
label, cv2.FONT_HERSHEY_SIMPLEX, font_scale, 1
)

# Draw label background
cv2.rectangle(
image,
(x1, y1 - text_h - baseline - 5),
(x1 + text_w, y1),
color,
-1, # Filled rectangle
)

# Draw label text
cv2.putText(
image,
label,
(x1, y1 - baseline - 2),
cv2.FONT_HERSHEY_SIMPLEX,
font_scale,
(0, 0, 0), # Black text
1,
cv2.LINE_AA,
)

return image

# Usage
img = cv2.imread("photo.jpg")
draw_bounding_box(img, (100, 50, 300, 250), "Cat (0.95)", (0, 255, 0))
cv2.imwrite("annotated.jpg", img)

Annotations with Pillow (Cleaner Text)

python
from PIL import Image, ImageDraw, ImageFont

def annotate_image(
image_path: str,
annotations: list[dict],
output_path: str = "annotated.png",
):
"""Annotate an image with bounding boxes using Pillow.

Args:
image_path: Path to input image
annotations: List of dicts with keys: bbox, label, color
output_path: Path to save annotated image
"""
img = Image.open(image_path).convert("RGB")
draw = ImageDraw.Draw(img)

# Load a font (fall back to default if not available)
try:
font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 16)
font_small = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 12)
except OSError:
font = ImageFont.load_default()
font_small = font

for ann in annotations:
bbox = ann["bbox"] # (x1, y1, x2, y2)
label = ann.get("label", "")
color = ann.get("color", "green")

# Draw bounding box
draw.rectangle(bbox, outline=color, width=3)

# Draw label
if label:
text_bbox = draw.textbbox((bbox[0], bbox[1] - 20), label, font=font_small)
draw.rectangle(text_bbox, fill=color)
draw.text((bbox[0], bbox[1] - 20), label, fill="white", font=font_small)

img.save(output_path)
print(f"Annotated image saved to {output_path}")

# Usage
annotate_image("photo.jpg", [
{"bbox": (50, 100, 300, 350), "label": "Person (0.92)", "color": "blue"},
{"bbox": (400, 200, 550, 400), "label": "Car (0.87)", "color": "red"},
])

Image Preprocessing for AI Models

Standard Preprocessing Pipeline

python
def preprocess_for_vision_model(
image_path: str,
target_size: tuple[int, int] = (224, 224),
normalize: bool = True,
) -> np.ndarray:
"""Preprocess an image for a vision model.

Steps:
1. Read image
2. Convert to RGB
3. Resize to target size
4. Normalize pixel values to [0, 1]
5. Add batch dimension
"""
# Read
img = cv2.imread(image_path)
if img is None:
raise ValueError(f"Could not read image: {image_path}")

# Convert BGR to RGB
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

# Resize
img = cv2.resize(img, target_size, interpolation=cv2.INTER_LINEAR)

# Normalize to [0, 1]
img = img.astype(np.float32) / 255.0

# ImageNet normalization (common for pretrained models)
if normalize:
mean = np.array([0.485, 0.456, 0.406])
std = np.array([0.229, 0.224, 0.225])
img = (img - mean) / std

# Add batch dimension: (H, W, C) -> (1, H, W, C)
img = np.expand_dims(img, axis=0)

return img

Augmentation for Training

python
import albumentations as A
from albumentations.pytorch import ToTensorV2

# Training augmentation pipeline
train_transform = A.Compose([
A.Resize(640, 640),
A.HorizontalFlip(p=0.5),
A.RandomBrightnessContrast(p=0.2),
A.GaussNoise(p=0.1),
A.Rotate(limit=15, p=0.3),
A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
ToTensorV2(),
], bbox_params=A.BboxParams(format="pascal_voc", label_fields=["labels"]))

# Validation pipeline (no augmentation)
val_transform = A.Compose([
A.Resize(640, 640),
A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
ToTensorV2(),
], bbox_params=A.BboxParams(format="pascal_voc", label_fields=["labels"]))
Bounding Box Formats

Different models use different bbox formats — always check which one your model expects:

  • Pascal VOC: (x_min, y_min, x_max, y_max) — absolute pixels
  • COCO: (x_min, y_min, width, height) — absolute pixels
  • YOLO: (x_center, y_center, width, height) — normalized [0, 1]
  • xyxy: Same as Pascal VOC

Batch Processing

python
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor

def batch_process_images(
input_dir: str,
output_dir: str,
process_fn,
max_workers: int = 4,
):
"""Process all images in a directory in parallel."""
input_path = Path(input_dir)
output_path = Path(output_dir)
output_path.mkdir(parents=True, exist_ok=True)

image_extensions = {".jpg", ".jpeg", ".png", ".bmp", ".tiff"}
images = [f for f in input_path.iterdir() if f.suffix.lower() in image_extensions]

def process_one(img_file):
try:
result = process_fn(str(img_file))
output_file = output_path / img_file.name
if isinstance(result, np.ndarray):
cv2.imwrite(str(output_file), result)
elif isinstance(result, Image.Image):
result.save(str(output_file))
return True
except Exception as e:
print(f"Error processing {img_file}: {e}")
return False

with ThreadPoolExecutor(max_workers=max_workers) as executor:
results = list(executor.map(process_one, images))

print(f"Processed {sum(results)}/{len(images)} images successfully")

# Usage: resize all images to 640x640
batch_process_images(
"raw_images/",
"processed_images/",
lambda p: cv2.resize(cv2.imread(p), (640, 640)),
)
Memory Management

When processing large batches of images, be mindful of memory:

  • Don't load all images into memory at once
  • Use generators instead of lists where possible
  • Call gc.collect() between large batches if needed
  • Consider using memory-mapped files for very large datasets