Image Processing

Before feeding images to AI models, you often need to preprocess them — resizing, normalizing, annotating, and extracting regions of interest. This page covers the essential image processing techniques using OpenCV and Pillow.

Setup

bash

uv add opencv-python-headless pillow numpy

python

import cv2
import numpy as np
from PIL import Image, ImageDraw, ImageFont

Basic Image Operations

Reading, Writing, and Converting

python

# Read an image with OpenCV (BGR format)
img_cv = cv2.imread("photo.jpg")

# Read with PIL (RGB format)
img_pil = Image.open("photo.jpg")

# Convert between formats
img_rgb = cv2.cvtColor(img_cv, cv2.COLOR_BGR2RGB)  # BGR -> RGB
img_bgr = cv2.cvtColor(np.array(img_pil), cv2.COLOR_RGB2BGR)  # PIL -> OpenCV

# Save images
cv2.imwrite("output.jpg", img_cv)
img_pil.save("output.png")

# Get image properties
height, width, channels = img_cv.shape
print(f"Size: {width}x{height}, Channels: {channels}")

Resizing and Cropping

python

# Resize to specific dimensions
resized = cv2.resize(img_cv, (640, 480))

# Resize maintaining aspect ratio
def resize_with_aspect(image, target_width=None, target_height=None):
    h, w = image.shape[:2]
    if target_width:
        ratio = target_width / w
        new_size = (target_width, int(h * ratio))
    elif target_height:
        ratio = target_height / h
        new_size = (int(w * ratio), target_height)
    else:
        return image
    return cv2.resize(image, new_size, interpolation=cv2.INTER_AREA)

# Crop a region (y1:y2, x1:x2)
cropped = img_cv[100:400, 200:600]

# Center crop to square
def center_crop(image, size):
    h, w = image.shape[:2]
    min_dim = min(h, w)
    start_y = (h - min_dim) // 2
    start_x = (w - min_dim) // 2
    crop = image[start_y:start_y+min_dim, start_x:start_x+min_dim]
    return cv2.resize(crop, (size, size))

Drawing Annotations

Bounding Boxes with OpenCV

python

def draw_bounding_box(
    image,
    bbox: tuple[int, int, int, int],  # x1, y1, x2, y2
    label: str = "",
    color: tuple = (0, 255, 0),  # Green in BGR
    thickness: int = 2,
    font_scale: float = 0.6,
):
    """Draw a bounding box with label on an image."""
    x1, y1, x2, y2 = bbox

    # Draw rectangle
    cv2.rectangle(image, (x1, y1), (x2, y2), color, thickness)

    if label:
        # Get text size for background
        (text_w, text_h), baseline = cv2.getTextSize(
            label, cv2.FONT_HERSHEY_SIMPLEX, font_scale, 1
        )

        # Draw label background
        cv2.rectangle(
            image,
            (x1, y1 - text_h - baseline - 5),
            (x1 + text_w, y1),
            color,
            -1,  # Filled rectangle
        )

        # Draw label text
        cv2.putText(
            image,
            label,
            (x1, y1 - baseline - 2),
            cv2.FONT_HERSHEY_SIMPLEX,
            font_scale,
            (0, 0, 0),  # Black text
            1,
            cv2.LINE_AA,
        )

    return image

# Usage
img = cv2.imread("photo.jpg")
draw_bounding_box(img, (100, 50, 300, 250), "Cat (0.95)", (0, 255, 0))
cv2.imwrite("annotated.jpg", img)

Annotations with Pillow (Cleaner Text)

python

from PIL import Image, ImageDraw, ImageFont

def annotate_image(
    image_path: str,
    annotations: list[dict],
    output_path: str = "annotated.png",
):
    """Annotate an image with bounding boxes using Pillow.

    Args:
        image_path: Path to input image
        annotations: List of dicts with keys: bbox, label, color
        output_path: Path to save annotated image
    """
    img = Image.open(image_path).convert("RGB")
    draw = ImageDraw.Draw(img)

    # Load a font (fall back to default if not available)
    try:
        font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 16)
        font_small = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 12)
    except OSError:
        font = ImageFont.load_default()
        font_small = font

    for ann in annotations:
        bbox = ann["bbox"]  # (x1, y1, x2, y2)
        label = ann.get("label", "")
        color = ann.get("color", "green")

        # Draw bounding box
        draw.rectangle(bbox, outline=color, width=3)

        # Draw label
        if label:
            text_bbox = draw.textbbox((bbox[0], bbox[1] - 20), label, font=font_small)
            draw.rectangle(text_bbox, fill=color)
            draw.text((bbox[0], bbox[1] - 20), label, fill="white", font=font_small)

    img.save(output_path)
    print(f"Annotated image saved to {output_path}")

# Usage
annotate_image("photo.jpg", [
    {"bbox": (50, 100, 300, 350), "label": "Person (0.92)", "color": "blue"},
    {"bbox": (400, 200, 550, 400), "label": "Car (0.87)", "color": "red"},
])

Image Preprocessing for AI Models

Standard Preprocessing Pipeline

python

def preprocess_for_vision_model(
    image_path: str,
    target_size: tuple[int, int] = (224, 224),
    normalize: bool = True,
) -> np.ndarray:
    """Preprocess an image for a vision model.

    Steps:
    1. Read image
    2. Convert to RGB
    3. Resize to target size
    4. Normalize pixel values to [0, 1]
    5. Add batch dimension
    """
    # Read
    img = cv2.imread(image_path)
    if img is None:
        raise ValueError(f"Could not read image: {image_path}")

    # Convert BGR to RGB
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

    # Resize
    img = cv2.resize(img, target_size, interpolation=cv2.INTER_LINEAR)

    # Normalize to [0, 1]
    img = img.astype(np.float32) / 255.0

    # ImageNet normalization (common for pretrained models)
    if normalize:
        mean = np.array([0.485, 0.456, 0.406])
        std = np.array([0.229, 0.224, 0.225])
        img = (img - mean) / std

    # Add batch dimension: (H, W, C) -> (1, H, W, C)
    img = np.expand_dims(img, axis=0)

    return img

Augmentation for Training

python

import albumentations as A
from albumentations.pytorch import ToTensorV2

# Training augmentation pipeline
train_transform = A.Compose([
    A.Resize(640, 640),
    A.HorizontalFlip(p=0.5),
    A.RandomBrightnessContrast(p=0.2),
    A.GaussNoise(p=0.1),
    A.Rotate(limit=15, p=0.3),
    A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ToTensorV2(),
], bbox_params=A.BboxParams(format="pascal_voc", label_fields=["labels"]))

# Validation pipeline (no augmentation)
val_transform = A.Compose([
    A.Resize(640, 640),
    A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ToTensorV2(),
], bbox_params=A.BboxParams(format="pascal_voc", label_fields=["labels"]))

Bounding Box Formats

Different models use different bbox formats — always check which one your model expects:

Pascal VOC: (x_min, y_min, x_max, y_max) — absolute pixels
COCO: (x_min, y_min, width, height) — absolute pixels
YOLO: (x_center, y_center, width, height) — normalized [0, 1]
xyxy: Same as Pascal VOC

Batch Processing

python

from pathlib import Path
from concurrent.futures import ThreadPoolExecutor

def batch_process_images(
    input_dir: str,
    output_dir: str,
    process_fn,
    max_workers: int = 4,
):
    """Process all images in a directory in parallel."""
    input_path = Path(input_dir)
    output_path = Path(output_dir)
    output_path.mkdir(parents=True, exist_ok=True)

    image_extensions = {".jpg", ".jpeg", ".png", ".bmp", ".tiff"}
    images = [f for f in input_path.iterdir() if f.suffix.lower() in image_extensions]

    def process_one(img_file):
        try:
            result = process_fn(str(img_file))
            output_file = output_path / img_file.name
            if isinstance(result, np.ndarray):
                cv2.imwrite(str(output_file), result)
            elif isinstance(result, Image.Image):
                result.save(str(output_file))
            return True
        except Exception as e:
            print(f"Error processing {img_file}: {e}")
            return False

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        results = list(executor.map(process_one, images))

    print(f"Processed {sum(results)}/{len(images)} images successfully")

# Usage: resize all images to 640x640
batch_process_images(
    "raw_images/",
    "processed_images/",
    lambda p: cv2.resize(cv2.imread(p), (640, 640)),
)

Memory Management

When processing large batches of images, be mindful of memory:

Don't load all images into memory at once
Use generators instead of lists where possible
Call gc.collect() between large batches if needed
Consider using memory-mapped files for very large datasets

Setup​

Basic Image Operations​

Reading, Writing, and Converting​

Resizing and Cropping​

Drawing Annotations​

Bounding Boxes with OpenCV​

Annotations with Pillow (Cleaner Text)​

Image Preprocessing for AI Models​

Standard Preprocessing Pipeline​

Augmentation for Training​

Batch Processing​

Setup

Basic Image Operations

Reading, Writing, and Converting

Resizing and Cropping

Drawing Annotations

Bounding Boxes with OpenCV

Annotations with Pillow (Cleaner Text)

Image Preprocessing for AI Models

Standard Preprocessing Pipeline

Augmentation for Training

Batch Processing