Skip to main content

Grounding DINO

Grounding DINO is an open-vocabulary object detection model that can detect any object described in natural language. Unlike traditional detectors that are limited to a fixed set of categories (like COCO's 80 classes), Grounding DINO lets you specify what to detect using text prompts.

How It Works

Grounding DINO combines:

  1. DINO — a self-supervised vision transformer backbone
  2. Grounding — text-image feature fusion that aligns language descriptions with visual regions

This means you can say "find the red fire extinguisher" and it will locate it, even though no training dataset explicitly has a "red fire extinguisher" class.

code
Text Prompt: "dog . cat . person"


┌─────────────────┐
│ Text Encoder │ (BERT)
│ (language │
│ features) │
└────────┬────────┘


┌─────────────────┐ ┌─────────────────┐
│ Feature Fusion │◄────│ Image Encoder │
│ (cross-attention)│ │ (Swin-B) │
└────────┬────────┘ └─────────────────┘


┌─────────────────┐
│ Detection Head │
│ (boxes + │
│ scores) │
└─────────────────┘

Installation

bash
# Install Grounding DINO
pip install groundingdino-py

# Or install from source
git clone https://github.com/IDEA-Research/GroundingDINO.git
cd GroundingDINO
pip install -e .

Basic Object Detection

python
from groundingdino.util.inference import load_model, predict
from groundingdino.util.box_ops import box_cxcywh_to_xyxy
import torch
import cv2
import numpy as np

# Load the model
model = load_model(
"GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py",
"weights/groundingdino_swint_ogc.pth",
)

def detect_objects(
image_path: str,
text_prompt: str,
box_threshold: float = 0.35,
text_threshold: float = 0.25,
) -> list[dict]:
"""Detect objects in an image using natural language descriptions.

Args:
image_path: Path to the image file
text_prompt: Dot-separated object descriptions (e.g., "dog . cat . car")
box_threshold: Minimum confidence for box predictions
text_threshold: Minimum confidence for text-image matching

Returns:
List of detections with bbox, score, and label
"""
# Load image
image_source = cv2.imread(image_path)
image = cv2.cvtColor(image_source, cv2.COLOR_BGR2RGB)

# Run prediction
boxes, logits, phrases = predict(
model=model,
image=image,
caption=text_prompt,
box_threshold=box_threshold,
text_threshold=text_threshold,
)

# Convert boxes from cxcywh to xyxy format
h, w, _ = image_source.shape
boxes_xyxy = box_cxcywh_to_xyxy(boxes) * torch.Tensor([w, h, w, h])

results = []
for box, score, phrase in zip(boxes_xyxy, logits, phrases):
x1, y1, x2, y2 = box.int().tolist()
results.append({
"bbox": (x1, y1, x2, y2),
"score": round(score.item(), 3),
"label": phrase,
})

return results

# Usage: detect specific objects
detections = detect_objects("street.jpg", "person . car . traffic light . bicycle")
for det in detections:
print(f"{det['label']}: {det['score']:.2f} at {det['bbox']}")

Visualizing Detections

python
def visualize_detections(
image_path: str,
detections: list[dict],
output_path: str = "detections.jpg",
):
"""Draw detection results on the image."""
image = cv2.imread(image_path)

# Color palette for different labels
colors = {
"person": (0, 255, 0), # Green
"car": (255, 0, 0), # Blue
"dog": (0, 165, 255), # Orange
}
default_color = (0, 0, 255) # Red

for det in detections:
x1, y1, x2, y2 = det["bbox"]
label = det["label"]
score = det["score"]
color = colors.get(label, default_color)

# Draw bounding box
cv2.rectangle(image, (x1, y1), (x2, y2), color, 2)

# Draw label
text = f"{label} {score:.2f}"
(tw, th), _ = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 0.6, 1)
cv2.rectangle(image, (x1, y1 - th - 6), (x1 + tw, y1), color, -1)
cv2.putText(
image, text, (x1, y1 - 4),
cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 1,
)

cv2.imwrite(output_path, image)
print(f"Saved to {output_path}")

# Usage
detections = detect_objects("street.jpg", "person . car")
visualize_detections("street.jpg", detections, "output.jpg")

Combining with SAM for Segmentation

Grounding DINO finds bounding boxes; SAM (Segment Anything Model) converts those boxes into precise segmentation masks:

python
from segment_anything import sam_model_registry, SamPredictor

def detect_and_segment(image_path: str, text_prompt: str):
"""Detect objects with Grounding DINO, then segment with SAM."""
# Step 1: Detect with Grounding DINO
detections = detect_objects(image_path, text_prompt)

# Step 2: Load SAM
sam = sam_model_registry["vit_h"](checkpoint="sam_vit_h.pth")
sam.to("cuda")
predictor = SamPredictor(sam)

image = cv2.imread(image_path)
image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
predictor.set_image(image_rgb)

# Step 3: Segment each detection
results = []
for det in detections:
x1, y1, x2, y2 = det["bbox"]
box = np.array([x1, y1, x2, y2])

masks, scores, _ = predictor.predict(
box=box,
multimask_output=True,
)

# Select the best mask
best_mask = masks[scores.argmax()]
results.append({
"label": det["label"],
"score": det["score"],
"bbox": det["bbox"],
"mask": best_mask,
})

return results

Practical Applications

Document Signature Detection

python
def detect_signatures(document_path: str) -> list[dict]:
"""Detect signatures in a document image."""
return detect_objects(
document_path,
text_prompt="signature . handwritten signature . sign",
box_threshold=0.25, # Lower threshold for challenging cases
text_threshold=0.20,
)

Product Defect Detection

python
def detect_defects(product_image: str) -> list[dict]:
"""Detect defects in a product image."""
return detect_objects(
product_image,
text_prompt="scratch . dent . crack . discoloration . stain",
box_threshold=0.30,
text_threshold=0.25,
)
Prompt Engineering for Detection
  • Separate categories with . (dot with spaces) — this is how Grounding DINO splits categories
  • Be specific: "golden retriever" works better than just "dog"
  • Use multiple synonyms: "signature . handwritten text . autograph" increases recall
  • Lower box_threshold (e.g., 0.25) to find more objects, raise it (e.g., 0.5) for higher precision
Performance Tips
  • Grounding DINO runs on GPU (SwinT is fast; SwinB is more accurate but slower)
  • Process images at reasonable resolutions (640-800px longest side)
  • Batch multiple text prompts into a single call rather than running the model multiple times
  • For production, consider exporting to ONNX or TensorRT for faster inference