When and how to build your own computer vision models
By the end of this session, you will:
3-hour journey through custom model development
From off-the-shelf to fully custom
Decision factors
Factor-by-factor comparison
| Factor | Cloud API | Custom Model |
|---|---|---|
| Time to deploy | Hours to days | Weeks to months |
| Data requirements | None | 100s to 1000s labeled images |
| Cost structure | Per-call (scales with usage) | Fixed (training + infra) |
| Expertise | API integration | ML engineering |
| Customization | Limited | Full control |
| Privacy | Data sent to cloud | Data stays on-premise |
When custom becomes cheaper
Standing on the shoulders of giants
Neural networks learn hierarchically
Two main strategies
When to use each
| Aspect | Feature Extraction | Fine-Tuning |
|---|---|---|
| Training time | Fast (minutes) | Slower (hours) |
| Data needed | Small (50-200/class) | Larger (200-1000/class) |
| Domain similarity | Similar to ImageNet | Different from ImageNet |
| Risk of overfitting | Lower | Higher |
| Potential accuracy | Good | Better |
Choosing your backbone
| Model | Size | Speed | Accuracy | Best For |
|---|---|---|---|---|
| MobileNetV3 | 5M params | Very Fast | Good | Mobile/Edge |
| EfficientNet-B0 | 5M params | Fast | Very Good | Balanced |
| ResNet-50 | 25M params | Medium | Excellent | General |
| ViT-B/16 | 86M params | Slow | Best | Max accuracy |
What we'll build
Standard ImageFolder structure
data/
train/
class_a/
image001.jpg
image002.jpg
...
class_b/
image001.jpg
...
val/
class_a/
...
class_b/
...
import torch
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
def create_dataloaders(data_dir: str, batch_size: int = 32):
"""Create train and validation dataloaders."""
# Standard ImageNet normalization (required for pre-trained models)
normalize = transforms.Normalize(
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]
)
train_transform = transforms.Compose([
transforms.RandomResizedCrop(224),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
normalize
])
val_transform = transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
normalize
])
# Load datasets from folder structure
train_data = datasets.ImageFolder(
f"{data_dir}/train",
transform=train_transform
)
val_data = datasets.ImageFolder(
f"{data_dir}/val",
transform=val_transform
)
# Create dataloaders
train_loader = DataLoader(
train_data, batch_size=batch_size,
shuffle=True, num_workers=4
)
val_loader = DataLoader(
val_data, batch_size=batch_size,
shuffle=False, num_workers=4
)
return train_loader, val_loader, train_data.classes
# Usage
train_loader, val_loader, classes = create_dataloaders("data/defects")
print(f"Classes: {classes}") # ['good', 'scratch', 'dent']
Loading and modifying a pre-trained model
import torch.nn as nn
from torchvision import models
def create_model(num_classes: int, freeze_base: bool = True):
"""Create transfer learning model."""
# Load pre-trained ResNet-50
model = models.resnet50(
weights=models.ResNet50_Weights.IMAGENET1K_V2
)
# Replace classification head
num_features = model.fc.in_features # 2048
model.fc = nn.Linear(num_features, num_classes)
# Freeze base layers if specified
if freeze_base:
for param in list(model.parameters())[:-2]:
param.requires_grad = False
return model
What we're modifying
def train_model(model, train_loader, val_loader, epochs=10, lr=0.001):
"""Train the model."""
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(
filter(lambda p: p.requires_grad, model.parameters()),
lr=lr
)
for epoch in range(epochs):
model.train()
running_loss = 0.0
for inputs, labels in train_loader:
inputs, labels = inputs.to(device), labels.to(device)
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
running_loss += loss.item()
# Validation phase
model.eval()
val_loss = 0.0
correct = 0
total = 0
with torch.no_grad():
for inputs, labels in val_loader:
inputs, labels = inputs.to(device), labels.to(device)
outputs = model(inputs)
loss = criterion(outputs, labels)
val_loss += loss.item()
_, predicted = outputs.max(1)
total += labels.size(0)
correct += predicted.eq(labels).sum().item()
val_acc = correct / total
print(f"Epoch {epoch+1}/{epochs}")
print(f" Train Loss: {running_loss/len(train_loader):.4f}")
print(f" Val Loss: {val_loss/len(val_loader):.4f}")
print(f" Val Acc: {val_acc:.2%}")
return model
def train_defect_classifier():
"""Complete training pipeline for defect detection."""
# Create data loaders
train_loader, val_loader, classes = create_dataloaders(
"data/defects", batch_size=32
)
print(f"Classes: {classes}")
print(f"Training samples: {len(train_loader.dataset)}")
# Initialize model (feature extraction mode)
model = create_model(
num_classes=len(classes),
freeze_base=True
)
# Train
model = train_model(model, train_loader, val_loader, epochs=15)
# Save model
torch.save(model.state_dict(), "defect_classifier.pth")
print("Model saved!")
return model, classes
# Run training
model, classes = train_defect_classifier()
Artificially expand your dataset
Fast, flexible augmentation library
import albumentations as A
from albumentations.pytorch import ToTensorV2
def create_augmentation_pipeline(task_type: str = "general"):
"""Create augmentation pipeline based on task."""
if task_type == "general":
return A.Compose([
A.RandomResizedCrop(224, 224, scale=(0.8, 1.0)),
A.HorizontalFlip(p=0.5),
A.Rotate(limit=15, p=0.5),
A.ColorJitter(
brightness=0.2, contrast=0.2,
saturation=0.2, p=0.5
),
A.Normalize(
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]
),
ToTensorV2()
])
Match augmentations to your use case
if task_type == "defect_detection":
# Manufacturing: preserve size, simulate sensor noise
return A.Compose([
A.Resize(224, 224),
A.HorizontalFlip(p=0.5),
A.VerticalFlip(p=0.5),
A.RandomBrightnessContrast(brightness_limit=0.3, p=0.5),
A.GaussNoise(var_limit=(10, 50), p=0.3),
A.Normalize(...), ToTensorV2()
])
elif task_type == "document":
# Documents: perspective, lighting
return A.Compose([
A.Perspective(scale=(0.05, 0.1), p=0.5),
A.Affine(rotate=(-5, 5), shear=(-5, 5), p=0.5),
A.RandomBrightnessContrast(p=0.5),
A.GaussianBlur(blur_limit=(3, 5), p=0.3),
A.Normalize(...), ToTensorV2()
])
Train custom models without writing code
| Platform | Strengths | Best For |
|---|---|---|
| Google Vertex AI | Auto architecture, one-click deploy | GCP users, production |
| AWS Rekognition Custom | Few-shot (10+ images), pay-per-use | AWS ecosystem |
| Azure Custom Vision | User-friendly, ONNX export | Edge deployment |
| Roboflow | Great annotation, versioning | Object detection |
| Teachable Machine | Free, instant results | Learning, demos |
Measuring what matters
The foundation of classification metrics
Predicted +, Actually +
Predicted +, Actually -
Predicted -, Actually +
Predicted -, Actually -
| Metric | Formula | Use When |
|---|---|---|
| Accuracy | (TP + TN) / Total | Balanced classes |
| Precision | TP / (TP + FP) | False positives costly |
| Recall | TP / (TP + FN) | False negatives costly |
| F1 Score | 2 * (P * R) / (P + R) | Imbalanced classes |
Different applications need different metrics
Additional metrics for detection tasks
def calculate_iou(box1, box2):
"""Calculate IoU between two boxes [x1, y1, x2, y2]."""
# Calculate intersection
x1 = max(box1[0], box2[0])
y1 = max(box1[1], box2[1])
x2 = min(box1[2], box2[2])
y2 = min(box1[3], box2[3])
intersection = max(0, x2 - x1) * max(0, y2 - y1)
# Calculate union
area1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
area2 = (box2[2] - box2[0]) * (box2[3] - box2[1])
union = area1 + area2 - intersection
return intersection / union if union > 0 else 0
# Example
iou = calculate_iou([10, 10, 50, 50], [20, 20, 60, 60])
print(f"IoU: {iou:.2f}") # IoU: 0.33
Duration: 1.5 hours | Deadline: Before Session 5
Ethics, Governance & Final Presentations