Leveraging pretrained models and optimizing performance
Works well with small datasets (100s-1000s of images)
Converges much faster than training from scratch
Often achieves higher accuracy than random initialization
Rule of thumb: Always start with transfer learning unless you have millions of labeled images and significant compute resources.
from tensorflow.keras.applications import (
ResNet50, VGG16, EfficientNetB0, MobileNetV2
)
# Load model with ImageNet weights
base_model = ResNet50(
weights='imagenet', # (#1:Pretrained on ImageNet)
include_top=False, # (#2:Remove classification head)
input_shape=(224, 224, 3)
)
# Check model architecture
base_model.summary()
# Available models: VGG16, VGG19, ResNet50/101/152,
# InceptionV3, EfficientNetB0-B7, MobileNetV2/V3, etc.
import torchvision.models as models
# Load pretrained ResNet50
model = models.resnet50(pretrained=True) # (#1:ImageNet weights)
# Modern syntax (PyTorch 2.0+)
from torchvision.models import ResNet50_Weights
model = models.resnet50(weights=ResNet50_Weights.IMAGENET1K_V2) # (#2:Specific weights)
# Remove classification head
import torch.nn as nn
model.fc = nn.Identity() # (#3:Replace final layer)
# Or get features directly
model = models.resnet50(pretrained=True)
features = nn.Sequential(*list(model.children())[:-1]) # (#4:All but last layer)
import timm
# List available models
print(timm.list_models('*efficientnet*')) # (#1:Search models)
# Load model with pretrained weights
model = timm.create_model(
'efficientnet_b0',
pretrained=True, # (#2:Download pretrained weights)
num_classes=10 # (#3:Custom number of classes)
)
# Load as feature extractor
model = timm.create_model(
'vit_base_patch16_224',
pretrained=True,
num_classes=0 # (#4:0 removes classifier)
)
# timm has 700+ models: ViT, Swin, ConvNeXt, etc.
timm (PyTorch Image Models) provides the largest collection of pretrained vision models.
# Keras implementation
base_model = ResNet50(
weights='imagenet',
include_top=False,
input_shape=(224,224,3)
)
# Freeze base model
base_model.trainable = False
# Add classifier
model = keras.Sequential([
base_model,
layers.GlobalAveragePooling2D(),
layers.Dense(256, activation='relu'),
layers.Dropout(0.5),
layers.Dense(num_classes, activation='softmax')
])
import torch
import torch.nn as nn
from torchvision import models
class FeatureExtractor(nn.Module):
def __init__(self, num_classes):
super().__init__()
# Load pretrained backbone
self.backbone = models.resnet50(pretrained=True)
# Freeze all layers
for param in self.backbone.parameters(): # (#1:Freeze backbone)
param.requires_grad = False
# Replace classifier
num_features = self.backbone.fc.in_features
self.backbone.fc = nn.Sequential(
nn.Linear(num_features, 256), # (#2:New classifier head)
nn.ReLU(),
nn.Dropout(0.5),
nn.Linear(256, num_classes)
)
def forward(self, x):
return self.backbone(x)
# Keras: Unfreeze all layers
base_model.trainable = True
# Use lower learning rate
model.compile(
optimizer=keras.optimizers.Adam(
learning_rate=1e-5 # Lower LR
),
loss='categorical_crossentropy',
metrics=['accuracy']
)
# PyTorch: All params trainable
for param in model.parameters():
param.requires_grad = True
optimizer = torch.optim.Adam(
model.parameters(),
lr=1e-5
)
Warning: Use learning rate 10-100x smaller than training from scratch to avoid destroying pretrained features.
# Keras: Freeze early layers, unfreeze later ones
base_model = ResNet50(weights='imagenet', include_top=False)
# Freeze first N layers
for layer in base_model.layers[:100]: # (#1:Freeze early layers)
layer.trainable = False
for layer in base_model.layers[100:]: # (#2:Unfreeze later layers)
layer.trainable = True
# PyTorch: Freeze by layer name
model = models.resnet50(pretrained=True)
for name, param in model.named_parameters():
if 'layer4' in name or 'fc' in name: # (#3:Unfreeze layer4 and fc)
param.requires_grad = True
else:
param.requires_grad = False # (#4:Freeze other layers)
# Check trainable parameters
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Trainable params: {trainable:,}")
Gradually unfreeze layers during training, starting from the top (closest to output) and moving toward the bottom.
# Progressive unfreezing schedule
def unfreeze_layers(model, epoch):
if epoch == 0:
# Train only classifier
for p in model.backbone.parameters():
p.requires_grad = False
elif epoch == 5:
# Unfreeze layer4
for p in model.backbone.layer4.parameters():
p.requires_grad = True
elif epoch == 10:
# Unfreeze layer3
for p in model.backbone.layer3.parameters():
p.requires_grad = True
elif epoch == 15:
# Unfreeze all
for p in model.parameters():
p.requires_grad = True
Which strategy would you recommend for each scenario?
500 chest X-ray images for pneumonia detection. Single GPU, limited time.
Feature extraction, partial, or full fine-tuning?
50,000 product images for e-commerce classification. Multiple GPUs available.
Feature extraction, partial, or full fine-tuning?
200 satellite images (very different from ImageNet). Need maximum accuracy.
Which approach? What LR strategy?
# Different learning rates for different layers
model = models.resnet50(pretrained=True)
# Group parameters by layer depth
param_groups = [
{'params': model.conv1.parameters(), 'lr': 1e-6}, # (#1:Lowest LR for early layers)
{'params': model.layer1.parameters(), 'lr': 1e-6},
{'params': model.layer2.parameters(), 'lr': 1e-5},
{'params': model.layer3.parameters(), 'lr': 1e-5},
{'params': model.layer4.parameters(), 'lr': 1e-4}, # (#2:Higher LR for later layers)
{'params': model.fc.parameters(), 'lr': 1e-3} # (#3:Highest LR for new head)
]
optimizer = torch.optim.Adam(param_groups)
# Keras equivalent using layer-wise LR multiplier
# Requires custom training loop or optimizer modification
Intuition: Early layers capture generic features that shouldn't change much; later layers need more adaptation.
Correct predictions / Total predictions
TP / (TP + FP) - Exactness
TP / (TP + FN) - Completeness
2 * (P * R) / (P + R) - Harmonic mean
| Metric | Use When | Limitation |
|---|---|---|
| Accuracy | Balanced classes | Misleading with imbalanced data |
| Precision | False positives are costly (spam filter) | Ignores false negatives |
| Recall | False negatives are costly (disease detection) | Ignores false positives |
| F1 Score | Need balance between precision and recall | Assumes equal importance of P and R |
| Predicted | ||
|---|---|---|
| Actual | Positive | Negative |
| Positive | TP True Positive |
FN False Negative |
| Negative | FP False Positive |
TN True Negative |
Precision = TP / (TP + FP) - "Of all positive predictions, how many were correct?"
Recall = TP / (TP + FN) - "Of all actual positives, how many did we find?"
from sklearn.metrics import (
confusion_matrix, classification_report, f1_score
)
import seaborn as sns
import matplotlib.pyplot as plt
# Predictions
y_pred = model.predict(X_test).argmax(axis=1)
# F1 Score
f1 = f1_score(y_true, y_pred, average='weighted') # (#1:Weighted for imbalanced data)
print(f"F1 Score: {f1:.4f}")
# Confusion Matrix
cm = confusion_matrix(y_true, y_pred) # (#2:Shows prediction distribution)
# Visualize
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues') # (#3:Heatmap visualization)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()
Given this confusion matrix for a COVID test model:
| Pred: Negative | Pred: Positive | |
|---|---|---|
| Actual: Negative | 850 | 50 |
| Actual: Positive | 20 | 80 |
What is the accuracy?
Formula: (TP+TN)/Total
What is the recall (sensitivity) for COVID+?
Formula: TP/(TP+FN)
Is this model better for screening or confirmation? Why?
Think: Cost of false negatives vs false positives
from sklearn.metrics import roc_curve, auc, classification_report
import numpy as np
# Get probability predictions
y_proba = model.predict(X_test) # (#1:Softmax probabilities)
# For binary classification
fpr, tpr, thresholds = roc_curve(y_true, y_proba[:, 1]) # (#2:False/True positive rates)
roc_auc = auc(fpr, tpr)
# Plot ROC curve
plt.plot(fpr, tpr, label=f'ROC (AUC = {roc_auc:.3f})')
plt.plot([0, 1], [0, 1], 'k--') # (#3:Random baseline)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.show()
# Classification report (precision, recall, f1 per class)
print(classification_report(y_true, y_pred, target_names=class_names)) # (#4:Full report)
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_auc_score
# One-vs-Rest AUC
y_true_bin = label_binarize(y_true, classes=range(num_classes)) # (#1:One-hot encode)
# Calculate AUC
auc_ovr = roc_auc_score(
y_true_bin, y_proba,
multi_class='ovr', # (#2:One-vs-Rest strategy)
average='weighted'
)
print(f"Weighted OvR AUC: {auc_ovr:.4f}")
# Plot ROC for each class
for i in range(num_classes):
fpr, tpr, _ = roc_curve(y_true_bin[:, i], y_proba[:, i])
plt.plot(fpr, tpr, label=f'Class {i} (AUC={auc(fpr, tpr):.2f})') # (#3:Per-class curves)
plt.legend()
plt.show()
Measures overlap between predicted and ground truth boxes.
def calculate_iou(box1, box2):
# box format: [x1, y1, x2, y2]
x1 = max(box1[0], box2[0])
y1 = max(box1[1], box2[1])
x2 = min(box1[2], box2[2])
y2 = min(box1[3], box2[3])
inter = max(0, x2-x1) * max(0, y2-y1)
area1 = (box1[2]-box1[0]) * (box1[3]-box1[1])
area2 = (box2[2]-box2[0]) * (box2[3]-box2[1])
return inter / (area1 + area2 - inter)
# Using torchmetrics
from torchmetrics.detection import MeanAveragePrecision
metric = MeanAveragePrecision()
# Format: list of dicts
preds = [
{'boxes': tensor, 'scores': tensor, 'labels': tensor}
]
targets = [
{'boxes': tensor, 'labels': tensor}
]
metric.update(preds, targets)
result = metric.compute()
print(f"mAP@0.5: {result['map_50']:.4f}")
print(f"mAP@0.5:0.95: {result['map']:.4f}")
import numpy as np
def dice_coefficient(pred, target, smooth=1e-6):
"""Dice = 2 * |A intersection B| / (|A| + |B|)"""
intersection = np.sum(pred * target) # (#1:Pixel overlap)
return (2 * intersection + smooth) / (np.sum(pred) + np.sum(target) + smooth)
def iou_score(pred, target, smooth=1e-6):
"""IoU = |A intersection B| / |A union B|"""
intersection = np.sum(pred * target)
union = np.sum(pred) + np.sum(target) - intersection # (#2:Union calculation)
return (intersection + smooth) / (union + smooth)
def pixel_accuracy(pred, target):
"""Simple accuracy per pixel"""
return np.mean(pred == target) # (#3:Correct pixels / Total)
# Per-class metrics
def mean_iou(pred, target, num_classes):
ious = []
for c in range(num_classes):
iou = iou_score(pred == c, target == c) # (#4:IoU per class)
ious.append(iou)
return np.mean(ious)
| Aspect | Dice Coefficient | IoU (Jaccard) |
|---|---|---|
| Formula | 2|A ∩ B| / (|A| + |B|) | |A ∩ B| / |A ∪ B| |
| Range | 0 to 1 | 0 to 1 |
| Relationship | Dice = 2*IoU / (1 + IoU) | IoU = Dice / (2 - Dice) |
| Values | Always >= IoU | Always <= Dice |
| Use Case | Medical imaging, loss function | Detection, standard benchmark |
Example: If IoU = 0.5, then Dice = 0.667. Perfect overlap gives both = 1.
# Plot learning curves
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# Loss curves
axes[0].plot(history['loss'], label='Train')
axes[0].plot(history['val_loss'], label='Val')
axes[0].set_title('Loss')
axes[0].legend()
# Accuracy curves
axes[1].plot(history['accuracy'], label='Train')
axes[1].plot(history['val_accuracy'], label='Val')
axes[1].set_title('Accuracy')
axes[1].legend()
plt.show()
| Pattern | Diagnosis | Solution |
|---|---|---|
| Train high, Val high | Underfitting | More capacity, longer training, lower regularization |
| Train low, Val high | Overfitting | More data, regularization, early stopping |
| Train low, Val low | Good fit | Can try for better performance |
| Val loss increases | Overfitting (late) | Early stopping, reduce epochs |
| Oscillating loss | LR too high | Reduce learning rate |
# Keras
from tensorflow.keras import layers
model = keras.Sequential([
layers.Conv2D(64, 3, activation='relu'),
layers.Dropout(0.25), # (#1:25% dropout after conv)
layers.Flatten(),
layers.Dense(256, activation='relu'),
layers.Dropout(0.5), # (#2:50% dropout before output)
layers.Dense(num_classes, activation='softmax')
])
# PyTorch
import torch.nn as nn
class Model(nn.Module):
def __init__(self):
super().__init__()
self.features = nn.Sequential(
nn.Conv2d(3, 64, 3),
nn.ReLU(),
nn.Dropout2d(0.25) # (#3:Spatial dropout for conv)
)
self.classifier = nn.Sequential(
nn.Linear(64*26*26, 256),
nn.ReLU(),
nn.Dropout(0.5), # (#4:Standard dropout for FC)
nn.Linear(256, num_classes)
)
# L2 Regularization (Weight Decay)
# Keras
from tensorflow.keras import regularizers
layers.Dense(256,
kernel_regularizer=regularizers.l2(0.01) # (#1:L2 penalty)
)
# PyTorch - weight decay in optimizer
optimizer = torch.optim.Adam(
model.parameters(),
lr=1e-3,
weight_decay=1e-4 # (#2:L2 via optimizer)
)
# Batch Normalization - implicit regularization
# Keras
model.add(layers.BatchNormalization()) # (#3:After conv/dense)
# PyTorch
nn.Sequential(
nn.Conv2d(64, 128, 3),
nn.BatchNorm2d(128), # (#4:Normalize activations)
nn.ReLU()
)
Instead of hard labels (0 or 1), use soft labels that prevent overconfidence.
Smoothed label = (1 - epsilon) * original + epsilon / num_classes
# Keras - built-in support
loss = keras.losses.CategoricalCrossentropy(
label_smoothing=0.1 # epsilon
)
# PyTorch - CrossEntropyLoss
loss_fn = nn.CrossEntropyLoss(
label_smoothing=0.1
)
# Manual implementation
def smooth_labels(labels, epsilon=0.1):
n_classes = labels.shape[-1]
return labels * (1 - epsilon) + \
epsilon / n_classes
from tensorflow.keras.callbacks import (
ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
)
# Save best model
checkpoint = ModelCheckpoint(
'best_model.keras',
monitor='val_loss', # (#1:Track validation loss)
save_best_only=True,
mode='min'
)
# Stop when no improvement
early_stop = EarlyStopping(
monitor='val_loss',
patience=10, # (#2:Wait 10 epochs)
restore_best_weights=True # (#3:Revert to best)
)
# Reduce LR on plateau
reduce_lr = ReduceLROnPlateau(
monitor='val_loss',
factor=0.5, # (#4:Halve learning rate)
patience=5,
min_lr=1e-7
)
callbacks = [checkpoint, early_stop, reduce_lr]
model.fit(X_train, y_train, callbacks=callbacks, epochs=100)
best_val_loss = float('inf')
patience_counter = 0
patience = 10
for epoch in range(num_epochs):
# Training
model.train()
for batch in train_loader:
# ... training step ...
# Validation
model.eval()
val_loss = evaluate(model, val_loader) # (#1:Calculate val loss)
# Save best model
if val_loss < best_val_loss:
best_val_loss = val_loss
torch.save(model.state_dict(), 'best_model.pt') # (#2:Save checkpoint)
patience_counter = 0
else:
patience_counter += 1
# Early stopping
if patience_counter >= patience: # (#3:Stop if no improvement)
print(f"Early stopping at epoch {epoch}")
break
# LR scheduling
scheduler.step(val_loss) # (#4:ReduceLROnPlateau)
Optuna is an automatic hyperparameter optimization framework.
pip install optuna optuna-dashboard
import optuna
def objective(trial):
# Suggest hyperparameters
lr = trial.suggest_float('lr', 1e-5, 1e-2, log=True) # (#1:Log-uniform)
dropout = trial.suggest_float('dropout', 0.1, 0.5)
batch_size = trial.suggest_categorical('batch_size', [16, 32, 64]) # (#2:Categorical)
optimizer_name = trial.suggest_categorical('optimizer', ['Adam', 'SGD'])
# Build model with suggested params
model = build_model(dropout=dropout)
# Train and evaluate
accuracy = train_and_evaluate(
model, lr=lr, batch_size=batch_size, optimizer=optimizer_name
)
return accuracy # (#3:Maximize this)
# Create and run study
study = optuna.create_study(direction='maximize') # (#4:Maximize accuracy)
study.optimize(objective, n_trials=100)
# Best parameters
print(f"Best params: {study.best_params}")
print(f"Best accuracy: {study.best_value:.4f}")
# Enable pruning for unpromising trials
def objective(trial):
model = build_model(trial)
for epoch in range(num_epochs):
train(model)
val_acc = evaluate(model)
# Report intermediate value
trial.report(val_acc, epoch) # (#1:Report progress)
# Check if trial should be pruned
if trial.should_prune(): # (#2:Prune bad trials)
raise optuna.TrialPruned()
return val_acc
# Create study with pruner
study = optuna.create_study(
direction='maximize',
pruner=optuna.pruners.MedianPruner() # (#3:Prune below median)
)
# Visualize results
from optuna.visualization import plot_optimization_history, plot_param_importances
plot_optimization_history(study) # (#4:Show progress)
plot_param_importances(study) # (#5:Parameter importance)
import wandb
# Initialize run
wandb.init(
project='cv-transfer-learning',
config={
'learning_rate': 1e-4,
'epochs': 50,
'batch_size': 32
}
)
# Log metrics
for epoch in range(num_epochs):
train_loss = train(model)
val_loss = evaluate(model)
wandb.log({
'train_loss': train_loss,
'val_loss': val_loss,
'epoch': epoch
})
wandb.finish()
# Define sweep configuration
sweep_config = {
'method': 'bayes', # (#1:Bayesian optimization)
'metric': {'name': 'val_accuracy', 'goal': 'maximize'},
'parameters': {
'learning_rate': {
'distribution': 'log_uniform_values', # (#2:Log-uniform sampling)
'min': 1e-5, 'max': 1e-2
},
'dropout': {'values': [0.2, 0.3, 0.4, 0.5]}, # (#3:Discrete values)
'batch_size': {'values': [16, 32, 64]},
'optimizer': {'values': ['adam', 'sgd']}
}
}
# Create sweep
sweep_id = wandb.sweep(sweep_config, project='cv-sweeps')
# Define training function
def train_sweep():
wandb.init()
config = wandb.config # (#4:Get suggested params)
model = build_model(config.dropout)
# ... train with config.learning_rate, etc.
# Run sweep
wandb.agent(sweep_id, train_sweep, count=50) # (#5:Run 50 trials)
# Use a standard dataset for practice
import tensorflow_datasets as tfds
# Load dataset
(train_ds, val_ds), info = tfds.load(
'oxford_flowers102', # (#1:102 flower classes)
split=['train', 'validation'],
with_info=True,
as_supervised=True
)
# Preprocessing function
def preprocess(image, label):
image = tf.image.resize(image, (224, 224))
image = tf.keras.applications.efficientnet.preprocess_input(image) # (#2:Model-specific preprocessing)
return image, label
# Prepare datasets
BATCH_SIZE = 32
train_ds = train_ds.map(preprocess).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE) # (#3:Optimize pipeline)
val_ds = val_ds.map(preprocess).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
Always start with pretrained models - they provide a strong foundation
Choose between feature extraction, partial, or full fine-tuning based on data size
Use appropriate metrics for your task: classification, detection, or segmentation
Dropout, L2, BatchNorm, and label smoothing prevent overfitting
Use checkpoints, early stopping, and LR scheduling for stable training
Optuna and W&B automate the search for optimal hyperparameters
Preparation: Review IoU and mAP concepts. Install Ultralytics YOLO package.
| Type | Resource |
|---|---|
| Library | timm - PyTorch Image Models |
| Documentation | Keras Transfer Learning Guide |
| Tool | Optuna - Hyperparameter Optimization |
| Platform | Weights & Biases |
| Paper | How Transferable are Features in DNNs? |
| Course | fast.ai - Practical Deep Learning |
Fine-tune EfficientNet on the Flowers dataset
Compare different fine-tuning strategies
Try Optuna or W&B sweeps for hyperparameter search