Deep dive into neural network architectures for vision tasks
Neural networks designed for grid-like data (images).
Output feature map at position (i, j):
y[i,j] = sum(sum(x[i+m, j+n] * k[m,n])) + b
Where:
- x: input feature map
- k: kernel/filter of size (M, N)
- b: bias term
- m, n: kernel indices
import torch.nn as nn
# PyTorch Conv2d
conv_layer = nn.Conv2d(
in_channels=3, # (#1:RGB input)
out_channels=64, # (#2:Number of filters)
kernel_size=3, # (#3:3x3 kernel)
stride=1, # (#4:Step size)
padding=1 # (#5:Same padding)
)
# TensorFlow/Keras equivalent
from tensorflow.keras.layers import Conv2D
conv_layer = Conv2D(
filters=64,
kernel_size=(3, 3),
strides=(1, 1),
padding='same', # (#6:Output same size as input)
activation='relu' # (#7:Activation included)
)
Output Size = floor((W - K + 2P) / S) + 1
Where:
- W: Input width/height
- K: Kernel size
- P: Padding
- S: Stride
| Input | Kernel | Padding | Stride | Output |
|---|---|---|---|---|
| 32 | 3 | 1 | 1 | 32 |
| 32 | 3 | 0 | 1 | 30 |
| 32 | 3 | 1 | 2 | 16 |
| 224 | 7 | 3 | 2 | 112 |
Formula: Output = floor((W - K + 2P) / S) + 1
Input: 64x64, Kernel: 3x3, Padding: 0, Stride: 1
Output size?
Input: 64x64, Kernel: 3x3, Padding: 1, Stride: 2
Output size?
After Conv(224, k=7, p=3, s=2) then Pool(k=3, s=2)?
Final size? (like ResNet stem)
Takes maximum value in each window. Preserves strongest activations.
nn.MaxPool2d(kernel_size=2, stride=2)
Computes average value in each window. Smoother features.
nn.AvgPool2d(kernel_size=2, stride=2)
Averages entire feature map to single value. Used before classification.
nn.AdaptiveAvgPool2d((1, 1))
Purpose: Reduces spatial dimensions, provides translation invariance, and reduces parameters.
import torch
import torch.nn as nn
# Input: batch of 4 images, 64 channels, 32x32 spatial
x = torch.randn(4, 64, 32, 32) # (#1:NCHW format)
# Max Pooling - reduces spatial by 2x
max_pool = nn.MaxPool2d(kernel_size=2, stride=2)
out = max_pool(x) # (#2:Shape: (4, 64, 16, 16))
# Global Average Pooling - reduces to 1x1
gap = nn.AdaptiveAvgPool2d((1, 1))
out = gap(x) # (#3:Shape: (4, 64, 1, 1))
# Flatten for fully connected layer
out = out.view(out.size(0), -1) # (#4:Shape: (4, 64))
Dense layers that connect all input neurons to all output neurons.
# PyTorch Linear layer
fc = nn.Linear(
in_features=512,
out_features=10 # num_classes
)
# Keras Dense layer
from tensorflow.keras.layers import Dense
fc = Dense(units=10, activation='softmax')
Modern trend: Replace FC layers with Global Average Pooling to reduce parameters and overfitting.
| Function | Formula | Range | Use Case |
|---|---|---|---|
| ReLU | max(0, x) | [0, inf) | Hidden layers (default) |
| LeakyReLU | max(0.01x, x) | (-inf, inf) | Avoid dying neurons |
| Sigmoid | 1/(1+e^-x) | [0, 1] | Binary classification |
| Softmax | e^xi / sum(e^xj) | [0, 1], sum=1 | Multi-class classification |
| GELU | x * P(X ≤ x) | (-inf, inf) | Transformers |
Simple thresholding at zero. Creates sparsity (many zeros) which makes the network more efficient. Gradient is either 0 or 1, avoiding vanishing gradients for positive values.
Each output e^xi is divided by the sum of all e^xj. This normalization creates a valid probability distribution over all classes.
Vanishing gradients: For very large or small inputs, gradient approaches 0. During backprop, gradients get multiplied and shrink to near-zero.
import numpy as np
import matplotlib.pyplot as plt
x = np.linspace(-5, 5, 100)
fig, axes = plt.subplots(2, 2, figsize=(10, 8))
# ReLU: f(x) = max(0, x)
axes[0,0].plot(x, np.maximum(0, x), 'b-', linewidth=2)
axes[0,0].axhline(y=0, color='k', linestyle='--', alpha=0.3)
axes[0,0].axvline(x=0, color='k', linestyle='--', alpha=0.3)
axes[0,0].set_title('ReLU: max(0, x)')
axes[0,0].set_ylim(-1, 5)
# Sigmoid: f(x) = 1/(1+e^-x)
axes[0,1].plot(x, 1/(1+np.exp(-x)), 'r-', linewidth=2)
axes[0,1].axhline(y=0.5, color='k', linestyle='--', alpha=0.3)
axes[0,1].set_title('Sigmoid: 1/(1+e^-x)')
axes[0,1].set_ylim(-0.1, 1.1)
# LeakyReLU: f(x) = max(0.1x, x)
axes[1,0].plot(x, np.where(x > 0, x, 0.1*x), 'g-', linewidth=2)
axes[1,0].axhline(y=0, color='k', linestyle='--', alpha=0.3)
axes[1,0].set_title('LeakyReLU: max(0.1x, x)')
# Softmax example (3 classes)
logits = np.array([2.0, 1.0, 0.1])
softmax = np.exp(logits) / np.sum(np.exp(logits))
axes[1,1].bar(['Class A', 'Class B', 'Class C'], softmax, color=['blue', 'orange', 'green'])
axes[1,1].set_title(f'Softmax: probabilities sum to {softmax.sum():.1f}')
plt.tight_layout()
plt.savefig('activation_plots.png')
# He initialization for ReLU
nn.init.kaiming_normal_(layer.weight,
nonlinearity='relu')
import torch
import torch.nn as nn
import torch.nn.functional as F
x = torch.randn(4, 64) # (#1:Batch of 4, 64 features)
# ReLU - Most common for hidden layers
relu = nn.ReLU()
out = relu(x) # (#2:Zeros out negatives)
# LeakyReLU - Allows small negative gradient
leaky = nn.LeakyReLU(negative_slope=0.01)
out = leaky(x) # (#3:Prevents dying ReLU)
# Sigmoid - Binary classification output
out = torch.sigmoid(x) # (#4:Range [0, 1])
# Softmax - Multi-class classification
out = F.softmax(x, dim=1) # (#5:Probabilities sum to 1)
# GELU - Used in Transformers
gelu = nn.GELU()
out = gelu(x) # (#6:Smooth ReLU variant)
Input: 224x224x3
Conv3-64 x2 -> Pool -> 112x112x64
Conv3-128 x2 -> Pool -> 56x56x128
Conv3-256 x3 -> Pool -> 28x28x256
Conv3-512 x3 -> Pool -> 14x14x512
Conv3-512 x3 -> Pool -> 7x7x512
Flatten -> FC-4096 -> FC-4096
FC-1000 (Softmax)
import torch.nn as nn
import torchvision.models as models
# Load pretrained VGG-16
vgg16 = models.vgg16(weights='IMAGENET1K_V1') # (#1:Pretrained weights)
# Custom VGG-like block
class VGGBlock(nn.Module):
def __init__(self, in_ch, out_ch, num_convs):
super().__init__()
layers = []
for i in range(num_convs):
layers.append(nn.Conv2d(
in_ch if i == 0 else out_ch, # (#2:First conv changes channels)
out_ch, kernel_size=3, padding=1
))
layers.append(nn.ReLU(inplace=True))
layers.append(nn.MaxPool2d(2, 2)) # (#3:Halve spatial dims)
self.block = nn.Sequential(*layers)
def forward(self, x):
return self.block(x)
Output = F(x) + x
- x: input (identity)
- F(x): learned residual
- +: element-wise addition
Impact: Won ImageNet 2015, enabled 1000+ layer networks!
import torch.nn as nn
class ResidualBlock(nn.Module):
def __init__(self, in_channels, out_channels, stride=1):
super().__init__()
self.conv1 = nn.Conv2d(in_channels, out_channels,
kernel_size=3, stride=stride, padding=1) # (#1:May downsample)
self.bn1 = nn.BatchNorm2d(out_channels) # (#2:Batch normalization)
self.conv2 = nn.Conv2d(out_channels, out_channels,
kernel_size=3, padding=1)
self.bn2 = nn.BatchNorm2d(out_channels)
self.relu = nn.ReLU(inplace=True)
# Skip connection with projection if dimensions change
self.shortcut = nn.Sequential()
if stride != 1 or in_channels != out_channels:
self.shortcut = nn.Sequential(
nn.Conv2d(in_channels, out_channels, 1, stride), # (#3:1x1 conv to match dims)
nn.BatchNorm2d(out_channels)
)
def forward(self, x):
identity = self.shortcut(x) # (#4:Skip connection)
out = self.relu(self.bn1(self.conv1(x)))
out = self.bn2(self.conv2(out))
out += identity # (#5:Add residual)
return self.relu(out)
| Model | Layers | Parameters | Top-1 Acc | Block Type |
|---|---|---|---|---|
| ResNet-18 | 18 | 11.7M | 69.8% | Basic |
| ResNet-34 | 34 | 21.8M | 73.3% | Basic |
| ResNet-50 | 50 | 25.6M | 76.1% | Bottleneck |
| ResNet-101 | 101 | 44.5M | 77.4% | Bottleneck |
| ResNet-152 | 152 | 60.2M | 78.3% | Bottleneck |
Bottleneck block: Uses 1x1 conv to reduce dimensions, 3x3 conv, then 1x1 to expand - more efficient!
Compound scaling - systematically scale depth, width, and resolution together.
depth: d = alpha^phi
width: w = beta^phi
resolution: r = gamma^phi
Constraint: alpha * beta^2 * gamma^2 ~ 2
| Model | Params | Top-1 |
|---|---|---|
| B0 | 5.3M | 77.1% |
| B3 | 12M | 81.6% |
| B5 | 30M | 83.6% |
| B7 | 66M | 84.3% |
Key insight: EfficientNet-B0 achieves similar accuracy to ResNet-50 with 8x fewer parameters!
import torchvision.models as models
import torch.nn as nn
# Load pretrained EfficientNet
efficientnet = models.efficientnet_b0(weights='IMAGENET1K_V1') # (#1:5.3M params)
# Modify for custom number of classes
num_classes = 10
efficientnet.classifier = nn.Sequential(
nn.Dropout(p=0.2, inplace=True),
nn.Linear(1280, num_classes) # (#2:Replace final layer)
)
# EfficientNet-V2 (improved version)
efficientnet_v2 = models.efficientnet_v2_s(weights='IMAGENET1K_V1') # (#3:Faster training)
# Timm library - more options
import timm
model = timm.create_model('efficientnet_b3', pretrained=True, num_classes=10) # (#4:Easy model creation)
1. Split image into patches (16x16)
2. Flatten patches to vectors
3. Linear projection + position embed
4. Add [CLS] token
5. Transformer encoder blocks
6. MLP head on [CLS] for classification
Trade-off: ViT requires large datasets (JFT-300M) to outperform CNNs. With less data, CNNs still win!
import torch
import torch.nn as nn
from transformers import ViTForImageClassification, ViTImageProcessor
# Using Hugging Face Transformers
model_name = "google/vit-base-patch16-224" # (#1:16x16 patches, 224x224 input)
processor = ViTImageProcessor.from_pretrained(model_name) # (#2:Preprocessing)
model = ViTForImageClassification.from_pretrained(model_name)
# Using timm library
import timm
vit = timm.create_model('vit_base_patch16_224', pretrained=True, num_classes=10) # (#3:Easy setup)
# PyTorch native (torchvision)
from torchvision.models import vit_b_16, ViT_B_16_Weights
vit = vit_b_16(weights=ViT_B_16_Weights.IMAGENET1K_V1) # (#4:Pretrained)
vit.heads = nn.Linear(768, 10) # (#5:Custom head)
Modernized CNN with Transformer-inspired design:
Hierarchical Vision Transformer:
Which architecture would you recommend for each scenario?
Mobile app with 5MB size limit, real-time inference needed.
ResNet-50, EfficientNet-B0, or ViT-B?
Medical imaging with only 500 labeled images, high accuracy required.
Train from scratch or fine-tune?
Production system with 10M+ training images, cost is not a concern.
CNN or Transformer-based?
import timm
import torchvision.models as models
# ConvNeXt - CNN competitive with ViT
convnext = models.convnext_tiny(weights='IMAGENET1K_V1') # (#1:28.6M params)
convnext_base = timm.create_model('convnext_base', pretrained=True) # (#2:Via timm)
# Swin Transformer
swin = models.swin_t(weights='IMAGENET1K_V1') # (#3:Swin-Tiny)
swin_base = timm.create_model('swin_base_patch4_window7_224', pretrained=True) # (#4:Base model)
# Modify for custom classes
num_classes = 10
convnext.classifier[2] = nn.Linear(768, num_classes) # (#5:ConvNeXt head)
swin.head = nn.Linear(768, num_classes) # (#6:Swin head)
Single-shot detection: One neural network predicts bounding boxes and class probabilities in a single pass.
| Version | Year | Key Feature |
|---|---|---|
| YOLOv3 | 2018 | Multi-scale predictions |
| YOLOv5 | 2020 | PyTorch native |
| YOLOv8 | 2023 | Anchor-free |
| YOLOv11 | 2024 | Latest improvements |
from ultralytics import YOLO
# Load pretrained model
model = YOLO('yolov8n.pt') # (#1:Nano model - fastest)
# Options: yolov8n, yolov8s, yolov8m, yolov8l, yolov8x
# Run inference
results = model('image.jpg') # (#2:Single image)
results = model(['img1.jpg', 'img2.jpg']) # (#3:Batch inference)
# Process results
for result in results:
boxes = result.boxes # (#4:Bounding boxes)
for box in boxes:
xyxy = box.xyxy[0] # (#5:x1, y1, x2, y2)
conf = box.conf[0] # (#6:Confidence score)
cls = box.cls[0] # (#7:Class index)
# Train custom model
model.train(data='custom.yaml', epochs=100, imgsz=640) # (#8:Fine-tuning)
U-shape: The architecture resembles letter "U" - hence the name. Skip connections are crucial for preserving spatial information!
import segmentation_models_pytorch as smp
# Using segmentation_models_pytorch
model = smp.Unet(
encoder_name="resnet34", # (#1:Pretrained backbone)
encoder_weights="imagenet", # (#2:ImageNet weights)
in_channels=3, # (#3:RGB input)
classes=1, # (#4:Binary segmentation)
)
# U-Net++ (improved skip connections)
model = smp.UnetPlusPlus(
encoder_name="efficientnet-b3",
encoder_weights="imagenet",
classes=5, # (#5:Multi-class segmentation)
)
# Using Hugging Face
from transformers import SegformerForSemanticSegmentation
model = SegformerForSemanticSegmentation.from_pretrained(
"nvidia/segformer-b0-finetuned-ade-512-512" # (#6:Pretrained segmentation)
)
Foundation model for promptable segmentation trained on 11M images and 1B masks.
SAM 2 (2024): Video support added!
from segment_anything import sam_model_registry, SamPredictor
# Load SAM model
sam = sam_model_registry["vit_h"](checkpoint="sam_vit_h.pth") # (#1:ViT-Huge)
predictor = SamPredictor(sam)
# Set image (encodes once)
image = cv2.imread("image.jpg")
predictor.set_image(image) # (#2:Run image encoder)
# Predict with point prompt
input_point = np.array([[500, 375]]) # (#3:Click location)
input_label = np.array([1]) # (#4:1=foreground, 0=background)
masks, scores, logits = predictor.predict(
point_coords=input_point,
point_labels=input_label,
multimask_output=True # (#5:Returns 3 masks)
)
# Predict with box prompt
input_box = np.array([100, 100, 400, 400]) # (#6:x1, y1, x2, y2)
masks, _, _ = predictor.predict(box=input_box)
| Loss | Formula | Use Case |
|---|---|---|
| Binary Cross-Entropy | -[y*log(p) + (1-y)*log(1-p)] | Binary classification |
| Categorical CE | -sum(y_i * log(p_i)) | Multi-class classification |
| Dice Loss | 1 - 2*|X ∩ Y| / (|X| + |Y|) | Segmentation |
| IoU Loss | 1 - |X ∩ Y| / |X ∪ Y| | Object detection |
| Focal Loss | -alpha*(1-p)^gamma*log(p) | Class imbalance |
import torch
import torch.nn as nn
# Binary Cross-Entropy (with logits)
criterion = nn.BCEWithLogitsLoss() # (#1:Includes sigmoid)
loss = criterion(predictions, targets)
# Multi-class Cross-Entropy
criterion = nn.CrossEntropyLoss() # (#2:Includes softmax)
loss = criterion(predictions, targets) # (#3:targets: class indices)
# Dice Loss for segmentation
def dice_loss(pred, target, smooth=1e-6):
pred = torch.sigmoid(pred) # (#4:Convert to probabilities)
intersection = (pred * target).sum()
union = pred.sum() + target.sum()
dice = (2. * intersection + smooth) / (union + smooth) # (#5:Dice coefficient)
return 1 - dice # (#6:Loss = 1 - coefficient)
# Combined loss (common for segmentation)
loss = 0.5 * bce_loss + 0.5 * dice_loss # (#7:Balance both objectives)
Classic optimizer. Good generalization but slower convergence.
optim.SGD(params, lr=0.01, momentum=0.9)
Adaptive learning rates. Fast convergence, works well out-of-box.
optim.Adam(params, lr=0.001)
Adam with decoupled weight decay. Best for Transformers.
optim.AdamW(params, lr=0.001, weight_decay=0.01)
Rule of thumb: Start with AdamW for transformers, Adam for CNNs. Fine-tune with SGD+momentum for best final results.
import torch.optim as optim
# Basic optimizer setup
optimizer = optim.AdamW(
model.parameters(),
lr=1e-4, # (#1:Learning rate)
weight_decay=0.01 # (#2:L2 regularization)
)
# Different LR for different layers (transfer learning)
optimizer = optim.AdamW([
{'params': model.backbone.parameters(), 'lr': 1e-5}, # (#3:Pretrained - low LR)
{'params': model.head.parameters(), 'lr': 1e-3} # (#4:New layers - high LR)
])
# Training step
optimizer.zero_grad() # (#5:Clear gradients)
loss = criterion(model(inputs), targets)
loss.backward() # (#6:Compute gradients)
optimizer.step() # (#7:Update weights)
| Scheduler | Description | Best For |
|---|---|---|
| StepLR | Decay by gamma every N epochs | Simple decay |
| CosineAnnealing | Cosine curve from max to min LR | Transformers, long training |
| OneCycleLR | Warm-up then decay in one cycle | Fast convergence |
| ReduceLROnPlateau | Reduce when metric stops improving | Adaptive training |
| WarmupCosine | Linear warmup + cosine decay | ViT, large batch training |
from torch.optim.lr_scheduler import CosineAnnealingLR, OneCycleLR
# Cosine Annealing
scheduler = CosineAnnealingLR(
optimizer,
T_max=100, # (#1:Total epochs)
eta_min=1e-6 # (#2:Minimum LR)
)
# OneCycleLR (recommended for fast training)
scheduler = OneCycleLR(
optimizer,
max_lr=1e-3, # (#3:Peak learning rate)
epochs=100,
steps_per_epoch=len(train_loader), # (#4:Total steps)
pct_start=0.1 # (#5:10% warmup)
)
# Training loop
for epoch in range(epochs):
for batch in train_loader:
optimizer.zero_grad()
loss.backward()
optimizer.step()
scheduler.step() # (#6:Update LR each step for OneCycleLR)
# scheduler.step() # (#7:Or once per epoch for others)
def train_epoch(model, loader, criterion, optimizer, scheduler, device):
model.train() # (#1:Training mode)
total_loss = 0
for images, labels in loader:
images, labels = images.to(device), labels.to(device) # (#2:Move to GPU)
optimizer.zero_grad()
outputs = model(images)
loss = criterion(outputs, labels)
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0) # (#3:Gradient clipping)
optimizer.step()
scheduler.step()
total_loss += loss.item()
return total_loss / len(loader)
# Validation loop
@torch.no_grad() # (#4:Disable gradients)
def validate(model, loader, criterion, device):
model.eval() # (#5:Evaluation mode)
# ... similar but no backward pass
Conv, Pool, FC layers with activations form the foundation of all vision architectures
ResNet's residual connections enable training very deep networks effectively
ViT and ConvNeXt compete for state-of-the-art; choose based on data size and task
Practical advice: Start with pretrained EfficientNet or ResNet. Use AdamW + cosine scheduler. Fine-tune, don't train from scratch!
Preparation: Download a small custom dataset from Kaggle for fine-tuning practice.
| Type | Resource |
|---|---|
| Paper | ResNet - Deep Residual Learning |
| Paper | ViT - An Image is Worth 16x16 Words |
| Paper | ConvNeXt - A ConvNet for the 2020s |
| Library | Ultralytics YOLO |
| Library | segmentation_models_pytorch |
| Tutorial | PyTorch Transfer Learning |
Build and train CNN models on Google Colab
Implement ResNet and fine-tune EfficientNet
Compare optimizers and learning rate schedules