Image transformations, augmentation, and classical feature extraction
import cv2
import numpy as np
# Load image (OpenCV loads as BGR)
img_bgr = cv2.imread('image.jpg') # (#1:BGR by default)
# Convert to different color spaces
img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB) # (#2:For matplotlib)
img_gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY) # (#3:Single channel)
img_hsv = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2HSV) # (#4:Hue-based ops)
img_lab = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2LAB) # (#5:Perceptual)
# Access individual channels
h, s, v = cv2.split(img_hsv) # (#6:Separate channels)
l, a, b = cv2.split(img_lab)
# Merge channels back
img_hsv_merged = cv2.merge([h, s, v]) # (#7:Combine channels)
| Data Type | Range | Use Case |
|---|---|---|
| uint8 | 0 - 255 | Standard image storage, display |
| float32 | 0.0 - 1.0 | Neural network input, processing |
| float32 | -1.0 - 1.0 | Normalized for some models |
| float64 | Any | Scientific computing, precision |
Important: Always verify the expected input range for your model. Mismatched ranges cause silent failures!
import numpy as np
# uint8 to float32 [0, 1]
img_normalized = img.astype(np.float32) / 255.0 # (#1:Simple scaling)
# uint8 to float32 [-1, 1]
img_centered = (img.astype(np.float32) / 127.5) - 1.0 # (#2:Centered range)
# Back to uint8
img_uint8 = (img_normalized * 255).astype(np.uint8) # (#3:For display)
# Float to uint8 from [-1, 1]
img_uint8 = ((img_centered + 1.0) * 127.5).astype(np.uint8) # (#4:Reverse)
# Clip to valid range (safety)
img_clipped = np.clip(img_processed, 0, 255).astype(np.uint8) # (#5:Avoid overflow)
A pixel has RGB values: [128, 64, 192] (uint8)
What are the normalized values in [0, 1] range?
Formula: value / 255.0
What are the values in [-1, 1] range?
Formula: (value / 127.5) - 1.0
After ImageNet normalization (R channel), given mean=0.485, std=0.229?
Formula: (normalized - mean) / std
# Convert between formats
img_chw = np.transpose(img_hwc, (2, 0, 1)) # (#1:HWC to CHW)
img_hwc = np.transpose(img_chw, (1, 2, 0)) # (#2:CHW to HWC)
# PyTorch tensors
tensor_chw = torch.permute(tensor_hwc, (2, 0, 1)) # (#3:PyTorch permute)
Stretch to target size. Fast but may distort aspect ratio.
Resize and pad to maintain proportions. Adds letterboxing.
Resize longer edge, crop center. May lose edge content.
import cv2
# Simple resize (may distort)
img_resized = cv2.resize(img, (224, 224)) # (#1:Direct resize)
# Resize with aspect ratio preservation + padding
def resize_with_padding(img, target_size):
h, w = img.shape[:2]
scale = min(target_size / h, target_size / w) # (#2:Scale factor)
new_h, new_w = int(h * scale), int(w * scale)
resized = cv2.resize(img, (new_w, new_h))
# Create padded canvas
canvas = np.zeros((target_size, target_size, 3), dtype=np.uint8) # (#3:Black pad)
y_offset = (target_size - new_h) // 2
x_offset = (target_size - new_w) // 2
canvas[y_offset:y_offset+new_h, x_offset:x_offset+new_w] = resized
return canvas
# Center crop after resize
def resize_and_center_crop(img, target_size):
h, w = img.shape[:2]
scale = max(target_size / h, target_size / w) # (#4:Cover target)
new_h, new_w = int(h * scale), int(w * scale)
resized = cv2.resize(img, (new_w, new_h))
# Center crop
y_start = (new_h - target_size) // 2
x_start = (new_w - target_size) // 2
return resized[y_start:y_start+target_size, x_start:x_start+target_size] # (#5:Crop)
Normalized inputs help gradient descent converge faster by keeping values in a similar range
Prevents large values from causing overflow or vanishing gradients during training
Pretrained models expect inputs normalized with their training statistics (e.g., ImageNet)
Key insight: Without normalization, different features (pixels) would have vastly different scales, making optimization unstable.
# Zero mean, unit variance
mean = img.mean()
std = img.std()
img_norm = (img - mean) / std
Good for variable lighting conditions
# Standard ImageNet normalization
mean = [0.485, 0.456, 0.406]
std = [0.229, 0.224, 0.225]
img_norm = (img - mean) / std
Required for pretrained models
import numpy as np
import torch
from torchvision import transforms
# ImageNet statistics (computed from 1M+ images)
IMAGENET_MEAN = np.array([0.485, 0.456, 0.406]) # (#1:RGB means)
IMAGENET_STD = np.array([0.229, 0.224, 0.225]) # (#2:RGB stds)
# NumPy implementation
def imagenet_normalize(img):
"""img should be float32 in [0, 1] range, RGB format"""
return (img - IMAGENET_MEAN) / IMAGENET_STD # (#3:Per-channel norm)
# PyTorch transforms (preferred)
transform = transforms.Compose([
transforms.ToTensor(), # (#4:HWC uint8 to CHW float [0,1])
transforms.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD) # (#5:Apply stats)
])
# TensorFlow/Keras
from tensorflow.keras.applications.resnet50 import preprocess_input
img_preprocessed = preprocess_input(img) # (#6:Model-specific)
import cv2
# For grayscale images
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
equalized = cv2.equalizeHist(gray)
# For color images (equalize V channel)
hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
hsv[:,:,2] = cv2.equalizeHist(hsv[:,:,2])
enhanced = cv2.cvtColor(hsv, cv2.COLOR_HSV2BGR)
import cv2
# Create CLAHE object
clahe = cv2.createCLAHE(
clipLimit=2.0, # (#1:Contrast limit to prevent over-amplification)
tileGridSize=(8, 8) # (#2:Size of local regions)
)
# Apply to grayscale
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
clahe_gray = clahe.apply(gray) # (#3:Local adaptive equalization)
# Apply to color (LAB space recommended)
lab = cv2.cvtColor(img, cv2.COLOR_BGR2LAB)
lab[:,:,0] = clahe.apply(lab[:,:,0]) # (#4:Apply to L channel only)
enhanced = cv2.cvtColor(lab, cv2.COLOR_LAB2BGR) # (#5:Convert back)
CLAHE vs Standard: CLAHE works locally, preventing over-amplification in already bright regions. Better for real-world images.
import cv2
import numpy as np
# Gaussian blur (simple denoising)
blurred = cv2.GaussianBlur(img, (5, 5), 0) # (#1:Kernel size must be odd)
# Non-local means denoising (better quality)
denoised = cv2.fastNlMeansDenoisingColored(
img, None, h=10, hForColorComponents=10, # (#2:Filter strength)
templateWindowSize=7, searchWindowSize=21 # (#3:Window sizes)
)
# Gamma correction
def adjust_gamma(img, gamma=1.0):
inv_gamma = 1.0 / gamma
table = np.array([((i / 255.0) ** inv_gamma) * 255
for i in range(256)]).astype(np.uint8) # (#4:Lookup table)
return cv2.LUT(img, table) # (#5:Apply LUT)
brightened = adjust_gamma(img, gamma=0.5) # (#6:gamma < 1 brightens)
darkened = adjust_gamma(img, gamma=2.0) # (#7:gamma > 1 darkens)
Key insight: Good augmentation can be worth more than additional training data!
import albumentations as A
from albumentations.pytorch import ToTensorV2 # (#1:PyTorch integration)
# Basic setup
transform = A.Compose([
A.Resize(224, 224), # (#2:Always resize first)
A.HorizontalFlip(p=0.5), # (#3:50% probability)
A.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]), # (#4:ImageNet stats)
ToTensorV2() # (#5:NumPy to PyTorch tensor)
])
# Apply transformation
transformed = transform(image=image)
img_tensor = transformed['image'] # (#6:Access result)
Why Albumentations? Faster than torchvision, supports bboxes/masks, rich augmentation library.
import albumentations as A
geometric_transform = A.Compose([
A.HorizontalFlip(p=0.5), # (#1:Mirror horizontally)
A.VerticalFlip(p=0.3), # (#2:Mirror vertically)
A.Rotate(limit=30, p=0.7), # (#3:Random rotation +/- 30 degrees)
A.ShiftScaleRotate(
shift_limit=0.1, # (#4:Shift by 10% max)
scale_limit=0.2, # (#5:Scale by +/- 20%)
rotate_limit=15, # (#6:Rotate +/- 15 degrees)
p=0.8
),
A.ElasticTransform(
alpha=120, sigma=120 * 0.05, # (#7:Elastic deformation)
alpha_affine=120 * 0.03, p=0.3
),
A.GridDistortion(p=0.2), # (#8:Grid-based distortion)
A.Perspective(scale=(0.05, 0.1), p=0.3) # (#9:Perspective change)
])
import albumentations as A
photometric_transform = A.Compose([
A.RandomBrightnessContrast(
brightness_limit=0.2, # (#1:Brightness change)
contrast_limit=0.2, # (#2:Contrast change)
p=0.7
),
A.HueSaturationValue(
hue_shift_limit=20, # (#3:Hue shift)
sat_shift_limit=30, # (#4:Saturation change)
val_shift_limit=20, # (#5:Value/brightness)
p=0.5
),
A.GaussNoise(var_limit=(10, 50), p=0.3), # (#6:Add Gaussian noise)
A.GaussianBlur(blur_limit=(3, 7), p=0.3), # (#7:Apply blur)
A.MotionBlur(blur_limit=7, p=0.2), # (#8:Motion blur effect)
A.CLAHE(clip_limit=4.0, p=0.3), # (#9:Adaptive histogram eq)
A.ColorJitter(p=0.4) # (#10:Random color changes)
])
import albumentations as A
# Cutout: randomly mask regions
cutout = A.CoarseDropout(
max_holes=8, # Number of patches
max_height=32,
max_width=32,
fill_value=0, # Black patches
p=0.5
)
Forces model to use all features, not just dominant ones.
import numpy as np
def mixup(img1, img2, label1, label2, alpha=0.2):
lam = np.random.beta(alpha, alpha)
mixed_img = lam * img1 + (1 - lam) * img2
mixed_label = lam * label1 + (1 - lam) * label2
return mixed_img, mixed_label
Blends two images and their labels for smoother decision boundaries.
def cutmix(img1, img2, label1, label2):
lam = np.random.beta(1.0, 1.0)
h, w = img1.shape[:2]
# Random box
cut_w = int(w * np.sqrt(1 - lam))
cut_h = int(h * np.sqrt(1 - lam))
cx, cy = np.random.randint(w), np.random.randint(h)
# Paste region from img2
mixed = img1.copy()
x1, y1 = max(cx-cut_w//2, 0), max(cy-cut_h//2, 0)
x2, y2 = min(cx+cut_w//2, w), min(cy+cut_h//2, h)
mixed[y1:y2, x1:x2] = img2[y1:y2, x1:x2]
return mixed, lam*label1 + (1-lam)*label2
from torchvision.transforms import autoaugment
# Learned augmentation policies
transform = autoaugment.AutoAugment(
policy=autoaugment.AutoAugmentPolicy.IMAGENET
)
# RandAugment (simpler alternative)
from torchvision.transforms import RandAugment
transform = RandAugment(
num_ops=2, # Operations per image
magnitude=9 # Intensity (0-30)
)
Policies learned by searching for best augmentations
import albumentations as A
from albumentations.pytorch import ToTensorV2
def get_train_transforms(img_size=224):
return A.Compose([
A.RandomResizedCrop(img_size, img_size, scale=(0.8, 1.0)), # (#1:Random crop)
A.HorizontalFlip(p=0.5),
A.ShiftScaleRotate(shift_limit=0.1, scale_limit=0.15, rotate_limit=15, p=0.5),
A.OneOf([ # (#2:Apply ONE of these)
A.GaussNoise(var_limit=(10, 50)),
A.GaussianBlur(blur_limit=3),
A.MotionBlur(blur_limit=3),
], p=0.3),
A.OneOf([
A.RandomBrightnessContrast(brightness_limit=0.2, contrast_limit=0.2),
A.HueSaturationValue(hue_shift_limit=10, sat_shift_limit=20, val_shift_limit=10),
], p=0.5),
A.CoarseDropout(max_holes=8, max_height=16, max_width=16, p=0.3), # (#3:Cutout)
A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), # (#4:ImageNet)
ToTensorV2() # (#5:To tensor)
])
def get_val_transforms(img_size=224): # (#6:No augmentation for validation!)
return A.Compose([
A.Resize(img_size, img_size),
A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
ToTensorV2()
])
You're training a model to classify documents (letters, receipts, forms). Which augmentations make sense?
Would flipping a document horizontally be realistic?
Think: Do documents get flipped in real use?
What rotation range would be appropriate?
Think: Scanner misalignment range
Would strong color changes be appropriate?
Think: Most documents are grayscale/B&W
import cv2
# Create SIFT detector
sift = cv2.SIFT_create() # (#1:Initialize)
# Detect and compute
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
keypoints, descriptors = sift.detectAndCompute(
gray, None # (#2:No mask)
)
# Draw keypoints
img_kp = cv2.drawKeypoints(
img, keypoints, None,
flags=cv2.DRAW_MATCHES_FLAGS_DRAW_RICH_KEYPOINTS # (#3:Show scale/orientation)
)
print(f"Found {len(keypoints)} keypoints")
print(f"Descriptor shape: {descriptors.shape}") # (#4:(N, 128))
import cv2
# Create ORB detector
orb = cv2.ORB_create(
nfeatures=500, # (#1:Max keypoints)
scaleFactor=1.2, # (#2:Pyramid scale)
nlevels=8 # (#3:Pyramid levels)
)
# Detect and compute
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
keypoints, descriptors = orb.detectAndCompute(gray, None)
# Binary descriptors - use Hamming distance
print(f"Descriptor type: {descriptors.dtype}") # (#4:uint8)
print(f"Descriptor shape: {descriptors.shape}") # (#5:(N, 32) = 256 bits)
from skimage.feature import hog
from skimage import exposure
import cv2
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
resized = cv2.resize(gray, (128, 64)) # (#1:Standard size)
# Compute HOG features
features, hog_image = hog(
resized,
orientations=9, # (#2:Orientation bins)
pixels_per_cell=(8, 8), # (#3:Cell size)
cells_per_block=(2, 2), # (#4:Block size)
visualize=True, # (#5:Return visualization)
feature_vector=True # (#6:Flatten output)
)
# Enhance visualization
hog_image = exposure.rescale_intensity(hog_image, in_range=(0, 10))
print(f"HOG feature vector: {features.shape}") # (#7:(3780,) for 128x64)
from sklearn.decomposition import PCA
import numpy as np
# Flatten images to feature vectors
# images shape: (n_samples, height, width, channels)
n_samples = images.shape[0]
features = images.reshape(n_samples, -1) # (#1:Flatten to (n, h*w*c))
# Normalize features
features = features.astype(np.float32) / 255.0
features = (features - features.mean(axis=0)) / features.std(axis=0) # (#2:Standardize)
# Apply PCA
pca = PCA(n_components=50) # (#3:Reduce to 50 dimensions)
features_reduced = pca.fit_transform(features)
# Explained variance
print(f"Explained variance ratio: {pca.explained_variance_ratio_.sum():.2%}") # (#4:Info retained)
# For visualization (2D)
pca_2d = PCA(n_components=2)
features_2d = pca_2d.fit_transform(features) # (#5:For plotting)
from sklearn.manifold import TSNE
# t-SNE for visualization
tsne = TSNE(
n_components=2,
perplexity=30, # Balance local/global
random_state=42
)
features_tsne = tsne.fit_transform(features)
# Plot
plt.scatter(
features_tsne[:, 0],
features_tsne[:, 1],
c=labels, cmap='tab10', alpha=0.6
)
plt.title('t-SNE Visualization')
import umap
# UMAP - faster and better preserves global structure
reducer = umap.UMAP(
n_components=2,
n_neighbors=15,
min_dist=0.1,
random_state=42
)
features_umap = reducer.fit_transform(features)
# Plot
plt.scatter(
features_umap[:, 0],
features_umap[:, 1],
c=labels, cmap='tab10', alpha=0.6
)
plt.title('UMAP Visualization')
Tip: Apply PCA first to reduce to ~50 dimensions, then t-SNE/UMAP for better results.
Proper normalization and color handling are critical for model performance
Smart augmentation can dramatically improve generalization
Understanding classical features helps interpret deep learning
Remember: Always use different transforms for training vs validation!
Preparation: Review the augmentation pipeline code and ensure your Colab environment has GPU enabled.
| Type | Resource |
|---|---|
| Library | Albumentations Documentation |
| Tutorial | OpenCV-Python Tutorials |
| Paper | CutMix: Training with Localizable Features |
| Paper | AutoAugment: Learning Augmentation Policies |
| Tool | UMAP Documentation |
| Classic Paper | SIFT: Distinctive Image Features |
Start the preprocessing and augmentation exercises in Colab
Try different augmentation combinations on your dataset
Use t-SNE/UMAP to explore your feature space