Building CV applications with cloud APIs from Google, AWS, Azure, and Anthropic
Today's goals (3 hours)
Effectively leverage cloud-based CV services
Understand offerings from major providers
Create functional CV-powered tools
Our journey today
Benefits over building from scratch
Pre-trained models, production-ready from day one
No need to hire data scientists or ML engineers
Minutes to integrate, instant scaling
Focus on business logic, not infrastructure
Pay per use, no infrastructure costs
Start small, scale automatically
SLAs, security certifications
Compliance and support included
The request-response cycle
Major players in the ecosystem
Specialized, predefined tasks with structured outputs
Open-ended reasoning, natural language interaction
Purpose-built for specific use cases
Comprehensive traditional CV API
Pricing: ~$1.50/1,000 images
AWS's computer vision service
Pricing: Tier-based, volume discounts
Microsoft's vision intelligence
Pricing: Competitive, good free tier
The new generation: Claude, GPT-4V, Gemini
Side-by-side comparison
| Provider | Type | Best Features | Pricing Model |
|---|---|---|---|
| Google Cloud Vision | Traditional | OCR, labels, web entities | Per image |
| Amazon Rekognition | Traditional | Face, video, custom labels | Tiered |
| Azure CV | Traditional | Captioning, spatial, reading | Per transaction |
| Claude/GPT-4V/Gemini | Multimodal LLM | Open-ended reasoning | Per token |
Match your needs to the right API
Recommended: Google Cloud Vision, Azure Read API
Best for receipts, invoices, forms, ID cards
Recommended: Amazon Rekognition
Best for security, user verification, demographics
Recommended: Claude Vision, GPT-4 Vision
Best for complex reasoning, custom tasks, products
Recommended: Amazon Rekognition Video
Best for surveillance, live streams, media analysis
Real-world applications matched to APIs
| Use Case | Recommended Service |
|---|---|
| Receipt/Invoice OCR | Google Document AI, Azure Form Recognizer |
| Face verification | Amazon Rekognition Face |
| Product analysis | Claude Vision, GPT-4 Vision |
| Content moderation | Google SafeSearch, Amazon Moderation |
| Retail foot traffic | Azure Spatial Analysis |
| Complex reasoning | Claude Vision, Gemini Pro Vision |
What we'll build today
Receipt Processing → Extract text → Parse structure
Product Cataloging → Analyze attributes → Generate metadata
Retail Analysis → Understand context → Business insights
ExpenseTracker → Combine skills → Production code
Environment preparation
# Install required packages
# pip install anthropic pillow requests
import os
from pathlib import Path
import base64
# Create working directory
WORK_DIR = Path("cv_workshop")
WORK_DIR.mkdir(exist_ok=True)
# Sample images for testing
SAMPLE_IMAGES = [
"receipt.jpg", # For OCR
"products.jpg", # For object detection
"storefront.jpg", # For scene analysis
"document.pdf" # For document processing
]
Converting images to Base64 for APIs
def encode_image(image_path: str) -> str:
"""Encode image to base64 string for API calls."""
with open(image_path, "rb") as image_file:
return base64.standard_b64encode(
image_file.read()
).decode("utf-8")
# Usage
image_data = encode_image("receipt.jpg")
# Returns: "/9j/4AAQSkZJRg..." (base64 string)
Extract structured data from receipts
import anthropic
from pathlib import Path
def extract_receipt_data(image_path: str) -> dict:
"""Extract structured data from a receipt image."""
client = anthropic.Anthropic()
# Determine media type from extension
ext = Path(image_path).suffix.lower()
media_types = {
'.jpg': 'image/jpeg',
'.jpeg': 'image/jpeg',
'.png': 'image/png',
'.webp': 'image/webp'
}
media_type = media_types.get(ext, 'image/jpeg')
# Encode image to base64
image_data = encode_image(image_path)
# Call Claude Vision API
response = client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=1024,
messages=[{
"role": "user",
"content": [
{
"type": "image",
"source": {
"type": "base64",
"media_type": media_type,
"data": image_data
}
},
{
"type": "text",
"text": "..." # Prompt on next slide
}
]
}]
)
Structured prompts get structured results
# The prompt that extracts structured data
prompt = """
Analyze this receipt and extract as JSON:
{
"vendor_name": "",
"date": "",
"items": [
{"name": "", "price": 0.00}
],
"subtotal": 0.00,
"tax": 0.00,
"total": 0.00
}
Return only valid JSON, no additional text.
If any field is unclear, use null.
"""
What Claude returns from a restaurant receipt
{
"vendor_name": "Café de Flore",
"date": "2024-03-15",
"items": [
{"name": "Espresso", "price": 4.50},
{"name": "Croissant", "price": 3.20},
{"name": "Orange Juice", "price": 5.00}
],
"subtotal": 12.70,
"tax": 1.27,
"total": 13.97
}
Automate e-commerce product metadata
Automatic product categorization
Colors, materials, sizes
Compelling descriptions
Search optimization
def analyze_product(image_path: str) -> dict:
"""Analyze a product image for e-commerce cataloging."""
client = anthropic.Anthropic()
image_data = encode_image(image_path)
response = client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=1500,
messages=[{
"role": "user",
"content": [
{"type": "image", "source": {
"type": "base64",
"media_type": "image/jpeg",
"data": image_data
}},
{"type": "text", "text": """..."""} # Next slide
]
}]
)
return response.content[0].text
prompt = """
Analyze this product for e-commerce listing.
Return JSON with:
{
"category": "Main category",
"subcategory": "Specific subcategory",
"attributes": {
"colors": [],
"materials": [],
"size_estimate": ""
},
"marketing_title": "Max 10 words, compelling",
"description": "2-3 sentences, benefit-focused",
"search_keywords": ["10", "relevant", "keywords"]
}
Be accurate about visible attributes.
Don't guess what you can't see clearly.
"""
{
"category": "Fashion",
"subcategory": "Women's Handbags",
"attributes": {
"colors": ["cognac brown", "gold accents"],
"materials": ["leather", "metal hardware"],
"size_estimate": "medium, ~30cm width"
},
"marketing_title": "Elegant Cognac Leather Tote with Gold Hardware",
"description": "A sophisticated everyday tote crafted from
premium cognac leather. Features secure zip closure and
elegant gold-tone hardware for timeless style.",
"search_keywords": ["leather tote", "brown handbag",
"cognac bag", "gold hardware", "women's purse", ...]
}
Process entire catalogs efficiently
def process_product_catalog(image_folder: str) -> list:
"""Process all product images in a folder."""
results = []
image_extensions = {'.jpg', '.jpeg', '.png', '.webp'}
for image_path in Path(image_folder).iterdir():
if image_path.suffix.lower() in image_extensions:
print(f"Processing: {image_path.name}")
analysis = analyze_product(str(image_path))
results.append({
"filename": image_path.name,
"analysis": analysis
})
return results
Analyze retail environments for business insights
Store type, size, layout, ambiance
Count, demographics, behaviors
Displays, signage, product visibility
Cleanliness, safety, staff, queues
def analyze_retail_scene(image_path: str) -> dict:
"""Analyze a retail environment for insights."""
client = anthropic.Anthropic()
image_data = encode_image(image_path)
response = client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=2000,
messages=[{
"role": "user",
"content": [
{"type": "image", "source": {
"type": "base64",
"media_type": "image/jpeg",
"data": image_data
}},
{"type": "text", "text": """..."""}
]
}]
)
return response.content[0].text
prompt = """
Analyze this retail environment comprehensively:
1. SCENE: Store type, estimated size, time of day,
overall ambiance
2. CUSTOMERS: Approximate count, visible demographics,
current activities and behaviors
3. MERCHANDISING: Display quality, signage effectiveness,
product visibility, promotional materials
4. OPERATIONS: Cleanliness score (1-10), safety hazards,
staff visibility, queue management
5. RECOMMENDATIONS: List 3 specific, actionable
improvements with expected business impact
Be specific, quantitative where possible, and
business-focused in your analysis.
"""
How retail scene analysis drives value
ExpenseTracker: CV-powered expense management
Class-based design for maintainability
Properties: expenses: List, client: Anthropic
Public: add_receipt(), get_summary()
Private: _extract_receipt(), _categorize()
Fields: id, timestamp, image_path
Data: category, extracted data dict
import json
from datetime import datetime
class ExpenseTracker:
"""Expense tracking using computer vision."""
def __init__(self):
self.expenses = []
self.client = anthropic.Anthropic()
def add_receipt(self, image_path: str,
category: str = None) -> dict:
"""Process a receipt and add to expenses."""
# Extract data using CV
extracted = self._extract_receipt(image_path)
# Parse JSON response
try:
receipt_data = json.loads(extracted)
except json.JSONDecodeError:
receipt_data = {"raw": extracted, "error": True}
# Create expense record
expense = {
"id": len(self.expenses) + 1,
"timestamp": datetime.now().isoformat(),
"image_path": image_path,
"category": category or self._categorize(receipt_data),
"data": receipt_data
}
self.expenses.append(expense)
return expense
def _extract_receipt(self, image_path: str) -> str:
"""Use CV to extract receipt data."""
# Reuse our extract_receipt_data function
return extract_receipt_data(image_path)
Keyword-based category assignment
def _categorize(self, data: dict) -> str:
"""Auto-categorize based on vendor name."""
vendor = data.get("vendor_name", "").lower()
categories = {
"restaurant": ["restaurant", "cafe", "pizza",
"burger", "sushi", "bistro"],
"grocery": ["supermarket", "grocery", "carrefour",
"auchan", "monoprix"],
"transport": ["uber", "taxi", "sncf", "ratp"],
"office": ["staples", "office", "amazon"],
}
for cat, keywords in categories.items():
if any(kw in vendor for kw in keywords):
return cat
return "other"
def get_summary(self) -> dict:
"""Generate expense summary by category."""
by_category = {}
for expense in self.expenses:
cat = expense["category"]
total = expense["data"].get("total", 0) or 0
if cat not in by_category:
by_category[cat] = {"count": 0, "total": 0}
by_category[cat]["count"] += 1
by_category[cat]["total"] += float(total)
return {
"total_expenses": len(self.expenses),
"by_category": by_category,
"grand_total": sum(
c["total"] for c in by_category.values()
)
}
# Initialize tracker
tracker = ExpenseTracker()
# Add receipts
tracker.add_receipt("lunch_receipt.jpg")
tracker.add_receipt("office_supplies.jpg")
tracker.add_receipt("taxi_receipt.jpg")
# Get summary
summary = tracker.get_summary()
print(f"Total expenses: {summary['total_expenses']}")
print(f"Grand total: €{summary['grand_total']:.2f}")
# Output:
# Total expenses: 3
# Grand total: €127.45
# By category:
# restaurant: 1 expense, €23.50
# office: 1 expense, €89.95
# transport: 1 expense, €14.00
Measuring and comparing API performance
Response time per image
Correctness of results
Total cost at expected volume
import time
from typing import Callable
def benchmark_api(
api_function: Callable,
test_images: list,
runs: int = 3
) -> dict:
"""Benchmark an API across multiple images."""
results = {
"timings": [], "successes": 0, "failures": 0
}
for image in test_images:
for _ in range(runs):
start = time.time()
try:
api_function(image)
results["successes"] += 1
except Exception as e:
results["failures"] += 1
results["timings"].append(time.time() - start)
results["avg_time"] = sum(results["timings"]) / len(results["timings"])
return results
Reducing API costs at scale
Reduce file size
Focus on relevant areas
Skip duplicate calls
Cache for future use
Practical strategies for production
Resize to optimal dimensions - many APIs charge by size
Can reduce costs by 50-90% with minimal quality loss
Hash images, store results - avoid duplicate processing
Especially valuable for user-uploaded duplicate content
Use batch endpoints for volume discounts when available
Process during off-peak hours for lower rates
Use cheap APIs for filtering, premium for analysis
| Approach | Cost per 1000 images |
|---|---|
| All images through Tier 2 | $10.00 |
| Tiered processing (20% pass) | $3.00 |
from PIL import Image
import io
def optimize_image(image_path: str,
max_size: int = 1024,
quality: int = 85) -> bytes:
"""Optimize image size before API call."""
with Image.open(image_path) as img:
# Resize if too large
if max(img.size) > max_size:
ratio = max_size / max(img.size)
new_size = tuple(int(d * ratio) for d in img.size)
img = img.resize(new_size, Image.LANCZOS)
# Convert to RGB if needed
if img.mode in ('RGBA', 'P'):
img = img.convert('RGB')
# Save to bytes with compression
buffer = io.BytesIO()
img.save(buffer, format='JPEG', quality=quality)
return buffer.getvalue()
# Can reduce file size by 70-90% while maintaining quality
import hashlib
import json
from pathlib import Path
CACHE_DIR = Path("api_cache")
CACHE_DIR.mkdir(exist_ok=True)
def get_cached_or_call(image_path: str, api_func) -> dict:
"""Check cache before calling API."""
# Generate hash of image content
with open(image_path, "rb") as f:
image_hash = hashlib.md5(f.read()).hexdigest()
cache_file = CACHE_DIR / f"{image_hash}.json"
# Return cached if exists
if cache_file.exists():
return json.loads(cache_file.read_text())
# Call API and cache result
result = api_func(image_path)
cache_file.write_text(json.dumps(result))
return result
Production-ready API calls
import time
from typing import Optional
def call_api_with_retry(
func,
max_retries: int = 3,
base_delay: float = 1.0
) -> Optional[dict]:
"""Call API with exponential backoff retry."""
for attempt in range(max_retries):
try:
return func()
except anthropic.RateLimitError:
delay = base_delay * (2 ** attempt)
print(f"Rate limited. Waiting {delay}s...")
time.sleep(delay)
except anthropic.APIError as e:
print(f"API error: {e}")
if attempt == max_retries - 1:
raise
return None
Protecting your API integration
HTTPS only, VPC endpoints, IP allowlisting
Usage alerts, anomaly detection, audit logging
Duration: 1.5 hours | Deadline: Before Session 4
Deliverable: GitHub repository or zip file with code and documentation
Inspiration for your CV application
Contact extraction, CRM integration
Price extraction, allergen detection
Species recognition, disease detection
Style classification, similar items search
Key takeaways from today
No ML expertise needed, instant production-ready, multiple providers
Traditional vs Multimodal, match use case to API, consider cost structure
Structured prompts, error handling, caching & optimization
Start simple, iterate quickly, document everything
Custom Models & Transfer Learning
Decision criteria for custom models
Leverage pre-trained models
Hands-on model training
No-code alternatives