Building with Cloud Vision APIs
Hands-on integration with Google Cloud Vision and Claude Vision
Objectives
By the end of this practical work, you will be able to:
- Set up and authenticate with cloud vision API providers
- Extract text from documents using OCR
- Analyze images for objects and scenes
- Use Claude Vision for complex visual reasoning
- Compare results and choose the right API for different tasks
Prerequisites
- Python 3.8+ installed
- Google Cloud account with Vision API enabled (free tier available)
- Anthropic API key for Claude (free credits available)
- Sample images for testing (receipts, products, scenes)
Install required packages:
pip install google-cloud-vision anthropic pillow
API Keys: Never commit API keys to version control. Use environment variables or a .env file.
Instructions
Step 1: Set Up Your Environment
Create a project structure and configure API credentials:
# Create project directory
mkdir cv-api-lab
cd cv-api-lab
# Create virtual environment
python -m venv venv
source venv/bin/activate # On Windows: venv\Scripts\activate
# Create .env file for credentials
touch .env
Add your credentials to .env:
# .env
GOOGLE_APPLICATION_CREDENTIALS=path/to/your/service-account.json
ANTHROPIC_API_KEY=sk-ant-your-key-here
Create a utility file to load credentials:
# utils.py
import os
from dotenv import load_dotenv
load_dotenv()
def get_anthropic_key():
return os.getenv("ANTHROPIC_API_KEY")
Step 2: Extract Text with Google Cloud Vision OCR
Create a script to extract text from receipts or documents:
# ocr_google.py
from google.cloud import vision
import io
def extract_text(image_path: str) -> dict:
"""Extract text from an image using Google Cloud Vision."""
client = vision.ImageAnnotatorClient()
with open(image_path, "rb") as image_file:
content = image_file.read()
image = vision.Image(content=content)
# Perform text detection
response = client.text_detection(image=image)
texts = response.text_annotations
if response.error.message:
raise Exception(f"API Error: {response.error.message}")
result = {
"full_text": texts[0].description if texts else "",
"words": [],
"confidence": None
}
# Extract individual words with bounding boxes
for text in texts[1:]: # Skip first (full text)
vertices = text.bounding_poly.vertices
result["words"].append({
"text": text.description,
"bounds": [(v.x, v.y) for v in vertices]
})
return result
# Test it
if __name__ == "__main__":
result = extract_text("receipt.jpg")
print("Extracted Text:")
print(result["full_text"])
print(f"\nFound {len(result['words'])} words")
Step 3: Analyze Images with Claude Vision
Use Claude for more complex visual understanding:
# analyze_claude.py
import anthropic
import base64
from pathlib import Path
def encode_image(image_path: str) -> tuple[str, str]:
"""Encode image to base64 and detect media type."""
path = Path(image_path)
suffix = path.suffix.lower()
media_types = {
".jpg": "image/jpeg",
".jpeg": "image/jpeg",
".png": "image/png",
".gif": "image/gif",
".webp": "image/webp"
}
media_type = media_types.get(suffix, "image/jpeg")
with open(image_path, "rb") as f:
image_data = base64.standard_b64encode(f.read()).decode("utf-8")
return image_data, media_type
def analyze_image(image_path: str, prompt: str) -> str:
"""Analyze an image using Claude Vision."""
client = anthropic.Anthropic()
image_data, media_type = encode_image(image_path)
message = client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=1024,
messages=[
{
"role": "user",
"content": [
{
"type": "image",
"source": {
"type": "base64",
"media_type": media_type,
"data": image_data,
},
},
{
"type": "text",
"text": prompt
}
],
}
],
)
return message.content[0].text
# Test it
if __name__ == "__main__":
result = analyze_image(
"product.jpg",
"Describe this product in detail. Include: category, brand if visible, "
"key features, estimated price range, and suggested uses."
)
print(result)
Step 4: Build a Receipt Parser
Combine OCR with intelligent parsing:
# receipt_parser.py
import json
from analyze_claude import analyze_image
def parse_receipt(image_path: str) -> dict:
"""Parse a receipt image into structured data."""
prompt = """Analyze this receipt image and extract the following information
in JSON format:
{
"store_name": "Name of the store",
"date": "Date of purchase (YYYY-MM-DD format)",
"items": [
{"name": "Item name", "quantity": 1, "price": 0.00}
],
"subtotal": 0.00,
"tax": 0.00,
"total": 0.00,
"payment_method": "cash/card/other"
}
If any field cannot be determined, use null.
Return ONLY the JSON, no additional text."""
response = analyze_image(image_path, prompt)
# Parse the JSON response
try:
# Clean up response if needed
json_str = response.strip()
if json_str.startswith("```"):
json_str = json_str.split("```")[1]
if json_str.startswith("json"):
json_str = json_str[4:]
data = json.loads(json_str)
except json.JSONDecodeError:
data = {"raw_response": response, "error": "Could not parse JSON"}
return data
if __name__ == "__main__":
receipt_data = parse_receipt("receipt.jpg")
print(json.dumps(receipt_data, indent=2))
Step 5: Build a Product Analyzer
Create a tool to analyze product images for e-commerce:
# product_analyzer.py
from analyze_claude import analyze_image
import json
def analyze_product(image_path: str) -> dict:
"""Analyze a product image for e-commerce cataloging."""
prompt = """Analyze this product image for an e-commerce catalog.
Provide the following in JSON format:
{
"category": "Main product category",
"subcategory": "Specific subcategory",
"title": "Suggested product title (max 80 chars)",
"description": "Product description (2-3 sentences)",
"key_features": ["Feature 1", "Feature 2", "Feature 3"],
"colors": ["Primary color", "Secondary color if any"],
"materials": ["Material if identifiable"],
"condition": "new/used/refurbished",
"suggested_tags": ["tag1", "tag2", "tag3"],
"quality_issues": ["Any visible defects or concerns"]
}
Return ONLY valid JSON."""
response = analyze_image(image_path, prompt)
try:
json_str = response.strip()
if json_str.startswith("```"):
json_str = json_str.split("```")[1]
if json_str.startswith("json"):
json_str = json_str[4:]
return json.loads(json_str)
except json.JSONDecodeError:
return {"raw_response": response}
if __name__ == "__main__":
product = analyze_product("product.jpg")
print(json.dumps(product, indent=2))
Step 6: Compare API Results
Create a comparison script to evaluate different APIs:
# compare_apis.py
import time
from ocr_google import extract_text
from analyze_claude import analyze_image
def compare_ocr(image_path: str) -> dict:
"""Compare OCR results from different providers."""
results = {}
# Google Cloud Vision
start = time.time()
try:
google_result = extract_text(image_path)
results["google"] = {
"text": google_result["full_text"],
"word_count": len(google_result["words"]),
"time_ms": (time.time() - start) * 1000
}
except Exception as e:
results["google"] = {"error": str(e)}
# Claude Vision
start = time.time()
try:
claude_result = analyze_image(
image_path,
"Extract ALL text from this image. Return only the text, "
"preserving the original layout as much as possible."
)
results["claude"] = {
"text": claude_result,
"word_count": len(claude_result.split()),
"time_ms": (time.time() - start) * 1000
}
except Exception as e:
results["claude"] = {"error": str(e)}
return results
if __name__ == "__main__":
comparison = compare_ocr("receipt.jpg")
print("=== Google Cloud Vision ===")
if "error" not in comparison["google"]:
print(f"Time: {comparison['google']['time_ms']:.0f}ms")
print(f"Words: {comparison['google']['word_count']}")
print(comparison["google"]["text"][:500])
else:
print(f"Error: {comparison['google']['error']}")
print("\n=== Claude Vision ===")
if "error" not in comparison["claude"]:
print(f"Time: {comparison['claude']['time_ms']:.0f}ms")
print(f"Words: {comparison['claude']['word_count']}")
print(comparison["claude"]["text"][:500])
else:
print(f"Error: {comparison['claude']['error']}")
Step 7: Build a Mini Application
Combine everything into a simple command-line application:
# cv_app.py
import argparse
import json
from receipt_parser import parse_receipt
from product_analyzer import analyze_product
from compare_apis import compare_ocr
def main():
parser = argparse.ArgumentParser(description="CV API Tools")
parser.add_argument("command", choices=["receipt", "product", "compare"])
parser.add_argument("image", help="Path to image file")
parser.add_argument("--output", "-o", help="Output file (JSON)")
args = parser.parse_args()
if args.command == "receipt":
result = parse_receipt(args.image)
elif args.command == "product":
result = analyze_product(args.image)
elif args.command == "compare":
result = compare_ocr(args.image)
output = json.dumps(result, indent=2)
print(output)
if args.output:
with open(args.output, "w") as f:
f.write(output)
print(f"\nResults saved to {args.output}")
if __name__ == "__main__":
main()
Usage:
# Parse a receipt
python cv_app.py receipt receipt.jpg -o receipt_data.json
# Analyze a product
python cv_app.py product product.jpg -o product_data.json
# Compare APIs
python cv_app.py compare document.jpg
Expected Output
After completing this practical work, you should have:
- A working project with API integrations
- Receipt parser that extracts structured data
- Product analyzer for e-commerce cataloging
- Comparison results showing API performance differences
- A command-line tool combining all functionality
Deliverables
- Complete project folder with all Python files
- Sample outputs for at least 3 different images
- Brief report (1 page) comparing the APIs: when to use each, pros/cons
Bonus Challenges
- Challenge 1: Add AWS Rekognition as a third comparison option
- Challenge 2: Build a simple web interface using Streamlit or Gradio
- Challenge 3: Add error handling and retry logic for API failures
- Challenge 4: Implement caching to avoid re-processing the same images