Oculus / demo_oculus.py

Upload demo_oculus.py with huggingface_hub

8d515d0 verified about 2 months ago

7.41 kB

	#!/usr/bin/env python3
	"""
	Oculus Car Part Detection Demo

	Demonstrates detection on car images using the extended training model.
	"""

	import sys
	import requests
	from io import BytesIO
	from PIL import Image, ImageDraw, ImageFont
	import torch
	import numpy as np

	# Add parent to path
	from pathlib import Path
	sys.path.insert(0, str(Path(__file__).parent))

	from oculus_unified_model import OculusForConditionalGeneration

	def visualize_results(image, output, filename="output_car_parts.png"):
	"""Draw bounding boxes and labels on image."""
	draw = ImageDraw.Draw(image)

	# Try to load a font
	try:
	font = ImageFont.truetype("/System/Library/Fonts/Helvetica.ttc", 16)
	except:
	font = ImageFont.load_default()

	width, height = image.size

	# COCO Classes
	COCO_CLASSES = [
	'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck',
	'boat', 'traffic light', 'fire hydrant', 'stop sign', 'parking meter', 'bench',
	'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra',
	'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee',
	'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove',
	'skateboard', 'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup',
	'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange',
	'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch',
	'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse',
	'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink',
	'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier',
	'toothbrush'
	]

	# Draw boxes
	for box, label, conf in zip(output.boxes, output.labels, output.confidences):
	# Box is [x1, y1, x2, y2] normalized
	x1, y1, x2, y2 = box

	# Clamp normalized coords
	x1 = max(0.0, min(1.0, x1))
	y1 = max(0.0, min(1.0, y1))
	x2 = max(0.0, min(1.0, x2))
	y2 = max(0.0, min(1.0, y2))

	# Ensure valid box
	if x2 <= x1 or y2 <= y1:
	continue

	x1 *= width
	y1 *= height
	x2 *= width
	y2 *= height

	# Color based on confidence
	color = "red" if conf < 0.5 else "green"

	draw.rectangle([x1, y1, x2, y2], outline=color, width=3)

	# Label
	try:
	class_name = COCO_CLASSES[int(label)]
	except:
	class_name = str(label)

	label_text = f"{class_name} ({conf:.2f})"

	# Draw text background
	text_bbox = draw.textbbox((x1, y1), label_text, font=font)
	draw.rectangle(text_bbox, fill=color)
	draw.text((x1, y1), label_text, fill="white", font=font)

	image.save(filename)
	print(f"Saved visualization to {filename}")

	def main():
	import argparse
	parser = argparse.ArgumentParser(description="Oculus General Object Detection Demo")
	parser.add_argument("--image", type=str, help="Path to image file to test")
	parser.add_argument("--prompt", type=str, default="Detect objects", help="Text prompt for the model")
	parser.add_argument("--mode", type=str, default="box", choices=["box", "vqa", "caption"], help="Inference mode")
	parser.add_argument("--threshold", type=float, default=0.2, help="Detection threshold")
	parser.add_argument("--output", type=str, default="detection_result.png", help="Output filename")
	args = parser.parse_args()

	# ... (Checkpoint loading logic remains the same) ...
	# Find latest checkpoint
	checkpoint_dir = Path("checkpoints/oculus_detection_v2")
	model_path = None

	if checkpoint_dir.exists():
	# Get all step folders
	steps = []
	for d in checkpoint_dir.iterdir():
	if d.is_dir() and d.name.startswith("step_"):
	try:
	step = int(d.name.split("_")[1])
	steps.append((step, d))
	except:
	pass

	# Sort and pick latest
	if steps:
	steps.sort(key=lambda x: x[0], reverse=True)
	model_path = str(steps[0][1])
	print(f"✨ Found latest checkpoint: {model_path}")

	if model_path is None:
	model_path = str(checkpoint_dir / "final")

	# Fallback to initial detection checkpoint if extended one isn't ready
	if not Path(model_path).exists():
	model_path = "checkpoints/oculus_detection/final"
	print(f"⚠️ Extended V2 model not found, falling back to V1: {model_path}")

	print(f"Loading model from {model_path}...")
	try:
	model = OculusForConditionalGeneration.from_pretrained(model_path)

	# Load heads
	heads_path = Path(model_path) / "heads.pth"
	if heads_path.exists():
	heads = torch.load(heads_path, map_location="cpu")
	model.detection_head.load_state_dict(heads['detection'])
	print("✓ Loaded detection heads")
	except Exception as e:
	print(f"Error loading model: {e}")
	return

	# Image logic
	if args.image:
	image_path = args.image
	print(f"\nProcessing Custom Image: {image_path}...")
	else:
	# Use a generic COCO sample (dining table/people) instead of car if possible
	# defaulting to the car one is fine, but let's see if we have others
	image_path = "data/coco/images/000000071345.jpg"
	print(f"\nProcessing Default Image: {image_path}...")

	try:
	if Path(image_path).exists():
	image = Image.open(image_path).convert('RGB')
	else:
	# Fallback to online image
	# Let's use a more crowded scene for generic detection
	url = "https://upload.wikimedia.org/wikipedia/commons/thumb/8/8d/President_Barack_Obama.jpg/800px-President_Barack_Obama.jpg"
	print(f"Image not found, downloading sample {url}...")
	response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
	image = Image.open(BytesIO(response.content)).convert('RGB')

	# Mode selection
	if args.mode == "box":
	print(f"Running detection with prompt: '{args.prompt}'...")
	output = model.generate(
	image,
	mode="box",
	prompt=args.prompt,
	threshold=args.threshold
	)
	print(f"Found {len(output.boxes)} objects")
	visualize_results(image, output, args.output)

	elif args.mode == "caption":
	print("Generating caption...")
	output = model.generate(image, mode="text", prompt="A photo of")
	print(f"\n📝 Caption: {output.text}\n")

	elif args.mode == "vqa":
	question = args.prompt if args.prompt != "Detect objects" else "What is in this image?"
	print(f"Thinking about question: '{question}'...")
	output = model.generate(image, mode="text", prompt=question)
	print(f"\n🤔 Answer: {output.text}\n")

	except Exception as e:
	print(f"Error processing image: {e}")

	if __name__ == "__main__":
	main()