from pathlib import Path
import modal
from encord.objects.coordinates import (
PointCoordinate,
)
from encord.objects.ontology_labels_impl import LabelRowV2
from encord.objects.ontology_object_instance import ObjectInstance
from fastapi import Depends
from typing_extensions import Annotated
from encord_agents.fastapi.dependencies import (
FrameData,
dep_asset,
dep_label_row,
dep_objects,
)
# 1. Define the Modal image.
# This specifies the base environment, installs system dependencies,
# downloads the CoTracker3 model weights, and installs Python packages.
image = (
modal.Image.debian_slim(python_version="3.12")
.apt_install("libgl1", "libglib2.0-0", "wget")
.run_commands(
"wget https://huggingface.co/facebook/cotracker3/resolve/main/scaled_offline.pth",
)
.pip_install(
"fastapi[standard]",
"encord-agents",
"torch",
"torchvision",
"tqdm",
"imageio[ffmpeg]",
)
.add_local_python_source("cotracker") # Assuming 'cotracker' is a local directory with CoTracker source
)
# 2. Define the Modal app.
# This creates the Modal application instance, linking it to the defined image.
app = modal.App(name="encord-agents-cotracker-3-with-model", image=image)
# Helper function to read video frames from a given path using imageio.
def read_video_from_path(path):
import imageio
import numpy as np
try:
reader = imageio.get_reader(path)
except Exception as e:
print("Error opening video file: ", e)
return None
frames = []
for i, im in enumerate(reader):
frames.append(np.array(im))
return np.stack(frames)
# 3. Define the endpoint and CoTracker3 usage.
# This is the main function that runs on Modal, exposed as a web endpoint.
@app.function(
secrets=[modal.Secret.from_name("encord-ssh-key")], # Accesses a Modal secret for Encord authentication
gpu="L4" # Specifies that the function requires a GPU (L4 type)
)
@modal.web_endpoint(method="POST")
def cotracker3(
frame_data: FrameData, # Injected frame metadata from Encord webhook
lr: Annotated[LabelRowV2, Depends(dep_label_row)], # Injected label row object
object_instances: Annotated[list[ObjectInstance], Depends(dep_objects)], # Injected list of selected objects
asset: Annotated[Path, Depends(dep_asset)], # Injected path to the local video asset
):
import imageio
import numpy
import torch
from cotracker.predictor import CoTrackerPredictor
# Initialize CoTracker3 model, moving to GPU if available
model = CoTrackerPredictor(checkpoint="/scaled_offline.pth")
if torch.cuda.is_available():
model = model.cuda()
# Ensure only one object is selected for tracking
assert len(object_instances) == 1
obj_inst = object_instances[0]
# Read the video from the asset path and convert to PyTorch tensor
video = read_video_from_path(asset)
video_tensor = torch.from_numpy(video).permute(0, 3, 1, 2)[None].float()
# Move video tensor to GPU if available
if torch.cuda.is_available():
video_tensor = video_tensor.cuda()
# Extract query point from the selected object instance's annotation
annotation = obj_inst.get_annotation(frame_data.frame)
assert isinstance(annotation.coordinates, PointCoordinate)
assert lr.width # Ensure label row dimensions are available
assert lr.height
# Prepare the query tensor for CoTracker (frame number, x-coordinate, y-coordinate)
query = torch.tensor(
[
[
frame_data.frame,
annotation.coordinates.x * lr.width, # Convert normalized x to pixel x
annotation.coordinates.y * lr.height, # Convert normalized y to pixel y
],
]
)
if torch.cuda.is_available():
query = query.cuda()
# Run CoTracker to predict tracks based on the query point
pred_tracks, _ = model(video_tensor, queries=query[None])
# Update the object instance with predicted tracks for each frame
for frame_num, coord in enumerate(pred_tracks.reshape(-1, 2)):
try:
obj_inst.set_for_frames(
coordinates=PointCoordinate(x=float(coord[0]) / lr.width, y=float(coord[1]) / lr.height),
frames=frame_num,
)
except Exception:
# Skip frames where updating might fail (e.g., if coordinates are out of bounds)
continue
# Save the updated label row with the new tracked object instance
lr.save()