sam3_local/sam3/eval/conversion_util.py

# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved

# pyre-unsafe
import json
import os
from collections import defaultdict

from tqdm import tqdm


def convert_ytbvis_to_cocovid_gt(ann_json, save_path=None):
    """Convert YouTube VIS dataset to COCO-style video instance segmentation format.

    Args:
        ann_json (str): Path to YouTube VIS annotation JSON file
        save_path (str): path to save converted COCO-style JSON
    """
    # Initialize COCO structure
    VIS = {
        "info": {},
        "images": [],
        "videos": [],
        "tracks": [],
        "annotations": [],
        "categories": [],
        "licenses": [],
    }

    # Load original annotations
    official_anns = json.load(open(ann_json))
    VIS["categories"] = official_anns["categories"]  # Direct copy categories

    # Initialize counters
    records = dict(img_id=1, ann_id=1)

    # Create video-to-annotations mapping
    vid_to_anns = defaultdict(list)
    for ann in official_anns["annotations"]:
        vid_to_anns[ann["video_id"]].append(ann)

    # Create tracks directly
    VIS["tracks"] = [
        {
            "id": ann["id"],
            "category_id": ann["category_id"],
            "video_id": ann["video_id"],
        }
        for ann in official_anns["annotations"]
    ]

    # Process videos
    for video_info in tqdm(official_anns["videos"]):
        # Create video entry
        video = {
            "id": video_info["id"],
            "name": os.path.dirname(video_info["file_names"][0]),
            "width": video_info["width"],
            "height": video_info["height"],
            "length": video_info["length"],
            "neg_category_ids": [],
            "not_exhaustive_category_ids": [],
        }
        VIS["videos"].append(video)

        # Process frames
        num_frames = len(video_info["file_names"])
        for frame_idx in range(num_frames):
            # Create image entry
            image = {
                "id": records["img_id"],
                "video_id": video_info["id"],
                "file_name": video_info["file_names"][frame_idx],
                "width": video_info["width"],
                "height": video_info["height"],
                "frame_index": frame_idx,
                "frame_id": frame_idx,
            }
            VIS["images"].append(image)

            # Process annotations for this frame
            if video_info["id"] in vid_to_anns:
                for ann in vid_to_anns[video_info["id"]]:
                    bbox = ann["bboxes"][frame_idx]
                    if bbox is None:
                        continue

                    # Create annotation entry
                    annotation = {
                        "id": records["ann_id"],
                        "video_id": video_info["id"],
                        "image_id": records["img_id"],
                        "track_id": ann["id"],
                        "category_id": ann["category_id"],
                        "bbox": bbox,
                        "area": ann["areas"][frame_idx],
                        "segmentation": ann["segmentations"][frame_idx],
                        "iscrowd": ann["iscrowd"],
                    }
                    VIS["annotations"].append(annotation)
                    records["ann_id"] += 1

            records["img_id"] += 1

    # Print summary
    print(f"Converted {len(VIS['videos'])} videos")
    print(f"Converted {len(VIS['images'])} images")
    print(f"Created {len(VIS['tracks'])} tracks")
    print(f"Created {len(VIS['annotations'])} annotations")

    if save_path is None:
        return VIS

    # Save output
    save_dir = os.path.dirname(save_path)
    os.makedirs(save_dir, exist_ok=True)
    json.dump(VIS, open(save_path, "w"))

    return VIS


def convert_ytbvis_to_cocovid_pred(
    youtubevis_pred_path: str, converted_dataset_path: str, output_path: str
) -> None:
    """
    Convert YouTubeVIS predictions to COCO format with video_id preservation

    Args:
        youtubevis_pred_path: Path to YouTubeVIS prediction JSON
        converted_dataset_path: Path to converted COCO dataset JSON
        output_path: Path to save COCO format predictions
    """

    # Load YouTubeVIS predictions
    with open(youtubevis_pred_path) as f:
        ytv_predictions = json.load(f)

    # Load converted dataset for image ID mapping
    with open(converted_dataset_path) as f:
        coco_dataset = json.load(f)

    # Create (video_id, frame_idx) -> image_id mapping
    image_id_map = {
        (img["video_id"], img["frame_index"]): img["id"]
        for img in coco_dataset["images"]
    }

    coco_annotations = []
    track_id_counter = 1  # Unique track ID generator

    for pred in tqdm(ytv_predictions):
        video_id = pred["video_id"]
        category_id = pred["category_id"]
        bboxes = pred["bboxes"]
        segmentations = pred.get("segmentations", [])  # Get segmentations if available
        areas = pred.get("areas", [])  # Get areas if available
        score = pred["score"]

        # Assign unique track ID for this prediction
        track_id = track_id_counter
        track_id_counter += 1

        # Ensure segmentations and areas have the same length as bboxes
        if len(segmentations) == 0:
            segmentations = [None] * len(bboxes)
        if len(areas) == 0:
            areas = [None] * len(bboxes)

        for frame_idx, (bbox, segmentation, area_from_pred) in enumerate(
            zip(bboxes, segmentations, areas)
        ):
            # Skip frames with missing objects (None or zero bbox)
            if bbox is None or all(x == 0 for x in bbox):
                continue

            # Get corresponding image ID from mapping
            image_id = image_id_map.get((video_id, frame_idx))
            if image_id is None:
                raise RuntimeError(
                    f"prediction {video_id=}, {frame_idx=} does not match any images in the converted COCO format"
                )

            # Extract bbox coordinates
            x, y, w, h = bbox

            # Calculate area - use area from prediction if available, otherwise from bbox
            if area_from_pred is not None and area_from_pred > 0:
                area = area_from_pred
            else:
                area = w * h

            # Create COCO annotation with video_id
            coco_annotation = {
                "image_id": int(image_id),
                "video_id": video_id,  # Added video_id field
                "track_id": track_id,
                "category_id": category_id,
                "bbox": [float(x), float(y), float(w), float(h)],
                "area": float(area),
                "iscrowd": 0,
                "score": float(score),
            }

            # Add segmentation if available
            if segmentation is not None:
                coco_annotation["segmentation"] = segmentation

            coco_annotations.append(coco_annotation)

    # Save output
    with open(output_path, "w") as f:
        json.dump(coco_annotations, f)

    print(f"Converted {len(coco_annotations)} predictions to COCO format with video_id")