Initial commit

fbshipit-source-id: da6be2f26e3a1202f4bffde8cb980e2dcb851294
2025-11-18 23:07:42 -08:00
commit a13e358df4
504 changed files with 122758 additions and 0 deletions
--- a/sam3/eval/conversion_util.py
+++ b/sam3/eval/conversion_util.py
@@ -0,0 +1,211 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
+import json
+import os
+from collections import defaultdict
+
+from tqdm import tqdm
+
+
+def convert_ytbvis_to_cocovid_gt(ann_json, save_path=None):
+    """Convert YouTube VIS dataset to COCO-style video instance segmentation format.
+
+    Args:
+        ann_json (str): Path to YouTube VIS annotation JSON file
+        save_path (str): path to save converted COCO-style JSON
+    """
+    # Initialize COCO structure
+    VIS = {
+        "info": {},
+        "images": [],
+        "videos": [],
+        "tracks": [],
+        "annotations": [],
+        "categories": [],
+        "licenses": [],
+    }
+
+    # Load original annotations
+    official_anns = json.load(open(ann_json))
+    VIS["categories"] = official_anns["categories"]  # Direct copy categories
+
+    # Initialize counters
+    records = dict(img_id=1, ann_id=1)
+
+    # Create video-to-annotations mapping
+    vid_to_anns = defaultdict(list)
+    for ann in official_anns["annotations"]:
+        vid_to_anns[ann["video_id"]].append(ann)
+
+    # Create tracks directly
+    VIS["tracks"] = [
+        {
+            "id": ann["id"],
+            "category_id": ann["category_id"],
+            "video_id": ann["video_id"],
+        }
+        for ann in official_anns["annotations"]
+    ]
+
+    # Process videos
+    for video_info in tqdm(official_anns["videos"]):
+        # Create video entry
+        video = {
+            "id": video_info["id"],
+            "name": os.path.dirname(video_info["file_names"][0]),
+            "width": video_info["width"],
+            "height": video_info["height"],
+            "length": video_info["length"],
+            "neg_category_ids": [],
+            "not_exhaustive_category_ids": [],
+        }
+        VIS["videos"].append(video)
+
+        # Process frames
+        num_frames = len(video_info["file_names"])
+        for frame_idx in range(num_frames):
+            # Create image entry
+            image = {
+                "id": records["img_id"],
+                "video_id": video_info["id"],
+                "file_name": video_info["file_names"][frame_idx],
+                "width": video_info["width"],
+                "height": video_info["height"],
+                "frame_index": frame_idx,
+                "frame_id": frame_idx,
+            }
+            VIS["images"].append(image)
+
+            # Process annotations for this frame
+            if video_info["id"] in vid_to_anns:
+                for ann in vid_to_anns[video_info["id"]]:
+                    bbox = ann["bboxes"][frame_idx]
+                    if bbox is None:
+                        continue
+
+                    # Create annotation entry
+                    annotation = {
+                        "id": records["ann_id"],
+                        "video_id": video_info["id"],
+                        "image_id": records["img_id"],
+                        "track_id": ann["id"],
+                        "category_id": ann["category_id"],
+                        "bbox": bbox,
+                        "area": ann["areas"][frame_idx],
+                        "segmentation": ann["segmentations"][frame_idx],
+                        "iscrowd": ann["iscrowd"],
+                    }
+                    VIS["annotations"].append(annotation)
+                    records["ann_id"] += 1
+
+            records["img_id"] += 1
+
+    # Print summary
+    print(f"Converted {len(VIS['videos'])} videos")
+    print(f"Converted {len(VIS['images'])} images")
+    print(f"Created {len(VIS['tracks'])} tracks")
+    print(f"Created {len(VIS['annotations'])} annotations")
+
+    if save_path is None:
+        return VIS
+
+    # Save output
+    save_dir = os.path.dirname(save_path)
+    os.makedirs(save_dir, exist_ok=True)
+    json.dump(VIS, open(save_path, "w"))
+
+    return VIS
+
+
+def convert_ytbvis_to_cocovid_pred(
+    youtubevis_pred_path: str, converted_dataset_path: str, output_path: str
+) -> None:
+    """
+    Convert YouTubeVIS predictions to COCO format with video_id preservation
+
+    Args:
+        youtubevis_pred_path: Path to YouTubeVIS prediction JSON
+        converted_dataset_path: Path to converted COCO dataset JSON
+        output_path: Path to save COCO format predictions
+    """
+
+    # Load YouTubeVIS predictions
+    with open(youtubevis_pred_path) as f:
+        ytv_predictions = json.load(f)
+
+    # Load converted dataset for image ID mapping
+    with open(converted_dataset_path) as f:
+        coco_dataset = json.load(f)
+
+    # Create (video_id, frame_idx) -> image_id mapping
+    image_id_map = {
+        (img["video_id"], img["frame_index"]): img["id"]
+        for img in coco_dataset["images"]
+    }
+
+    coco_annotations = []
+    track_id_counter = 1  # Unique track ID generator
+
+    for pred in tqdm(ytv_predictions):
+        video_id = pred["video_id"]
+        category_id = pred["category_id"]
+        bboxes = pred["bboxes"]
+        segmentations = pred.get("segmentations", [])  # Get segmentations if available
+        areas = pred.get("areas", [])  # Get areas if available
+        score = pred["score"]
+
+        # Assign unique track ID for this prediction
+        track_id = track_id_counter
+        track_id_counter += 1
+
+        # Ensure segmentations and areas have the same length as bboxes
+        if len(segmentations) == 0:
+            segmentations = [None] * len(bboxes)
+        if len(areas) == 0:
+            areas = [None] * len(bboxes)
+
+        for frame_idx, (bbox, segmentation, area_from_pred) in enumerate(
+            zip(bboxes, segmentations, areas)
+        ):
+            # Skip frames with missing objects (None or zero bbox)
+            if bbox is None or all(x == 0 for x in bbox):
+                continue
+
+            # Get corresponding image ID from mapping
+            image_id = image_id_map.get((video_id, frame_idx))
+            if image_id is None:
+                raise RuntimeError(
+                    f"prediction {video_id=}, {frame_idx=} does not match any images in the converted COCO format"
+                )
+
+            # Extract bbox coordinates
+            x, y, w, h = bbox
+
+            # Calculate area - use area from prediction if available, otherwise from bbox
+            if area_from_pred is not None and area_from_pred > 0:
+                area = area_from_pred
+            else:
+                area = w * h
+
+            # Create COCO annotation with video_id
+            coco_annotation = {
+                "image_id": int(image_id),
+                "video_id": video_id,  # Added video_id field
+                "track_id": track_id,
+                "category_id": category_id,
+                "bbox": [float(x), float(y), float(w), float(h)],
+                "area": float(area),
+                "iscrowd": 0,
+                "score": float(score),
+            }
+
+            # Add segmentation if available
+            if segmentation is not None:
+                coco_annotation["segmentation"] = segmentation
+
+            coco_annotations.append(coco_annotation)
+
+    # Save output
+    with open(output_path, "w") as f:
+        json.dump(coco_annotations, f)
+
+    print(f"Converted {len(coco_annotations)} predictions to COCO format with video_id")