Differential Revision: D90237984 fbshipit-source-id: 526fd760f303bf31be4f743bdcd77760496de0de
214 lines
7.0 KiB
Python
214 lines
7.0 KiB
Python
# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
|
|
|
|
# pyre-unsafe
|
|
import json
|
|
import os
|
|
from collections import defaultdict
|
|
|
|
from tqdm import tqdm
|
|
|
|
|
|
def convert_ytbvis_to_cocovid_gt(ann_json, save_path=None):
|
|
"""Convert YouTube VIS dataset to COCO-style video instance segmentation format.
|
|
|
|
Args:
|
|
ann_json (str): Path to YouTube VIS annotation JSON file
|
|
save_path (str): path to save converted COCO-style JSON
|
|
"""
|
|
# Initialize COCO structure
|
|
VIS = {
|
|
"info": {},
|
|
"images": [],
|
|
"videos": [],
|
|
"tracks": [],
|
|
"annotations": [],
|
|
"categories": [],
|
|
"licenses": [],
|
|
}
|
|
|
|
# Load original annotations
|
|
official_anns = json.load(open(ann_json))
|
|
VIS["categories"] = official_anns["categories"] # Direct copy categories
|
|
|
|
# Initialize counters
|
|
records = dict(img_id=1, ann_id=1)
|
|
|
|
# Create video-to-annotations mapping
|
|
vid_to_anns = defaultdict(list)
|
|
for ann in official_anns["annotations"]:
|
|
vid_to_anns[ann["video_id"]].append(ann)
|
|
|
|
# Create tracks directly
|
|
VIS["tracks"] = [
|
|
{
|
|
"id": ann["id"],
|
|
"category_id": ann["category_id"],
|
|
"video_id": ann["video_id"],
|
|
}
|
|
for ann in official_anns["annotations"]
|
|
]
|
|
|
|
# Process videos
|
|
for video_info in tqdm(official_anns["videos"]):
|
|
# Create video entry
|
|
video = {
|
|
"id": video_info["id"],
|
|
"name": os.path.dirname(video_info["file_names"][0]),
|
|
"width": video_info["width"],
|
|
"height": video_info["height"],
|
|
"length": video_info["length"],
|
|
"neg_category_ids": [],
|
|
"not_exhaustive_category_ids": [],
|
|
}
|
|
VIS["videos"].append(video)
|
|
|
|
# Process frames
|
|
num_frames = len(video_info["file_names"])
|
|
for frame_idx in range(num_frames):
|
|
# Create image entry
|
|
image = {
|
|
"id": records["img_id"],
|
|
"video_id": video_info["id"],
|
|
"file_name": video_info["file_names"][frame_idx],
|
|
"width": video_info["width"],
|
|
"height": video_info["height"],
|
|
"frame_index": frame_idx,
|
|
"frame_id": frame_idx,
|
|
}
|
|
VIS["images"].append(image)
|
|
|
|
# Process annotations for this frame
|
|
if video_info["id"] in vid_to_anns:
|
|
for ann in vid_to_anns[video_info["id"]]:
|
|
bbox = ann["bboxes"][frame_idx]
|
|
if bbox is None:
|
|
continue
|
|
|
|
# Create annotation entry
|
|
annotation = {
|
|
"id": records["ann_id"],
|
|
"video_id": video_info["id"],
|
|
"image_id": records["img_id"],
|
|
"track_id": ann["id"],
|
|
"category_id": ann["category_id"],
|
|
"bbox": bbox,
|
|
"area": ann["areas"][frame_idx],
|
|
"segmentation": ann["segmentations"][frame_idx],
|
|
"iscrowd": ann["iscrowd"],
|
|
}
|
|
VIS["annotations"].append(annotation)
|
|
records["ann_id"] += 1
|
|
|
|
records["img_id"] += 1
|
|
|
|
# Print summary
|
|
print(f"Converted {len(VIS['videos'])} videos")
|
|
print(f"Converted {len(VIS['images'])} images")
|
|
print(f"Created {len(VIS['tracks'])} tracks")
|
|
print(f"Created {len(VIS['annotations'])} annotations")
|
|
|
|
if save_path is None:
|
|
return VIS
|
|
|
|
# Save output
|
|
save_dir = os.path.dirname(save_path)
|
|
os.makedirs(save_dir, exist_ok=True)
|
|
json.dump(VIS, open(save_path, "w"))
|
|
|
|
return VIS
|
|
|
|
|
|
def convert_ytbvis_to_cocovid_pred(
|
|
youtubevis_pred_path: str, converted_dataset_path: str, output_path: str
|
|
) -> None:
|
|
"""
|
|
Convert YouTubeVIS predictions to COCO format with video_id preservation
|
|
|
|
Args:
|
|
youtubevis_pred_path: Path to YouTubeVIS prediction JSON
|
|
converted_dataset_path: Path to converted COCO dataset JSON
|
|
output_path: Path to save COCO format predictions
|
|
"""
|
|
|
|
# Load YouTubeVIS predictions
|
|
with open(youtubevis_pred_path) as f:
|
|
ytv_predictions = json.load(f)
|
|
|
|
# Load converted dataset for image ID mapping
|
|
with open(converted_dataset_path) as f:
|
|
coco_dataset = json.load(f)
|
|
|
|
# Create (video_id, frame_idx) -> image_id mapping
|
|
image_id_map = {
|
|
(img["video_id"], img["frame_index"]): img["id"]
|
|
for img in coco_dataset["images"]
|
|
}
|
|
|
|
coco_annotations = []
|
|
track_id_counter = 1 # Unique track ID generator
|
|
|
|
for pred in tqdm(ytv_predictions):
|
|
video_id = pred["video_id"]
|
|
category_id = pred["category_id"]
|
|
bboxes = pred["bboxes"]
|
|
segmentations = pred.get("segmentations", []) # Get segmentations if available
|
|
areas = pred.get("areas", []) # Get areas if available
|
|
score = pred["score"]
|
|
|
|
# Assign unique track ID for this prediction
|
|
track_id = track_id_counter
|
|
track_id_counter += 1
|
|
|
|
# Ensure segmentations and areas have the same length as bboxes
|
|
if len(segmentations) == 0:
|
|
segmentations = [None] * len(bboxes)
|
|
if len(areas) == 0:
|
|
areas = [None] * len(bboxes)
|
|
|
|
for frame_idx, (bbox, segmentation, area_from_pred) in enumerate(
|
|
zip(bboxes, segmentations, areas)
|
|
):
|
|
# Skip frames with missing objects (None or zero bbox)
|
|
if bbox is None or all(x == 0 for x in bbox):
|
|
continue
|
|
|
|
# Get corresponding image ID from mapping
|
|
image_id = image_id_map.get((video_id, frame_idx))
|
|
if image_id is None:
|
|
raise RuntimeError(
|
|
f"prediction {video_id=}, {frame_idx=} does not match any images in the converted COCO format"
|
|
)
|
|
|
|
# Extract bbox coordinates
|
|
x, y, w, h = bbox
|
|
|
|
# Calculate area - use area from prediction if available, otherwise from bbox
|
|
if area_from_pred is not None and area_from_pred > 0:
|
|
area = area_from_pred
|
|
else:
|
|
area = w * h
|
|
|
|
# Create COCO annotation with video_id
|
|
coco_annotation = {
|
|
"image_id": int(image_id),
|
|
"video_id": video_id, # Added video_id field
|
|
"track_id": track_id,
|
|
"category_id": category_id,
|
|
"bbox": [float(x), float(y), float(w), float(h)],
|
|
"area": float(area),
|
|
"iscrowd": 0,
|
|
"score": float(score),
|
|
}
|
|
|
|
# Add segmentation if available
|
|
if segmentation is not None:
|
|
coco_annotation["segmentation"] = segmentation
|
|
|
|
coco_annotations.append(coco_annotation)
|
|
|
|
# Save output
|
|
with open(output_path, "w") as f:
|
|
json.dump(coco_annotations, f)
|
|
|
|
print(f"Converted {len(coco_annotations)} predictions to COCO format with video_id")
|