Initial commit

fbshipit-source-id: da6be2f26e3a1202f4bffde8cb980e2dcb851294
2025-11-18 23:07:42 -08:00
commit a13e358df4
504 changed files with 122758 additions and 0 deletions
--- a/sam3/eval/ytvis_eval.py
+++ b/sam3/eval/ytvis_eval.py
@@ -0,0 +1,411 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
+import copy
+import gc
+import logging
+import os
+from collections import defaultdict
+from operator import xor
+from pathlib import Path
+from typing import List, Optional
+
+import numpy as np
+import pycocotools.mask as mask_util
+import torch
+from pycocotools.cocoeval import COCOeval
+from sam3.eval.cgf1_eval import CGF1Eval
+from sam3.eval.coco_eval_offline import convert_to_xywh
+from sam3.model.box_ops import box_xywh_inter_union
+from sam3.train.masks_ops import rle_encode
+from sam3.train.utils import distributed as dist
+from typing_extensions import override
+
+try:
+    import rapidjson as json
+except ModuleNotFoundError:
+    import json
+
+from iopath.common.file_io import g_pathmgr
+
+
+class YTVISevalMixin:
+    """
+    Identical to COCOeval but adapts computeIoU to compute IoU between tracklets/masklets.
+    """
+
+    @override
+    def _prepare(self):
+        """
+        Copied from cocoeval.py but doesn't convert masks to RLEs (we assume they already are RLEs)
+        """
+        p = self.params
+        if p.useCats:
+            gts = self.cocoGt.loadAnns(
+                self.cocoGt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds)
+            )
+            dts = self.cocoDt.loadAnns(
+                self.cocoDt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds)
+            )
+        else:
+            gts = self.cocoGt.loadAnns(self.cocoGt.getAnnIds(imgIds=p.imgIds))
+            dts = self.cocoDt.loadAnns(self.cocoDt.getAnnIds(imgIds=p.imgIds))
+
+        # set ignore flag
+        for gt in gts:
+            gt["ignore"] = gt["ignore"] if "ignore" in gt else 0
+            gt["ignore"] = "iscrowd" in gt and gt["iscrowd"]
+            if p.iouType == "keypoints":
+                gt["ignore"] = (gt["num_keypoints"] == 0) or gt["ignore"]
+        self._gts = defaultdict(list)  # gt for evaluation
+        self._dts = defaultdict(list)  # dt for evaluation
+        for gt in gts:
+            self._gts[gt["image_id"], gt["category_id"]].append(gt)
+        for dt in dts:
+            self._dts[dt["image_id"], dt["category_id"]].append(dt)
+        self.evalImgs = defaultdict(list)  # per-image per-category evaluation results
+        self.eval = {}  # accumulated evaluation results
+
+    def computeIoU(self, imgId, catId):
+        """
+        Compute IoU between tracklets. Copied from cocoeval.py but adapted for videos (in YT-VIS format)
+        """
+        p = self.params
+        if p.useCats:
+            gt = self._gts[imgId, catId]
+            dt = self._dts[imgId, catId]
+        else:
+            gt = [_ for cId in p.catIds for _ in self._gts[imgId, cId]]
+            dt = [_ for cId in p.catIds for _ in self._dts[imgId, cId]]
+        if len(gt) == 0 or len(dt) == 0:
+            return []
+
+        # For class mAP and phrase AP evaluation, we sort the detections in descending order of scores (as in COCOeval).
+        # For demo F1 evaluation, we DO NOT sort the detections (but match them with GTs via Hungarian matching).
+        assert hasattr(self, "sort_inds_by_scores_in_iou"), (
+            "subclasses that inherits YTVISevalMixin should set `self.sort_inds_by_scores_in_iou` "
+            "(True for class mAP and phrase AP, False for demo F1)"
+        )
+        if self.sort_inds_by_scores_in_iou:
+            inds = np.argsort([-d["score"] for d in dt], kind="mergesort")
+            dt = [dt[i] for i in inds]
+            if len(dt) > p.maxDets[-1]:
+                dt = dt[0 : p.maxDets[-1]]
+
+        if p.iouType == "segm":
+            g = [g["segmentations"] for g in gt]
+            d = [d["segmentations"] for d in dt]
+        elif p.iouType == "bbox":
+            g = [g["bboxes"] for g in gt]
+            d = [d["bboxes"] for d in dt]
+        else:
+            raise Exception("unknown iouType for iou computation")
+
+        def iou_tracklets(preds, gts):
+            preds = torch.tensor(preds)
+            gts = torch.tensor(gts)
+            inter, union = box_xywh_inter_union(
+                preds.unsqueeze(1), gts.unsqueeze(0)
+            )  # Num preds x Num GTS x Num frames
+            inter = inter.sum(-1)
+            union = union.sum(-1)
+            assert (
+                union > 0
+            ).all(), (
+                "There exists a tracklet with zero GTs across time. This is suspicious"
+            )
+            return inter / union
+
+        def iou_masklets(preds, gts):
+            inter = 0
+            union = 0
+            for p_i, gt_i in zip(preds, gts):
+                if p_i and gt_i:
+                    # Compute areas of intersection and union
+                    inter += mask_util.area(
+                        mask_util.merge([p_i, gt_i], intersect=True)
+                    )
+                    union += mask_util.area(
+                        mask_util.merge([p_i, gt_i], intersect=False)
+                    )
+                elif gt_i:
+                    union += mask_util.area(gt_i)
+                elif p_i:
+                    union += mask_util.area(p_i)
+            if union > 0:
+                iou = inter / union
+                assert iou >= 0 and iou <= 1, "Encountered an error in IoU computation"
+            else:
+                assert np.isclose(inter, 0) and np.isclose(
+                    union, 0
+                ), "Encountered an error in IoU computation"
+                iou = 1
+            return iou
+
+        if p.iouType == "segm":
+            ious = [[iou_masklets(d_i, g_i) for g_i in g] for d_i in d]
+        else:
+            ious = iou_tracklets(d, g)
+        return np.array(ious)
+
+
+class YTVISeval(YTVISevalMixin, COCOeval):
+    # For class mAP and phrase AP evaluation, we sort the detections in descending order of scores (as in COCOeval).
+    sort_inds_by_scores_in_iou = True
+
+
+class VideoDemoF1Eval(YTVISevalMixin, CGF1Eval):
+    # For demo F1 evaluation, we DO NOT sort the detections (but match them with GTs via Hungarian matching).
+    sort_inds_by_scores_in_iou = False
+
+
+class YTVISResultsWriter:
+    """
+    Gather and dumps predictions in YT-VIS format.
+    Expected flow of API calls: reset() -> N * update() -> compute_synced()
+    """
+
+    def __init__(
+        self,
+        dump_file: str,
+        postprocessor,
+        gather_pred_via_filesys=False,
+        pred_file_evaluators: Optional[List] = None,
+        save_per_frame_scores: bool = False,
+        write_eval_metrics_file: bool = True,
+        eval_metrics_file_suffix: str = ".sam3_eval_metrics",
+    ):
+        self.dump_file = dump_file
+        self.dump = []
+        self.postprocessor = postprocessor
+        self.gather_pred_via_filesys = gather_pred_via_filesys
+        if dist.is_main_process():
+            dirname = os.path.dirname(self.dump_file)
+            if not os.path.exists(dirname):
+                os.makedirs(dirname, exist_ok=True)
+                logging.info(f"Creating folder: {dirname}")
+
+        # the evaluation hooks to be applied to the prediction files
+        self.pred_file_evaluators = pred_file_evaluators or []
+        self.save_per_frame_scores = save_per_frame_scores
+        # in addition to the prediction file, we also write the evaluation metrics
+        # for easier debugging and analysis (stored in another eval_metrics_file
+        # so that we can keep the dumped prediction file under YT-VIS format)
+        self.write_eval_metrics_file = write_eval_metrics_file
+        if self.write_eval_metrics_file:
+            self.eval_metrics_file = self.dump_file + eval_metrics_file_suffix
+            os.makedirs(os.path.dirname(self.eval_metrics_file), exist_ok=True)
+
+    def _dump_vid_preds(self, results):
+        dumped_results = copy.deepcopy(results)
+        self.dump.extend(dumped_results)
+
+    def prepare(self, predictions):
+        ytvis_results = []
+        for video_id, prediction in predictions.items():
+            if len(prediction) == 0:
+                continue
+            for k in ["boxes", "scores", "labels"]:
+                assert (
+                    k in prediction
+                ), f"Expected predictions to have `{k}` key, available keys are {prediction.keys()}"
+            if self.save_per_frame_scores:
+                assert (
+                    "per_frame_scores" in prediction
+                ), f"Expected predictions to have `per_frame_scores` key, available keys are {prediction.keys()}"
+            assert xor(
+                "masks" in prediction, "masks_rle" in prediction
+            ), f"Expected predictions to have either `masks` key or `masks_rle` key, available keys are {prediction.keys()}"
+
+            boxes = prediction["boxes"]
+            boxes = convert_to_xywh(boxes).tolist()
+            scores = prediction["scores"].tolist()
+            labels = prediction["labels"].tolist()
+            if "masks" in prediction:
+                masks = prediction["masks"].squeeze(2)
+                assert (
+                    masks.ndim == 4
+                ), "Expected masks to be of shape(N_preds,T_frames,H,W)"
+
+                areas = [mask.flatten(1).sum(1).tolist() for mask in masks]
+                rles = [rle_encode(masklet) for masklet in masks]
+
+                # memory clean
+                del masks
+                del prediction["masks"]
+            elif "masks_rle" in prediction:
+                rles = prediction.pop("masks_rle")
+                areas = [
+                    [0 if rle is None else rle.pop("area") for rle in rles_per_obj]
+                    for rles_per_obj in rles
+                ]
+            else:
+                raise ValueError(
+                    "Expected either `masks` or `masks_rle` key in the predictions."
+                )
+
+            new_results = [
+                {
+                    "video_id": video_id,
+                    "category_id": track_label,
+                    "bboxes": track_boxes,
+                    "score": track_score,
+                    "segmentations": track_masks,
+                    "areas": track_areas,
+                }
+                for (
+                    track_boxes,
+                    track_masks,
+                    track_areas,
+                    track_score,
+                    track_label,
+                ) in zip(boxes, rles, areas, scores, labels)
+            ]
+            # Optionally, save per-frame scores
+            if self.save_per_frame_scores:
+                per_frame_scores = prediction["per_frame_scores"].tolist()
+                for res, track_per_frame_scores in zip(new_results, per_frame_scores):
+                    res["per_frame_scores"] = track_per_frame_scores
+
+            ytvis_results.extend(new_results)
+
+        return ytvis_results
+
+    def set_sync_device(self, device: torch.device):
+        self._sync_device = device
+
+    def update(self, *args, **kwargs):
+        predictions = self.postprocessor.process_results(*args, **kwargs)
+        results = self.prepare(predictions)
+        self._dump_vid_preds(results)
+
+    def _dump_preds(self):
+        if not dist.is_main_process():
+            self.dump = []
+            gc.collect()
+            return
+        dumped_file = Path(self.dump_file)
+        logging.info(f"YTVIS evaluator: Dumping predictions to {dumped_file}")
+        with g_pathmgr.open(str(dumped_file), "w") as f:
+            json.dump(self.dump, f)
+        self.dump = []
+        gc.collect()
+        return str(dumped_file)
+
+    def synchronize_between_processes(self):
+        logging.info("YT-VIS evaluator: Synchronizing between processes")
+        dump_dict = self._dedup_pre_gather(self.dump)
+        if self.gather_pred_via_filesys:
+            dump_dict_all_gpus = dist.gather_to_rank_0_via_filesys(dump_dict)
+        else:
+            dump_dict_all_gpus = dist.all_gather(dump_dict, force_cpu=True)
+        self.dump = self._dedup_post_gather(dump_dict_all_gpus)
+        logging.info(f"Gathered all {len(self.dump)} predictions")
+
+    def _dedup_pre_gather(self, predictions):
+        """
+        Organize the predictions as a dict-of-list using (video_id, category_id) as keys
+        for deduplication after gathering them across GPUs.
+
+        During evaluation, PyTorch data loader under `drop_last: False` would wrap
+        around the dataset length to be a multiple of world size (GPU num) and duplicate
+        the remaining batches. This causes the same test sample to appear simultaneously
+        in multiple GPUs, resulting in duplicated predictions being saved into prediction
+        files. These duplicates are then counted as false positives under detection mAP
+        metrics (since a ground truth can be matched with only one prediction).
+
+        For example, if there are 4 GPUs and 6 samples [A1, A2, B1, B2, C1, C2], the data
+        loader (under `drop_last: False`) would load it by wrapping it around like
+        `[A1, A2, B1, B2, C1, C2, *A1*, *A2*]` to make a multiple of 4 and then split it as
+
+        - GPU 0: A1, C1
+        - GPU 1: A2, C2
+        - GPU 3: B1, **A1**
+        - GPU 4: B2, **A2**
+        (as in DistributedSampler in https://github.com/pytorch/pytorch/blob/521588519da9f4876d90ddd7a17c10d0eca89dc6/torch/utils/data/distributed.py#L116-L124)
+
+        so the predictions on A1 and A2 will occur twice in the final gathered outputs
+        in the prediction file (and counted as false positives). This also affects our
+        YT-VIS official val evaluation, but to a lesser extent than YT-VIS dev since
+        the latter is much smaller and more susceptible to false positives.
+
+        So we to deduplicate this. The tricky part is that we cannot deduplicate them
+        simply using video id, given that we are sharding the classes in each video
+        across multiple batches (with 20 prompts per batch) in our "orig_cats" eval dbs.
+
+        The solution is to deduplicate based on (video_id, category_id) tuple as keys.
+        We organize the predictions as a dict-of-list using (video_id, category_id) as
+        keys on each GPU, with the list of masklets under this (video_id, category_id)
+        on this GPU as values. Then, we all-gather this dict-of-list across GPUs and
+        if a key (video_id, category_id) appears in multiple GPUs, we only take the
+        prediction masklet list from one GPU.
+        """
+        prediction_dict = defaultdict(list)
+        for p in predictions:
+            prediction_dict[(p["video_id"], p["category_id"])].append(p)
+        return prediction_dict
+
+    def _dedup_post_gather(self, list_of_prediction_dict):
+        """
+        Deduplicate the predictions from all GPUs. See `_dedup_pre_gather` for details.
+        """
+        dedup_prediction_dict = {}
+        duplication_keys = []
+        for prediction_dict in list_of_prediction_dict:
+            for k, v in prediction_dict.items():
+                if k not in dedup_prediction_dict:
+                    dedup_prediction_dict[k] = v
+                else:
+                    duplication_keys.append(k)
+
+        logging.info(
+            f"skipped {len(duplication_keys)} duplicated predictions in YTVISResultsWriter "
+            f"with the following (video_id, category_id) tuples: {duplication_keys}"
+        )
+        dedup_predictions = sum(dedup_prediction_dict.values(), [])
+        return dedup_predictions
+
+    def compute_synced(
+        self,
+    ):
+        self.synchronize_between_processes()
+        dumped_file = self._dump_preds()
+        if not dist.is_main_process():
+            return {"": 0.0}
+
+        # run evaluation hooks on the prediction file
+        meters = {}
+        all_video_np_level_results = defaultdict(dict)
+        for evaluator in self.pred_file_evaluators:
+            gc.collect()
+            results, video_np_level_results = evaluator.evaluate(dumped_file)
+            meters.update(results)
+            for (video_id, category_id), res in video_np_level_results.items():
+                all_video_np_level_results[(video_id, category_id)].update(res)
+
+        gc.collect()
+        if self.write_eval_metrics_file:
+            # convert the nested dict of {(video_id, category_id): per_sample_metric_dict}
+            # to a list of per-sample metric dicts (with video_id and category_id) for JSON,
+            # as JSON doesn't allow using tuples like (video_id, category_id) as dict keys
+            video_np_level_metrics = [
+                {"video_id": video_id, "category_id": category_id, **res}
+                for (video_id, category_id), res in all_video_np_level_results.items()
+            ]
+            eval_metrics = {
+                "dataset_level_metrics": meters,
+                "video_np_level_metrics": video_np_level_metrics,
+            }
+            with g_pathmgr.open(self.eval_metrics_file, "w") as f:
+                json.dump(eval_metrics, f)
+            logging.info(
+                f"YTVIS evaluator: Dumped evaluation metrics to {self.eval_metrics_file}"
+            )
+
+        if len(meters) == 0:
+            meters = {"": 0.0}
+        return meters
+
+    def compute(self):
+        return {"": 0.0}
+
+    def reset(self, *args, **kwargs):
+        self.dump = []