sam3_local/sam3/eval/ytvis_eval.py

# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved

# pyre-unsafe
import copy
import gc
import logging
import os
from collections import defaultdict
from operator import xor
from pathlib import Path
from typing import List, Optional

import numpy as np
import pycocotools.mask as mask_util
import torch
from pycocotools.cocoeval import COCOeval
from sam3.eval.cgf1_eval import CGF1Eval
from sam3.eval.coco_eval_offline import convert_to_xywh
from sam3.model.box_ops import box_xywh_inter_union
from sam3.train.masks_ops import rle_encode
from sam3.train.utils import distributed as dist
from typing_extensions import override

try:
    import rapidjson as json
except ModuleNotFoundError:
    import json

from iopath.common.file_io import g_pathmgr


class YTVISevalMixin:
    """
    Identical to COCOeval but adapts computeIoU to compute IoU between tracklets/masklets.
    """

    @override
    def _prepare(self):
        """
        Copied from cocoeval.py but doesn't convert masks to RLEs (we assume they already are RLEs)
        """
        p = self.params
        if p.useCats:
            gts = self.cocoGt.loadAnns(
                self.cocoGt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds)
            )
            dts = self.cocoDt.loadAnns(
                self.cocoDt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds)
            )
        else:
            gts = self.cocoGt.loadAnns(self.cocoGt.getAnnIds(imgIds=p.imgIds))
            dts = self.cocoDt.loadAnns(self.cocoDt.getAnnIds(imgIds=p.imgIds))

        # set ignore flag
        for gt in gts:
            gt["ignore"] = gt["ignore"] if "ignore" in gt else 0
            gt["ignore"] = "iscrowd" in gt and gt["iscrowd"]
            if p.iouType == "keypoints":
                gt["ignore"] = (gt["num_keypoints"] == 0) or gt["ignore"]
        self._gts = defaultdict(list)  # gt for evaluation
        self._dts = defaultdict(list)  # dt for evaluation
        for gt in gts:
            self._gts[gt["image_id"], gt["category_id"]].append(gt)
        for dt in dts:
            self._dts[dt["image_id"], dt["category_id"]].append(dt)
        self.evalImgs = defaultdict(list)  # per-image per-category evaluation results
        self.eval = {}  # accumulated evaluation results

    def computeIoU(self, imgId, catId):
        """
        Compute IoU between tracklets. Copied from cocoeval.py but adapted for videos (in YT-VIS format)
        """
        p = self.params
        if p.useCats:
            gt = self._gts[imgId, catId]
            dt = self._dts[imgId, catId]
        else:
            gt = [_ for cId in p.catIds for _ in self._gts[imgId, cId]]
            dt = [_ for cId in p.catIds for _ in self._dts[imgId, cId]]
        if len(gt) == 0 or len(dt) == 0:
            return []

        # For class mAP and phrase AP evaluation, we sort the detections in descending order of scores (as in COCOeval).
        # For demo F1 evaluation, we DO NOT sort the detections (but match them with GTs via Hungarian matching).
        assert hasattr(self, "sort_inds_by_scores_in_iou"), (
            "subclasses that inherits YTVISevalMixin should set `self.sort_inds_by_scores_in_iou` "
            "(True for class mAP and phrase AP, False for demo F1)"
        )
        if self.sort_inds_by_scores_in_iou:
            inds = np.argsort([-d["score"] for d in dt], kind="mergesort")
            dt = [dt[i] for i in inds]
            if len(dt) > p.maxDets[-1]:
                dt = dt[0 : p.maxDets[-1]]

        if p.iouType == "segm":
            g = [g["segmentations"] for g in gt]
            d = [d["segmentations"] for d in dt]
        elif p.iouType == "bbox":
            g = [g["bboxes"] for g in gt]
            d = [d["bboxes"] for d in dt]
        else:
            raise Exception("unknown iouType for iou computation")

        def iou_tracklets(preds, gts):
            preds = torch.tensor(preds)
            gts = torch.tensor(gts)
            inter, union = box_xywh_inter_union(
                preds.unsqueeze(1), gts.unsqueeze(0)
            )  # Num preds x Num GTS x Num frames
            inter = inter.sum(-1)
            union = union.sum(-1)
            assert (
                union > 0
            ).all(), (
                "There exists a tracklet with zero GTs across time. This is suspicious"
            )
            return inter / union

        def iou_masklets(preds, gts):
            inter = 0
            union = 0
            for p_i, gt_i in zip(preds, gts):
                if p_i and gt_i:
                    # Compute areas of intersection and union
                    inter += mask_util.area(
                        mask_util.merge([p_i, gt_i], intersect=True)
                    )
                    union += mask_util.area(
                        mask_util.merge([p_i, gt_i], intersect=False)
                    )
                elif gt_i:
                    union += mask_util.area(gt_i)
                elif p_i:
                    union += mask_util.area(p_i)
            if union > 0:
                iou = inter / union
                assert iou >= 0 and iou <= 1, "Encountered an error in IoU computation"
            else:
                assert np.isclose(inter, 0) and np.isclose(
                    union, 0
                ), "Encountered an error in IoU computation"
                iou = 1
            return iou

        if p.iouType == "segm":
            ious = [[iou_masklets(d_i, g_i) for g_i in g] for d_i in d]
        else:
            ious = iou_tracklets(d, g)
        return np.array(ious)


class YTVISeval(YTVISevalMixin, COCOeval):
    # For class mAP and phrase AP evaluation, we sort the detections in descending order of scores (as in COCOeval).
    sort_inds_by_scores_in_iou = True


class VideoDemoF1Eval(YTVISevalMixin, CGF1Eval):
    # For demo F1 evaluation, we DO NOT sort the detections (but match them with GTs via Hungarian matching).
    sort_inds_by_scores_in_iou = False


class YTVISResultsWriter:
    """
    Gather and dumps predictions in YT-VIS format.
    Expected flow of API calls: reset() -> N * update() -> compute_synced()
    """

    def __init__(
        self,
        dump_file: str,
        postprocessor,
        gather_pred_via_filesys=False,
        pred_file_evaluators: Optional[List] = None,
        save_per_frame_scores: bool = False,
        write_eval_metrics_file: bool = True,
        eval_metrics_file_suffix: str = ".sam3_eval_metrics",
    ):
        self.dump_file = dump_file
        self.dump = []
        self.postprocessor = postprocessor
        self.gather_pred_via_filesys = gather_pred_via_filesys
        if dist.is_main_process():
            dirname = os.path.dirname(self.dump_file)
            if not os.path.exists(dirname):
                os.makedirs(dirname, exist_ok=True)
                logging.info(f"Creating folder: {dirname}")

        # the evaluation hooks to be applied to the prediction files
        self.pred_file_evaluators = pred_file_evaluators or []
        self.save_per_frame_scores = save_per_frame_scores
        # in addition to the prediction file, we also write the evaluation metrics
        # for easier debugging and analysis (stored in another eval_metrics_file
        # so that we can keep the dumped prediction file under YT-VIS format)
        self.write_eval_metrics_file = write_eval_metrics_file
        if self.write_eval_metrics_file:
            self.eval_metrics_file = self.dump_file + eval_metrics_file_suffix
            os.makedirs(os.path.dirname(self.eval_metrics_file), exist_ok=True)

    def _dump_vid_preds(self, results):
        dumped_results = copy.deepcopy(results)
        self.dump.extend(dumped_results)

    def prepare(self, predictions):
        ytvis_results = []
        for video_id, prediction in predictions.items():
            if len(prediction) == 0:
                continue
            for k in ["boxes", "scores", "labels"]:
                assert (
                    k in prediction
                ), f"Expected predictions to have `{k}` key, available keys are {prediction.keys()}"
            if self.save_per_frame_scores:
                assert (
                    "per_frame_scores" in prediction
                ), f"Expected predictions to have `per_frame_scores` key, available keys are {prediction.keys()}"
            assert xor(
                "masks" in prediction, "masks_rle" in prediction
            ), f"Expected predictions to have either `masks` key or `masks_rle` key, available keys are {prediction.keys()}"

            boxes = prediction["boxes"]
            boxes = convert_to_xywh(boxes).tolist()
            scores = prediction["scores"].tolist()
            labels = prediction["labels"].tolist()
            if "masks" in prediction:
                masks = prediction["masks"].squeeze(2)
                assert (
                    masks.ndim == 4
                ), "Expected masks to be of shape(N_preds,T_frames,H,W)"

                areas = [mask.flatten(1).sum(1).tolist() for mask in masks]
                rles = [rle_encode(masklet) for masklet in masks]

                # memory clean
                del masks
                del prediction["masks"]
            elif "masks_rle" in prediction:
                rles = prediction.pop("masks_rle")
                areas = [
                    [0 if rle is None else rle.pop("area") for rle in rles_per_obj]
                    for rles_per_obj in rles
                ]
            else:
                raise ValueError(
                    "Expected either `masks` or `masks_rle` key in the predictions."
                )

            new_results = [
                {
                    "video_id": video_id,
                    "category_id": track_label,
                    "bboxes": track_boxes,
                    "score": track_score,
                    "segmentations": track_masks,
                    "areas": track_areas,
                }
                for (
                    track_boxes,
                    track_masks,
                    track_areas,
                    track_score,
                    track_label,
                ) in zip(boxes, rles, areas, scores, labels)
            ]
            # Optionally, save per-frame scores
            if self.save_per_frame_scores:
                per_frame_scores = prediction["per_frame_scores"].tolist()
                for res, track_per_frame_scores in zip(new_results, per_frame_scores):
                    res["per_frame_scores"] = track_per_frame_scores

            ytvis_results.extend(new_results)

        return ytvis_results

    def set_sync_device(self, device: torch.device):
        self._sync_device = device

    def update(self, *args, **kwargs):
        predictions = self.postprocessor.process_results(*args, **kwargs)
        results = self.prepare(predictions)
        self._dump_vid_preds(results)

    def _dump_preds(self):
        if not dist.is_main_process():
            self.dump = []
            gc.collect()
            return
        dumped_file = Path(self.dump_file)
        logging.info(f"YTVIS evaluator: Dumping predictions to {dumped_file}")
        with g_pathmgr.open(str(dumped_file), "w") as f:
            json.dump(self.dump, f)
        self.dump = []
        gc.collect()
        return str(dumped_file)

    def synchronize_between_processes(self):
        logging.info("YT-VIS evaluator: Synchronizing between processes")
        dump_dict = self._dedup_pre_gather(self.dump)
        if self.gather_pred_via_filesys:
            dump_dict_all_gpus = dist.gather_to_rank_0_via_filesys(dump_dict)
        else:
            dump_dict_all_gpus = dist.all_gather(dump_dict, force_cpu=True)
        self.dump = self._dedup_post_gather(dump_dict_all_gpus)
        logging.info(f"Gathered all {len(self.dump)} predictions")

    def _dedup_pre_gather(self, predictions):
        """
        Organize the predictions as a dict-of-list using (video_id, category_id) as keys
        for deduplication after gathering them across GPUs.

        During evaluation, PyTorch data loader under `drop_last: False` would wrap
        around the dataset length to be a multiple of world size (GPU num) and duplicate
        the remaining batches. This causes the same test sample to appear simultaneously
        in multiple GPUs, resulting in duplicated predictions being saved into prediction
        files. These duplicates are then counted as false positives under detection mAP
        metrics (since a ground truth can be matched with only one prediction).

        For example, if there are 4 GPUs and 6 samples [A1, A2, B1, B2, C1, C2], the data
        loader (under `drop_last: False`) would load it by wrapping it around like
        `[A1, A2, B1, B2, C1, C2, *A1*, *A2*]` to make a multiple of 4 and then split it as

        - GPU 0: A1, C1
        - GPU 1: A2, C2
        - GPU 3: B1, **A1**
        - GPU 4: B2, **A2**
        (as in DistributedSampler in https://github.com/pytorch/pytorch/blob/521588519da9f4876d90ddd7a17c10d0eca89dc6/torch/utils/data/distributed.py#L116-L124)

        so the predictions on A1 and A2 will occur twice in the final gathered outputs
        in the prediction file (and counted as false positives). This also affects our
        YT-VIS official val evaluation, but to a lesser extent than YT-VIS dev since
        the latter is much smaller and more susceptible to false positives.

        So we to deduplicate this. The tricky part is that we cannot deduplicate them
        simply using video id, given that we are sharding the classes in each video
        across multiple batches (with 20 prompts per batch) in our "orig_cats" eval dbs.

        The solution is to deduplicate based on (video_id, category_id) tuple as keys.
        We organize the predictions as a dict-of-list using (video_id, category_id) as
        keys on each GPU, with the list of masklets under this (video_id, category_id)
        on this GPU as values. Then, we all-gather this dict-of-list across GPUs and
        if a key (video_id, category_id) appears in multiple GPUs, we only take the
        prediction masklet list from one GPU.
        """
        prediction_dict = defaultdict(list)
        for p in predictions:
            prediction_dict[(p["video_id"], p["category_id"])].append(p)
        return prediction_dict

    def _dedup_post_gather(self, list_of_prediction_dict):
        """
        Deduplicate the predictions from all GPUs. See `_dedup_pre_gather` for details.
        """
        dedup_prediction_dict = {}
        duplication_keys = []
        for prediction_dict in list_of_prediction_dict:
            for k, v in prediction_dict.items():
                if k not in dedup_prediction_dict:
                    dedup_prediction_dict[k] = v
                else:
                    duplication_keys.append(k)

        logging.info(
            f"skipped {len(duplication_keys)} duplicated predictions in YTVISResultsWriter "
            f"with the following (video_id, category_id) tuples: {duplication_keys}"
        )
        dedup_predictions = sum(dedup_prediction_dict.values(), [])
        return dedup_predictions

    def compute_synced(
        self,
    ):
        self.synchronize_between_processes()
        dumped_file = self._dump_preds()
        if not dist.is_main_process():
            return {"": 0.0}

        # run evaluation hooks on the prediction file
        meters = {}
        all_video_np_level_results = defaultdict(dict)
        for evaluator in self.pred_file_evaluators:
            gc.collect()
            results, video_np_level_results = evaluator.evaluate(dumped_file)
            meters.update(results)
            for (video_id, category_id), res in video_np_level_results.items():
                all_video_np_level_results[(video_id, category_id)].update(res)

        gc.collect()
        if self.write_eval_metrics_file:
            # convert the nested dict of {(video_id, category_id): per_sample_metric_dict}
            # to a list of per-sample metric dicts (with video_id and category_id) for JSON,
            # as JSON doesn't allow using tuples like (video_id, category_id) as dict keys
            video_np_level_metrics = [
                {"video_id": video_id, "category_id": category_id, **res}
                for (video_id, category_id), res in all_video_np_level_results.items()
            ]
            eval_metrics = {
                "dataset_level_metrics": meters,
                "video_np_level_metrics": video_np_level_metrics,
            }
            with g_pathmgr.open(self.eval_metrics_file, "w") as f:
                json.dump(eval_metrics, f)
            logging.info(
                f"YTVIS evaluator: Dumped evaluation metrics to {self.eval_metrics_file}"
            )

        if len(meters) == 0:
            meters = {"": 0.0}
        return meters

    def compute(self):
        return {"": 0.0}

    def reset(self, *args, **kwargs):
        self.dump = []