Initial commit

fbshipit-source-id: da6be2f26e3a1202f4bffde8cb980e2dcb851294
2025-11-18 23:07:42 -08:00
commit a13e358df4
504 changed files with 122758 additions and 0 deletions
--- a/sam3/eval/init.py
+++ b/sam3/eval/init.py
@@ -0,0 +1 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
--- a/sam3/eval/cgf1_eval.py
+++ b/sam3/eval/cgf1_eval.py
@@ -0,0 +1,703 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
+
+import contextlib
+import copy
+import json
+import os
+import time
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import List, Union
+
+import numpy as np
+import pycocotools.mask as maskUtils
+from pycocotools.coco import COCO
+from pycocotools.cocoeval import COCOeval
+from scipy.optimize import linear_sum_assignment
+from tqdm import tqdm
+
+
+@dataclass
+class Metric:
+    name: str
+
+    # whether the metric is computed at the image level or the box level
+    image_level: bool
+
+    # iou threshold (None is used for image level metrics or to indicate averaging over all thresholds in [0.5:0.95])
+    iou_threshold: Union[float, None]
+
+
+CGF1_METRICS = [
+    Metric(name="cgF1", image_level=False, iou_threshold=None),
+    Metric(name="precision", image_level=False, iou_threshold=None),
+    Metric(name="recall", image_level=False, iou_threshold=None),
+    Metric(name="F1", image_level=False, iou_threshold=None),
+    Metric(name="positive_macro_F1", image_level=False, iou_threshold=None),
+    Metric(name="positive_micro_F1", image_level=False, iou_threshold=None),
+    Metric(name="positive_micro_precision", image_level=False, iou_threshold=None),
+    Metric(name="IL_precision", image_level=True, iou_threshold=None),
+    Metric(name="IL_recall", image_level=True, iou_threshold=None),
+    Metric(name="IL_F1", image_level=True, iou_threshold=None),
+    Metric(name="IL_FPR", image_level=True, iou_threshold=None),
+    Metric(name="IL_MCC", image_level=True, iou_threshold=None),
+    Metric(name="cgF1", image_level=False, iou_threshold=0.5),
+    Metric(name="precision", image_level=False, iou_threshold=0.5),
+    Metric(name="recall", image_level=False, iou_threshold=0.5),
+    Metric(name="F1", image_level=False, iou_threshold=0.5),
+    Metric(name="positive_macro_F1", image_level=False, iou_threshold=0.5),
+    Metric(name="positive_micro_F1", image_level=False, iou_threshold=0.5),
+    Metric(name="positive_micro_precision", image_level=False, iou_threshold=0.5),
+    Metric(name="cgF1", image_level=False, iou_threshold=0.75),
+    Metric(name="precision", image_level=False, iou_threshold=0.75),
+    Metric(name="recall", image_level=False, iou_threshold=0.75),
+    Metric(name="F1", image_level=False, iou_threshold=0.75),
+    Metric(name="positive_macro_F1", image_level=False, iou_threshold=0.75),
+    Metric(name="positive_micro_F1", image_level=False, iou_threshold=0.75),
+    Metric(name="positive_micro_precision", image_level=False, iou_threshold=0.75),
+]
+
+
+class COCOCustom(COCO):
+    """COCO class from pycocotools with tiny modifications for speed"""
+
+    def createIndex(self):
+        # create index
+        print("creating index...")
+        anns, cats, imgs = {}, {}, {}
+        imgToAnns, catToImgs = defaultdict(list), defaultdict(list)
+        if "annotations" in self.dataset:
+            for ann in self.dataset["annotations"]:
+                imgToAnns[ann["image_id"]].append(ann)
+                anns[ann["id"]] = ann
+
+        if "images" in self.dataset:
+            # MODIFICATION: do not reload imgs if they are already there
+            if self.imgs:
+                imgs = self.imgs
+            else:
+                for img in self.dataset["images"]:
+                    imgs[img["id"]] = img
+            # END MODIFICATION
+
+        if "categories" in self.dataset:
+            for cat in self.dataset["categories"]:
+                cats[cat["id"]] = cat
+
+        if "annotations" in self.dataset and "categories" in self.dataset:
+            for ann in self.dataset["annotations"]:
+                catToImgs[ann["category_id"]].append(ann["image_id"])
+
+        print("index created!")
+
+        # create class members
+        self.anns = anns
+        self.imgToAnns = imgToAnns
+        self.catToImgs = catToImgs
+        self.imgs = imgs
+        self.cats = cats
+
+    def loadRes(self, resFile):
+        """
+        Load result file and return a result api object.
+        :param   resFile (str)     : file name of result file
+        :return: res (obj)         : result api object
+        """
+        res = COCOCustom()
+        res.dataset["info"] = copy.deepcopy(self.dataset.get("info", {}))
+        # MODIFICATION: no copy
+        # res.dataset['images'] = [img for img in self.dataset['images']]
+        res.dataset["images"] = self.dataset["images"]
+        # END MODIFICATION
+
+        print("Loading and preparing results...")
+        tic = time.time()
+        if type(resFile) == str:
+            with open(resFile) as f:
+                anns = json.load(f)
+        elif type(resFile) == np.ndarray:
+            anns = self.loadNumpyAnnotations(resFile)
+        else:
+            anns = resFile
+        assert type(anns) == list, "results in not an array of objects"
+        annsImgIds = [ann["image_id"] for ann in anns]
+        # MODIFICATION: faster and cached subset check
+        if not hasattr(self, "img_id_set"):
+            self.img_id_set = set(self.getImgIds())
+        assert set(annsImgIds).issubset(
+            self.img_id_set
+        ), "Results do not correspond to current coco set"
+        # END MODIFICATION
+        if "caption" in anns[0]:
+            imgIds = set([img["id"] for img in res.dataset["images"]]) & set(
+                [ann["image_id"] for ann in anns]
+            )
+            res.dataset["images"] = [
+                img for img in res.dataset["images"] if img["id"] in imgIds
+            ]
+            for id, ann in enumerate(anns):
+                ann["id"] = id + 1
+        elif "bbox" in anns[0] and not anns[0]["bbox"] == []:
+            res.dataset["categories"] = copy.deepcopy(self.dataset["categories"])
+            for id, ann in enumerate(anns):
+                bb = ann["bbox"]
+                x1, x2, y1, y2 = [bb[0], bb[0] + bb[2], bb[1], bb[1] + bb[3]]
+                if not "segmentation" in ann:
+                    ann["segmentation"] = [[x1, y1, x1, y2, x2, y2, x2, y1]]
+                ann["area"] = bb[2] * bb[3]
+                ann["id"] = id + 1
+                ann["iscrowd"] = 0
+        elif "segmentation" in anns[0]:
+            res.dataset["categories"] = copy.deepcopy(self.dataset["categories"])
+            for id, ann in enumerate(anns):
+                # now only support compressed RLE format as segmentation results
+                ann["area"] = maskUtils.area(ann["segmentation"])
+                if not "bbox" in ann:
+                    ann["bbox"] = maskUtils.toBbox(ann["segmentation"])
+                ann["id"] = id + 1
+                ann["iscrowd"] = 0
+        elif "keypoints" in anns[0]:
+            res.dataset["categories"] = copy.deepcopy(self.dataset["categories"])
+            for id, ann in enumerate(anns):
+                s = ann["keypoints"]
+                x = s[0::3]
+                y = s[1::3]
+                x0, x1, y0, y1 = np.min(x), np.max(x), np.min(y), np.max(y)
+                ann["area"] = (x1 - x0) * (y1 - y0)
+                ann["id"] = id + 1
+                ann["bbox"] = [x0, y0, x1 - x0, y1 - y0]
+        print("DONE (t={:0.2f}s)".format(time.time() - tic))
+
+        res.dataset["annotations"] = anns
+        # MODIFICATION: inherit images
+        res.imgs = self.imgs
+        # END MODIFICATION
+        res.createIndex()
+        return res
+
+
+class CGF1Eval(COCOeval):
+    """
+    This evaluator is based upon COCO evaluation, but evaluates the model in a more realistic setting
+    for downstream applications.
+    See SAM3 paper for the details on the CGF1 metric.
+
+    Do not use this evaluator directly. Prefer the CGF1Evaluator wrapper.
+
+    Notes:
+     - This evaluator does not support per-category evaluation (in the way defined by pyCocotools)
+     - In open vocabulary settings, we have different noun-phrases for each image. What we call an "image_id" here is actually an (image, noun-phrase) pair. So in every "image_id" there is only one category, implied by the noun-phrase. Thus we can ignore the usual coco "category" field of the predictions
+    """
+
+    def __init__(
+        self,
+        coco_gt=None,
+        coco_dt=None,
+        iouType="segm",
+        threshold=0.5,
+    ):
+        """
+        Args:
+            coco_gt (COCO): ground truth COCO API
+            coco_dt (COCO): detections COCO API
+            iou_type (str): type of IoU to evaluate
+            threshold (float): threshold for predictions
+        """
+        super().__init__(coco_gt, coco_dt, iouType)
+        self.threshold = threshold
+
+        self.params.useCats = False
+        self.params.areaRng = [[0**2, 1e5**2]]
+        self.params.areaRngLbl = ["all"]
+        self.params.maxDets = [1000000]
+
+    def computeIoU(self, imgId, catId):
+        # Same as the original COCOeval.computeIoU, but without sorting
+        p = self.params
+        if p.useCats:
+            gt = self._gts[imgId, catId]
+            dt = self._dts[imgId, catId]
+        else:
+            gt = [_ for cId in p.catIds for _ in self._gts[imgId, cId]]
+            dt = [_ for cId in p.catIds for _ in self._dts[imgId, cId]]
+        if len(gt) == 0 and len(dt) == 0:
+            return []
+
+        if p.iouType == "segm":
+            g = [g["segmentation"] for g in gt]
+            d = [d["segmentation"] for d in dt]
+        elif p.iouType == "bbox":
+            g = [g["bbox"] for g in gt]
+            d = [d["bbox"] for d in dt]
+        else:
+            raise Exception("unknown iouType for iou computation")
+
+        # compute iou between each dt and gt region
+        iscrowd = [int(o["iscrowd"]) for o in gt]
+        ious = maskUtils.iou(d, g, iscrowd)
+        return ious
+
+    def evaluateImg(self, imgId, catId, aRng, maxDet):
+        """
+        perform evaluation for single category and image
+        :return: dict (single image results)
+        """
+        p = self.params
+        assert not p.useCats, "This evaluator does not support per-category evaluation."
+        assert catId == -1
+        all_gts = [_ for cId in p.catIds for _ in self._gts[imgId, cId]]
+        keep_gt = np.array([not g["ignore"] for g in all_gts], dtype=bool)
+        gt = [g for g in all_gts if not g["ignore"]]
+        all_dts = [_ for cId in p.catIds for _ in self._dts[imgId, cId]]
+        keep_dt = np.array([d["score"] >= self.threshold for d in all_dts], dtype=bool)
+        dt = [d for d in all_dts if d["score"] >= self.threshold]
+        if len(gt) == 0 and len(dt) == 0:
+            # This is a "true negative" case, where there are no GTs and no predictions
+            # The box-level metrics are ill-defined, so we don't add them to this dict
+            return {
+                "image_id": imgId,
+                "IL_TP": 0,
+                "IL_TN": 1,
+                "IL_FP": 0,
+                "IL_FN": 0,
+                "num_dt": len(dt),
+            }
+
+        if len(gt) > 0 and len(dt) == 0:
+            # This is a "false negative" case, where there are GTs but no predictions
+            return {
+                "image_id": imgId,
+                "IL_TP": 0,
+                "IL_TN": 0,
+                "IL_FP": 0,
+                "IL_FN": 1,
+                "TPs": np.zeros((len(p.iouThrs),), dtype=np.int64),
+                "FPs": np.zeros((len(p.iouThrs),), dtype=np.int64),
+                "FNs": np.ones((len(p.iouThrs),), dtype=np.int64) * len(gt),
+                "local_F1s": np.zeros((len(p.iouThrs),), dtype=np.int64),
+                "local_positive_F1s": np.zeros((len(p.iouThrs),), dtype=np.int64),
+                "num_dt": len(dt),
+            }
+
+        # Load pre-computed ious
+        ious = self.ious[(imgId, catId)]
+
+        # compute matching
+        if len(ious) == 0:
+            ious = np.zeros((len(dt), len(gt)))
+        else:
+            ious = ious[keep_dt, :][:, keep_gt]
+        assert ious.shape == (len(dt), len(gt))
+
+        matched_dt, matched_gt = linear_sum_assignment(-ious)
+
+        match_scores = ious[matched_dt, matched_gt]
+
+        TPs, FPs, FNs = [], [], []
+        IL_perfect = []
+        for thresh in p.iouThrs:
+            TP = (match_scores >= thresh).sum()
+            FP = len(dt) - TP
+            FN = len(gt) - TP
+            assert (
+                FP >= 0 and FN >= 0
+            ), f"FP: {FP}, FN: {FN}, TP: {TP}, match_scores: {match_scores}, len(dt): {len(dt)}, len(gt): {len(gt)}, ious: {ious}"
+            TPs.append(TP)
+            FPs.append(FP)
+            FNs.append(FN)
+
+            if FP == FN and FP == 0:
+                IL_perfect.append(1)
+            else:
+                IL_perfect.append(0)
+
+        TPs = np.array(TPs, dtype=np.int64)
+        FPs = np.array(FPs, dtype=np.int64)
+        FNs = np.array(FNs, dtype=np.int64)
+        IL_perfect = np.array(IL_perfect, dtype=np.int64)
+
+        # compute precision recall and F1
+        precision = TPs / (TPs + FPs + 1e-4)
+        assert np.all(precision <= 1)
+        recall = TPs / (TPs + FNs + 1e-4)
+        assert np.all(recall <= 1)
+        F1 = 2 * precision * recall / (precision + recall + 1e-4)
+
+        result = {
+            "image_id": imgId,
+            "TPs": TPs,
+            "FPs": FPs,
+            "FNs": FNs,
+            "local_F1s": F1,
+            "IL_TP": (len(gt) > 0) and (len(dt) > 0),
+            "IL_FP": (len(gt) == 0) and (len(dt) > 0),
+            "IL_TN": (len(gt) == 0) and (len(dt) == 0),
+            "IL_FN": (len(gt) > 0) and (len(dt) == 0),
+            "num_dt": len(dt),
+        }
+        if len(gt) > 0 and len(dt) > 0:
+            result["local_positive_F1s"] = F1
+        return result
+
+    def accumulate(self, p=None):
+        """
+        Accumulate per image evaluation results and store the result in self.eval
+        :param p: input params for evaluation
+        :return: None
+        """
+        if self.evalImgs is None or len(self.evalImgs) == 0:
+            print("Please run evaluate() first")
+        # allows input customized parameters
+        if p is None:
+            p = self.params
+
+        setImgIds = set(p.imgIds)
+
+        # TPs, FPs, FNs
+        TPs = np.zeros((len(p.iouThrs),), dtype=np.int64)
+        FPs = np.zeros((len(p.iouThrs),), dtype=np.int64)
+        pmFPs = np.zeros((len(p.iouThrs),), dtype=np.int64)
+        FNs = np.zeros((len(p.iouThrs),), dtype=np.int64)
+        local_F1s = np.zeros((len(p.iouThrs),), dtype=np.float64)
+
+        # Image level metrics
+        IL_TPs = 0
+        IL_FPs = 0
+        IL_TNs = 0
+        IL_FNs = 0
+
+        valid_img_count = 0
+        valid_F1_count = 0
+        evaledImgIds = set()
+        for res in self.evalImgs:
+            if res["image_id"] not in setImgIds:
+                continue
+            evaledImgIds.add(res["image_id"])
+            IL_TPs += res["IL_TP"]
+            IL_FPs += res["IL_FP"]
+            IL_TNs += res["IL_TN"]
+            IL_FNs += res["IL_FN"]
+
+            if "TPs" not in res:
+                continue
+
+            TPs += res["TPs"]
+            FPs += res["FPs"]
+            FNs += res["FNs"]
+            valid_img_count += 1
+
+            if "local_positive_F1s" in res:
+                local_F1s += res["local_positive_F1s"]
+                pmFPs += res["FPs"]
+                if res["num_dt"] > 0:
+                    valid_F1_count += 1
+
+        assert len(setImgIds - evaledImgIds) == 0, (
+            f"{len(setImgIds - evaledImgIds)} images not evaluated. "
+            f"Here are the IDs of the first 3: {list(setImgIds - evaledImgIds)[:3]}"
+        )
+
+        # compute precision recall and F1
+        precision = TPs / (TPs + FPs + 1e-4)
+        positive_micro_precision = TPs / (TPs + pmFPs + 1e-4)
+        assert np.all(precision <= 1)
+        recall = TPs / (TPs + FNs + 1e-4)
+        assert np.all(recall <= 1)
+        F1 = 2 * precision * recall / (precision + recall + 1e-4)
+        positive_micro_F1 = (
+            2
+            * positive_micro_precision
+            * recall
+            / (positive_micro_precision + recall + 1e-4)
+        )
+
+        IL_rec = IL_TPs / (IL_TPs + IL_FNs + 1e-6)
+        IL_prec = IL_TPs / (IL_TPs + IL_FPs + 1e-6)
+        IL_F1 = 2 * IL_prec * IL_rec / (IL_prec + IL_rec + 1e-6)
+        IL_FPR = IL_FPs / (IL_FPs + IL_TNs + 1e-6)
+        IL_MCC = float(IL_TPs * IL_TNs - IL_FPs * IL_FNs) / (
+            (
+                float(IL_TPs + IL_FPs)
+                * float(IL_TPs + IL_FNs)
+                * float(IL_TNs + IL_FPs)
+                * float(IL_TNs + IL_FNs)
+            )
+            ** 0.5
+            + 1e-6
+        )
+
+        self.eval = {
+            "params": p,
+            "TPs": TPs,
+            "FPs": FPs,
+            "positive_micro_FPs": pmFPs,
+            "FNs": FNs,
+            "precision": precision,
+            "positive_micro_precision": positive_micro_precision,
+            "recall": recall,
+            "F1": F1,
+            "positive_micro_F1": positive_micro_F1,
+            "positive_macro_F1": local_F1s / valid_F1_count,
+            "IL_recall": IL_rec,
+            "IL_precision": IL_prec,
+            "IL_F1": IL_F1,
+            "IL_FPR": IL_FPR,
+            "IL_MCC": IL_MCC,
+        }
+        self.eval["cgF1"] = self.eval["positive_micro_F1"] * self.eval["IL_MCC"]
+
+    def summarize(self):
+        """
+        Compute and display summary metrics for evaluation results.
+        """
+        if not self.eval:
+            raise Exception("Please run accumulate() first")
+
+        def _summarize(iouThr=None, metric=""):
+            p = self.params
+            iStr = " {:<18} @[ IoU={:<9}] = {:0.3f}"
+            titleStr = "Average " + metric
+            iouStr = (
+                "{:0.2f}:{:0.2f}".format(p.iouThrs[0], p.iouThrs[-1])
+                if iouThr is None
+                else "{:0.2f}".format(iouThr)
+            )
+
+            s = self.eval[metric]
+            # IoU
+            if iouThr is not None:
+                t = np.where(iouThr == p.iouThrs)[0]
+                s = s[t]
+
+            if len(s[s > -1]) == 0:
+                mean_s = -1
+            else:
+                mean_s = np.mean(s[s > -1])
+            print(iStr.format(titleStr, iouStr, mean_s))
+            return mean_s
+
+        def _summarize_single(metric=""):
+            titleStr = "Average " + metric
+            iStr = " {:<35} = {:0.3f}"
+            s = self.eval[metric]
+            print(iStr.format(titleStr, s))
+            return s
+
+        def _summarizeDets():
+            stats = []
+
+            for metric in CGF1_METRICS:
+                if metric.image_level:
+                    stats.append(_summarize_single(metric=metric.name))
+                else:
+                    stats.append(
+                        _summarize(iouThr=metric.iou_threshold, metric=metric.name)
+                    )
+            return np.asarray(stats)
+
+        summarize = _summarizeDets
+        self.stats = summarize()
+
+
+def _evaluate(self):
+    """
+    Run per image evaluation on given images and store results (a list of dict) in self.evalImgs
+    """
+    p = self.params
+    # add backward compatibility if useSegm is specified in params
+    p.imgIds = list(np.unique(p.imgIds))
+    p.useCats = False
+    p.maxDets = sorted(p.maxDets)
+    self.params = p
+
+    self._prepare()
+    # loop through images, area range, max detection number
+    catIds = [-1]
+
+    if p.iouType == "segm" or p.iouType == "bbox":
+        computeIoU = self.computeIoU
+    else:
+        raise RuntimeError(f"Unsupported iou {p.iouType}")
+    self.ious = {
+        (imgId, catId): computeIoU(imgId, catId)
+        for imgId in p.imgIds
+        for catId in catIds
+    }
+
+    maxDet = p.maxDets[-1]
+    evalImgs = [
+        self.evaluateImg(imgId, catId, areaRng, maxDet)
+        for catId in catIds
+        for areaRng in p.areaRng
+        for imgId in p.imgIds
+    ]
+    # this is NOT in the pycocotools code, but could be done outside
+    evalImgs = np.asarray(evalImgs).reshape(len(catIds), len(p.areaRng), len(p.imgIds))
+    return p.imgIds, evalImgs
+
+
+class CGF1Evaluator:
+    """
+    Wrapper class for cgF1 evaluation.
+    This supports the oracle setting (when several ground-truths are available per image)
+    """
+
+    def __init__(
+        self,
+        gt_path: Union[str, List[str]],
+        iou_type="segm",
+        verbose=False,
+    ):
+        """
+        Args:
+            gt_path (str or list of str): path(s) to ground truth COCO json file(s)
+            iou_type (str): type of IoU to evaluate
+            threshold (float): threshold for predictions
+        """
+        self.gt_paths = gt_path if isinstance(gt_path, list) else [gt_path]
+        self.iou_type = iou_type
+
+        self.coco_gts = [COCOCustom(gt) for gt in self.gt_paths]
+
+        self.verbose = verbose
+
+        self.coco_evals = []
+        for i, coco_gt in enumerate(self.coco_gts):
+            self.coco_evals.append(
+                CGF1Eval(
+                    coco_gt=coco_gt,
+                    iouType=iou_type,
+                )
+            )
+            self.coco_evals[i].useCats = False
+
+        exclude_img_ids = set()
+        # exclude_img_ids are the ids that are not exhaustively annotated in any of the other gts
+        for coco_gt in self.coco_gts[1:]:
+            exclude_img_ids = exclude_img_ids.union(
+                {
+                    img["id"]
+                    for img in coco_gt.dataset["images"]
+                    if not img["is_instance_exhaustive"]
+                }
+            )
+        # we only eval on instance exhaustive queries
+        self.eval_img_ids = [
+            img["id"]
+            for img in self.coco_gts[0].dataset["images"]
+            if (img["is_instance_exhaustive"] and img["id"] not in exclude_img_ids)
+        ]
+
+    def evaluate(self, pred_file: str):
+        """
+        Evaluate the detections using cgF1 metric.
+
+        Args:
+            pred_file: path to the predictions COCO json file
+
+        """
+        assert len(self.coco_gts) > 0, "No ground truth provided for evaluation."
+        assert len(self.coco_gts) == len(
+            self.coco_evals
+        ), "Mismatch in number of ground truths and evaluators."
+
+        if self.verbose:
+            print(f"Loading predictions from {pred_file}")
+
+        with open(pred_file, "r") as f:
+            preds = json.load(f)
+
+        if self.verbose:
+            print(f"Loaded {len(preds)} predictions")
+
+        img2preds = defaultdict(list)
+        for pred in preds:
+            img2preds[pred["image_id"]].append(pred)
+
+        all_eval_imgs = []
+        for img_id in tqdm(self.eval_img_ids, disable=not self.verbose):
+            results = img2preds[img_id]
+            all_scorings = []
+            for cur_coco_gt, coco_eval in zip(self.coco_gts, self.coco_evals):
+                # suppress pycocotools prints
+                with open(os.devnull, "w") as devnull:
+                    with contextlib.redirect_stdout(devnull):
+                        coco_dt = (
+                            cur_coco_gt.loadRes(results) if results else COCOCustom()
+                        )
+
+                coco_eval.cocoDt = coco_dt
+                coco_eval.params.imgIds = [img_id]
+                coco_eval.params.useCats = False
+                img_ids, eval_imgs = _evaluate(coco_eval)
+                all_scorings.append(eval_imgs)
+            selected = self._select_best_scoring(all_scorings)
+            all_eval_imgs.append(selected)
+
+        # After this point, we have selected the best scoring per image among several ground truths
+        # we can now accumulate and summarize, using only the first coco_eval
+
+        self.coco_evals[0].evalImgs = list(
+            np.concatenate(all_eval_imgs, axis=2).flatten()
+        )
+        self.coco_evals[0].params.imgIds = self.eval_img_ids
+        self.coco_evals[0]._paramsEval = copy.deepcopy(self.coco_evals[0].params)
+
+        if self.verbose:
+            print(f"Accumulating results")
+        self.coco_evals[0].accumulate()
+        print("cgF1 metric, IoU type={}".format(self.iou_type))
+        self.coco_evals[0].summarize()
+        print()
+
+        out = {}
+        for i, value in enumerate(self.coco_evals[0].stats):
+            name = CGF1_METRICS[i].name
+            if CGF1_METRICS[i].iou_threshold is not None:
+                name = f"{name}@{CGF1_METRICS[i].iou_threshold}"
+            out[f"cgF1_eval_{self.iou_type}_{name}"] = float(value)
+
+        return out
+
+    @staticmethod
+    def _select_best_scoring(scorings):
+        # This function is used for "oracle" type evaluation.
+        # It accepts the evaluation results with respect to several ground truths, and picks the best
+        if len(scorings) == 1:
+            return scorings[0]
+
+        assert (
+            scorings[0].ndim == 3
+        ), f"Expecting results in [numCats, numAreas, numImgs] format, got {scorings[0].shape}"
+        assert (
+            scorings[0].shape[0] == 1
+        ), f"Expecting a single category, got {scorings[0].shape[0]}"
+
+        for scoring in scorings:
+            assert (
+                scoring.shape == scorings[0].shape
+            ), f"Shape mismatch: {scoring.shape}, {scorings[0].shape}"
+
+        selected_imgs = []
+        for img_id in range(scorings[0].shape[-1]):
+            best = scorings[0][:, :, img_id]
+
+            for scoring in scorings[1:]:
+                current = scoring[:, :, img_id]
+                if "local_F1s" in best[0, 0] and "local_F1s" in current[0, 0]:
+                    # we were able to compute a F1 score for this particular image in both evaluations
+                    # best["local_F1s"] contains the results at various IoU thresholds. We simply take the average for comparision
+                    best_score = best[0, 0]["local_F1s"].mean()
+                    current_score = current[0, 0]["local_F1s"].mean()
+                    if current_score > best_score:
+                        best = current
+
+                else:
+                    # If we're here, it means that in that in some evaluation we were not able to get a valid local F1
+                    # This happens when both the predictions and targets are empty. In that case, we can assume it's a perfect prediction
+                    if "local_F1s" not in current[0, 0]:
+                        best = current
+            selected_imgs.append(best)
+        result = np.stack(selected_imgs, axis=-1)
+        assert result.shape == scorings[0].shape
+        return result
--- a/sam3/eval/coco_eval.py
+++ b/sam3/eval/coco_eval.py
@@ -0,0 +1,916 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
+
+"""
+COCO evaluator that works in distributed mode.
+
+Mostly copy-paste from https://github.com/pytorch/vision/blob/edfd5a7/references/detection/coco_eval.py
+The difference is that there is less copy-pasting from pycocotools
+in the end of the file, as python3 can suppress prints with contextlib
+"""
+
+import contextlib
+import copy
+import json
+import logging
+import os
+import pickle
+from collections import defaultdict
+from pathlib import Path
+
+from typing import Any, List, Optional
+
+import numpy as np
+
+import pycocotools.mask as mask_utils
+import torch
+from iopath.common.file_io import g_pathmgr
+from pycocotools.coco import COCO
+from pycocotools.cocoeval import COCOeval
+
+from sam3.train.masks_ops import rle_encode
+
+from sam3.train.utils.distributed import (
+    all_gather,
+    gather_to_rank_0_via_filesys,
+    get_rank,
+    is_main_process,
+)
+
+RARITY_BUCKETS = {0: "frequent", 1: "common", 2: "medium", 3: "rare"}
+
+
+class CocoEvaluator:
+    def __init__(
+        self,
+        coco_gt,
+        iou_types: List[str],
+        useCats: bool,
+        dump_dir: Optional[str],
+        postprocessor,
+        average_by_rarity=False,
+        metrics_dump_dir: Optional[str] = None,
+        gather_pred_via_filesys=False,
+        use_normalized_areas=True,
+        maxdets=[1, 10, 100],
+        exhaustive_only=False,
+        all_exhaustive_only=True,
+    ):
+        """Online coco evaluator. It will evaluate images as they are generated by the model, then accumulate/summarize at the end
+
+        Args:
+           - coco_gt: COCO api object containing the gt
+           - iou_types: can be either "bbox" or "segm"
+           - useCats: If true, categories will be used for evaluation
+           - dump_dir: if non null, then the predictions will be dumped in that directory
+           - postprocessor: Module to convert the model's output into the coco format
+           - average_by_rarity: if true then we expect the images information in the gt dataset
+                 to have a "rarity" field. Then the AP will be computed on all rarity buckets
+                 individually, then averaged
+           - gather_pred_via_filesys: if true, we use the filesystem for collective gathers
+           - use_normalized_areas: if true, the areas of the objects in the GT are assumed to be
+                 normalized by the area of the image. In that case, the size buckets are adjusted
+           - maxdets: maximal number of detections to be evaluated on each image.
+           - exhaustive_only: If true, we restrict eval only to exhaustive annotations
+           - all_exhaustive_only: If true, datapoints are restricted only to those with all exhaustive annotations
+
+        """
+        # coco_gt = copy.deepcopy(coco_gt)
+        self.coco_gts = [coco_gt] if not isinstance(coco_gt, list) else coco_gt
+        assert len(maxdets) == 3, f"expecting 3 detection threshold, got {len(maxdets)}"
+
+        self.use_normalized_areas = use_normalized_areas
+        self.iou_types = iou_types
+        self.useCats = useCats
+        self.maxdets = maxdets
+        self.dump = None
+        self.dump_dir = dump_dir
+        if self.dump_dir is not None:
+            self.dump = []
+            if is_main_process():
+                if not os.path.exists(self.dump_dir):
+                    os.makedirs(self.dump_dir, exist_ok=True)
+                    logging.info(f"Create the folder: {dump_dir}")
+
+        self.initialized = False
+
+        # Whether to gather predictions through filesystem (instead of torch
+        # collective ops; requiring a shared filesystem across all ranks)
+        self.gather_pred_via_filesys = gather_pred_via_filesys
+        self.use_self_evaluate = True  # CPP version is disabled
+        self.postprocessor = postprocessor
+        self.average_by_rarity = average_by_rarity
+        self.exhaustive_only = exhaustive_only
+        self.all_exhaustive_only = all_exhaustive_only
+        self.metrics_dump_dir = metrics_dump_dir
+        if self.metrics_dump_dir is not None:
+            if is_main_process():
+                if not os.path.exists(self.metrics_dump_dir):
+                    os.makedirs(self.metrics_dump_dir, exist_ok=True)
+                    logging.info(f"Create the folder: {metrics_dump_dir}")
+
+    def _lazy_init(self, coco_cls=COCO):
+        if self.initialized:
+            return
+
+        self.initialized = True
+
+        self.coco_gts = [
+            coco_cls(g_pathmgr.get_local_path(gt)) if isinstance(gt, str) else gt
+            for gt in self.coco_gts
+        ]
+
+        self.reset()
+
+        self.eval_img_ids = None
+
+        if self.exhaustive_only:
+            exclude_img_ids = set()
+            # exclude_img_ids are the ids that are not exhaustively annotated in any of the other gts
+            if self.all_exhaustive_only:
+                for coco_gt in self.coco_gts[1:]:
+                    exclude_img_ids = exclude_img_ids.union(
+                        {
+                            img["id"]
+                            for img in coco_gt.dataset["images"]
+                            if not img["is_instance_exhaustive"]
+                        }
+                    )
+            # we only eval on instance exhaustive queries
+            self.eval_img_ids = [
+                img["id"]
+                for img in self.coco_gts[0].dataset["images"]
+                if (img["is_instance_exhaustive"] and img["id"] not in exclude_img_ids)
+            ]
+
+        self.rarity_buckets = None
+        if self.average_by_rarity:
+            self.rarity_buckets = defaultdict(list)
+            eval_img_ids_set = (
+                set(self.eval_img_ids) if self.eval_img_ids is not None else None
+            )
+            for img in self.coco_gts[0].dataset["images"]:
+                if self.eval_img_ids is not None and img["id"] not in eval_img_ids_set:
+                    continue
+                self.rarity_buckets[img["rarity"]].append(img["id"])
+            print("Rarity buckets sizes:")
+            for k, v in self.rarity_buckets.items():
+                print(f"{k}: {len(v)}")
+
+    def set_sync_device(self, device: torch.device) -> Any:
+        self._sync_device = device
+
+    def _evaluate(self, *args, **kwargs):
+        return evaluate(*args, **kwargs)
+
+    def _loadRes(self, *args, **kwargs):
+        return loadRes(*args, **kwargs)
+
+    def update(self, *args, **kwargs):
+        self._lazy_init()
+        predictions = self.postprocessor.process_results(*args, **kwargs)
+
+        img_ids = list(np.unique(list(predictions.keys())))
+        self.img_ids.extend(img_ids)
+
+        for iou_type in self.iou_types:
+            results = self.prepare(predictions, iou_type)
+            self._dump(results)
+
+            assert len(self.coco_gts) == len(self.coco_evals)
+            all_scorings = []
+            for cur_coco_gt, cur_coco_eval in zip(self.coco_gts, self.coco_evals):
+                # suppress pycocotools prints
+                with open(os.devnull, "w") as devnull:
+                    with contextlib.redirect_stdout(devnull):
+                        coco_dt = (
+                            self._loadRes(cur_coco_gt, results) if results else COCO()
+                        )
+
+                coco_eval = cur_coco_eval[iou_type]
+
+                coco_eval.cocoDt = coco_dt
+                coco_eval.params.imgIds = list(img_ids)
+                coco_eval.params.useCats = self.useCats
+                coco_eval.params.maxDets = self.maxdets
+                img_ids, eval_imgs = self._evaluate(coco_eval, self.use_self_evaluate)
+                all_scorings.append(eval_imgs)
+
+            selected = self.select_best_scoring(all_scorings)
+            self.eval_imgs[iou_type].append(selected)
+
+    def select_best_scoring(self, scorings):
+        # This function is used for "oracle" type evaluation.
+        # It accepts the evaluation results with respect to several ground truths, and picks the best
+        if len(scorings) == 1:
+            return scorings[0]
+
+        # Currently we don't support Oracle Phrase AP.
+        # To implement it, we likely need to modify the cpp code since the eval_image type is opaque
+        raise RuntimeError("Not implemented")
+
+    def _dump(self, results):
+        if self.dump is not None:
+            dumped_results = copy.deepcopy(results)
+            for r in dumped_results:
+                if "bbox" not in self.iou_types and "bbox" in r:
+                    del r["bbox"]
+                elif "bbox" in r:
+                    r["bbox"] = [round(coord, 5) for coord in r["bbox"]]
+                r["score"] = round(r["score"], 5)
+            self.dump.extend(dumped_results)
+
+    def synchronize_between_processes(self):
+        self._lazy_init()
+        logging.info("Coco evaluator: Synchronizing between processes")
+        for iou_type in self.iou_types:
+            if len(self.eval_imgs[iou_type]) > 0:
+                self.eval_imgs[iou_type] = np.concatenate(self.eval_imgs[iou_type], 2)
+            else:
+                num_areas = len(self.coco_evals[0][iou_type].params.areaRng)
+                # assuming 1 class
+                assert not self.useCats
+                self.eval_imgs[iou_type] = np.empty((1, num_areas, 0))
+            create_common_coco_eval(
+                self.coco_evals[0][iou_type],
+                self.img_ids,
+                self.eval_imgs[iou_type],
+                use_self_evaluate=self.use_self_evaluate,
+                gather_pred_via_filesys=self.gather_pred_via_filesys,
+                metrics_dump_dir=self.metrics_dump_dir,
+            )
+        if self.dump is not None:
+            dumped_file = Path(self.dump_dir) / f"coco_predictions_{get_rank()}.json"
+            logging.info(f"COCO evaluator: Dumping local predictions to {dumped_file}")
+            with g_pathmgr.open(str(dumped_file), "w") as f:
+                json.dump(self.dump, f)
+
+            # if self.gather_pred_via_filesys:
+            #     dump = gather_to_rank_0_via_filesys(self.dump)
+            # else:
+            #     dump = all_gather(self.dump, force_cpu=True)
+            # self.dump = sum(dump, [])
+
+    def accumulate(self, imgIds=None):
+        self._lazy_init()
+        logging.info(
+            f"Coco evaluator: Accumulating on {len(imgIds) if imgIds is not None else 'all'} images"
+        )
+        if not is_main_process():
+            return
+
+        if imgIds is None:
+            for coco_eval in self.coco_evals[0].values():
+                accumulate(coco_eval, use_self_eval=self.use_self_evaluate)
+
+        if imgIds is not None:
+            imgIds = set(imgIds)
+            for coco_eval in self.coco_evals[0].values():
+                p = coco_eval.params
+                id_mask = np.array([(i in imgIds) for i in p.imgIds], dtype=bool)
+                old_img_ids = p.imgIds
+                coco_eval.params.imgIds = np.asarray(p.imgIds)[id_mask]
+                old_img_evals = coco_eval.evalImgs
+                catIds = p.catIds if p.useCats else [-1]
+                coco_eval.evalImgs = list(
+                    np.asarray(coco_eval.evalImgs)
+                    .reshape(len(catIds), len(p.areaRng), len(old_img_ids))[
+                        ..., id_mask
+                    ]
+                    .flatten()
+                )
+                accumulate(coco_eval, use_self_eval=self.use_self_evaluate)
+                coco_eval.evalImgs = old_img_evals
+                coco_eval.params.imgIds = old_img_ids
+
+    def summarize(self):
+        self._lazy_init()
+        logging.info("Coco evaluator: Summarizing")
+        if not is_main_process():
+            return {}
+
+        outs = {}
+        if self.rarity_buckets is None:
+            self.accumulate(self.eval_img_ids)
+            for iou_type, coco_eval in self.coco_evals[0].items():
+                print("IoU metric: {}".format(iou_type))
+                summarize(coco_eval)
+
+            if "bbox" in self.coco_evals[0]:
+                for key, value in zip(*self.coco_evals[0]["bbox"].stats):
+                    outs[f"coco_eval_bbox_{key}"] = value
+            if "segm" in self.coco_evals[0]:
+                for key, value in zip(*self.coco_evals[0]["segm"].stats):
+                    outs[f"coco_eval_masks_{key}"] = value
+        else:
+            total_stats = {}
+            all_keys = {}
+            for bucket, img_list in self.rarity_buckets.items():
+                self.accumulate(imgIds=img_list)
+                bucket_name = RARITY_BUCKETS[bucket]
+                for iou_type, coco_eval in self.coco_evals[0].items():
+                    print(f"IoU metric: {iou_type}. Rarity bucket: {bucket_name}")
+                    summarize(coco_eval)
+
+                if "bbox" in self.coco_evals[0]:
+                    if "bbox" not in total_stats:
+                        total_stats["bbox"] = np.zeros_like(
+                            self.coco_evals[0]["bbox"].stats[1]
+                        )
+                        all_keys["bbox"] = self.coco_evals[0]["bbox"].stats[0]
+                    total_stats["bbox"] += self.coco_evals[0]["bbox"].stats[1]
+                    for key, value in zip(*self.coco_evals[0]["bbox"].stats):
+                        outs[f"coco_eval_bbox_{bucket_name}_{key}"] = value
+                if "segm" in self.coco_evals[0]:
+                    if "segm" not in total_stats:
+                        total_stats["segm"] = np.zeros_like(
+                            self.coco_evals[0]["segm"].stats[1]
+                        )
+                        all_keys["segm"] = self.coco_evals[0]["segm"].stats[0]
+                    total_stats["segm"] += self.coco_evals[0]["segm"].stats[1]
+                    for key, value in zip(*self.coco_evals[0]["segm"].stats):
+                        outs[f"coco_eval_masks_{bucket_name}_{key}"] = value
+
+            if "bbox" in total_stats:
+                total_stats["bbox"] /= len(self.rarity_buckets)
+                for key, value in zip(all_keys["bbox"], total_stats["bbox"]):
+                    outs[f"coco_eval_bbox_{key}"] = value
+            if "segm" in total_stats:
+                total_stats["segm"] /= len(self.rarity_buckets)
+                for key, value in zip(all_keys["segm"], total_stats["segm"]):
+                    outs[f"coco_eval_masks_{key}"] = value
+
+        # if self.dump is not None:
+        #     assert self.dump_dir is not None
+        #     logging.info("Coco evaluator: Dumping the global result file to disk")
+        #     with g_pathmgr.open(str(Path(self.dump_dir) / "coco_eval.json"), "w") as f:
+        #         json.dump(self.dump, f)
+        return outs
+
+    def compute_synced(self):
+        self._lazy_init()
+        self.synchronize_between_processes()
+        return self.summarize()
+
+    def compute(self):
+        self._lazy_init()
+        return {"": 0.0}
+
+    def reset(self, cocoeval_cls=COCOeval):
+        self.coco_evals = [{} for _ in range(len(self.coco_gts))]
+        for i, coco_gt in enumerate(self.coco_gts):
+            for iou_type in self.iou_types:
+                self.coco_evals[i][iou_type] = cocoeval_cls(coco_gt, iouType=iou_type)
+                self.coco_evals[i][iou_type].params.useCats = self.useCats
+                self.coco_evals[i][iou_type].params.maxDets = self.maxdets
+                if self.use_normalized_areas:
+                    self.coco_evals[i][iou_type].params.areaRng = [
+                        [0, 1e5],
+                        [0, 0.001],
+                        [0.001, 0.01],
+                        [0.01, 0.1],
+                        [0.1, 0.5],
+                        [0.5, 0.95],
+                        [0.95, 1e5],
+                    ]
+                    self.coco_evals[i][iou_type].params.areaRngLbl = [
+                        "all",
+                        "tiny",
+                        "small",
+                        "medium",
+                        "large",
+                        "huge",
+                        "whole_image",
+                    ]
+
+        self.img_ids = []
+        self.eval_imgs = {k: [] for k in self.iou_types}
+        if self.dump is not None:
+            self.dump = []
+
+    def write(self, stats):
+        self._lazy_init()
+        """Write the results in the stats dict"""
+        if "bbox" in self.coco_evals[0]:
+            stats["coco_eval_bbox"] = self.coco_evals[0]["bbox"].stats.tolist()
+        if "segm" in self.coco_evals[0]:
+            stats["coco_eval_masks"] = self.coco_evals[0]["segm"].stats.tolist()
+        return stats
+
+    def prepare(self, predictions, iou_type):
+        self._lazy_init()
+        if iou_type == "bbox":
+            return self.prepare_for_coco_detection(predictions)
+        elif iou_type == "segm":
+            return self.prepare_for_coco_segmentation(predictions)
+        elif iou_type == "keypoints":
+            return self.prepare_for_coco_keypoint(predictions)
+        else:
+            raise ValueError("Unknown iou type {}".format(iou_type))
+
+    def prepare_for_coco_detection(self, predictions):
+        self._lazy_init()
+        coco_results = []
+        for original_id, prediction in predictions.items():
+            if len(prediction) == 0:
+                continue
+
+            boxes = prediction["boxes"]
+            boxes = convert_to_xywh(boxes).tolist()
+            scores = prediction["scores"].tolist()
+            labels = prediction["labels"].tolist()
+
+            coco_results.extend(
+                [
+                    {
+                        "image_id": original_id,
+                        "category_id": labels[k],
+                        "bbox": box,
+                        "score": scores[k],
+                    }
+                    for k, box in enumerate(boxes)
+                ]
+            )
+        return coco_results
+
+    @torch.no_grad()
+    def prepare_for_coco_segmentation(self, predictions):
+        self._lazy_init()
+        coco_results = []
+        for original_id, prediction in predictions.items():
+            if len(prediction) == 0:
+                continue
+
+            scores = prediction["scores"].tolist()
+            labels = prediction["labels"].tolist()
+            boundaries, dilated_boundaries = None, None
+            if "boundaries" in prediction:
+                boundaries = prediction["boundaries"]
+                dilated_boundaries = prediction["dilated_boundaries"]
+                assert dilated_boundaries is not None
+                assert len(scores) == len(boundaries)
+
+            if "masks_rle" in prediction:
+                rles = prediction["masks_rle"]
+                areas = []
+                for rle in rles:
+                    cur_area = mask_utils.area(rle)
+                    h, w = rle["size"]
+                    areas.append(cur_area / (h * w))
+            else:
+                masks = prediction["masks"]
+
+                masks = masks > 0.5
+                h, w = masks.shape[-2:]
+
+                areas = masks.flatten(1).sum(1) / (h * w)
+                areas = areas.tolist()
+
+                rles = rle_encode(masks.squeeze(1))
+
+                # memory clean
+                del masks
+                del prediction["masks"]
+
+            assert len(areas) == len(rles) == len(scores)
+            for k, rle in enumerate(rles):
+                payload = {
+                    "image_id": original_id,
+                    "category_id": labels[k],
+                    "segmentation": rle,
+                    "score": scores[k],
+                    "area": areas[k],
+                }
+                if boundaries is not None:
+                    payload["boundary"] = boundaries[k]
+                    payload["dilated_boundary"] = dilated_boundaries[k]
+
+                coco_results.append(payload)
+
+        return coco_results
+
+    def prepare_for_coco_keypoint(self, predictions):
+        self._lazy_init()
+        coco_results = []
+        for original_id, prediction in predictions.items():
+            if len(prediction) == 0:
+                continue
+
+            boxes = prediction["boxes"]
+            boxes = convert_to_xywh(boxes).tolist()
+            scores = prediction["scores"].tolist()
+            labels = prediction["labels"].tolist()
+            keypoints = prediction["keypoints"]
+            keypoints = keypoints.flatten(start_dim=1).tolist()
+
+            coco_results.extend(
+                [
+                    {
+                        "image_id": original_id,
+                        "category_id": labels[k],
+                        "keypoints": keypoint,
+                        "score": scores[k],
+                    }
+                    for k, keypoint in enumerate(keypoints)
+                ]
+            )
+        return coco_results
+
+
+def convert_to_xywh(boxes):
+    xmin, ymin, xmax, ymax = boxes.unbind(-1)
+    return torch.stack((xmin, ymin, xmax - xmin, ymax - ymin), dim=-1)
+
+
+def merge(img_ids, eval_imgs, gather_pred_via_filesys=False):
+    if gather_pred_via_filesys:
+        # only gather the predictions to rank 0 (other ranks will receive empty
+        # lists for `all_img_ids` and `all_eval_imgs`, which should be OK as
+        # merging and evaluation are only done on rank 0)
+        all_img_ids = gather_to_rank_0_via_filesys(img_ids)
+        all_eval_imgs = gather_to_rank_0_via_filesys(eval_imgs)
+    else:
+        all_img_ids = all_gather(img_ids, force_cpu=True)
+        all_eval_imgs = all_gather(eval_imgs, force_cpu=True)
+    if not is_main_process():
+        return None, None
+
+    merged_img_ids = []
+    for p in all_img_ids:
+        merged_img_ids.extend(p)
+
+    merged_eval_imgs = []
+    for p in all_eval_imgs:
+        merged_eval_imgs.append(p)
+
+    merged_img_ids = np.array(merged_img_ids)
+    merged_eval_imgs = np.concatenate(merged_eval_imgs, 2)
+
+    # keep only unique (and in sorted order) images
+    merged_img_ids, idx = np.unique(merged_img_ids, return_index=True)
+    merged_eval_imgs = merged_eval_imgs[..., idx]
+
+    return merged_img_ids, merged_eval_imgs
+
+
+def create_common_coco_eval(
+    coco_eval,
+    img_ids,
+    eval_imgs,
+    use_self_evaluate,
+    gather_pred_via_filesys=False,
+    metrics_dump_dir=None,
+):
+    img_ids, eval_imgs = merge(img_ids, eval_imgs, gather_pred_via_filesys)
+    if not is_main_process():
+        return
+    if metrics_dump_dir is not None:
+        dumped_file = (
+            Path(metrics_dump_dir) / f"coco_eval_img_metrics_{get_rank()}.json"
+        )
+        logging.info(f"COCO evaluator: Dumping local predictions to {dumped_file}")
+        with g_pathmgr.open(str(dumped_file), "w") as f:
+            json.dump(eval_imgs.squeeze(), f, default=lambda x: x.tolist())
+    img_ids = list(img_ids)
+
+    # If some images were not predicted, we need to create dummy detections for them
+    missing_img_ids = set(coco_eval.cocoGt.getImgIds()) - set(img_ids)
+    if len(missing_img_ids) > 0:
+        print(f"WARNING: {len(missing_img_ids)} images were not predicted!")
+        coco_eval.cocoDt = COCO()
+        coco_eval.params.imgIds = list(missing_img_ids)
+        new_img_ids, new_eval_imgs = evaluate(coco_eval, use_self_evaluate)
+        img_ids.extend(new_img_ids)
+        eval_imgs = np.concatenate((eval_imgs, new_eval_imgs), axis=2)
+
+    eval_imgs = list(eval_imgs.flatten())
+    assert len(img_ids) == len(coco_eval.cocoGt.getImgIds())
+
+    coco_eval.evalImgs = eval_imgs
+    coco_eval.params.imgIds = img_ids
+    coco_eval._paramsEval = copy.deepcopy(coco_eval.params)
+
+
+#################################################################
+# From pycocotools, just removed the prints and fixed
+# a Python3 bug about unicode not defined
+#################################################################
+
+
+# Copy of COCO prepare, but doesn't convert anntoRLE
+def segmentation_prepare(self):
+    """
+    Prepare ._gts and ._dts for evaluation based on params
+    :return: None
+    """
+    p = self.params
+    if p.useCats:
+        gts = self.cocoGt.loadAnns(
+            self.cocoGt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds)
+        )
+        dts = self.cocoDt.loadAnns(
+            self.cocoDt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds)
+        )
+    else:
+        gts = self.cocoGt.loadAnns(self.cocoGt.getAnnIds(imgIds=p.imgIds))
+        dts = self.cocoDt.loadAnns(self.cocoDt.getAnnIds(imgIds=p.imgIds))
+
+    for gt in gts:
+        gt["ignore"] = gt["ignore"] if "ignore" in gt else 0
+        gt["ignore"] = "iscrowd" in gt and gt["iscrowd"]
+        if p.iouType == "keypoints":
+            gt["ignore"] = (gt["num_keypoints"] == 0) or gt["ignore"]
+    self._gts = defaultdict(list)  # gt for evaluation
+    self._dts = defaultdict(list)  # dt for evaluation
+    for gt in gts:
+        self._gts[gt["image_id"], gt["category_id"]].append(gt)
+    for dt in dts:
+        self._dts[dt["image_id"], dt["category_id"]].append(dt)
+    self.evalImgs = defaultdict(list)  # per-image per-category evaluation results
+    self.eval = {}  # accumulated evaluation results
+
+
+def evaluate(self, use_self_evaluate):
+    """
+    Run per image evaluation on given images and store results (a list of dict) in self.evalImgs
+    :return: None
+    """
+    # tic = time.time()
+    # print('Running per image evaluation...', use_self_evaluate)
+    p = self.params
+    # add backward compatibility if useSegm is specified in params
+    if p.useSegm is not None:
+        p.iouType = "segm" if p.useSegm == 1 else "bbox"
+        print(
+            "useSegm (deprecated) is not None. Running {} evaluation".format(p.iouType)
+        )
+    # print('Evaluate annotation type *{}*'.format(p.iouType))
+    p.imgIds = list(np.unique(p.imgIds))
+    if p.useCats:
+        p.catIds = list(np.unique(p.catIds))
+    p.maxDets = sorted(p.maxDets)
+    self.params = p
+
+    self._prepare()
+    # loop through images, area range, max detection number
+    catIds = p.catIds if p.useCats else [-1]
+
+    if p.iouType == "segm" or p.iouType == "bbox":
+        computeIoU = self.computeIoU
+    elif p.iouType == "keypoints":
+        computeIoU = self.computeOks
+    self.ious = {
+        (imgId, catId): computeIoU(imgId, catId)
+        for imgId in p.imgIds
+        for catId in catIds
+    }
+
+    maxDet = p.maxDets[-1]
+    if use_self_evaluate:
+        evalImgs = [
+            self.evaluateImg(imgId, catId, areaRng, maxDet)
+            for catId in catIds
+            for areaRng in p.areaRng
+            for imgId in p.imgIds
+        ]
+        # this is NOT in the pycocotools code, but could be done outside
+        evalImgs = np.asarray(evalImgs).reshape(
+            len(catIds), len(p.areaRng), len(p.imgIds)
+        )
+        return p.imgIds, evalImgs
+
+    # <<<< Beginning of code differences with original COCO API
+    # def convert_instances_to_cpp(instances, is_det=False):
+    #     # Convert annotations for a list of instances in an image to a format that's fast
+    #     # to access in C++
+    #     instances_cpp = []
+    #     for instance in instances:
+    #         instance_cpp = _CPP.InstanceAnnotation(
+    #             int(instance["id"]),
+    #             instance["score"] if is_det else instance.get("score", 0.0),
+    #             instance["area"],
+    #             bool(instance.get("iscrowd", 0)),
+    #             bool(instance.get("ignore", 0)),
+    #         )
+    #         instances_cpp.append(instance_cpp)
+    #     return instances_cpp
+
+    # # Convert GT annotations, detections, and IOUs to a format that's fast to access in C++
+    # ground_truth_instances = [
+    #     [convert_instances_to_cpp(self._gts[imgId, catId]) for catId in p.catIds]
+    #     for imgId in p.imgIds
+    # ]
+    # detected_instances = [
+    #     [
+    #         convert_instances_to_cpp(self._dts[imgId, catId], is_det=True)
+    #         for catId in p.catIds
+    #     ]
+    #     for imgId in p.imgIds
+    # ]
+    # ious = [[self.ious[imgId, catId] for catId in catIds] for imgId in p.imgIds]
+
+    # if not p.useCats:
+    #     # For each image, flatten per-category lists into a single list
+    #     ground_truth_instances = [
+    #         [[o for c in i for o in c]] for i in ground_truth_instances
+    #     ]
+    #     detected_instances = [[[o for c in i for o in c]] for i in detected_instances]
+
+    # # Call C++ implementation of self.evaluateImgs()
+    # _evalImgs_cpp = _CPP.COCOevalEvaluateImages(
+    #     p.areaRng, maxDet, p.iouThrs, ious, ground_truth_instances, detected_instances
+    # )
+
+    # self._paramsEval = copy.deepcopy(self.params)
+    # evalImgs = np.asarray(_evalImgs_cpp).reshape(
+    #     len(catIds), len(p.areaRng), len(p.imgIds)
+    # )
+    # return p.imgIds, evalImgs
+
+
+#################################################################
+# end of straight copy from pycocotools, just removing the prints
+#################################################################
+
+
+#################################################################
+# From pycocotools, but disabled mask->box conversion which is
+# pointless
+#################################################################
+def loadRes(self, resFile):
+    """
+    Load result file and return a result api object.
+    :param   resFile (str)     : file name of result file
+    :return: res (obj)         : result api object
+    """
+    res = COCO()
+    res.dataset["images"] = [img for img in self.dataset["images"]]
+
+    if type(resFile) == str:
+        anns = json.load(open(resFile))
+    elif type(resFile) == np.ndarray:
+        anns = self.loadNumpyAnnotations(resFile)
+    else:
+        anns = resFile
+    assert type(anns) == list, "results in not an array of objects"
+    annsImgIds = [ann["image_id"] for ann in anns]
+    assert set(annsImgIds) == (
+        set(annsImgIds) & set(self.getImgIds())
+    ), "Results do not correspond to current coco set"
+    if "caption" in anns[0]:
+        imgIds = set([img["id"] for img in res.dataset["images"]]) & set(
+            [ann["image_id"] for ann in anns]
+        )
+        res.dataset["images"] = [
+            img for img in res.dataset["images"] if img["id"] in imgIds
+        ]
+        for id, ann in enumerate(anns):
+            ann["id"] = id + 1
+    elif "bbox" in anns[0] and not anns[0]["bbox"] == []:
+        res.dataset["categories"] = copy.deepcopy(self.dataset["categories"])
+        for id, ann in enumerate(anns):
+            bb = ann["bbox"]
+            x1, x2, y1, y2 = [bb[0], bb[0] + bb[2], bb[1], bb[1] + bb[3]]
+            if "segmentation" not in ann:
+                ann["segmentation"] = [[x1, y1, x1, y2, x2, y2, x2, y1]]
+            ann["area"] = bb[2] * bb[3]
+            ann["id"] = id + 1
+            ann["iscrowd"] = 0
+    elif "segmentation" in anns[0]:
+        res.dataset["categories"] = copy.deepcopy(self.dataset["categories"])
+        for id, ann in enumerate(anns):
+            # now only support compressed RLE format as segmentation results
+            # ann["area"] = mask_util.area(ann["segmentation"])
+            # The following lines are disabled because they are pointless
+            #  if not 'bbox' in ann:
+            #     ann['bbox'] = maskUtils.toBbox(ann['segmentation'])
+            ann["id"] = id + 1
+            ann["iscrowd"] = 0
+    elif "keypoints" in anns[0]:
+        res.dataset["categories"] = copy.deepcopy(self.dataset["categories"])
+        for id, ann in enumerate(anns):
+            s = ann["keypoints"]
+            x = s[0::3]
+            y = s[1::3]
+            x0, x1, y0, y1 = np.min(x), np.max(x), np.min(y), np.max(y)
+            ann["area"] = (x1 - x0) * (y1 - y0)
+            ann["id"] = id + 1
+            ann["bbox"] = [x0, y0, x1 - x0, y1 - y0]
+
+    res.dataset["annotations"] = anns
+    res.createIndex()
+    return res
+
+
+#################################################################
+# end of straight copy from pycocotools
+#################################################################
+
+
+#################################################################
+# From pycocotools, but added handling of custom area rngs, and returns stat keys
+#################################################################
+def summarize(self):
+    """
+    Compute and display summary metrics for evaluation results.
+    Note this functin can *only* be applied on the default parameter setting
+    """
+
+    def _summarize(ap=1, iouThr=None, areaRng="all", maxDets=100):
+        p = self.params
+        iStr = " {:<18} {} @[ IoU={:<9} | area={:>6s} | maxDets={:>3d} ] = {:0.3f}"
+        titleStr = "Average Precision" if ap == 1 else "Average Recall"
+        typeStr = "(AP)" if ap == 1 else "(AR)"
+        iouStr = (
+            "{:0.2f}:{:0.2f}".format(p.iouThrs[0], p.iouThrs[-1])
+            if iouThr is None
+            else "{:0.2f}".format(iouThr)
+        )
+
+        aind = [i for i, aRng in enumerate(p.areaRngLbl) if aRng == areaRng]
+        mind = [i for i, mDet in enumerate(p.maxDets) if mDet == maxDets]
+        if ap == 1:
+            # dimension of precision: [TxRxKxAxM]
+            s = self.eval["precision"]
+            # IoU
+            if iouThr is not None:
+                t = np.where(iouThr == p.iouThrs)[0]
+                s = s[t]
+            s = s[:, :, :, aind, mind]
+        else:
+            # dimension of recall: [TxKxAxM]
+            s = self.eval["recall"]
+            if iouThr is not None:
+                t = np.where(iouThr == p.iouThrs)[0]
+                s = s[t]
+            s = s[:, :, aind, mind]
+        if len(s[s > -1]) == 0:
+            mean_s = -1
+        else:
+            mean_s = np.mean(s[s > -1])
+        print(iStr.format(titleStr, typeStr, iouStr, areaRng, maxDets, mean_s))
+        return mean_s
+
+    def _summarizeDets():
+        nb_results = 6 + (len(self.params.areaRng) - 1) * 2
+        assert len(self.params.areaRng) == len(self.params.areaRngLbl)
+        stats = np.zeros((nb_results,))
+        keys = ["AP", "AP_50", "AP_75"]
+        stats[0] = _summarize(1, maxDets=self.params.maxDets[2])
+        stats[1] = _summarize(1, iouThr=0.5, maxDets=self.params.maxDets[2])
+        stats[2] = _summarize(1, iouThr=0.75, maxDets=self.params.maxDets[2])
+        cur_id = 3
+        for area in self.params.areaRngLbl[1:]:
+            stats[cur_id] = _summarize(1, areaRng=area, maxDets=self.params.maxDets[2])
+            cur_id += 1
+            keys.append(f"AP_{area}")
+        stats[cur_id] = _summarize(0, maxDets=self.params.maxDets[0])
+        cur_id += 1
+        stats[cur_id] = _summarize(0, maxDets=self.params.maxDets[1])
+        cur_id += 1
+        stats[cur_id] = _summarize(0, maxDets=self.params.maxDets[2])
+        cur_id += 1
+        keys += ["AR", "AR_50", "AR_75"]
+
+        for area in self.params.areaRngLbl[1:]:
+            stats[cur_id] = _summarize(0, areaRng=area, maxDets=self.params.maxDets[2])
+            cur_id += 1
+            keys.append(f"AR_{area}")
+        assert len(stats) == len(keys)
+        return keys, stats
+
+    if not self.eval:
+        raise Exception("Please run accumulate() first")
+    self.stats = _summarizeDets()
+
+
+#################################################################
+# end of straight copy from pycocotools
+#################################################################
+
+
+#################################################################
+# From https://github.com/facebookresearch/detectron2/blob/main/detectron2/evaluation/fast_eval_api.py
+# with slight adjustments
+#################################################################
+def accumulate(self, use_self_eval=False):
+    """
+    Accumulate per image evaluation results and store the result in self.eval.  Does not
+    support changing parameter settings from those used by self.evaluate()
+    """
+    if use_self_eval:
+        self.accumulate()
+        return
+    # CPP code is disabled
+    # self.eval = _CPP.COCOevalAccumulate(self.params, self.evalImgs)
+
+    # # recall is num_iou_thresholds X num_categories X num_area_ranges X num_max_detections
+    # self.eval["recall"] = np.array(self.eval["recall"]).reshape(
+    #     self.eval["counts"][:1] + self.eval["counts"][2:]
+    # )
+
+    # # precision and scores are num_iou_thresholds X num_recall_thresholds X num_categories X
+    # # num_area_ranges X num_max_detections
+    # self.eval["precision"] = np.array(self.eval["precision"]).reshape(
+    #     self.eval["counts"]
+    # )
+    # self.eval["scores"] = np.array(self.eval["scores"]).reshape(self.eval["counts"])
--- a/sam3/eval/coco_eval_offline.py
+++ b/sam3/eval/coco_eval_offline.py
@@ -0,0 +1,181 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
+
+"""
+This evaluator is meant for regular COCO mAP evaluation, for example on the COCO val set.
+
+For Category mAP, we need the model to make predictions for all the categories on every single image.
+In general, since the number of classes can be big, and the API model makes predictions individually for each pair (image, class),
+we may need to split the inference process for a given image in several chunks.
+"""
+
+import logging
+from collections import defaultdict
+
+import torch
+from pycocotools.coco import COCO
+from pycocotools.cocoeval import COCOeval
+from sam3.train.utils.distributed import is_main_process
+
+try:
+    from tidecv import datasets, TIDE
+
+    HAS_TIDE = True
+except ImportError:
+    HAS_TIDE = False
+    print("WARNING: TIDE not installed. Detailed analysis will not be available.")
+
+
+# the COCO detection metrics (https://github.com/cocodataset/cocoapi/blob/8c9bcc3cf640524c4c20a9c40e89cb6a2f2fa0e9/PythonAPI/pycocotools/cocoeval.py#L460-L471)
+COCO_METRICS = [
+    "AP",
+    "AP_50",
+    "AP_75",
+    "AP_small",
+    "AP_medium",
+    "AP_large",
+    "AR_maxDets@1",
+    "AR_maxDets@10",
+    "AR_maxDets@100",
+    "AR_small",
+    "AR_medium",
+    "AR_large",
+]
+
+
+def convert_to_xywh(boxes):
+    """Convert bounding boxes from xyxy format to xywh format."""
+    xmin, ymin, xmax, ymax = boxes.unbind(-1)
+    return torch.stack((xmin, ymin, xmax - xmin, ymax - ymin), dim=-1)
+
+
+class HeapElement:
+    """Utility class to make a heap with a custom comparator"""
+
+    def __init__(self, val):
+        self.val = val
+
+    def __lt__(self, other):
+        return self.val["score"] < other.val["score"]
+
+
+class COCOevalCustom(COCOeval):
+    """
+    This is a slightly modified version of the original COCO API with added support for positive split evaluation.
+    """
+
+    def __init__(
+        self, cocoGt=None, cocoDt=None, iouType="segm", dt_only_positive=False
+    ):
+        super().__init__(cocoGt, cocoDt, iouType)
+        self.dt_only_positive = dt_only_positive
+
+    def _prepare(self):
+        """
+        Prepare ._gts and ._dts for evaluation based on params
+        :return: None
+        """
+
+        def _toMask(anns, coco):
+            # modify ann['segmentation'] by reference
+            for ann in anns:
+                rle = coco.annToRLE(ann)
+                ann["segmentation"] = rle
+
+        p = self.params
+        if p.useCats:
+            gts = self.cocoGt.loadAnns(
+                self.cocoGt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds)
+            )
+            dts = self.cocoDt.loadAnns(
+                self.cocoDt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds)
+            )
+        else:
+            gts = self.cocoGt.loadAnns(self.cocoGt.getAnnIds(imgIds=p.imgIds))
+            dts = self.cocoDt.loadAnns(self.cocoDt.getAnnIds(imgIds=p.imgIds))
+
+        # convert ground truth to mask if iouType == 'segm'
+        if p.iouType == "segm":
+            _toMask(gts, self.cocoGt)
+            _toMask(dts, self.cocoDt)
+        # set ignore flag
+        for gt in gts:
+            gt["ignore"] = gt["ignore"] if "ignore" in gt else 0
+            gt["ignore"] = "iscrowd" in gt and gt["iscrowd"]
+            if p.iouType == "keypoints":
+                gt["ignore"] = (gt["num_keypoints"] == 0) or gt["ignore"]
+        self._gts = defaultdict(list)  # gt for evaluation
+        self._dts = defaultdict(list)  # dt for evaluation
+
+        _gts_cat_ids = defaultdict(set)  # gt for evaluation on positive split
+        for gt in gts:
+            self._gts[gt["image_id"], gt["category_id"]].append(gt)
+            _gts_cat_ids[gt["image_id"]].add(gt["category_id"])
+
+        #### BEGIN MODIFICATION ####
+        for dt in dts:
+            if (
+                self.dt_only_positive
+                and dt["category_id"] not in _gts_cat_ids[dt["image_id"]]
+            ):
+                continue
+            self._dts[dt["image_id"], dt["category_id"]].append(dt)
+        #### END MODIFICATION ####
+        self.evalImgs = defaultdict(list)  # per-image per-category evaluation results
+        self.eval = {}  # accumulated evaluation results
+
+
+class CocoEvaluatorOfflineWithPredFileEvaluators:
+    def __init__(
+        self,
+        gt_path,
+        tide: bool = True,
+        iou_type: str = "bbox",
+        positive_split=False,
+    ):
+        self.gt_path = gt_path
+        self.tide_enabled = HAS_TIDE and tide
+        self.positive_split = positive_split
+        self.iou_type = iou_type
+
+    def evaluate(self, dumped_file):
+        if not is_main_process():
+            return {}
+
+        logging.info("OfflineCoco evaluator: Loading groundtruth")
+        self.gt = COCO(self.gt_path)
+
+        # Creating the result file
+        logging.info("Coco evaluator: Creating the result file")
+        cocoDt = self.gt.loadRes(str(dumped_file))
+
+        # Run the evaluation
+        logging.info("Coco evaluator: Running evaluation")
+        coco_eval = COCOevalCustom(
+            self.gt, cocoDt, iouType=self.iou_type, dt_only_positive=self.positive_split
+        )
+        coco_eval.evaluate()
+        coco_eval.accumulate()
+        coco_eval.summarize()
+
+        outs = {}
+        for i, value in enumerate(coco_eval.stats):
+            outs[f"coco_eval_{self.iou_type}_{COCO_METRICS[i]}"] = value
+
+        if self.tide_enabled:
+            logging.info("Coco evaluator: Loading TIDE")
+            self.tide_gt = datasets.COCO(self.gt_path)
+            self.tide = TIDE(mode="mask" if self.iou_type == "segm" else "bbox")
+
+            # Run TIDE
+            logging.info("Coco evaluator: Running TIDE")
+            self.tide.evaluate(
+                self.tide_gt, datasets.COCOResult(str(dumped_file)), name="coco_eval"
+            )
+            self.tide.summarize()
+            for k, v in self.tide.get_main_errors()["coco_eval"].items():
+                outs[f"coco_eval_{self.iou_type}_TIDE_{k}"] = v
+
+            for k, v in self.tide.get_special_errors()["coco_eval"].items():
+                outs[f"coco_eval_{self.iou_type}_TIDE_{k}"] = v
+
+        return outs
--- a/sam3/eval/coco_reindex.py
+++ b/sam3/eval/coco_reindex.py
@@ -0,0 +1,230 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
+
+"""
+Self-contained COCO JSON re-indexing function that creates temporary files.
+"""
+
+import json
+import os
+import tempfile
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple
+
+
+def reindex_coco_to_temp(input_json_path: str) -> Optional[str]:
+    """
+    Convert 0-indexed COCO JSON file to 1-indexed and save to temporary location.
+
+    Args:
+        input_json_path: Path to the input COCO JSON file
+
+    Returns:
+        Path to the new 1-indexed JSON file in temporary directory, or None if no conversion needed
+
+    Raises:
+        FileNotFoundError: If input file doesn't exist
+        json.JSONDecodeError: If input file is not valid JSON
+        ValueError: If input file is not a valid COCO format
+    """
+
+    def is_coco_json(data: Dict[str, Any]) -> bool:
+        """Check if data appears to be a COCO format file."""
+        if not isinstance(data, dict):
+            return False
+        # A COCO file should have at least one of these keys
+        coco_keys = {"images", "annotations", "categories"}
+        return any(key in data for key in coco_keys)
+
+    def check_zero_indexed(data: Dict[str, Any]) -> Tuple[bool, bool, bool]:
+        """
+        Check if annotations, images, or categories start from index 0.
+
+        Returns:
+            Tuple of (annotations_zero_indexed, images_zero_indexed, categories_zero_indexed)
+        """
+        annotations_zero = False
+        images_zero = False
+        categories_zero = False
+
+        # Check annotations
+        annotations = data.get("annotations", [])
+        if annotations and any(ann.get("id", -1) == 0 for ann in annotations):
+            annotations_zero = True
+
+        # Check images
+        images = data.get("images", [])
+        if images and any(img.get("id", -1) == 0 for img in images):
+            images_zero = True
+
+        # Check categories
+        categories = data.get("categories", [])
+        if categories and any(cat.get("id", -1) == 0 for cat in categories):
+            categories_zero = True
+
+        return annotations_zero, images_zero, categories_zero
+
+    def reindex_coco_data(data: Dict[str, Any]) -> Dict[str, Any]:
+        """Convert 0-indexed COCO data to 1-indexed."""
+        modified_data = data.copy()
+
+        annotations_zero, images_zero, categories_zero = check_zero_indexed(data)
+
+        # Create ID mapping for consistency
+        image_id_mapping = {}
+        category_id_mapping = {}
+
+        # Process images first (since annotations reference image IDs)
+        if images_zero and "images" in modified_data:
+            for img in modified_data["images"]:
+                old_id = img["id"]
+                new_id = old_id + 1
+                image_id_mapping[old_id] = new_id
+                img["id"] = new_id
+
+        # Process categories (since annotations reference category IDs)
+        if categories_zero and "categories" in modified_data:
+            for cat in modified_data["categories"]:
+                old_id = cat["id"]
+                new_id = old_id + 1
+                category_id_mapping[old_id] = new_id
+                cat["id"] = new_id
+
+        # Process annotations
+        if "annotations" in modified_data:
+            for ann in modified_data["annotations"]:
+                # Update annotation ID if needed
+                if annotations_zero:
+                    ann["id"] = ann["id"] + 1
+
+                # Update image_id reference if images were reindexed
+                if images_zero and ann.get("image_id") is not None:
+                    old_image_id = ann["image_id"]
+                    if old_image_id in image_id_mapping:
+                        ann["image_id"] = image_id_mapping[old_image_id]
+
+                # Update category_id reference if categories were reindexed
+                if categories_zero and ann.get("category_id") is not None:
+                    old_category_id = ann["category_id"]
+                    if old_category_id in category_id_mapping:
+                        ann["category_id"] = category_id_mapping[old_category_id]
+
+        return modified_data
+
+    # Validate input path
+    if not os.path.exists(input_json_path):
+        raise FileNotFoundError(f"Input file not found: {input_json_path}")
+
+    # Load and validate JSON data
+    try:
+        with open(input_json_path, "r", encoding="utf-8") as f:
+            data = json.load(f)
+    except json.JSONDecodeError as e:
+        raise json.JSONDecodeError(f"Invalid JSON in {input_json_path}: {e}")
+
+    # Validate COCO format
+    if not is_coco_json(data):
+        raise ValueError(
+            f"File does not appear to be in COCO format: {input_json_path}"
+        )
+
+    # Check if reindexing is needed
+    annotations_zero, images_zero, categories_zero = check_zero_indexed(data)
+
+    if not (annotations_zero or images_zero or categories_zero):
+        # No conversion needed - just copy to temp location
+        input_path = Path(input_json_path)
+        temp_dir = tempfile.mkdtemp()
+        temp_filename = f"{input_path.stem}_1_indexed{input_path.suffix}"
+        temp_path = os.path.join(temp_dir, temp_filename)
+
+        with open(temp_path, "w", encoding="utf-8") as f:
+            json.dump(data, f, indent=2, ensure_ascii=False)
+
+        return temp_path
+
+    # Perform reindexing
+    modified_data = reindex_coco_data(data)
+
+    # Create temporary file
+    input_path = Path(input_json_path)
+    temp_dir = tempfile.mkdtemp()
+    temp_filename = f"{input_path.stem}_1_indexed{input_path.suffix}"
+    temp_path = os.path.join(temp_dir, temp_filename)
+
+    # Write modified data to temporary file
+    with open(temp_path, "w", encoding="utf-8") as f:
+        json.dump(modified_data, f, indent=2, ensure_ascii=False)
+
+    return temp_path
+
+
+# Example usage and test function
+def test_reindex_function():
+    """Test the reindex function with a sample COCO file."""
+
+    # Create a test COCO file
+    test_data = {
+        "info": {"description": "Test COCO dataset", "version": "1.0", "year": 2023},
+        "images": [
+            {"id": 0, "width": 640, "height": 480, "file_name": "test1.jpg"},
+            {"id": 1, "width": 640, "height": 480, "file_name": "test2.jpg"},
+        ],
+        "categories": [
+            {"id": 0, "name": "person", "supercategory": "person"},
+            {"id": 1, "name": "car", "supercategory": "vehicle"},
+        ],
+        "annotations": [
+            {
+                "id": 0,
+                "image_id": 0,
+                "category_id": 0,
+                "bbox": [100, 100, 50, 75],
+                "area": 3750,
+                "iscrowd": 0,
+            },
+            {
+                "id": 1,
+                "image_id": 1,
+                "category_id": 1,
+                "bbox": [200, 150, 120, 80],
+                "area": 9600,
+                "iscrowd": 0,
+            },
+        ],
+    }
+
+    # Create temporary test file
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
+        json.dump(test_data, f, indent=2)
+        test_file_path = f.name
+
+    try:
+        # Test the function
+        result_path = reindex_coco_to_temp(test_file_path)
+        print(f"Original file: {test_file_path}")
+        print(f"Converted file: {result_path}")
+
+        # Load and display the result
+        with open(result_path, "r") as f:
+            result_data = json.load(f)
+
+        print("\nConverted data sample:")
+        print(f"First image ID: {result_data['images'][0]['id']}")
+        print(f"First category ID: {result_data['categories'][0]['id']}")
+        print(f"First annotation ID: {result_data['annotations'][0]['id']}")
+        print(f"First annotation image_id: {result_data['annotations'][0]['image_id']}")
+        print(
+            f"First annotation category_id: {result_data['annotations'][0]['category_id']}"
+        )
+
+        # Clean up
+        os.unlink(result_path)
+        os.rmdir(os.path.dirname(result_path))
+
+    finally:
+        # Clean up test file
+        os.unlink(test_file_path)
+
+
+if __name__ == "__main__":
+    test_reindex_function()
--- a/sam3/eval/coco_writer.py
+++ b/sam3/eval/coco_writer.py
@@ -0,0 +1,352 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
+
+"""
+COCO prediction dumper for distributed training.
+
+Handles collection and dumping of COCO-format predictions from models.
+Supports distributed processing with multiple GPUs/processes.
+"""
+
+import copy
+import gc
+import heapq
+import json
+import logging
+import os
+from collections import defaultdict
+from pathlib import Path
+from typing import Any, Optional
+
+import pycocotools.mask as mask_utils
+import torch
+from iopath.common.file_io import g_pathmgr
+from sam3.eval.coco_eval_offline import convert_to_xywh
+from sam3.train.masks_ops import rle_encode
+from sam3.train.utils.distributed import (
+    all_gather,
+    gather_to_rank_0_via_filesys,
+    get_rank,
+    is_main_process,
+)
+
+
+### Helper functions and classes
+
+
+class HeapElement:
+    """Utility class to make a heap with a custom comparator based on score."""
+
+    def __init__(self, val):
+        self.val = val
+
+    def __lt__(self, other):
+        return self.val["score"] < other.val["score"]
+
+
+class PredictionDumper:
+    """
+    Handles collection and dumping of COCO-format predictions from a model.
+
+    This class processes model outputs through a postprocessor, converts them to COCO format,
+    and saves them to disk. It supports distributed processing with multiple GPUs/processes.
+    """
+
+    def __init__(
+        self,
+        dump_dir: str,
+        postprocessor,
+        maxdets: int,
+        iou_type: str,
+        gather_pred_via_filesys: bool = False,
+        merge_predictions: bool = False,
+        pred_file_evaluators: Optional[Any] = None,
+    ):
+        """
+        Initialize the PredictionDumper.
+
+        Args:
+            dump_dir: Directory to dump predictions.
+            postprocessor: Module to convert the model's output into COCO format.
+            maxdets: Maximum number of detections per image.
+            iou_type: IoU type to evaluate. Can include "bbox", "segm"
+            gather_pred_via_filesys: If True, use the filesystem for collective gathers across
+                processes (requires a shared filesystem). Otherwise, use torch collective ops.
+            merge_predictions: If True, merge predictions from all processes and dump to a single file.
+        """
+        self.iou_type = iou_type
+        self.maxdets = maxdets
+        self.dump_dir = dump_dir
+        self.postprocessor = postprocessor
+        self.gather_pred_via_filesys = gather_pred_via_filesys
+        self.merge_predictions = merge_predictions
+        self.pred_file_evaluators = pred_file_evaluators
+        if self.pred_file_evaluators is not None:
+            assert (
+                merge_predictions
+            ), "merge_predictions must be True if pred_file_evaluators are provided"
+        assert self.dump_dir is not None, "dump_dir must be provided"
+
+        if is_main_process():
+            os.makedirs(self.dump_dir, exist_ok=True)
+            logging.info(f"Created prediction dump directory: {self.dump_dir}")
+
+        # Initialize state
+        self.reset()
+
+    def update(self, *args, **kwargs):
+        """
+        Process and accumulate predictions from model outputs.
+
+        Args:
+            *args, **kwargs: Arguments passed to postprocessor.process_results()
+        """
+        predictions = self.postprocessor.process_results(*args, **kwargs)
+        results = self.prepare(predictions, self.iou_type)
+        self._dump(results)
+
+    def _dump(self, results):
+        """
+        Add results to the dump list with precision rounding.
+
+        Args:
+            results: List of prediction dictionaries in COCO format.
+        """
+        dumped_results = copy.deepcopy(results)
+        for r in dumped_results:
+            if "bbox" in r:
+                r["bbox"] = [round(coord, 5) for coord in r["bbox"]]
+            r["score"] = round(r["score"], 5)
+        self.dump.extend(dumped_results)
+
+    def synchronize_between_processes(self):
+        """
+        Synchronize predictions across all processes and save to disk.
+
+        If gather_pred_via_filesys is True, uses filesystem for gathering.
+        Otherwise, uses torch distributed collective operations.
+        Saves per-rank predictions to separate JSON files.
+        """
+        logging.info("Prediction Dumper: Synchronizing between processes")
+
+        if not self.merge_predictions:
+            dumped_file = (
+                Path(self.dump_dir)
+                / f"coco_predictions_{self.iou_type}_{get_rank()}.json"
+            )
+            logging.info(
+                f"Prediction Dumper: Dumping local predictions to {dumped_file}"
+            )
+            with g_pathmgr.open(str(dumped_file), "w") as f:
+                json.dump(self.dump, f)
+        else:
+            self.dump = self.gather_and_merge_predictions()
+            dumped_file = Path(self.dump_dir) / f"coco_predictions_{self.iou_type}.json"
+            if is_main_process():
+                logging.info(
+                    f"Prediction Dumper: Dumping merged predictions to {dumped_file}"
+                )
+                with g_pathmgr.open(str(dumped_file), "w") as f:
+                    json.dump(self.dump, f)
+
+        self.reset()
+        return dumped_file
+
+    def gather_and_merge_predictions(self):
+        """
+        Gather predictions from all processes and merge them, keeping top predictions per image.
+
+        This method collects predictions from all processes, then keeps only the top maxdets
+        predictions per image based on score. It also deduplicates predictions by (image_id, category_id).
+
+        Returns:
+            List of merged prediction dictionaries.
+        """
+        logging.info("Prediction Dumper: Gathering predictions from all processes")
+        gc.collect()
+
+        if self.gather_pred_via_filesys:
+            dump = gather_to_rank_0_via_filesys(self.dump)
+        else:
+            dump = all_gather(self.dump, force_cpu=True)
+
+        # Combine predictions, keeping only top maxdets per image
+        preds_by_image = defaultdict(list)
+        seen_img_cat = set()
+
+        for cur_dump in dump:
+            cur_seen_img_cat = set()
+            for p in cur_dump:
+                image_id = p["image_id"]
+                cat_id = p["category_id"]
+
+                # Skip if we've already seen this image/category pair in a previous dump
+                if (image_id, cat_id) in seen_img_cat:
+                    continue
+
+                cur_seen_img_cat.add((image_id, cat_id))
+
+                # Use a min-heap to keep top predictions
+                if len(preds_by_image[image_id]) < self.maxdets:
+                    heapq.heappush(preds_by_image[image_id], HeapElement(p))
+                else:
+                    heapq.heappushpop(preds_by_image[image_id], HeapElement(p))
+
+            seen_img_cat.update(cur_seen_img_cat)
+
+        # Flatten the heap elements back to a list
+        merged_dump = sum(
+            [[h.val for h in cur_preds] for cur_preds in preds_by_image.values()], []
+        )
+
+        return merged_dump
+
+    def compute_synced(self):
+        """
+        Synchronize predictions across processes and compute summary.
+
+        Returns:
+            Summary dictionary from summarize().
+        """
+        dumped_file = self.synchronize_between_processes()
+        if not is_main_process():
+            return {"": 0.0}
+
+        meters = {}
+        if self.pred_file_evaluators is not None:
+            for evaluator in self.pred_file_evaluators:
+                results = evaluator.evaluate(dumped_file)
+                meters.update(results)
+
+        if len(meters) == 0:
+            meters = {"": 0.0}
+        return meters
+
+    def compute(self):
+        """
+        Compute without synchronization.
+
+        Returns:
+            Empty metric dictionary.
+        """
+        return {"": 0.0}
+
+    def reset(self):
+        """Reset internal state for a new evaluation round."""
+        self.dump = []
+
+    def prepare(self, predictions, iou_type):
+        """
+        Route predictions to the appropriate preparation method based on iou_type.
+
+        Args:
+            predictions: Dictionary mapping image IDs to prediction dictionaries.
+            iou_type: Type of evaluation ("bbox", "segm").
+
+        Returns:
+            List of COCO-format prediction dictionaries.
+        """
+        if iou_type == "bbox":
+            return self.prepare_for_coco_detection(predictions)
+        elif iou_type == "segm":
+            return self.prepare_for_coco_segmentation(predictions)
+        else:
+            raise ValueError(f"Unknown iou type: {iou_type}")
+
+    def prepare_for_coco_detection(self, predictions):
+        """
+        Convert predictions to COCO detection format.
+
+        Args:
+            predictions: Dictionary mapping image IDs to prediction dictionaries
+                containing "boxes", "scores", and "labels".
+
+        Returns:
+            List of COCO-format detection dictionaries.
+        """
+        coco_results = []
+        for original_id, prediction in predictions.items():
+            if len(prediction) == 0:
+                continue
+
+            boxes = prediction["boxes"]
+            boxes = convert_to_xywh(boxes).tolist()
+            scores = prediction["scores"].tolist()
+            labels = prediction["labels"].tolist()
+
+            coco_results.extend(
+                [
+                    {
+                        "image_id": original_id,
+                        "category_id": labels[k],
+                        "bbox": box,
+                        "score": scores[k],
+                    }
+                    for k, box in enumerate(boxes)
+                ]
+            )
+        return coco_results
+
+    @torch.no_grad()
+    def prepare_for_coco_segmentation(self, predictions):
+        """
+        Convert predictions to COCO segmentation format.
+
+        Args:
+            predictions: Dictionary mapping image IDs to prediction dictionaries
+                containing "masks" or "masks_rle", "scores", and "labels".
+                Optionally includes "boundaries" and "dilated_boundaries".
+
+        Returns:
+            List of COCO-format segmentation dictionaries with RLE-encoded masks.
+        """
+        coco_results = []
+        for original_id, prediction in predictions.items():
+            if len(prediction) == 0:
+                continue
+
+            scores = prediction["scores"].tolist()
+            labels = prediction["labels"].tolist()
+
+            boxes = None
+            if "boxes" in prediction:
+                boxes = prediction["boxes"]
+                boxes = convert_to_xywh(boxes).tolist()
+                assert len(boxes) == len(scores)
+
+            if "masks_rle" in prediction:
+                rles = prediction["masks_rle"]
+                areas = []
+                for rle in rles:
+                    cur_area = mask_utils.area(rle)
+                    h, w = rle["size"]
+                    areas.append(cur_area / (h * w))
+            else:
+                masks = prediction["masks"]
+                masks = masks > 0.5
+                h, w = masks.shape[-2:]
+
+                areas = masks.flatten(1).sum(1) / (h * w)
+                areas = areas.tolist()
+
+                rles = rle_encode(masks.squeeze(1))
+
+                # Memory cleanup
+                del masks
+                del prediction["masks"]
+
+            assert len(areas) == len(rles) == len(scores)
+
+            for k, rle in enumerate(rles):
+                payload = {
+                    "image_id": original_id,
+                    "category_id": labels[k],
+                    "segmentation": rle,
+                    "score": scores[k],
+                    "area": areas[k],
+                }
+                if boxes is not None:
+                    payload["bbox"] = boxes[k]
+
+                coco_results.append(payload)
+
+        return coco_results
--- a/sam3/eval/conversion_util.py
+++ b/sam3/eval/conversion_util.py
@@ -0,0 +1,211 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
+import json
+import os
+from collections import defaultdict
+
+from tqdm import tqdm
+
+
+def convert_ytbvis_to_cocovid_gt(ann_json, save_path=None):
+    """Convert YouTube VIS dataset to COCO-style video instance segmentation format.
+
+    Args:
+        ann_json (str): Path to YouTube VIS annotation JSON file
+        save_path (str): path to save converted COCO-style JSON
+    """
+    # Initialize COCO structure
+    VIS = {
+        "info": {},
+        "images": [],
+        "videos": [],
+        "tracks": [],
+        "annotations": [],
+        "categories": [],
+        "licenses": [],
+    }
+
+    # Load original annotations
+    official_anns = json.load(open(ann_json))
+    VIS["categories"] = official_anns["categories"]  # Direct copy categories
+
+    # Initialize counters
+    records = dict(img_id=1, ann_id=1)
+
+    # Create video-to-annotations mapping
+    vid_to_anns = defaultdict(list)
+    for ann in official_anns["annotations"]:
+        vid_to_anns[ann["video_id"]].append(ann)
+
+    # Create tracks directly
+    VIS["tracks"] = [
+        {
+            "id": ann["id"],
+            "category_id": ann["category_id"],
+            "video_id": ann["video_id"],
+        }
+        for ann in official_anns["annotations"]
+    ]
+
+    # Process videos
+    for video_info in tqdm(official_anns["videos"]):
+        # Create video entry
+        video = {
+            "id": video_info["id"],
+            "name": os.path.dirname(video_info["file_names"][0]),
+            "width": video_info["width"],
+            "height": video_info["height"],
+            "length": video_info["length"],
+            "neg_category_ids": [],
+            "not_exhaustive_category_ids": [],
+        }
+        VIS["videos"].append(video)
+
+        # Process frames
+        num_frames = len(video_info["file_names"])
+        for frame_idx in range(num_frames):
+            # Create image entry
+            image = {
+                "id": records["img_id"],
+                "video_id": video_info["id"],
+                "file_name": video_info["file_names"][frame_idx],
+                "width": video_info["width"],
+                "height": video_info["height"],
+                "frame_index": frame_idx,
+                "frame_id": frame_idx,
+            }
+            VIS["images"].append(image)
+
+            # Process annotations for this frame
+            if video_info["id"] in vid_to_anns:
+                for ann in vid_to_anns[video_info["id"]]:
+                    bbox = ann["bboxes"][frame_idx]
+                    if bbox is None:
+                        continue
+
+                    # Create annotation entry
+                    annotation = {
+                        "id": records["ann_id"],
+                        "video_id": video_info["id"],
+                        "image_id": records["img_id"],
+                        "track_id": ann["id"],
+                        "category_id": ann["category_id"],
+                        "bbox": bbox,
+                        "area": ann["areas"][frame_idx],
+                        "segmentation": ann["segmentations"][frame_idx],
+                        "iscrowd": ann["iscrowd"],
+                    }
+                    VIS["annotations"].append(annotation)
+                    records["ann_id"] += 1
+
+            records["img_id"] += 1
+
+    # Print summary
+    print(f"Converted {len(VIS['videos'])} videos")
+    print(f"Converted {len(VIS['images'])} images")
+    print(f"Created {len(VIS['tracks'])} tracks")
+    print(f"Created {len(VIS['annotations'])} annotations")
+
+    if save_path is None:
+        return VIS
+
+    # Save output
+    save_dir = os.path.dirname(save_path)
+    os.makedirs(save_dir, exist_ok=True)
+    json.dump(VIS, open(save_path, "w"))
+
+    return VIS
+
+
+def convert_ytbvis_to_cocovid_pred(
+    youtubevis_pred_path: str, converted_dataset_path: str, output_path: str
+) -> None:
+    """
+    Convert YouTubeVIS predictions to COCO format with video_id preservation
+
+    Args:
+        youtubevis_pred_path: Path to YouTubeVIS prediction JSON
+        converted_dataset_path: Path to converted COCO dataset JSON
+        output_path: Path to save COCO format predictions
+    """
+
+    # Load YouTubeVIS predictions
+    with open(youtubevis_pred_path) as f:
+        ytv_predictions = json.load(f)
+
+    # Load converted dataset for image ID mapping
+    with open(converted_dataset_path) as f:
+        coco_dataset = json.load(f)
+
+    # Create (video_id, frame_idx) -> image_id mapping
+    image_id_map = {
+        (img["video_id"], img["frame_index"]): img["id"]
+        for img in coco_dataset["images"]
+    }
+
+    coco_annotations = []
+    track_id_counter = 1  # Unique track ID generator
+
+    for pred in tqdm(ytv_predictions):
+        video_id = pred["video_id"]
+        category_id = pred["category_id"]
+        bboxes = pred["bboxes"]
+        segmentations = pred.get("segmentations", [])  # Get segmentations if available
+        areas = pred.get("areas", [])  # Get areas if available
+        score = pred["score"]
+
+        # Assign unique track ID for this prediction
+        track_id = track_id_counter
+        track_id_counter += 1
+
+        # Ensure segmentations and areas have the same length as bboxes
+        if len(segmentations) == 0:
+            segmentations = [None] * len(bboxes)
+        if len(areas) == 0:
+            areas = [None] * len(bboxes)
+
+        for frame_idx, (bbox, segmentation, area_from_pred) in enumerate(
+            zip(bboxes, segmentations, areas)
+        ):
+            # Skip frames with missing objects (None or zero bbox)
+            if bbox is None or all(x == 0 for x in bbox):
+                continue
+
+            # Get corresponding image ID from mapping
+            image_id = image_id_map.get((video_id, frame_idx))
+            if image_id is None:
+                raise RuntimeError(
+                    f"prediction {video_id=}, {frame_idx=} does not match any images in the converted COCO format"
+                )
+
+            # Extract bbox coordinates
+            x, y, w, h = bbox
+
+            # Calculate area - use area from prediction if available, otherwise from bbox
+            if area_from_pred is not None and area_from_pred > 0:
+                area = area_from_pred
+            else:
+                area = w * h
+
+            # Create COCO annotation with video_id
+            coco_annotation = {
+                "image_id": int(image_id),
+                "video_id": video_id,  # Added video_id field
+                "track_id": track_id,
+                "category_id": category_id,
+                "bbox": [float(x), float(y), float(w), float(h)],
+                "area": float(area),
+                "iscrowd": 0,
+                "score": float(score),
+            }
+
+            # Add segmentation if available
+            if segmentation is not None:
+                coco_annotation["segmentation"] = segmentation
+
+            coco_annotations.append(coco_annotation)
+
+    # Save output
+    with open(output_path, "w") as f:
+        json.dump(coco_annotations, f)
+
+    print(f"Converted {len(coco_annotations)} predictions to COCO format with video_id")
--- a/sam3/eval/demo_eval.py
+++ b/sam3/eval/demo_eval.py
@@ -0,0 +1,658 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
+
+"""
+This evaluator is based upon COCO evaluation, but evaluates the model in a "demo" setting.
+This means that the model's predictions are thresholded and evaluated as "hard" predictions.
+"""
+
+import logging
+from typing import Optional
+
+import numpy as np
+import pycocotools.mask as maskUtils
+from pycocotools.cocoeval import COCOeval
+
+from sam3.eval.coco_eval import CocoEvaluator
+from sam3.train.masks_ops import compute_F_measure
+from sam3.train.utils.distributed import is_main_process
+
+from scipy.optimize import linear_sum_assignment
+
+
+class DemoEval(COCOeval):
+    """
+    This evaluator is based upon COCO evaluation, but evaluates the model in a "demo" setting.
+    This means that the model's predictions are thresholded and evaluated as "hard" predictions.
+    """
+
+    def __init__(
+        self,
+        coco_gt=None,
+        coco_dt=None,
+        iouType="bbox",
+        threshold=0.5,
+        compute_JnF=False,
+    ):
+        """
+        Args:
+            coco_gt (COCO): ground truth COCO API
+            coco_dt (COCO): detections COCO API
+            iou_type (str): type of IoU to evaluate
+            threshold (float): threshold for predictions
+        """
+        super().__init__(coco_gt, coco_dt, iouType)
+        self.threshold = threshold
+
+        self.params.useCats = False
+        self.params.areaRng = [[0**2, 1e5**2]]
+        self.params.areaRngLbl = ["all"]
+        self.params.maxDets = [100000]
+        self.compute_JnF = compute_JnF
+
+    def computeIoU(self, imgId, catId):
+        # Same as the original COCOeval.computeIoU, but without sorting
+        p = self.params
+        if p.useCats:
+            gt = self._gts[imgId, catId]
+            dt = self._dts[imgId, catId]
+        else:
+            gt = [_ for cId in p.catIds for _ in self._gts[imgId, cId]]
+            dt = [_ for cId in p.catIds for _ in self._dts[imgId, cId]]
+        if len(gt) == 0 and len(dt) == 0:
+            return []
+
+        if p.iouType == "segm":
+            g = [g["segmentation"] for g in gt]
+            d = [d["segmentation"] for d in dt]
+        elif p.iouType == "bbox":
+            g = [g["bbox"] for g in gt]
+            d = [d["bbox"] for d in dt]
+        else:
+            raise Exception("unknown iouType for iou computation")
+
+        # compute iou between each dt and gt region
+        iscrowd = [int(o["iscrowd"]) for o in gt]
+        ious = maskUtils.iou(d, g, iscrowd)
+        return ious
+
+    def evaluateImg(self, imgId, catId, aRng, maxDet):
+        """
+        perform evaluation for single category and image
+        :return: dict (single image results)
+        """
+        p = self.params
+        assert not p.useCats, "This evaluator does not support per-category evaluation."
+        assert catId == -1
+        all_gts = [_ for cId in p.catIds for _ in self._gts[imgId, cId]]
+        keep_gt = np.array([not g["ignore"] for g in all_gts], dtype=bool)
+        gt = [g for g in all_gts if not g["ignore"]]
+        all_dts = [_ for cId in p.catIds for _ in self._dts[imgId, cId]]
+        keep_dt = np.array([d["score"] >= self.threshold for d in all_dts], dtype=bool)
+        dt = [d for d in all_dts if d["score"] >= self.threshold]
+        if len(gt) == 0 and len(dt) == 0:
+            # This is a "true negative" case, where there are no GTs and no predictions
+            # The box-level metrics are ill-defined, so we don't add them to this dict
+            return {
+                "image_id": imgId,
+                "IL_TP": 0,
+                "IL_TN": 1,
+                "IL_FP": 0,
+                "IL_FN": 0,
+                "IL_perfect_neg": np.ones((len(p.iouThrs),), dtype=np.int64),
+                "num_dt": len(dt),
+            }
+
+        if len(gt) > 0 and len(dt) == 0:
+            # This is a "false negative" case, where there are GTs but no predictions
+            return {
+                "image_id": imgId,
+                "IL_TP": 0,
+                "IL_TN": 0,
+                "IL_FP": 0,
+                "IL_FN": 1,
+                "TPs": np.zeros((len(p.iouThrs),), dtype=np.int64),
+                "FPs": np.zeros((len(p.iouThrs),), dtype=np.int64),
+                "FNs": np.ones((len(p.iouThrs),), dtype=np.int64) * len(gt),
+                "local_F1s": np.zeros((len(p.iouThrs),), dtype=np.int64),
+                "local_positive_F1s": np.zeros((len(p.iouThrs),), dtype=np.int64),
+                "IL_perfect_pos": np.zeros((len(p.iouThrs),), dtype=np.int64),
+                "num_dt": len(dt),
+            }
+
+        # Load pre-computed ious
+        ious = self.ious[(imgId, catId)]
+
+        # compute matching
+        if len(ious) == 0:
+            ious = np.zeros((len(dt), len(gt)))
+        else:
+            ious = ious[keep_dt, :][:, keep_gt]
+        assert ious.shape == (len(dt), len(gt))
+
+        matched_dt, matched_gt = linear_sum_assignment(-ious)
+
+        match_scores = ious[matched_dt, matched_gt]
+
+        if self.compute_JnF and len(match_scores) > 0:
+            j_score = match_scores.mean()
+            f_measure = 0
+            for dt_id, gt_id in zip(matched_dt, matched_gt):
+                f_measure += compute_F_measure(
+                    gt_boundary_rle=gt[gt_id]["boundary"],
+                    gt_dilated_boundary_rle=gt[gt_id]["dilated_boundary"],
+                    dt_boundary_rle=dt[dt_id]["boundary"],
+                    dt_dilated_boundary_rle=dt[dt_id]["dilated_boundary"],
+                )
+            f_measure /= len(match_scores) + 1e-9
+            JnF = (j_score + f_measure) * 0.5
+        else:
+            j_score = f_measure = JnF = -1
+
+        TPs, FPs, FNs = [], [], []
+        IL_perfect = []
+        for thresh in p.iouThrs:
+            TP = (match_scores >= thresh).sum()
+            FP = len(dt) - TP
+            FN = len(gt) - TP
+            assert (
+                FP >= 0 and FN >= 0
+            ), f"FP: {FP}, FN: {FN}, TP: {TP}, match_scores: {match_scores}, len(dt): {len(dt)}, len(gt): {len(gt)}, ious: {ious}"
+            TPs.append(TP)
+            FPs.append(FP)
+            FNs.append(FN)
+
+            if FP == FN and FP == 0:
+                IL_perfect.append(1)
+            else:
+                IL_perfect.append(0)
+
+        TPs = np.array(TPs, dtype=np.int64)
+        FPs = np.array(FPs, dtype=np.int64)
+        FNs = np.array(FNs, dtype=np.int64)
+        IL_perfect = np.array(IL_perfect, dtype=np.int64)
+
+        # compute precision recall and F1
+        precision = TPs / (TPs + FPs + 1e-4)
+        assert np.all(precision <= 1)
+        recall = TPs / (TPs + FNs + 1e-4)
+        assert np.all(recall <= 1)
+        F1 = 2 * precision * recall / (precision + recall + 1e-4)
+
+        result = {
+            "image_id": imgId,
+            "TPs": TPs,
+            "FPs": FPs,
+            "FNs": FNs,
+            "local_F1s": F1,
+            "IL_TP": (len(gt) > 0) and (len(dt) > 0),
+            "IL_FP": (len(gt) == 0) and (len(dt) > 0),
+            "IL_TN": (len(gt) == 0) and (len(dt) == 0),
+            "IL_FN": (len(gt) > 0) and (len(dt) == 0),
+            ("IL_perfect_pos" if len(gt) > 0 else "IL_perfect_neg"): IL_perfect,
+            "F": f_measure,
+            "J": j_score,
+            "J&F": JnF,
+            "num_dt": len(dt),
+        }
+        if len(gt) > 0 and len(dt) > 0:
+            result["local_positive_F1s"] = F1
+        return result
+
+    def accumulate(self, p=None):
+        """
+        Accumulate per image evaluation results and store the result in self.eval
+        :param p: input params for evaluation
+        :return: None
+        """
+        if not self.evalImgs:
+            print("Please run evaluate() first")
+        # allows input customized parameters
+        if p is None:
+            p = self.params
+
+        setImgIds = set(p.imgIds)
+
+        # TPs, FPs, FNs
+        TPs = np.zeros((len(p.iouThrs),), dtype=np.int64)
+        FPs = np.zeros((len(p.iouThrs),), dtype=np.int64)
+        pmFPs = np.zeros((len(p.iouThrs),), dtype=np.int64)
+        FNs = np.zeros((len(p.iouThrs),), dtype=np.int64)
+        local_F1s = np.zeros((len(p.iouThrs),), dtype=np.float64)
+
+        # Image level metrics
+        IL_TPs = 0
+        IL_FPs = 0
+        IL_TNs = 0
+        IL_FNs = 0
+        IL_perfects_neg = np.zeros((len(p.iouThrs),), dtype=np.int64)
+        IL_perfects_pos = np.zeros((len(p.iouThrs),), dtype=np.int64)
+
+        # JnF metric
+        total_J = 0
+        total_F = 0
+        total_JnF = 0
+
+        valid_img_count = 0
+        total_pos_count = 0
+        total_neg_count = 0
+        valid_J_count = 0
+        valid_F1_count = 0
+        valid_F1_count_w0dt = 0
+        for res in self.evalImgs:
+            if res["image_id"] not in setImgIds:
+                continue
+            IL_TPs += res["IL_TP"]
+            IL_FPs += res["IL_FP"]
+            IL_TNs += res["IL_TN"]
+            IL_FNs += res["IL_FN"]
+            if "IL_perfect_neg" in res:
+                IL_perfects_neg += res["IL_perfect_neg"]
+                total_neg_count += 1
+            else:
+                assert "IL_perfect_pos" in res
+                IL_perfects_pos += res["IL_perfect_pos"]
+                total_pos_count += 1
+
+            if "TPs" not in res:
+                continue
+
+            TPs += res["TPs"]
+            FPs += res["FPs"]
+            FNs += res["FNs"]
+            valid_img_count += 1
+
+            if "local_positive_F1s" in res:
+                local_F1s += res["local_positive_F1s"]
+                pmFPs += res["FPs"]
+                valid_F1_count_w0dt += 1
+                if res["num_dt"] > 0:
+                    valid_F1_count += 1
+
+            if "J" in res and res["J"] > -1e-9:
+                total_J += res["J"]
+                total_F += res["F"]
+                total_JnF += res["J&F"]
+                valid_J_count += 1
+
+        # compute precision recall and F1
+        precision = TPs / (TPs + FPs + 1e-4)
+        positive_micro_precision = TPs / (TPs + pmFPs + 1e-4)
+        assert np.all(precision <= 1)
+        recall = TPs / (TPs + FNs + 1e-4)
+        assert np.all(recall <= 1)
+        F1 = 2 * precision * recall / (precision + recall + 1e-4)
+        positive_micro_F1 = (
+            2
+            * positive_micro_precision
+            * recall
+            / (positive_micro_precision + recall + 1e-4)
+        )
+
+        IL_rec = IL_TPs / (IL_TPs + IL_FNs + 1e-6)
+        IL_prec = IL_TPs / (IL_TPs + IL_FPs + 1e-6)
+        IL_F1 = 2 * IL_prec * IL_rec / (IL_prec + IL_rec + 1e-6)
+        IL_FPR = IL_FPs / (IL_FPs + IL_TNs + 1e-6)
+        IL_MCC = float(IL_TPs * IL_TNs - IL_FPs * IL_FNs) / (
+            (
+                float(IL_TPs + IL_FPs)
+                * float(IL_TPs + IL_FNs)
+                * float(IL_TNs + IL_FPs)
+                * float(IL_TNs + IL_FNs)
+            )
+            ** 0.5
+            + 1e-6
+        )
+        IL_perfect_pos = IL_perfects_pos / (total_pos_count + 1e-9)
+        IL_perfect_neg = IL_perfects_neg / (total_neg_count + 1e-9)
+
+        total_J = total_J / (valid_J_count + 1e-9)
+        total_F = total_F / (valid_J_count + 1e-9)
+        total_JnF = total_JnF / (valid_J_count + 1e-9)
+
+        self.eval = {
+            "params": p,
+            "TPs": TPs,
+            "FPs": FPs,
+            "positive_micro_FPs": pmFPs,
+            "FNs": FNs,
+            "precision": precision,
+            "positive_micro_precision": positive_micro_precision,
+            "recall": recall,
+            "F1": F1,
+            "positive_micro_F1": positive_micro_F1,
+            "positive_macro_F1": local_F1s / valid_F1_count,
+            "positive_w0dt_macro_F1": local_F1s / valid_F1_count_w0dt,
+            "IL_recall": IL_rec,
+            "IL_precision": IL_prec,
+            "IL_F1": IL_F1,
+            "IL_FPR": IL_FPR,
+            "IL_MCC": IL_MCC,
+            "IL_perfect_pos": IL_perfect_pos,
+            "IL_perfect_neg": IL_perfect_neg,
+            "J": total_J,
+            "F": total_F,
+            "J&F": total_JnF,
+        }
+        self.eval["CGF1"] = self.eval["positive_macro_F1"] * self.eval["IL_MCC"]
+        self.eval["CGF1_w0dt"] = (
+            self.eval["positive_w0dt_macro_F1"] * self.eval["IL_MCC"]
+        )
+        self.eval["CGF1_micro"] = self.eval["positive_micro_F1"] * self.eval["IL_MCC"]
+
+    def summarize(self):
+        """
+        Compute and display summary metrics for evaluation results.
+        Note this functin can *only* be applied on the default parameter setting
+        """
+        if not self.eval:
+            raise Exception("Please run accumulate() first")
+
+        def _summarize(iouThr=None, metric=""):
+            p = self.params
+            iStr = " {:<18} @[ IoU={:<9}] = {:0.3f}"
+            titleStr = "Average " + metric
+            iouStr = (
+                "{:0.2f}:{:0.2f}".format(p.iouThrs[0], p.iouThrs[-1])
+                if iouThr is None
+                else "{:0.2f}".format(iouThr)
+            )
+
+            s = self.eval[metric]
+            # IoU
+            if iouThr is not None:
+                t = np.where(iouThr == p.iouThrs)[0]
+                s = s[t]
+
+            if len(s[s > -1]) == 0:
+                mean_s = -1
+            else:
+                mean_s = np.mean(s[s > -1])
+            print(iStr.format(titleStr, iouStr, mean_s))
+            return mean_s
+
+        def _summarize_single(metric=""):
+            titleStr = "Average " + metric
+            iStr = " {:<35} = {:0.3f}"
+            s = self.eval[metric]
+            print(iStr.format(titleStr, s))
+            return s
+
+        def _summarizeDets():
+            # note: the index of these metrics are also used in video Demo F1 evaluation
+            # when adding new metrics, please update the index in video Demo F1 evaluation
+            # in "evaluate" method of the "VideoDemoF1Evaluator" class
+            stats = np.zeros((len(DEMO_METRICS),))
+            stats[0] = _summarize(metric="CGF1")
+            stats[1] = _summarize(metric="precision")
+            stats[2] = _summarize(metric="recall")
+            stats[3] = _summarize(metric="F1")
+            stats[4] = _summarize(metric="positive_macro_F1")
+            stats[5] = _summarize_single(metric="IL_precision")
+            stats[6] = _summarize_single(metric="IL_recall")
+            stats[7] = _summarize_single(metric="IL_F1")
+            stats[8] = _summarize_single(metric="IL_FPR")
+            stats[9] = _summarize_single(metric="IL_MCC")
+            stats[10] = _summarize(metric="IL_perfect_pos")
+            stats[11] = _summarize(metric="IL_perfect_neg")
+            stats[12] = _summarize(iouThr=0.5, metric="CGF1")
+            stats[13] = _summarize(iouThr=0.5, metric="precision")
+            stats[14] = _summarize(iouThr=0.5, metric="recall")
+            stats[15] = _summarize(iouThr=0.5, metric="F1")
+            stats[16] = _summarize(iouThr=0.5, metric="positive_macro_F1")
+            stats[17] = _summarize(iouThr=0.5, metric="IL_perfect_pos")
+            stats[18] = _summarize(iouThr=0.5, metric="IL_perfect_neg")
+            stats[19] = _summarize(iouThr=0.75, metric="CGF1")
+            stats[20] = _summarize(iouThr=0.75, metric="precision")
+            stats[21] = _summarize(iouThr=0.75, metric="recall")
+            stats[22] = _summarize(iouThr=0.75, metric="F1")
+            stats[23] = _summarize(iouThr=0.75, metric="positive_macro_F1")
+            stats[24] = _summarize(iouThr=0.75, metric="IL_perfect_pos")
+            stats[25] = _summarize(iouThr=0.75, metric="IL_perfect_neg")
+            stats[26] = _summarize_single(metric="J")
+            stats[27] = _summarize_single(metric="F")
+            stats[28] = _summarize_single(metric="J&F")
+            stats[29] = _summarize(metric="CGF1_micro")
+            stats[30] = _summarize(metric="positive_micro_precision")
+            stats[31] = _summarize(metric="positive_micro_F1")
+            stats[32] = _summarize(iouThr=0.5, metric="CGF1_micro")
+            stats[33] = _summarize(iouThr=0.5, metric="positive_micro_precision")
+            stats[34] = _summarize(iouThr=0.5, metric="positive_micro_F1")
+            stats[35] = _summarize(iouThr=0.75, metric="CGF1_micro")
+            stats[36] = _summarize(iouThr=0.75, metric="positive_micro_precision")
+            stats[37] = _summarize(iouThr=0.75, metric="positive_micro_F1")
+            stats[38] = _summarize(metric="CGF1_w0dt")
+            stats[39] = _summarize(metric="positive_w0dt_macro_F1")
+            stats[40] = _summarize(iouThr=0.5, metric="CGF1_w0dt")
+            stats[41] = _summarize(iouThr=0.5, metric="positive_w0dt_macro_F1")
+            stats[42] = _summarize(iouThr=0.75, metric="CGF1_w0dt")
+            stats[43] = _summarize(iouThr=0.75, metric="positive_w0dt_macro_F1")
+            return stats
+
+        summarize = _summarizeDets
+        self.stats = summarize()
+
+
+DEMO_METRICS = [
+    "CGF1",
+    "Precision",
+    "Recall",
+    "F1",
+    "Macro_F1",
+    "IL_Precision",
+    "IL_Recall",
+    "IL_F1",
+    "IL_FPR",
+    "IL_MCC",
+    "IL_perfect_pos",
+    "IL_perfect_neg",
+    "CGF1@0.5",
+    "Precision@0.5",
+    "Recall@0.5",
+    "F1@0.5",
+    "Macro_F1@0.5",
+    "IL_perfect_pos@0.5",
+    "IL_perfect_neg@0.5",
+    "CGF1@0.75",
+    "Precision@0.75",
+    "Recall@0.75",
+    "F1@0.75",
+    "Macro_F1@0.75",
+    "IL_perfect_pos@0.75",
+    "IL_perfect_neg@0.75",
+    "J",
+    "F",
+    "J&F",
+    "CGF1_micro",
+    "positive_micro_Precision",
+    "positive_micro_F1",
+    "CGF1_micro@0.5",
+    "positive_micro_Precision@0.5",
+    "positive_micro_F1@0.5",
+    "CGF1_micro@0.75",
+    "positive_micro_Precision@0.75",
+    "positive_micro_F1@0.75",
+    "CGF1_w0dt",
+    "positive_w0dt_macro_F1",
+    "CGF1_w0dt@0.5",
+    "positive_w0dt_macro_F1@0.5",
+    "CGF1_w0dt@0.75",
+    "positive_w0dt_macro_F1@0.75",
+]
+
+
+class DemoEvaluator(CocoEvaluator):
+    def __init__(
+        self,
+        coco_gt,
+        iou_types,
+        dump_dir: Optional[str],
+        postprocessor,
+        threshold=0.5,
+        average_by_rarity=False,
+        gather_pred_via_filesys=False,
+        exhaustive_only=False,
+        all_exhaustive_only=True,
+        compute_JnF=False,
+        metrics_dump_dir: Optional[str] = None,
+    ):
+        self.iou_types = iou_types
+        self.threshold = threshold
+        super().__init__(
+            coco_gt=coco_gt,
+            iou_types=iou_types,
+            useCats=False,
+            dump_dir=dump_dir,
+            postprocessor=postprocessor,
+            # average_by_rarity=average_by_rarity,
+            gather_pred_via_filesys=gather_pred_via_filesys,
+            exhaustive_only=exhaustive_only,
+            all_exhaustive_only=all_exhaustive_only,
+            metrics_dump_dir=metrics_dump_dir,
+        )
+
+        self.use_self_evaluate = True
+        self.compute_JnF = compute_JnF
+
+    def _lazy_init(self):
+        if self.initialized:
+            return
+        super()._lazy_init()
+        self.use_self_evaluate = True
+        self.reset()
+
+    def select_best_scoring(self, scorings):
+        # This function is used for "oracle" type evaluation.
+        # It accepts the evaluation results with respect to several ground truths, and picks the best
+        if len(scorings) == 1:
+            return scorings[0]
+
+        assert (
+            scorings[0].ndim == 3
+        ), f"Expecting results in [numCats, numAreas, numImgs] format, got {scorings[0].shape}"
+        assert (
+            scorings[0].shape[0] == 1
+        ), f"Expecting a single category, got {scorings[0].shape[0]}"
+
+        for scoring in scorings:
+            assert (
+                scoring.shape == scorings[0].shape
+            ), f"Shape mismatch: {scoring.shape}, {scorings[0].shape}"
+
+        selected_imgs = []
+        for img_id in range(scorings[0].shape[-1]):
+            best = scorings[0][:, :, img_id]
+
+            for scoring in scorings[1:]:
+                current = scoring[:, :, img_id]
+                if "local_F1s" in best[0, 0] and "local_F1s" in current[0, 0]:
+                    # we were able to compute a F1 score for this particular image in both evaluations
+                    # best["local_F1s"] contains the results at various IoU thresholds. We simply take the average for comparision
+                    best_score = best[0, 0]["local_F1s"].mean()
+                    current_score = current[0, 0]["local_F1s"].mean()
+                    if current_score > best_score:
+                        best = current
+
+                else:
+                    # If we're here, it means that in that in some evaluation we were not able to get a valid local F1
+                    # This happens when both the predictions and targets are empty. In that case, we can assume it's a perfect prediction
+                    if "local_F1s" not in current[0, 0]:
+                        best = current
+            selected_imgs.append(best)
+        result = np.stack(selected_imgs, axis=-1)
+        assert result.shape == scorings[0].shape
+        return result
+
+    def summarize(self):
+        self._lazy_init()
+        logging.info("Demo evaluator: Summarizing")
+        if not is_main_process():
+            return {}
+        outs = {}
+        prefix = "oracle_" if len(self.coco_evals) > 1 else ""
+        # if self.rarity_buckets is None:
+        self.accumulate(self.eval_img_ids)
+        for iou_type, coco_eval in self.coco_evals[0].items():
+            print("Demo metric, IoU type={}".format(iou_type))
+            coco_eval.summarize()
+
+        if "bbox" in self.coco_evals[0]:
+            for i, value in enumerate(self.coco_evals[0]["bbox"].stats):
+                outs[f"coco_eval_bbox_{prefix}{DEMO_METRICS[i]}"] = value
+        if "segm" in self.coco_evals[0]:
+            for i, value in enumerate(self.coco_evals[0]["segm"].stats):
+                outs[f"coco_eval_masks_{prefix}{DEMO_METRICS[i]}"] = value
+        # else:
+        #     total_stats = {}
+        #     for bucket, img_list in self.rarity_buckets.items():
+        #         self.accumulate(imgIds=img_list)
+        #         bucket_name = RARITY_BUCKETS[bucket]
+        #         for iou_type, coco_eval in self.coco_evals[0].items():
+        #             print(
+        #                 "Demo metric, IoU type={}, Rarity bucket={}".format(
+        #                     iou_type, bucket_name
+        #                 )
+        #             )
+        #             coco_eval.summarize()
+
+        #         if "bbox" in self.coco_evals[0]:
+        #             if "bbox" not in total_stats:
+        #                 total_stats["bbox"] = np.zeros_like(
+        #                     self.coco_evals[0]["bbox"].stats
+        #                 )
+        #             total_stats["bbox"] += self.coco_evals[0]["bbox"].stats
+        #             for i, value in enumerate(self.coco_evals[0]["bbox"].stats):
+        #                 outs[
+        #                     f"coco_eval_bbox_{bucket_name}_{prefix}{DEMO_METRICS[i]}"
+        #                 ] = value
+        #         if "segm" in self.coco_evals[0]:
+        #             if "segm" not in total_stats:
+        #                 total_stats["segm"] = np.zeros_like(
+        #                     self.coco_evals[0]["segm"].stats
+        #                 )
+        #             total_stats["segm"] += self.coco_evals[0]["segm"].stats
+        #             for i, value in enumerate(self.coco_evals[0]["segm"].stats):
+        #                 outs[
+        #                     f"coco_eval_masks_{bucket_name}_{prefix}{DEMO_METRICS[i]}"
+        #                 ] = value
+
+        #     if "bbox" in total_stats:
+        #         total_stats["bbox"] /= len(self.rarity_buckets)
+        #         for i, value in enumerate(total_stats["bbox"]):
+        #             outs[f"coco_eval_bbox_{prefix}{DEMO_METRICS[i]}"] = value
+        #     if "segm" in total_stats:
+        #         total_stats["segm"] /= len(self.rarity_buckets)
+        #         for i, value in enumerate(total_stats["segm"]):
+        #             outs[f"coco_eval_masks_{prefix}{DEMO_METRICS[i]}"] = value
+
+        return outs
+
+    def accumulate(self, imgIds=None):
+        self._lazy_init()
+        logging.info(
+            f"demo evaluator: Accumulating on {len(imgIds) if imgIds is not None else 'all'} images"
+        )
+        if not is_main_process():
+            return
+
+        if imgIds is not None:
+            for coco_eval in self.coco_evals[0].values():
+                coco_eval.params.imgIds = list(imgIds)
+
+        for coco_eval in self.coco_evals[0].values():
+            coco_eval.accumulate()
+
+    def reset(self):
+        self.coco_evals = [{} for _ in range(len(self.coco_gts))]
+        for i, coco_gt in enumerate(self.coco_gts):
+            for iou_type in self.iou_types:
+                self.coco_evals[i][iou_type] = DemoEval(
+                    coco_gt=coco_gt,
+                    iouType=iou_type,
+                    threshold=self.threshold,
+                    compute_JnF=self.compute_JnF,
+                )
+                self.coco_evals[i][iou_type].useCats = False
+        self.img_ids = []
+        self.eval_imgs = {k: [] for k in self.iou_types}
+        if self.dump is not None:
+            self.dump = []
--- a/sam3/eval/hota_eval_toolkit/init.py
+++ b/sam3/eval/hota_eval_toolkit/init.py
@@ -0,0 +1 @@
+# flake8: noqa
--- a/sam3/eval/hota_eval_toolkit/run_ytvis_eval.py
+++ b/sam3/eval/hota_eval_toolkit/run_ytvis_eval.py
@@ -0,0 +1,114 @@
+# flake8: noqa
+
+"""run_youtube_vis.py
+Run example:
+run_youtube_vis.py --USE_PARALLEL False --METRICS HOTA --TRACKERS_TO_EVAL STEm_Seg
+Command Line Arguments: Defaults, # Comments
+    Eval arguments:
+            'USE_PARALLEL': False,
+            'NUM_PARALLEL_CORES': 8,
+            'BREAK_ON_ERROR': True,  # Raises exception and exits with error
+            'RETURN_ON_ERROR': False,  # if not BREAK_ON_ERROR, then returns from function on error
+            'LOG_ON_ERROR': os.path.join(code_path, 'error_log.txt'),  # if not None, save any errors into a log file.
+            'PRINT_RESULTS': True,
+            'PRINT_ONLY_COMBINED': False,
+            'PRINT_CONFIG': True,
+            'TIME_PROGRESS': True,
+            'DISPLAY_LESS_PROGRESS': True,
+            'OUTPUT_SUMMARY': True,
+            'OUTPUT_EMPTY_CLASSES': True,  # If False, summary files are not output for classes with no detections
+            'OUTPUT_DETAILED': True,
+            'PLOT_CURVES': True,
+    Dataset arguments:
+        'GT_FOLDER': os.path.join(code_path, 'data/gt/youtube_vis/youtube_vis_training'),  # Location of GT data
+        'TRACKERS_FOLDER': os.path.join(code_path, 'data/trackers/youtube_vis/youtube_vis_training'),
+        # Trackers location
+        'OUTPUT_FOLDER': None,  # Where to save eval results (if None, same as TRACKERS_FOLDER)
+        'TRACKERS_TO_EVAL': None,  # Filenames of trackers to eval (if None, all in folder)
+        'CLASSES_TO_EVAL': None,  # Classes to eval (if None, all classes)
+        'SPLIT_TO_EVAL': 'training',  # Valid: 'training', 'val'
+        'PRINT_CONFIG': True,  # Whether to print current config
+        'OUTPUT_SUB_FOLDER': '',  # Output files are saved in OUTPUT_FOLDER/tracker_name/OUTPUT_SUB_FOLDER
+        'TRACKER_SUB_FOLDER': 'data',  # Tracker files are in TRACKER_FOLDER/tracker_name/TRACKER_SUB_FOLDER
+        'TRACKER_DISPLAY_NAMES': None,  # Names of trackers to display, if None: TRACKERS_TO_EVAL
+    Metric arguments:
+        'METRICS': ['TrackMAP', 'HOTA', 'CLEAR', 'Identity']
+"""
+
+import argparse
+import os
+import sys
+from multiprocessing import freeze_support
+
+from . import trackeval
+
+
+def run_ytvis_eval(args=None, gt_json=None, dt_json=None):
+    # Command line interface:
+    default_eval_config = trackeval.Evaluator.get_default_eval_config()
+    # print only combined since TrackMAP is undefined for per sequence breakdowns
+    default_eval_config["PRINT_ONLY_COMBINED"] = True
+    default_dataset_config = trackeval.datasets.YouTubeVIS.get_default_dataset_config()
+    default_metrics_config = {"METRICS": ["HOTA"]}
+    config = {
+        **default_eval_config,
+        **default_dataset_config,
+        **default_metrics_config,
+    }  # Merge default configs
+    parser = argparse.ArgumentParser()
+    for setting in config.keys():
+        if type(config[setting]) == list or type(config[setting]) == type(None):
+            parser.add_argument("--" + setting, nargs="+")
+        else:
+            parser.add_argument("--" + setting)
+    args = parser.parse_args(args).__dict__
+    for setting in args.keys():
+        if args[setting] is not None:
+            if type(config[setting]) == type(True):
+                if args[setting] == "True":
+                    x = True
+                elif args[setting] == "False":
+                    x = False
+                else:
+                    raise Exception(
+                        "Command line parameter " + setting + "must be True or False"
+                    )
+            elif type(config[setting]) == type(1):
+                x = int(args[setting])
+            elif type(args[setting]) == type(None):
+                x = None
+            else:
+                x = args[setting]
+            config[setting] = x
+    eval_config = {k: v for k, v in config.items() if k in default_eval_config.keys()}
+    dataset_config = {
+        k: v for k, v in config.items() if k in default_dataset_config.keys()
+    }
+    metrics_config = {
+        k: v for k, v in config.items() if k in default_metrics_config.keys()
+    }
+
+    # Run code
+    evaluator = trackeval.Evaluator(eval_config)
+    # allow directly specifying the GT JSON data and Tracker (result)
+    # JSON data as Python objects, without reading from files.
+    dataset_config["GT_JSON_OBJECT"] = gt_json
+    dataset_config["TRACKER_JSON_OBJECT"] = dt_json
+    dataset_list = [trackeval.datasets.YouTubeVIS(dataset_config)]
+    metrics_list = []
+    # for metric in [trackeval.metrics.TrackMAP, trackeval.metrics.HOTA, trackeval.metrics.CLEAR,
+    #                trackeval.metrics.Identity]:
+    for metric in [trackeval.metrics.HOTA]:
+        if metric.get_name() in metrics_config["METRICS"]:
+            metrics_list.append(metric())
+    if len(metrics_list) == 0:
+        raise Exception("No metrics selected for evaluation")
+    output_res, output_msg = evaluator.evaluate(dataset_list, metrics_list)
+    return output_res, output_msg
+
+
+if __name__ == "__main__":
+    import sys
+
+    freeze_support()
+    run_ytvis_eval(sys.argv[1:])
--- a/sam3/eval/hota_eval_toolkit/trackeval/init.py
+++ b/sam3/eval/hota_eval_toolkit/trackeval/init.py
@@ -0,0 +1,4 @@
+# flake8: noqa
+
+from . import datasets, metrics, utils
+from .eval import Evaluator
--- a/sam3/eval/hota_eval_toolkit/trackeval/_timing.py
+++ b/sam3/eval/hota_eval_toolkit/trackeval/_timing.py
@@ -0,0 +1,68 @@
+# flake8: noqa
+
+import inspect
+from functools import wraps
+from time import perf_counter
+
+DO_TIMING = False
+DISPLAY_LESS_PROGRESS = False
+timer_dict = {}
+counter = 0
+
+
+def time(f):
+    @wraps(f)
+    def wrap(*args, **kw):
+        if DO_TIMING:
+            # Run function with timing
+            ts = perf_counter()
+            result = f(*args, **kw)
+            te = perf_counter()
+            tt = te - ts
+
+            # Get function name
+            arg_names = inspect.getfullargspec(f)[0]
+            if arg_names[0] == "self" and DISPLAY_LESS_PROGRESS:
+                return result
+            elif arg_names[0] == "self":
+                method_name = type(args[0]).__name__ + "." + f.__name__
+            else:
+                method_name = f.__name__
+
+            # Record accumulative time in each function for analysis
+            if method_name in timer_dict.keys():
+                timer_dict[method_name] += tt
+            else:
+                timer_dict[method_name] = tt
+
+            # If code is finished, display timing summary
+            if method_name == "Evaluator.evaluate":
+                print("")
+                print("Timing analysis:")
+                for key, value in timer_dict.items():
+                    print("%-70s %2.4f sec" % (key, value))
+            else:
+                # Get function argument values for printing special arguments of interest
+                arg_titles = ["tracker", "seq", "cls"]
+                arg_vals = []
+                for i, a in enumerate(arg_names):
+                    if a in arg_titles:
+                        arg_vals.append(args[i])
+                arg_text = "(" + ", ".join(arg_vals) + ")"
+
+                # Display methods and functions with different indentation.
+                if arg_names[0] == "self":
+                    print("%-74s %2.4f sec" % (" " * 4 + method_name + arg_text, tt))
+                elif arg_names[0] == "test":
+                    pass
+                else:
+                    global counter
+                    counter += 1
+                    print("%i %-70s %2.4f sec" % (counter, method_name + arg_text, tt))
+
+            return result
+        else:
+            # If config["TIME_PROGRESS"] is false, or config["USE_PARALLEL"] is true, run functions normally without timing.
+            return f(*args, **kw)
+
+    return wrap
--- a/sam3/eval/hota_eval_toolkit/trackeval/datasets/init.py
+++ b/sam3/eval/hota_eval_toolkit/trackeval/datasets/init.py
@@ -0,0 +1,4 @@
+# flake8: noqa
+
+from .tao_ow import TAO_OW
+from .youtube_vis import YouTubeVIS
--- a/sam3/eval/hota_eval_toolkit/trackeval/datasets/_base_dataset.py
+++ b/sam3/eval/hota_eval_toolkit/trackeval/datasets/_base_dataset.py
@@ -0,0 +1,379 @@
+# flake8: noqa
+
+import csv
+import io
+import os
+import traceback
+import zipfile
+from abc import ABC, abstractmethod
+from copy import deepcopy
+
+import numpy as np
+
+from .. import _timing
+from ..utils import TrackEvalException
+
+
+class _BaseDataset(ABC):
+    @abstractmethod
+    def __init__(self):
+        self.tracker_list = None
+        self.seq_list = None
+        self.class_list = None
+        self.output_fol = None
+        self.output_sub_fol = None
+        self.should_classes_combine = True
+        self.use_super_categories = False
+
+    # Functions to implement:
+
+    @staticmethod
+    @abstractmethod
+    def get_default_dataset_config(): ...
+
+    @abstractmethod
+    def _load_raw_file(self, tracker, seq, is_gt): ...
+
+    @_timing.time
+    @abstractmethod
+    def get_preprocessed_seq_data(self, raw_data, cls): ...
+
+    @abstractmethod
+    def _calculate_similarities(self, gt_dets_t, tracker_dets_t): ...
+
+    # Helper functions for all datasets:
+
+    @classmethod
+    def get_class_name(cls):
+        return cls.__name__
+
+    def get_name(self):
+        return self.get_class_name()
+
+    def get_output_fol(self, tracker):
+        return os.path.join(self.output_fol, tracker, self.output_sub_fol)
+
+    def get_display_name(self, tracker):
+        """Can be overwritten if the trackers name (in files) is different to how it should be displayed.
+        By default this method just returns the trackers name as is.
+        """
+        return tracker
+
+    def get_eval_info(self):
+        """Return info about the dataset needed for the Evaluator"""
+        return self.tracker_list, self.seq_list, self.class_list
+
+    @_timing.time
+    def get_raw_seq_data(self, tracker, seq):
+        """Loads raw data (tracker and ground-truth) for a single tracker on a single sequence.
+        Raw data includes all of the information needed for both preprocessing and evaluation, for all classes.
+        A later function (get_processed_seq_data) will perform such preprocessing and extract relevant information for
+        the evaluation of each class.
+
+        This returns a dict which contains the fields:
+        [num_timesteps]: integer
+        [gt_ids, tracker_ids, gt_classes, tracker_classes, tracker_confidences]:
+                                                                list (for each timestep) of 1D NDArrays (for each det).
+        [gt_dets, tracker_dets, gt_crowd_ignore_regions]: list (for each timestep) of lists of detections.
+        [similarity_scores]: list (for each timestep) of 2D NDArrays.
+        [gt_extras]: dict (for each extra) of lists (for each timestep) of 1D NDArrays (for each det).
+
+        gt_extras contains dataset specific information used for preprocessing such as occlusion and truncation levels.
+
+        Note that similarities are extracted as part of the dataset and not the metric, because almost all metrics are
+        independent of the exact method of calculating the similarity. However datasets are not (e.g. segmentation
+        masks vs 2D boxes vs 3D boxes).
+        We calculate the similarity before preprocessing because often both preprocessing and evaluation require it and
+        we don't wish to calculate this twice.
+        We calculate similarity between all gt and tracker classes (not just each class individually) to allow for
+        calculation of metrics such as class confusion matrices. Typically the impact of this on performance is low.
+        """
+        # Load raw data.
+        raw_gt_data = self._load_raw_file(tracker, seq, is_gt=True)
+        raw_tracker_data = self._load_raw_file(tracker, seq, is_gt=False)
+        raw_data = {**raw_tracker_data, **raw_gt_data}  # Merges dictionaries
+
+        # Calculate similarities for each timestep.
+        similarity_scores = []
+        for t, (gt_dets_t, tracker_dets_t) in enumerate(
+            zip(raw_data["gt_dets"], raw_data["tracker_dets"])
+        ):
+            ious = self._calculate_similarities(gt_dets_t, tracker_dets_t)
+            similarity_scores.append(ious)
+        raw_data["similarity_scores"] = similarity_scores
+        return raw_data
+
+    @staticmethod
+    def _load_simple_text_file(
+        file,
+        time_col=0,
+        id_col=None,
+        remove_negative_ids=False,
+        valid_filter=None,
+        crowd_ignore_filter=None,
+        convert_filter=None,
+        is_zipped=False,
+        zip_file=None,
+        force_delimiters=None,
+    ):
+        """Function that loads data which is in a commonly used text file format.
+        Assumes each det is given by one row of a text file.
+        There is no limit to the number or meaning of each column,
+        however one column needs to give the timestep of each det (time_col) which is default col 0.
+
+        The file dialect (deliminator, num cols, etc) is determined automatically.
+        This function automatically separates dets by timestep,
+        and is much faster than alternatives such as np.loadtext or pandas.
+
+        If remove_negative_ids is True and id_col is not None, dets with negative values in id_col are excluded.
+        These are not excluded from ignore data.
+
+        valid_filter can be used to only include certain classes.
+        It is a dict with ints as keys, and lists as values,
+        such that a row is included if "row[key].lower() is in value" for all key/value pairs in the dict.
+        If None, all classes are included.
+
+        crowd_ignore_filter can be used to read crowd_ignore regions separately. It has the same format as valid filter.
+
+        convert_filter can be used to convert value read to another format.
+        This is used most commonly to convert classes given as string to a class id.
+        This is a dict such that the key is the column to convert, and the value is another dict giving the mapping.
+
+        Optionally, input files could be a zip of multiple text files for storage efficiency.
+
+        Returns read_data and ignore_data.
+        Each is a dict (with keys as timesteps as strings) of lists (over dets) of lists (over column values).
+        Note that all data is returned as strings, and must be converted to float/int later if needed.
+        Note that timesteps will not be present in the returned dict keys if there are no dets for them
+        """
+
+        if remove_negative_ids and id_col is None:
+            raise TrackEvalException(
+                "remove_negative_ids is True, but id_col is not given."
+            )
+        if crowd_ignore_filter is None:
+            crowd_ignore_filter = {}
+        if convert_filter is None:
+            convert_filter = {}
+        try:
+            if is_zipped:  # Either open file directly or within a zip.
+                if zip_file is None:
+                    raise TrackEvalException(
+                        "is_zipped set to True, but no zip_file is given."
+                    )
+                archive = zipfile.ZipFile(os.path.join(zip_file), "r")
+                fp = io.TextIOWrapper(archive.open(file, "r"))
+            else:
+                fp = open(file)
+            read_data = {}
+            crowd_ignore_data = {}
+            fp.seek(0, os.SEEK_END)
+            # check if file is empty
+            if fp.tell():
+                fp.seek(0)
+                dialect = csv.Sniffer().sniff(
+                    fp.readline(), delimiters=force_delimiters
+                )  # Auto determine structure.
+                dialect.skipinitialspace = (
+                    True  # Deal with extra spaces between columns
+                )
+                fp.seek(0)
+                reader = csv.reader(fp, dialect)
+                for row in reader:
+                    try:
+                        # Deal with extra trailing spaces at the end of rows
+                        if row[-1] in "":
+                            row = row[:-1]
+                        timestep = str(int(float(row[time_col])))
+                        # Read ignore regions separately.
+                        is_ignored = False
+                        for ignore_key, ignore_value in crowd_ignore_filter.items():
+                            if row[ignore_key].lower() in ignore_value:
+                                # Convert values in one column (e.g. string to id)
+                                for (
+                                    convert_key,
+                                    convert_value,
+                                ) in convert_filter.items():
+                                    row[convert_key] = convert_value[
+                                        row[convert_key].lower()
+                                    ]
+                                # Save data separated by timestep.
+                                if timestep in crowd_ignore_data.keys():
+                                    crowd_ignore_data[timestep].append(row)
+                                else:
+                                    crowd_ignore_data[timestep] = [row]
+                                is_ignored = True
+                        if (
+                            is_ignored
+                        ):  # if det is an ignore region, it cannot be a normal det.
+                            continue
+                        # Exclude some dets if not valid.
+                        if valid_filter is not None:
+                            for key, value in valid_filter.items():
+                                if row[key].lower() not in value:
+                                    continue
+                        if remove_negative_ids:
+                            if int(float(row[id_col])) < 0:
+                                continue
+                        # Convert values in one column (e.g. string to id)
+                        for convert_key, convert_value in convert_filter.items():
+                            row[convert_key] = convert_value[row[convert_key].lower()]
+                        # Save data separated by timestep.
+                        if timestep in read_data.keys():
+                            read_data[timestep].append(row)
+                        else:
+                            read_data[timestep] = [row]
+                    except Exception:
+                        exc_str_init = (
+                            "In file %s the following line cannot be read correctly: \n"
+                            % os.path.basename(file)
+                        )
+                        exc_str = " ".join([exc_str_init] + row)
+                        raise TrackEvalException(exc_str)
+            fp.close()
+        except Exception:
+            print("Error loading file: %s, printing traceback." % file)
+            traceback.print_exc()
+            raise TrackEvalException(
+                "File %s cannot be read because it is either not present or invalidly formatted"
+                % os.path.basename(file)
+            )
+        return read_data, crowd_ignore_data
+
+    @staticmethod
+    def _calculate_mask_ious(masks1, masks2, is_encoded=False, do_ioa=False):
+        """Calculates the IOU (intersection over union) between two arrays of segmentation masks.
+        If is_encoded a run length encoding with pycocotools is assumed as input format, otherwise an input of numpy
+        arrays of the shape (num_masks, height, width) is assumed and the encoding is performed.
+        If do_ioa (intersection over area) , then calculates the intersection over the area of masks1 - this is commonly
+        used to determine if detections are within crowd ignore region.
+        :param masks1:  first set of masks (numpy array of shape (num_masks, height, width) if not encoded,
+                        else pycocotools rle encoded format)
+        :param masks2:  second set of masks (numpy array of shape (num_masks, height, width) if not encoded,
+                        else pycocotools rle encoded format)
+        :param is_encoded: whether the input is in pycocotools rle encoded format
+        :param do_ioa: whether to perform IoA computation
+        :return: the IoU/IoA scores
+        """
+
+        # Only loaded when run to reduce minimum requirements
+        from pycocotools import mask as mask_utils
+
+        # use pycocotools for run length encoding of masks
+        if not is_encoded:
+            masks1 = mask_utils.encode(
+                np.array(np.transpose(masks1, (1, 2, 0)), order="F")
+            )
+            masks2 = mask_utils.encode(
+                np.array(np.transpose(masks2, (1, 2, 0)), order="F")
+            )
+
+        # use pycocotools for iou computation of rle encoded masks
+        ious = mask_utils.iou(masks1, masks2, [do_ioa] * len(masks2))
+        if len(masks1) == 0 or len(masks2) == 0:
+            ious = np.asarray(ious).reshape(len(masks1), len(masks2))
+        assert (ious >= 0 - np.finfo("float").eps).all()
+        assert (ious <= 1 + np.finfo("float").eps).all()
+
+        return ious
+
+    @staticmethod
+    def _calculate_box_ious(bboxes1, bboxes2, box_format="xywh", do_ioa=False):
+        """Calculates the IOU (intersection over union) between two arrays of boxes.
+        Allows variable box formats ('xywh' and 'x0y0x1y1').
+        If do_ioa (intersection over area) , then calculates the intersection over the area of boxes1 - this is commonly
+        used to determine if detections are within crowd ignore region.
+        """
+        if box_format in "xywh":
+            # layout: (x0, y0, w, h)
+            bboxes1 = deepcopy(bboxes1)
+            bboxes2 = deepcopy(bboxes2)
+
+            bboxes1[:, 2] = bboxes1[:, 0] + bboxes1[:, 2]
+            bboxes1[:, 3] = bboxes1[:, 1] + bboxes1[:, 3]
+            bboxes2[:, 2] = bboxes2[:, 0] + bboxes2[:, 2]
+            bboxes2[:, 3] = bboxes2[:, 1] + bboxes2[:, 3]
+        elif box_format not in "x0y0x1y1":
+            raise (TrackEvalException("box_format %s is not implemented" % box_format))
+
+        # layout: (x0, y0, x1, y1)
+        min_ = np.minimum(bboxes1[:, np.newaxis, :], bboxes2[np.newaxis, :, :])
+        max_ = np.maximum(bboxes1[:, np.newaxis, :], bboxes2[np.newaxis, :, :])
+        intersection = np.maximum(min_[..., 2] - max_[..., 0], 0) * np.maximum(
+            min_[..., 3] - max_[..., 1], 0
+        )
+        area1 = (bboxes1[..., 2] - bboxes1[..., 0]) * (
+            bboxes1[..., 3] - bboxes1[..., 1]
+        )
+
+        if do_ioa:
+            ioas = np.zeros_like(intersection)
+            valid_mask = area1 > 0 + np.finfo("float").eps
+            ioas[valid_mask, :] = (
+                intersection[valid_mask, :] / area1[valid_mask][:, np.newaxis]
+            )
+
+            return ioas
+        else:
+            area2 = (bboxes2[..., 2] - bboxes2[..., 0]) * (
+                bboxes2[..., 3] - bboxes2[..., 1]
+            )
+            union = area1[:, np.newaxis] + area2[np.newaxis, :] - intersection
+            intersection[area1 <= 0 + np.finfo("float").eps, :] = 0
+            intersection[:, area2 <= 0 + np.finfo("float").eps] = 0
+            intersection[union <= 0 + np.finfo("float").eps] = 0
+            union[union <= 0 + np.finfo("float").eps] = 1
+            ious = intersection / union
+            return ious
+
+    @staticmethod
+    def _calculate_euclidean_similarity(dets1, dets2, zero_distance=2.0):
+        """Calculates the euclidean distance between two sets of detections, and then converts this into a similarity
+        measure with values between 0 and 1 using the following formula: sim = max(0, 1 - dist/zero_distance).
+        The default zero_distance of 2.0, corresponds to the default used in MOT15_3D, such that a 0.5 similarity
+        threshold corresponds to a 1m distance threshold for TPs.
+        """
+        dist = np.linalg.norm(dets1[:, np.newaxis] - dets2[np.newaxis, :], axis=2)
+        sim = np.maximum(0, 1 - dist / zero_distance)
+        return sim
+
+    @staticmethod
+    def _check_unique_ids(data, after_preproc=False):
+        """Check the requirement that the tracker_ids and gt_ids are unique per timestep"""
+        gt_ids = data["gt_ids"]
+        tracker_ids = data["tracker_ids"]
+        for t, (gt_ids_t, tracker_ids_t) in enumerate(zip(gt_ids, tracker_ids)):
+            if len(tracker_ids_t) > 0:
+                unique_ids, counts = np.unique(tracker_ids_t, return_counts=True)
+                if np.max(counts) != 1:
+                    duplicate_ids = unique_ids[counts > 1]
+                    exc_str_init = (
+                        "Tracker predicts the same ID more than once in a single timestep "
+                        "(seq: %s, frame: %i, ids:" % (data["seq"], t + 1)
+                    )
+                    exc_str = (
+                        " ".join([exc_str_init] + [str(d) for d in duplicate_ids]) + ")"
+                    )
+                    if after_preproc:
+                        exc_str_init += (
+                            "\n Note that this error occurred after preprocessing (but not before), "
+                            "so ids may not be as in file, and something seems wrong with preproc."
+                        )
+                    raise TrackEvalException(exc_str)
+            if len(gt_ids_t) > 0:
+                unique_ids, counts = np.unique(gt_ids_t, return_counts=True)
+                if np.max(counts) != 1:
+                    duplicate_ids = unique_ids[counts > 1]
+                    exc_str_init = (
+                        "Ground-truth has the same ID more than once in a single timestep "
+                        "(seq: %s, frame: %i, ids:" % (data["seq"], t + 1)
+                    )
+                    exc_str = (
+                        " ".join([exc_str_init] + [str(d) for d in duplicate_ids]) + ")"
+                    )
+                    if after_preproc:
+                        exc_str_init += (
+                            "\n Note that this error occurred after preprocessing (but not before), "
+                            "so ids may not be as in file, and something seems wrong with preproc."
+                        )
+                    raise TrackEvalException(exc_str)
--- a/sam3/eval/hota_eval_toolkit/trackeval/datasets/tao_ow.py
+++ b/sam3/eval/hota_eval_toolkit/trackeval/datasets/tao_ow.py
@@ -0,0 +1,891 @@
+# flake8: noqa
+
+import itertools
+import json
+import os
+from collections import defaultdict
+
+import numpy as np
+from scipy.optimize import linear_sum_assignment
+
+from .. import _timing, utils
+from ..utils import TrackEvalException
+from ._base_dataset import _BaseDataset
+
+
+class TAO_OW(_BaseDataset):
+    """Dataset class for TAO tracking"""
+
+    @staticmethod
+    def get_default_dataset_config():
+        """Default class config values"""
+        code_path = utils.get_code_path()
+        default_config = {
+            "GT_FOLDER": os.path.join(
+                code_path, "data/gt/tao/tao_training"
+            ),  # Location of GT data
+            "TRACKERS_FOLDER": os.path.join(
+                code_path, "data/trackers/tao/tao_training"
+            ),  # Trackers location
+            "OUTPUT_FOLDER": None,  # Where to save eval results (if None, same as TRACKERS_FOLDER)
+            "TRACKERS_TO_EVAL": None,  # Filenames of trackers to eval (if None, all in folder)
+            "CLASSES_TO_EVAL": None,  # Classes to eval (if None, all classes)
+            "SPLIT_TO_EVAL": "training",  # Valid: 'training', 'val'
+            "PRINT_CONFIG": True,  # Whether to print current config
+            "TRACKER_SUB_FOLDER": "data",  # Tracker files are in TRACKER_FOLDER/tracker_name/TRACKER_SUB_FOLDER
+            "OUTPUT_SUB_FOLDER": "",  # Output files are saved in OUTPUT_FOLDER/tracker_name/OUTPUT_SUB_FOLDER
+            "TRACKER_DISPLAY_NAMES": None,  # Names of trackers to display, if None: TRACKERS_TO_EVAL
+            "MAX_DETECTIONS": 300,  # Number of maximal allowed detections per image (0 for unlimited)
+            "SUBSET": "all",
+        }
+        return default_config
+
+    def __init__(self, config=None):
+        """Initialise dataset, checking that all required files are present"""
+        super().__init__()
+        # Fill non-given config values with defaults
+        self.config = utils.init_config(
+            config, self.get_default_dataset_config(), self.get_name()
+        )
+        self.gt_fol = self.config["GT_FOLDER"]
+        self.tracker_fol = self.config["TRACKERS_FOLDER"]
+        self.should_classes_combine = True
+        self.use_super_categories = False
+
+        self.tracker_sub_fol = self.config["TRACKER_SUB_FOLDER"]
+        self.output_fol = self.config["OUTPUT_FOLDER"]
+        if self.output_fol is None:
+            self.output_fol = self.tracker_fol
+        self.output_sub_fol = self.config["OUTPUT_SUB_FOLDER"]
+
+        gt_dir_files = [
+            file for file in os.listdir(self.gt_fol) if file.endswith(".json")
+        ]
+        if len(gt_dir_files) != 1:
+            raise TrackEvalException(
+                self.gt_fol + " does not contain exactly one json file."
+            )
+
+        with open(os.path.join(self.gt_fol, gt_dir_files[0])) as f:
+            self.gt_data = json.load(f)
+
+        self.subset = self.config["SUBSET"]
+        if self.subset != "all":
+            # Split GT data into `known`, `unknown` or `distractor`
+            self._split_known_unknown_distractor()
+            self.gt_data = self._filter_gt_data(self.gt_data)
+
+        # merge categories marked with a merged tag in TAO dataset
+        self._merge_categories(self.gt_data["annotations"] + self.gt_data["tracks"])
+
+        # Get sequences to eval and sequence information
+        self.seq_list = [
+            vid["name"].replace("/", "-") for vid in self.gt_data["videos"]
+        ]
+        self.seq_name_to_seq_id = {
+            vid["name"].replace("/", "-"): vid["id"] for vid in self.gt_data["videos"]
+        }
+        # compute mappings from videos to annotation data
+        self.videos_to_gt_tracks, self.videos_to_gt_images = self._compute_vid_mappings(
+            self.gt_data["annotations"]
+        )
+        # compute sequence lengths
+        self.seq_lengths = {vid["id"]: 0 for vid in self.gt_data["videos"]}
+        for img in self.gt_data["images"]:
+            self.seq_lengths[img["video_id"]] += 1
+        self.seq_to_images_to_timestep = self._compute_image_to_timestep_mappings()
+        self.seq_to_classes = {
+            vid["id"]: {
+                "pos_cat_ids": list(
+                    {
+                        track["category_id"]
+                        for track in self.videos_to_gt_tracks[vid["id"]]
+                    }
+                ),
+                "neg_cat_ids": vid["neg_category_ids"],
+                "not_exhaustively_labeled_cat_ids": vid["not_exhaustive_category_ids"],
+            }
+            for vid in self.gt_data["videos"]
+        }
+
+        # Get classes to eval
+        considered_vid_ids = [self.seq_name_to_seq_id[vid] for vid in self.seq_list]
+        seen_cats = set(
+            [
+                cat_id
+                for vid_id in considered_vid_ids
+                for cat_id in self.seq_to_classes[vid_id]["pos_cat_ids"]
+            ]
+        )
+        # only classes with ground truth are evaluated in TAO
+        self.valid_classes = [
+            cls["name"] for cls in self.gt_data["categories"] if cls["id"] in seen_cats
+        ]
+        # cls_name_to_cls_id_map = {cls['name']: cls['id'] for cls in self.gt_data['categories']}
+
+        if self.config["CLASSES_TO_EVAL"]:
+            # self.class_list = [cls.lower() if cls.lower() in self.valid_classes else None
+            #                    for cls in self.config['CLASSES_TO_EVAL']]
+            self.class_list = ["object"]  # class-agnostic
+            if not all(self.class_list):
+                raise TrackEvalException(
+                    "Attempted to evaluate an invalid class. Only classes "
+                    + ", ".join(self.valid_classes)
+                    + " are valid (classes present in ground truth data)."
+                )
+        else:
+            # self.class_list = [cls for cls in self.valid_classes]
+            self.class_list = ["object"]  # class-agnostic
+        # self.class_name_to_class_id = {k: v for k, v in cls_name_to_cls_id_map.items() if k in self.class_list}
+        self.class_name_to_class_id = {"object": 1}  # class-agnostic
+
+        # Get trackers to eval
+        if self.config["TRACKERS_TO_EVAL"] is None:
+            self.tracker_list = os.listdir(self.tracker_fol)
+        else:
+            self.tracker_list = self.config["TRACKERS_TO_EVAL"]
+
+        if self.config["TRACKER_DISPLAY_NAMES"] is None:
+            self.tracker_to_disp = dict(zip(self.tracker_list, self.tracker_list))
+        elif (self.config["TRACKERS_TO_EVAL"] is not None) and (
+            len(self.config["TRACKER_DISPLAY_NAMES"]) == len(self.tracker_list)
+        ):
+            self.tracker_to_disp = dict(
+                zip(self.tracker_list, self.config["TRACKER_DISPLAY_NAMES"])
+            )
+        else:
+            raise TrackEvalException(
+                "List of tracker files and tracker display names do not match."
+            )
+
+        self.tracker_data = {tracker: dict() for tracker in self.tracker_list}
+
+        for tracker in self.tracker_list:
+            tr_dir_files = [
+                file
+                for file in os.listdir(
+                    os.path.join(self.tracker_fol, tracker, self.tracker_sub_fol)
+                )
+                if file.endswith(".json")
+            ]
+            if len(tr_dir_files) != 1:
+                raise TrackEvalException(
+                    os.path.join(self.tracker_fol, tracker, self.tracker_sub_fol)
+                    + " does not contain exactly one json file."
+                )
+            with open(
+                os.path.join(
+                    self.tracker_fol, tracker, self.tracker_sub_fol, tr_dir_files[0]
+                )
+            ) as f:
+                curr_data = json.load(f)
+
+            # limit detections if MAX_DETECTIONS > 0
+            if self.config["MAX_DETECTIONS"]:
+                curr_data = self._limit_dets_per_image(curr_data)
+
+            # fill missing video ids
+            self._fill_video_ids_inplace(curr_data)
+
+            # make track ids unique over whole evaluation set
+            self._make_track_ids_unique(curr_data)
+
+            # merge categories marked with a merged tag in TAO dataset
+            self._merge_categories(curr_data)
+
+            # get tracker sequence information
+            curr_videos_to_tracker_tracks, curr_videos_to_tracker_images = (
+                self._compute_vid_mappings(curr_data)
+            )
+            self.tracker_data[tracker]["vids_to_tracks"] = curr_videos_to_tracker_tracks
+            self.tracker_data[tracker]["vids_to_images"] = curr_videos_to_tracker_images
+
+    def get_display_name(self, tracker):
+        return self.tracker_to_disp[tracker]
+
+    def _load_raw_file(self, tracker, seq, is_gt):
+        """Load a file (gt or tracker) in the TAO format
+
+        If is_gt, this returns a dict which contains the fields:
+        [gt_ids, gt_classes] : list (for each timestep) of 1D NDArrays (for each det).
+        [gt_dets]: list (for each timestep) of lists of detections.
+        [classes_to_gt_tracks]: dictionary with class values as keys and list of dictionaries (with frame indices as
+                                keys and corresponding segmentations as values) for each track
+        [classes_to_gt_track_ids, classes_to_gt_track_areas, classes_to_gt_track_lengths]: dictionary with class values
+                                as keys and lists (for each track) as values
+
+        if not is_gt, this returns a dict which contains the fields:
+        [tracker_ids, tracker_classes, tracker_confidences] : list (for each timestep) of 1D NDArrays (for each det).
+        [tracker_dets]: list (for each timestep) of lists of detections.
+        [classes_to_dt_tracks]: dictionary with class values as keys and list of dictionaries (with frame indices as
+                                keys and corresponding segmentations as values) for each track
+        [classes_to_dt_track_ids, classes_to_dt_track_areas, classes_to_dt_track_lengths]: dictionary with class values
+                                                                                           as keys and lists as values
+        [classes_to_dt_track_scores]: dictionary with class values as keys and 1D numpy arrays as values
+        """
+        seq_id = self.seq_name_to_seq_id[seq]
+        # File location
+        if is_gt:
+            imgs = self.videos_to_gt_images[seq_id]
+        else:
+            imgs = self.tracker_data[tracker]["vids_to_images"][seq_id]
+
+        # Convert data to required format
+        num_timesteps = self.seq_lengths[seq_id]
+        img_to_timestep = self.seq_to_images_to_timestep[seq_id]
+        data_keys = ["ids", "classes", "dets"]
+        if not is_gt:
+            data_keys += ["tracker_confidences"]
+        raw_data = {key: [None] * num_timesteps for key in data_keys}
+        for img in imgs:
+            # some tracker data contains images without any ground truth information, these are ignored
+            try:
+                t = img_to_timestep[img["id"]]
+            except KeyError:
+                continue
+            annotations = img["annotations"]
+            raw_data["dets"][t] = np.atleast_2d(
+                [ann["bbox"] for ann in annotations]
+            ).astype(float)
+            raw_data["ids"][t] = np.atleast_1d(
+                [ann["track_id"] for ann in annotations]
+            ).astype(int)
+            raw_data["classes"][t] = np.atleast_1d([1 for _ in annotations]).astype(
+                int
+            )  # class-agnostic
+            if not is_gt:
+                raw_data["tracker_confidences"][t] = np.atleast_1d(
+                    [ann["score"] for ann in annotations]
+                ).astype(float)
+
+        for t, d in enumerate(raw_data["dets"]):
+            if d is None:
+                raw_data["dets"][t] = np.empty((0, 4)).astype(float)
+                raw_data["ids"][t] = np.empty(0).astype(int)
+                raw_data["classes"][t] = np.empty(0).astype(int)
+                if not is_gt:
+                    raw_data["tracker_confidences"][t] = np.empty(0)
+
+        if is_gt:
+            key_map = {"ids": "gt_ids", "classes": "gt_classes", "dets": "gt_dets"}
+        else:
+            key_map = {
+                "ids": "tracker_ids",
+                "classes": "tracker_classes",
+                "dets": "tracker_dets",
+            }
+        for k, v in key_map.items():
+            raw_data[v] = raw_data.pop(k)
+
+        # all_classes = [self.class_name_to_class_id[cls] for cls in self.class_list]
+        all_classes = [1]  # class-agnostic
+
+        if is_gt:
+            classes_to_consider = all_classes
+            all_tracks = self.videos_to_gt_tracks[seq_id]
+        else:
+            # classes_to_consider = self.seq_to_classes[seq_id]['pos_cat_ids'] \
+            #                       + self.seq_to_classes[seq_id]['neg_cat_ids']
+            classes_to_consider = all_classes  # class-agnostic
+            all_tracks = self.tracker_data[tracker]["vids_to_tracks"][seq_id]
+
+        # classes_to_tracks = {cls: [track for track in all_tracks if track['category_id'] == cls]
+        #                      if cls in classes_to_consider else [] for cls in all_classes}
+        classes_to_tracks = {
+            cls: [track for track in all_tracks] if cls in classes_to_consider else []
+            for cls in all_classes
+        }  # class-agnostic
+
+        # mapping from classes to track information
+        raw_data["classes_to_tracks"] = {
+            cls: [
+                {
+                    det["image_id"]: np.atleast_1d(det["bbox"])
+                    for det in track["annotations"]
+                }
+                for track in tracks
+            ]
+            for cls, tracks in classes_to_tracks.items()
+        }
+        raw_data["classes_to_track_ids"] = {
+            cls: [track["id"] for track in tracks]
+            for cls, tracks in classes_to_tracks.items()
+        }
+        raw_data["classes_to_track_areas"] = {
+            cls: [track["area"] for track in tracks]
+            for cls, tracks in classes_to_tracks.items()
+        }
+        raw_data["classes_to_track_lengths"] = {
+            cls: [len(track["annotations"]) for track in tracks]
+            for cls, tracks in classes_to_tracks.items()
+        }
+
+        if not is_gt:
+            raw_data["classes_to_dt_track_scores"] = {
+                cls: np.array(
+                    [
+                        np.mean([float(x["score"]) for x in track["annotations"]])
+                        for track in tracks
+                    ]
+                )
+                for cls, tracks in classes_to_tracks.items()
+            }
+
+        if is_gt:
+            key_map = {
+                "classes_to_tracks": "classes_to_gt_tracks",
+                "classes_to_track_ids": "classes_to_gt_track_ids",
+                "classes_to_track_lengths": "classes_to_gt_track_lengths",
+                "classes_to_track_areas": "classes_to_gt_track_areas",
+            }
+        else:
+            key_map = {
+                "classes_to_tracks": "classes_to_dt_tracks",
+                "classes_to_track_ids": "classes_to_dt_track_ids",
+                "classes_to_track_lengths": "classes_to_dt_track_lengths",
+                "classes_to_track_areas": "classes_to_dt_track_areas",
+            }
+        for k, v in key_map.items():
+            raw_data[v] = raw_data.pop(k)
+
+        raw_data["num_timesteps"] = num_timesteps
+        raw_data["neg_cat_ids"] = self.seq_to_classes[seq_id]["neg_cat_ids"]
+        raw_data["not_exhaustively_labeled_cls"] = self.seq_to_classes[seq_id][
+            "not_exhaustively_labeled_cat_ids"
+        ]
+        raw_data["seq"] = seq
+        return raw_data
+
+    @_timing.time
+    def get_preprocessed_seq_data(self, raw_data, cls):
+        """Preprocess data for a single sequence for a single class ready for evaluation.
+        Inputs:
+             - raw_data is a dict containing the data for the sequence already read in by get_raw_seq_data().
+             - cls is the class to be evaluated.
+        Outputs:
+             - data is a dict containing all of the information that metrics need to perform evaluation.
+                It contains the following fields:
+                    [num_timesteps, num_gt_ids, num_tracker_ids, num_gt_dets, num_tracker_dets] : integers.
+                    [gt_ids, tracker_ids, tracker_confidences]: list (for each timestep) of 1D NDArrays (for each det).
+                    [gt_dets, tracker_dets]: list (for each timestep) of lists of detections.
+                    [similarity_scores]: list (for each timestep) of 2D NDArrays.
+        Notes:
+            General preprocessing (preproc) occurs in 4 steps. Some datasets may not use all of these steps.
+                1) Extract only detections relevant for the class to be evaluated (including distractor detections).
+                2) Match gt dets and tracker dets. Remove tracker dets that are matched to a gt det that is of a
+                    distractor class, or otherwise marked as to be removed.
+                3) Remove unmatched tracker dets if they fall within a crowd ignore region or don't meet a certain
+                    other criteria (e.g. are too small).
+                4) Remove gt dets that were only useful for preprocessing and not for actual evaluation.
+            After the above preprocessing steps, this function also calculates the number of gt and tracker detections
+                and unique track ids. It also relabels gt and tracker ids to be contiguous and checks that ids are
+                unique within each timestep.
+        TAO:
+            In TAO, the 4 preproc steps are as follow:
+                1) All classes present in the ground truth data are evaluated separately.
+                2) No matched tracker detections are removed.
+                3) Unmatched tracker detections are removed if there is not ground truth data and the class does not
+                    belong to the categories marked as negative for this sequence. Additionally, unmatched tracker
+                    detections for classes which are marked as not exhaustively labeled are removed.
+                4) No gt detections are removed.
+            Further, for TrackMAP computation track representations for the given class are accessed from a dictionary
+            and the tracks from the tracker data are sorted according to the tracker confidence.
+        """
+        cls_id = self.class_name_to_class_id[cls]
+        is_not_exhaustively_labeled = cls_id in raw_data["not_exhaustively_labeled_cls"]
+        is_neg_category = cls_id in raw_data["neg_cat_ids"]
+
+        data_keys = [
+            "gt_ids",
+            "tracker_ids",
+            "gt_dets",
+            "tracker_dets",
+            "tracker_confidences",
+            "similarity_scores",
+        ]
+        data = {key: [None] * raw_data["num_timesteps"] for key in data_keys}
+        unique_gt_ids = []
+        unique_tracker_ids = []
+        num_gt_dets = 0
+        num_tracker_dets = 0
+        for t in range(raw_data["num_timesteps"]):
+            # Only extract relevant dets for this class for preproc and eval (cls)
+            gt_class_mask = np.atleast_1d(raw_data["gt_classes"][t] == cls_id)
+            gt_class_mask = gt_class_mask.astype(bool)
+            gt_ids = raw_data["gt_ids"][t][gt_class_mask]
+            gt_dets = raw_data["gt_dets"][t][gt_class_mask]
+
+            tracker_class_mask = np.atleast_1d(raw_data["tracker_classes"][t] == cls_id)
+            tracker_class_mask = tracker_class_mask.astype(bool)
+            tracker_ids = raw_data["tracker_ids"][t][tracker_class_mask]
+            tracker_dets = raw_data["tracker_dets"][t][tracker_class_mask]
+            tracker_confidences = raw_data["tracker_confidences"][t][tracker_class_mask]
+            similarity_scores = raw_data["similarity_scores"][t][gt_class_mask, :][
+                :, tracker_class_mask
+            ]
+
+            # Match tracker and gt dets (with hungarian algorithm).
+            unmatched_indices = np.arange(tracker_ids.shape[0])
+            if gt_ids.shape[0] > 0 and tracker_ids.shape[0] > 0:
+                matching_scores = similarity_scores.copy()
+                matching_scores[matching_scores < 0.5 - np.finfo("float").eps] = 0
+                match_rows, match_cols = linear_sum_assignment(-matching_scores)
+                actually_matched_mask = (
+                    matching_scores[match_rows, match_cols] > 0 + np.finfo("float").eps
+                )
+                match_cols = match_cols[actually_matched_mask]
+                unmatched_indices = np.delete(unmatched_indices, match_cols, axis=0)
+
+            if gt_ids.shape[0] == 0 and not is_neg_category:
+                to_remove_tracker = unmatched_indices
+            elif is_not_exhaustively_labeled:
+                to_remove_tracker = unmatched_indices
+            else:
+                to_remove_tracker = np.array([], dtype=int)
+
+            # remove all unwanted unmatched tracker detections
+            data["tracker_ids"][t] = np.delete(tracker_ids, to_remove_tracker, axis=0)
+            data["tracker_dets"][t] = np.delete(tracker_dets, to_remove_tracker, axis=0)
+            data["tracker_confidences"][t] = np.delete(
+                tracker_confidences, to_remove_tracker, axis=0
+            )
+            similarity_scores = np.delete(similarity_scores, to_remove_tracker, axis=1)
+
+            data["gt_ids"][t] = gt_ids
+            data["gt_dets"][t] = gt_dets
+            data["similarity_scores"][t] = similarity_scores
+
+            unique_gt_ids += list(np.unique(data["gt_ids"][t]))
+            unique_tracker_ids += list(np.unique(data["tracker_ids"][t]))
+            num_tracker_dets += len(data["tracker_ids"][t])
+            num_gt_dets += len(data["gt_ids"][t])
+
+        # Re-label IDs such that there are no empty IDs
+        if len(unique_gt_ids) > 0:
+            unique_gt_ids = np.unique(unique_gt_ids)
+            gt_id_map = np.nan * np.ones((np.max(unique_gt_ids) + 1))
+            gt_id_map[unique_gt_ids] = np.arange(len(unique_gt_ids))
+            for t in range(raw_data["num_timesteps"]):
+                if len(data["gt_ids"][t]) > 0:
+                    data["gt_ids"][t] = gt_id_map[data["gt_ids"][t]].astype(int)
+        if len(unique_tracker_ids) > 0:
+            unique_tracker_ids = np.unique(unique_tracker_ids)
+            tracker_id_map = np.nan * np.ones((np.max(unique_tracker_ids) + 1))
+            tracker_id_map[unique_tracker_ids] = np.arange(len(unique_tracker_ids))
+            for t in range(raw_data["num_timesteps"]):
+                if len(data["tracker_ids"][t]) > 0:
+                    data["tracker_ids"][t] = tracker_id_map[
+                        data["tracker_ids"][t]
+                    ].astype(int)
+
+        # Record overview statistics.
+        data["num_tracker_dets"] = num_tracker_dets
+        data["num_gt_dets"] = num_gt_dets
+        data["num_tracker_ids"] = len(unique_tracker_ids)
+        data["num_gt_ids"] = len(unique_gt_ids)
+        data["num_timesteps"] = raw_data["num_timesteps"]
+        data["seq"] = raw_data["seq"]
+
+        # get track representations
+        data["gt_tracks"] = raw_data["classes_to_gt_tracks"][cls_id]
+        data["gt_track_ids"] = raw_data["classes_to_gt_track_ids"][cls_id]
+        data["gt_track_lengths"] = raw_data["classes_to_gt_track_lengths"][cls_id]
+        data["gt_track_areas"] = raw_data["classes_to_gt_track_areas"][cls_id]
+        data["dt_tracks"] = raw_data["classes_to_dt_tracks"][cls_id]
+        data["dt_track_ids"] = raw_data["classes_to_dt_track_ids"][cls_id]
+        data["dt_track_lengths"] = raw_data["classes_to_dt_track_lengths"][cls_id]
+        data["dt_track_areas"] = raw_data["classes_to_dt_track_areas"][cls_id]
+        data["dt_track_scores"] = raw_data["classes_to_dt_track_scores"][cls_id]
+        data["not_exhaustively_labeled"] = is_not_exhaustively_labeled
+        data["iou_type"] = "bbox"
+
+        # sort tracker data tracks by tracker confidence scores
+        if data["dt_tracks"]:
+            idx = np.argsort(
+                [-score for score in data["dt_track_scores"]], kind="mergesort"
+            )
+            data["dt_track_scores"] = [data["dt_track_scores"][i] for i in idx]
+            data["dt_tracks"] = [data["dt_tracks"][i] for i in idx]
+            data["dt_track_ids"] = [data["dt_track_ids"][i] for i in idx]
+            data["dt_track_lengths"] = [data["dt_track_lengths"][i] for i in idx]
+            data["dt_track_areas"] = [data["dt_track_areas"][i] for i in idx]
+        # Ensure that ids are unique per timestep.
+        self._check_unique_ids(data)
+
+        return data
+
+    def _calculate_similarities(self, gt_dets_t, tracker_dets_t):
+        similarity_scores = self._calculate_box_ious(gt_dets_t, tracker_dets_t)
+        return similarity_scores
+
+    def _merge_categories(self, annotations):
+        """
+        Merges categories with a merged tag. Adapted from https://github.com/TAO-Dataset
+        :param annotations: the annotations in which the classes should be merged
+        :return: None
+        """
+        merge_map = {}
+        for category in self.gt_data["categories"]:
+            if "merged" in category:
+                for to_merge in category["merged"]:
+                    merge_map[to_merge["id"]] = category["id"]
+
+        for ann in annotations:
+            ann["category_id"] = merge_map.get(ann["category_id"], ann["category_id"])
+
+    def _compute_vid_mappings(self, annotations):
+        """
+        Computes mappings from Videos to corresponding tracks and images.
+        :param annotations: the annotations for which the mapping should be generated
+        :return: the video-to-track-mapping, the video-to-image-mapping
+        """
+        vids_to_tracks = {}
+        vids_to_imgs = {}
+        vid_ids = [vid["id"] for vid in self.gt_data["videos"]]
+
+        # compute an mapping from image IDs to images
+        images = {}
+        for image in self.gt_data["images"]:
+            images[image["id"]] = image
+
+        for ann in annotations:
+            ann["area"] = ann["bbox"][2] * ann["bbox"][3]
+
+            vid = ann["video_id"]
+            if ann["video_id"] not in vids_to_tracks.keys():
+                vids_to_tracks[ann["video_id"]] = list()
+            if ann["video_id"] not in vids_to_imgs.keys():
+                vids_to_imgs[ann["video_id"]] = list()
+
+            # Fill in vids_to_tracks
+            tid = ann["track_id"]
+            exist_tids = [track["id"] for track in vids_to_tracks[vid]]
+            try:
+                index1 = exist_tids.index(tid)
+            except ValueError:
+                index1 = -1
+            if tid not in exist_tids:
+                curr_track = {
+                    "id": tid,
+                    "category_id": ann["category_id"],
+                    "video_id": vid,
+                    "annotations": [ann],
+                }
+                vids_to_tracks[vid].append(curr_track)
+            else:
+                vids_to_tracks[vid][index1]["annotations"].append(ann)
+
+            # Fill in vids_to_imgs
+            img_id = ann["image_id"]
+            exist_img_ids = [img["id"] for img in vids_to_imgs[vid]]
+            try:
+                index2 = exist_img_ids.index(img_id)
+            except ValueError:
+                index2 = -1
+            if index2 == -1:
+                curr_img = {"id": img_id, "annotations": [ann]}
+                vids_to_imgs[vid].append(curr_img)
+            else:
+                vids_to_imgs[vid][index2]["annotations"].append(ann)
+
+        # sort annotations by frame index and compute track area
+        for vid, tracks in vids_to_tracks.items():
+            for track in tracks:
+                track["annotations"] = sorted(
+                    track["annotations"],
+                    key=lambda x: images[x["image_id"]]["frame_index"],
+                )
+                # Computer average area
+                track["area"] = sum(x["area"] for x in track["annotations"]) / len(
+                    track["annotations"]
+                )
+
+        # Ensure all videos are present
+        for vid_id in vid_ids:
+            if vid_id not in vids_to_tracks.keys():
+                vids_to_tracks[vid_id] = []
+            if vid_id not in vids_to_imgs.keys():
+                vids_to_imgs[vid_id] = []
+
+        return vids_to_tracks, vids_to_imgs
+
+    def _compute_image_to_timestep_mappings(self):
+        """
+        Computes a mapping from images to the corresponding timestep in the sequence.
+        :return: the image-to-timestep-mapping
+        """
+        images = {}
+        for image in self.gt_data["images"]:
+            images[image["id"]] = image
+
+        seq_to_imgs_to_timestep = {vid["id"]: dict() for vid in self.gt_data["videos"]}
+        for vid in seq_to_imgs_to_timestep:
+            curr_imgs = [img["id"] for img in self.videos_to_gt_images[vid]]
+            curr_imgs = sorted(curr_imgs, key=lambda x: images[x]["frame_index"])
+            seq_to_imgs_to_timestep[vid] = {
+                curr_imgs[i]: i for i in range(len(curr_imgs))
+            }
+
+        return seq_to_imgs_to_timestep
+
+    def _limit_dets_per_image(self, annotations):
+        """
+        Limits the number of detections for each image to config['MAX_DETECTIONS']. Adapted from
+        https://github.com/TAO-Dataset/
+        :param annotations: the annotations in which the detections should be limited
+        :return: the annotations with limited detections
+        """
+        max_dets = self.config["MAX_DETECTIONS"]
+        img_ann = defaultdict(list)
+        for ann in annotations:
+            img_ann[ann["image_id"]].append(ann)
+
+        for img_id, _anns in img_ann.items():
+            if len(_anns) <= max_dets:
+                continue
+            _anns = sorted(_anns, key=lambda x: x["score"], reverse=True)
+            img_ann[img_id] = _anns[:max_dets]
+
+        return [ann for anns in img_ann.values() for ann in anns]
+
+    def _fill_video_ids_inplace(self, annotations):
+        """
+        Fills in missing video IDs inplace. Adapted from https://github.com/TAO-Dataset/
+        :param annotations: the annotations for which the videos IDs should be filled inplace
+        :return: None
+        """
+        missing_video_id = [x for x in annotations if "video_id" not in x]
+        if missing_video_id:
+            image_id_to_video_id = {
+                x["id"]: x["video_id"] for x in self.gt_data["images"]
+            }
+            for x in missing_video_id:
+                x["video_id"] = image_id_to_video_id[x["image_id"]]
+
+    @staticmethod
+    def _make_track_ids_unique(annotations):
+        """
+        Makes the track IDs unqiue over the whole annotation set. Adapted from https://github.com/TAO-Dataset/
+        :param annotations: the annotation set
+        :return: the number of updated IDs
+        """
+        track_id_videos = {}
+        track_ids_to_update = set()
+        max_track_id = 0
+        for ann in annotations:
+            t = ann["track_id"]
+            if t not in track_id_videos:
+                track_id_videos[t] = ann["video_id"]
+
+            if ann["video_id"] != track_id_videos[t]:
+                # Track id is assigned to multiple videos
+                track_ids_to_update.add(t)
+            max_track_id = max(max_track_id, t)
+
+        if track_ids_to_update:
+            print("true")
+            next_id = itertools.count(max_track_id + 1)
+            new_track_ids = defaultdict(lambda: next(next_id))
+            for ann in annotations:
+                t = ann["track_id"]
+                v = ann["video_id"]
+                if t in track_ids_to_update:
+                    ann["track_id"] = new_track_ids[t, v]
+        return len(track_ids_to_update)
+
+    def _split_known_unknown_distractor(self):
+        all_ids = set(
+            [i for i in range(1, 2000)]
+        )  # 2000 is larger than the max category id in TAO-OW.
+        # `knowns` includes 78 TAO_category_ids that corresponds to 78 COCO classes.
+        # (The other 2 COCO classes do not have corresponding classes in TAO).
+        self.knowns = {
+            4,
+            13,
+            1038,
+            544,
+            1057,
+            34,
+            35,
+            36,
+            41,
+            45,
+            58,
+            60,
+            579,
+            1091,
+            1097,
+            1099,
+            78,
+            79,
+            81,
+            91,
+            1115,
+            1117,
+            95,
+            1122,
+            99,
+            1132,
+            621,
+            1135,
+            625,
+            118,
+            1144,
+            126,
+            642,
+            1155,
+            133,
+            1162,
+            139,
+            154,
+            174,
+            185,
+            699,
+            1215,
+            714,
+            717,
+            1229,
+            211,
+            729,
+            221,
+            229,
+            747,
+            235,
+            237,
+            779,
+            276,
+            805,
+            299,
+            829,
+            852,
+            347,
+            371,
+            382,
+            896,
+            392,
+            926,
+            937,
+            428,
+            429,
+            961,
+            452,
+            979,
+            980,
+            982,
+            475,
+            480,
+            993,
+            1001,
+            502,
+            1018,
+        }
+        # `distractors` is defined as in the paper "Opening up Open-World Tracking"
+        self.distractors = {
+            20,
+            63,
+            108,
+            180,
+            188,
+            204,
+            212,
+            247,
+            303,
+            403,
+            407,
+            415,
+            490,
+            504,
+            507,
+            513,
+            529,
+            567,
+            569,
+            588,
+            672,
+            691,
+            702,
+            708,
+            711,
+            720,
+            736,
+            737,
+            798,
+            813,
+            815,
+            827,
+            831,
+            851,
+            877,
+            883,
+            912,
+            971,
+            976,
+            1130,
+            1133,
+            1134,
+            1169,
+            1184,
+            1220,
+        }
+        self.unknowns = all_ids.difference(self.knowns.union(self.distractors))
+
+    def _filter_gt_data(self, raw_gt_data):
+        """
+        Filter out irrelevant data in the raw_gt_data
+        Args:
+            raw_gt_data: directly loaded from json.
+
+        Returns:
+            filtered gt_data
+        """
+        valid_cat_ids = list()
+        if self.subset == "known":
+            valid_cat_ids = self.knowns
+        elif self.subset == "distractor":
+            valid_cat_ids = self.distractors
+        elif self.subset == "unknown":
+            valid_cat_ids = self.unknowns
+        # elif self.subset == "test_only_unknowns":
+        #     valid_cat_ids = test_only_unknowns
+        else:
+            raise Exception("The parameter `SUBSET` is incorrect")
+
+        filtered = dict()
+        filtered["videos"] = raw_gt_data["videos"]
+        # filtered["videos"] = list()
+        unwanted_vid = set()
+        # for video in raw_gt_data["videos"]:
+        #     datasrc = video["name"].split('/')[1]
+        #     if datasrc in data_srcs:
+        #         filtered["videos"].append(video)
+        #     else:
+        #         unwanted_vid.add(video["id"])
+
+        filtered["annotations"] = list()
+        for ann in raw_gt_data["annotations"]:
+            if (ann["video_id"] not in unwanted_vid) and (
+                ann["category_id"] in valid_cat_ids
+            ):
+                filtered["annotations"].append(ann)
+
+        filtered["tracks"] = list()
+        for track in raw_gt_data["tracks"]:
+            if (track["video_id"] not in unwanted_vid) and (
+                track["category_id"] in valid_cat_ids
+            ):
+                filtered["tracks"].append(track)
+
+        filtered["images"] = list()
+        for image in raw_gt_data["images"]:
+            if image["video_id"] not in unwanted_vid:
+                filtered["images"].append(image)
+
+        filtered["categories"] = list()
+        for cat in raw_gt_data["categories"]:
+            if cat["id"] in valid_cat_ids:
+                filtered["categories"].append(cat)
+
+        filtered["info"] = raw_gt_data["info"]
+        filtered["licenses"] = raw_gt_data["licenses"]
+
+        return filtered
--- a/sam3/eval/hota_eval_toolkit/trackeval/datasets/youtube_vis.py
+++ b/sam3/eval/hota_eval_toolkit/trackeval/datasets/youtube_vis.py
@@ -0,0 +1,524 @@
+# flake8: noqa
+
+# note: this file has been modified from its original version in TrackEval in
+# https://github.com/JonathonLuiten/TrackEval/blob/master/trackeval/datasets/youtube_vis.py
+# to support the following:
+# 1) bbox evaluation (via `IOU_TYPE`)
+# 2) passing GT and prediction data as Python objects (via `GT_JSON_OBJECT` and `TRACKER_JSON_OBJECT`)
+# 3) specifying a custom dataset name (via `DATASET_NAME`)
+
+import json
+import os
+
+import numpy as np
+
+from .. import _timing, utils
+from ..utils import TrackEvalException
+from ._base_dataset import _BaseDataset
+
+
+class YouTubeVIS(_BaseDataset):
+    """Dataset class for YouTubeVIS tracking"""
+
+    @staticmethod
+    def get_default_dataset_config():
+        """Default class config values"""
+        code_path = utils.get_code_path()
+        default_config = {
+            "GT_FOLDER": os.path.join(
+                code_path, "data/gt/youtube_vis/"
+            ),  # Location of GT data
+            "TRACKERS_FOLDER": os.path.join(code_path, "data/trackers/youtube_vis/"),
+            # Trackers location
+            "OUTPUT_FOLDER": None,  # Where to save eval results (if None, same as TRACKERS_FOLDER)
+            "TRACKERS_TO_EVAL": None,  # Filenames of trackers to eval (if None, all in folder)
+            "CLASSES_TO_EVAL": None,  # Classes to eval (if None, all classes)
+            "SPLIT_TO_EVAL": "train_sub_split",  # Valid: 'train', 'val', 'train_sub_split'
+            "PRINT_CONFIG": True,  # Whether to print current config
+            "OUTPUT_SUB_FOLDER": "",  # Output files are saved in OUTPUT_FOLDER/tracker_name/OUTPUT_SUB_FOLDER
+            "TRACKER_SUB_FOLDER": "data",  # Tracker files are in TRACKER_FOLDER/tracker_name/TRACKER_SUB_FOLDER
+            "TRACKER_DISPLAY_NAMES": None,  # Names of trackers to display, if None: TRACKERS_TO_EVAL
+            # Added for video phrase AP evaluation -- allow directly specifying the GT JSON data and Tracker (result)
+            # JSON data as Python objects, without reading from files.
+            "GT_JSON_OBJECT": None,
+            "TRACKER_JSON_OBJECT": None,
+            "IOU_TYPE": "segm",
+            "DATASET_NAME": "video",
+        }
+        return default_config
+
+    def __init__(self, config=None):
+        """Initialise dataset, checking that all required files are present"""
+        super().__init__()
+        # Fill non-given config values with defaults
+        self.config = utils.init_config(config, self.get_default_dataset_config())
+        self.gt_fol = (
+            self.config["GT_FOLDER"] + "youtube_vis_" + self.config["SPLIT_TO_EVAL"]
+        )
+        self.tracker_fol = (
+            self.config["TRACKERS_FOLDER"]
+            + "youtube_vis_"
+            + self.config["SPLIT_TO_EVAL"]
+        )
+        self.use_super_categories = False
+        self.should_classes_combine = True
+        assert self.config["IOU_TYPE"] in ["segm", "bbox"]
+        self.iou_type = self.config["IOU_TYPE"]
+        print("=" * 100)
+        print(f"Evaluate annotation type *{self.iou_type}*")
+        self.dataset_name = self.config["DATASET_NAME"]
+
+        self.output_fol = self.config["OUTPUT_FOLDER"]
+        if self.output_fol is None:
+            self.output_fol = self.tracker_fol
+        self.output_sub_fol = self.config["OUTPUT_SUB_FOLDER"]
+        self.tracker_sub_fol = self.config["TRACKER_SUB_FOLDER"]
+
+        if self.config["GT_JSON_OBJECT"] is not None:
+            # allow directly specifying the GT JSON data without reading from files
+            gt_json = self.config["GT_JSON_OBJECT"]
+            assert isinstance(gt_json, dict)
+            assert "videos" in gt_json
+            assert "categories" in gt_json
+            assert "annotations" in gt_json
+            self.gt_data = gt_json
+        else:
+            if not os.path.exists(self.gt_fol):
+                print("GT folder not found: " + self.gt_fol)
+                raise TrackEvalException(
+                    "GT folder not found: " + os.path.basename(self.gt_fol)
+                )
+            gt_dir_files = [
+                file for file in os.listdir(self.gt_fol) if file.endswith(".json")
+            ]
+            if len(gt_dir_files) != 1:
+                raise TrackEvalException(
+                    self.gt_fol + " does not contain exactly one json file."
+                )
+
+            with open(os.path.join(self.gt_fol, gt_dir_files[0])) as f:
+                self.gt_data = json.load(f)
+
+        # Get classes to eval
+        self.valid_classes = [cls["name"] for cls in self.gt_data["categories"]]
+        cls_name_to_cls_id_map = {
+            cls["name"]: cls["id"] for cls in self.gt_data["categories"]
+        }
+
+        if self.config["CLASSES_TO_EVAL"]:
+            self.class_list = [
+                cls.lower() if cls.lower() in self.valid_classes else None
+                for cls in self.config["CLASSES_TO_EVAL"]
+            ]
+            if not all(self.class_list):
+                raise TrackEvalException(
+                    "Attempted to evaluate an invalid class. Only classes "
+                    + ", ".join(self.valid_classes)
+                    + " are valid."
+                )
+        else:
+            self.class_list = [cls["name"] for cls in self.gt_data["categories"]]
+        self.class_name_to_class_id = {
+            k: v for k, v in cls_name_to_cls_id_map.items() if k in self.class_list
+        }
+
+        # Get sequences to eval and check gt files exist
+        self.seq_list = [
+            vid["file_names"][0].split("/")[0] for vid in self.gt_data["videos"]
+        ]
+        self.seq_name_to_seq_id = {
+            vid["file_names"][0].split("/")[0]: vid["id"]
+            for vid in self.gt_data["videos"]
+        }
+        self.seq_lengths = {
+            vid["id"]: len(vid["file_names"]) for vid in self.gt_data["videos"]
+        }
+
+        # encode masks and compute track areas
+        self._prepare_gt_annotations()
+
+        # Get trackers to eval
+        if self.config["TRACKER_JSON_OBJECT"] is not None:
+            # allow directly specifying the tracker JSON data without reading from files
+            tracker_json = self.config["TRACKER_JSON_OBJECT"]
+            assert isinstance(tracker_json, list)
+            self.tracker_list = ["tracker"]
+        elif self.config["TRACKERS_TO_EVAL"] is None:
+            self.tracker_list = os.listdir(self.tracker_fol)
+        else:
+            self.tracker_list = self.config["TRACKERS_TO_EVAL"]
+
+        if self.config["TRACKER_DISPLAY_NAMES"] is None:
+            self.tracker_to_disp = dict(zip(self.tracker_list, self.tracker_list))
+        elif (self.config["TRACKERS_TO_EVAL"] is not None) and (
+            len(self.config["TRACKER_DISPLAY_NAMES"]) == len(self.tracker_list)
+        ):
+            self.tracker_to_disp = dict(
+                zip(self.tracker_list, self.config["TRACKER_DISPLAY_NAMES"])
+            )
+        else:
+            raise TrackEvalException(
+                "List of tracker files and tracker display names do not match."
+            )
+
+        # counter for globally unique track IDs
+        self.global_tid_counter = 0
+
+        self.tracker_data = dict()
+        if self.config["TRACKER_JSON_OBJECT"] is not None:
+            # allow directly specifying the tracker JSON data without reading from files
+            tracker = self.tracker_list[0]
+            self.tracker_data[tracker] = tracker_json
+        else:
+            for tracker in self.tracker_list:
+                tracker_dir_path = os.path.join(
+                    self.tracker_fol, tracker, self.tracker_sub_fol
+                )
+                tr_dir_files = [
+                    file
+                    for file in os.listdir(tracker_dir_path)
+                    if file.endswith(".json")
+                ]
+                if len(tr_dir_files) != 1:
+                    raise TrackEvalException(
+                        tracker_dir_path + " does not contain exactly one json file."
+                    )
+
+                with open(os.path.join(tracker_dir_path, tr_dir_files[0])) as f:
+                    curr_data = json.load(f)
+
+                self.tracker_data[tracker] = curr_data
+
+    def get_display_name(self, tracker):
+        return self.tracker_to_disp[tracker]
+
+    def _load_raw_file(self, tracker, seq, is_gt):
+        """Load a file (gt or tracker) in the YouTubeVIS format
+        If is_gt, this returns a dict which contains the fields:
+        [gt_ids, gt_classes] : list (for each timestep) of 1D NDArrays (for each det).
+        [gt_dets]: list (for each timestep) of lists of detections.
+        [classes_to_gt_tracks]: dictionary with class values as keys and list of dictionaries (with frame indices as
+                                keys and corresponding segmentations as values) for each track
+        [classes_to_gt_track_ids, classes_to_gt_track_areas, classes_to_gt_track_iscrowd]: dictionary with class values
+                                as keys and lists (for each track) as values
+
+        if not is_gt, this returns a dict which contains the fields:
+        [tracker_ids, tracker_classes, tracker_confidences] : list (for each timestep) of 1D NDArrays (for each det).
+        [tracker_dets]: list (for each timestep) of lists of detections.
+        [classes_to_dt_tracks]: dictionary with class values as keys and list of dictionaries (with frame indices as
+                                keys and corresponding segmentations as values) for each track
+        [classes_to_dt_track_ids, classes_to_dt_track_areas]: dictionary with class values as keys and lists as values
+        [classes_to_dt_track_scores]: dictionary with class values as keys and 1D numpy arrays as values
+        """
+        # select sequence tracks
+        seq_id = self.seq_name_to_seq_id[seq]
+        if is_gt:
+            tracks = [
+                ann for ann in self.gt_data["annotations"] if ann["video_id"] == seq_id
+            ]
+        else:
+            tracks = self._get_tracker_seq_tracks(tracker, seq_id)
+
+        # Convert data to required format
+        num_timesteps = self.seq_lengths[seq_id]
+        data_keys = ["ids", "classes", "dets"]
+        if not is_gt:
+            data_keys += ["tracker_confidences"]
+        raw_data = {key: [None] * num_timesteps for key in data_keys}
+        result_key = "segmentations" if self.iou_type == "segm" else "bboxes"
+        for t in range(num_timesteps):
+            raw_data["dets"][t] = [
+                track[result_key][t] for track in tracks if track[result_key][t]
+            ]
+            raw_data["ids"][t] = np.atleast_1d(
+                [track["id"] for track in tracks if track[result_key][t]]
+            ).astype(int)
+            raw_data["classes"][t] = np.atleast_1d(
+                [track["category_id"] for track in tracks if track[result_key][t]]
+            ).astype(int)
+            if not is_gt:
+                raw_data["tracker_confidences"][t] = np.atleast_1d(
+                    [track["score"] for track in tracks if track[result_key][t]]
+                ).astype(float)
+
+        if is_gt:
+            key_map = {"ids": "gt_ids", "classes": "gt_classes", "dets": "gt_dets"}
+        else:
+            key_map = {
+                "ids": "tracker_ids",
+                "classes": "tracker_classes",
+                "dets": "tracker_dets",
+            }
+        for k, v in key_map.items():
+            raw_data[v] = raw_data.pop(k)
+
+        all_cls_ids = {self.class_name_to_class_id[cls] for cls in self.class_list}
+        classes_to_tracks = {
+            cls: [track for track in tracks if track["category_id"] == cls]
+            for cls in all_cls_ids
+        }
+
+        # mapping from classes to track representations and track information
+        raw_data["classes_to_tracks"] = {
+            cls: [
+                {i: track[result_key][i] for i in range(len(track[result_key]))}
+                for track in tracks
+            ]
+            for cls, tracks in classes_to_tracks.items()
+        }
+        raw_data["classes_to_track_ids"] = {
+            cls: [track["id"] for track in tracks]
+            for cls, tracks in classes_to_tracks.items()
+        }
+        raw_data["classes_to_track_areas"] = {
+            cls: [track["area"] for track in tracks]
+            for cls, tracks in classes_to_tracks.items()
+        }
+
+        if is_gt:
+            raw_data["classes_to_gt_track_iscrowd"] = {
+                cls: [track["iscrowd"] for track in tracks]
+                for cls, tracks in classes_to_tracks.items()
+            }
+        else:
+            raw_data["classes_to_dt_track_scores"] = {
+                cls: np.array([track["score"] for track in tracks])
+                for cls, tracks in classes_to_tracks.items()
+            }
+
+        if is_gt:
+            key_map = {
+                "classes_to_tracks": "classes_to_gt_tracks",
+                "classes_to_track_ids": "classes_to_gt_track_ids",
+                "classes_to_track_areas": "classes_to_gt_track_areas",
+            }
+        else:
+            key_map = {
+                "classes_to_tracks": "classes_to_dt_tracks",
+                "classes_to_track_ids": "classes_to_dt_track_ids",
+                "classes_to_track_areas": "classes_to_dt_track_areas",
+            }
+        for k, v in key_map.items():
+            raw_data[v] = raw_data.pop(k)
+
+        raw_data["num_timesteps"] = num_timesteps
+        raw_data["seq"] = seq
+        return raw_data
+
+    @_timing.time
+    def get_preprocessed_seq_data(self, raw_data, cls):
+        """Preprocess data for a single sequence for a single class ready for evaluation.
+        Inputs:
+             - raw_data is a dict containing the data for the sequence already read in by get_raw_seq_data().
+             - cls is the class to be evaluated.
+        Outputs:
+             - data is a dict containing all of the information that metrics need to perform evaluation.
+                It contains the following fields:
+                    [num_timesteps, num_gt_ids, num_tracker_ids, num_gt_dets, num_tracker_dets] : integers.
+                    [gt_ids, tracker_ids, tracker_confidences]: list (for each timestep) of 1D NDArrays (for each det).
+                    [gt_dets, tracker_dets]: list (for each timestep) of lists of detections.
+                    [similarity_scores]: list (for each timestep) of 2D NDArrays.
+        Notes:
+            General preprocessing (preproc) occurs in 4 steps. Some datasets may not use all of these steps.
+                1) Extract only detections relevant for the class to be evaluated (including distractor detections).
+                2) Match gt dets and tracker dets. Remove tracker dets that are matched to a gt det that is of a
+                    distractor class, or otherwise marked as to be removed.
+                3) Remove unmatched tracker dets if they fall within a crowd ignore region or don't meet a certain
+                    other criteria (e.g. are too small).
+                4) Remove gt dets that were only useful for preprocessing and not for actual evaluation.
+            After the above preprocessing steps, this function also calculates the number of gt and tracker detections
+                and unique track ids. It also relabels gt and tracker ids to be contiguous and checks that ids are
+                unique within each timestep.
+        YouTubeVIS:
+            In YouTubeVIS, the 4 preproc steps are as follow:
+                1) There are 40 classes which are evaluated separately.
+                2) No matched tracker dets are removed.
+                3) No unmatched tracker dets are removed.
+                4) No gt dets are removed.
+            Further, for TrackMAP computation track representations for the given class are accessed from a dictionary
+            and the tracks from the tracker data are sorted according to the tracker confidence.
+        """
+        cls_id = self.class_name_to_class_id[cls]
+
+        data_keys = [
+            "gt_ids",
+            "tracker_ids",
+            "gt_dets",
+            "tracker_dets",
+            "similarity_scores",
+        ]
+        data = {key: [None] * raw_data["num_timesteps"] for key in data_keys}
+        unique_gt_ids = []
+        unique_tracker_ids = []
+        num_gt_dets = 0
+        num_tracker_dets = 0
+
+        for t in range(raw_data["num_timesteps"]):
+            # Only extract relevant dets for this class for eval (cls)
+            gt_class_mask = np.atleast_1d(raw_data["gt_classes"][t] == cls_id)
+            gt_class_mask = gt_class_mask.astype(bool)
+            gt_ids = raw_data["gt_ids"][t][gt_class_mask]
+            gt_dets = [
+                raw_data["gt_dets"][t][ind]
+                for ind in range(len(gt_class_mask))
+                if gt_class_mask[ind]
+            ]
+
+            tracker_class_mask = np.atleast_1d(raw_data["tracker_classes"][t] == cls_id)
+            tracker_class_mask = tracker_class_mask.astype(bool)
+            tracker_ids = raw_data["tracker_ids"][t][tracker_class_mask]
+            tracker_dets = [
+                raw_data["tracker_dets"][t][ind]
+                for ind in range(len(tracker_class_mask))
+                if tracker_class_mask[ind]
+            ]
+            similarity_scores = raw_data["similarity_scores"][t][gt_class_mask, :][
+                :, tracker_class_mask
+            ]
+
+            data["tracker_ids"][t] = tracker_ids
+            data["tracker_dets"][t] = tracker_dets
+            data["gt_ids"][t] = gt_ids
+            data["gt_dets"][t] = gt_dets
+            data["similarity_scores"][t] = similarity_scores
+
+            unique_gt_ids += list(np.unique(data["gt_ids"][t]))
+            unique_tracker_ids += list(np.unique(data["tracker_ids"][t]))
+            num_tracker_dets += len(data["tracker_ids"][t])
+            num_gt_dets += len(data["gt_ids"][t])
+
+        # Re-label IDs such that there are no empty IDs
+        if len(unique_gt_ids) > 0:
+            unique_gt_ids = np.unique(unique_gt_ids)
+            gt_id_map = np.nan * np.ones((np.max(unique_gt_ids) + 1))
+            gt_id_map[unique_gt_ids] = np.arange(len(unique_gt_ids))
+            for t in range(raw_data["num_timesteps"]):
+                if len(data["gt_ids"][t]) > 0:
+                    data["gt_ids"][t] = gt_id_map[data["gt_ids"][t]].astype(int)
+        if len(unique_tracker_ids) > 0:
+            unique_tracker_ids = np.unique(unique_tracker_ids)
+            tracker_id_map = np.nan * np.ones((np.max(unique_tracker_ids) + 1))
+            tracker_id_map[unique_tracker_ids] = np.arange(len(unique_tracker_ids))
+            for t in range(raw_data["num_timesteps"]):
+                if len(data["tracker_ids"][t]) > 0:
+                    data["tracker_ids"][t] = tracker_id_map[
+                        data["tracker_ids"][t]
+                    ].astype(int)
+
+        # Ensure that ids are unique per timestep.
+        self._check_unique_ids(data)
+
+        # Record overview statistics.
+        data["num_tracker_dets"] = num_tracker_dets
+        data["num_gt_dets"] = num_gt_dets
+        data["num_tracker_ids"] = len(unique_tracker_ids)
+        data["num_gt_ids"] = len(unique_gt_ids)
+        data["num_timesteps"] = raw_data["num_timesteps"]
+        data["seq"] = raw_data["seq"]
+
+        # get track representations
+        data["gt_tracks"] = raw_data["classes_to_gt_tracks"][cls_id]
+        data["gt_track_ids"] = raw_data["classes_to_gt_track_ids"][cls_id]
+        data["gt_track_areas"] = raw_data["classes_to_gt_track_areas"][cls_id]
+        data["gt_track_iscrowd"] = raw_data["classes_to_gt_track_iscrowd"][cls_id]
+        data["dt_tracks"] = raw_data["classes_to_dt_tracks"][cls_id]
+        data["dt_track_ids"] = raw_data["classes_to_dt_track_ids"][cls_id]
+        data["dt_track_areas"] = raw_data["classes_to_dt_track_areas"][cls_id]
+        data["dt_track_scores"] = raw_data["classes_to_dt_track_scores"][cls_id]
+        data["iou_type"] = "mask"
+
+        # sort tracker data tracks by tracker confidence scores
+        if data["dt_tracks"]:
+            idx = np.argsort(
+                [-score for score in data["dt_track_scores"]], kind="mergesort"
+            )
+            data["dt_track_scores"] = [data["dt_track_scores"][i] for i in idx]
+            data["dt_tracks"] = [data["dt_tracks"][i] for i in idx]
+            data["dt_track_ids"] = [data["dt_track_ids"][i] for i in idx]
+            data["dt_track_areas"] = [data["dt_track_areas"][i] for i in idx]
+
+        return data
+
+    def _calculate_similarities(self, gt_dets_t, tracker_dets_t):
+        if self.iou_type == "segm":
+            similarity_scores = self._calculate_mask_ious(
+                gt_dets_t, tracker_dets_t, is_encoded=True, do_ioa=False
+            )
+        else:
+            gt_dets_t = np.array(gt_dets_t, dtype=np.float32).reshape(-1, 4)
+            tracker_dets_t = np.array(tracker_dets_t, dtype=np.float32).reshape(-1, 4)
+            similarity_scores = self._calculate_box_ious(
+                gt_dets_t, tracker_dets_t, box_format="xywh", do_ioa=False
+            )
+        return similarity_scores
+
+    def _prepare_gt_annotations(self):
+        """
+        Prepares GT data by rle encoding segmentations and computing the average track area.
+        :return: None
+        """
+        if self.iou_type == "segm":
+            # only loaded when needed to reduce minimum requirements
+            from pycocotools import mask as mask_utils
+
+            for track in self.gt_data["annotations"]:
+                h = track["height"]
+                w = track["width"]
+                for i, seg in enumerate(track["segmentations"]):
+                    if seg is not None and isinstance(seg["counts"], list):
+                        track["segmentations"][i] = mask_utils.frPyObjects(seg, h, w)
+                areas = [a for a in track["areas"] if a]
+                if len(areas) == 0:
+                    track["area"] = 0
+                else:
+                    track["area"] = np.array(areas).mean()
+        else:
+            for track in self.gt_data["annotations"]:
+                # For bbox eval, compute areas from bboxes if not already available
+                areas = [a for a in track.get("areas", []) if a]
+                if not areas:
+                    areas = []
+                    for bbox in track.get("bboxes", []):
+                        if bbox is not None:
+                            areas.append(bbox[2] * bbox[3])
+                track["area"] = np.array(areas).mean() if areas else 0
+
+    def _get_tracker_seq_tracks(self, tracker, seq_id):
+        """
+        Prepares tracker data for a given sequence. Extracts all annotations for given sequence ID, computes
+        average track area and assigns a track ID.
+        :param tracker: the given tracker
+        :param seq_id: the sequence ID
+        :return: the extracted tracks
+        """
+        # only loaded when needed to reduce minimum requirements
+        from pycocotools import mask as mask_utils
+
+        tracks = [
+            ann for ann in self.tracker_data[tracker] if ann["video_id"] == seq_id
+        ]
+        for track in tracks:
+            if "areas" not in track:
+                if self.iou_type == "segm":
+                    for seg in track["segmentations"]:
+                        if seg:
+                            track["areas"].append(mask_utils.area(seg))
+                        else:
+                            track["areas"].append(None)
+                else:
+                    for bbox in track["bboxes"]:
+                        if bbox:
+                            track["areas"].append(bbox[2] * bbox[3])
+                        else:
+                            track["areas"].append(None)
+            areas = [a for a in track["areas"] if a]
+            if len(areas) == 0:
+                track["area"] = 0
+            else:
+                track["area"] = np.array(areas).mean()
+            track["id"] = self.global_tid_counter
+            self.global_tid_counter += 1
+        return tracks
+
+    def get_name(self):
+        return self.dataset_name
--- a/sam3/eval/hota_eval_toolkit/trackeval/eval.py
+++ b/sam3/eval/hota_eval_toolkit/trackeval/eval.py
@@ -0,0 +1,395 @@
+# flake8: noqa
+
+import os
+import time
+import traceback
+from functools import partial
+from multiprocessing.pool import Pool
+
+import numpy as np
+
+from . import _timing, utils
+from .metrics import Count
+from .utils import TrackEvalException
+
+try:
+    import tqdm
+
+    TQDM_IMPORTED = True
+except ImportError as _:
+    TQDM_IMPORTED = False
+
+
+class Evaluator:
+    """Evaluator class for evaluating different metrics for different datasets"""
+
+    @staticmethod
+    def get_default_eval_config():
+        """Returns the default config values for evaluation"""
+        code_path = utils.get_code_path()
+        default_config = {
+            "USE_PARALLEL": False,
+            "NUM_PARALLEL_CORES": 8,
+            "BREAK_ON_ERROR": True,  # Raises exception and exits with error
+            "RETURN_ON_ERROR": False,  # if not BREAK_ON_ERROR, then returns from function on error
+            "LOG_ON_ERROR": os.path.join(
+                code_path, "error_log.txt"
+            ),  # if not None, save any errors into a log file.
+            "PRINT_RESULTS": True,
+            "PRINT_ONLY_COMBINED": False,
+            "PRINT_CONFIG": True,
+            "TIME_PROGRESS": True,
+            "DISPLAY_LESS_PROGRESS": True,
+            "OUTPUT_SUMMARY": True,
+            "OUTPUT_EMPTY_CLASSES": True,  # If False, summary files are not output for classes with no detections
+            "OUTPUT_DETAILED": True,
+            "PLOT_CURVES": True,
+        }
+        return default_config
+
+    def __init__(self, config=None):
+        """Initialise the evaluator with a config file"""
+        self.config = utils.init_config(config, self.get_default_eval_config(), "Eval")
+        # Only run timing analysis if not run in parallel.
+        if self.config["TIME_PROGRESS"] and not self.config["USE_PARALLEL"]:
+            _timing.DO_TIMING = True
+            if self.config["DISPLAY_LESS_PROGRESS"]:
+                _timing.DISPLAY_LESS_PROGRESS = True
+
+    def _combine_results(
+        self,
+        res,
+        metrics_list,
+        metric_names,
+        dataset,
+        res_field="COMBINED_SEQ",
+        target_tag=None,
+    ):
+        assert res_field.startswith("COMBINED_SEQ")
+        # collecting combined cls keys (cls averaged, det averaged, super classes)
+        tracker_list, seq_list, class_list = dataset.get_eval_info()
+        combined_cls_keys = []
+        res[res_field] = {}
+
+        # narrow the target for evaluation
+        if target_tag is not None:
+            target_video_ids = [
+                annot["video_id"]
+                for annot in dataset.gt_data["annotations"]
+                if target_tag in annot["tags"]
+            ]
+            vid2name = {
+                video["id"]: video["file_names"][0].split("/")[0]
+                for video in dataset.gt_data["videos"]
+            }
+            target_video_ids = set(target_video_ids)
+            target_video = [vid2name[video_id] for video_id in target_video_ids]
+
+            if len(target_video) == 0:
+                raise TrackEvalException(
+                    "No sequences found with the tag %s" % target_tag
+                )
+
+            target_annotations = [
+                annot
+                for annot in dataset.gt_data["annotations"]
+                if annot["video_id"] in target_video_ids
+            ]
+            assert all(target_tag in annot["tags"] for annot in target_annotations), (
+                f"Not all annotations in the target sequences have the target tag {target_tag}. "
+                "We currently only support a target tag at the sequence level, not at the annotation level."
+            )
+        else:
+            target_video = seq_list
+
+        # combine sequences for each class
+        for c_cls in class_list:
+            res[res_field][c_cls] = {}
+            for metric, metric_name in zip(metrics_list, metric_names):
+                curr_res = {
+                    seq_key: seq_value[c_cls][metric_name]
+                    for seq_key, seq_value in res.items()
+                    if not seq_key.startswith("COMBINED_SEQ")
+                    and seq_key in target_video
+                }
+                res[res_field][c_cls][metric_name] = metric.combine_sequences(curr_res)
+        # combine classes
+        if dataset.should_classes_combine:
+            combined_cls_keys += [
+                "cls_comb_cls_av",
+                "cls_comb_det_av",
+                "all",
+            ]
+            res[res_field]["cls_comb_cls_av"] = {}
+            res[res_field]["cls_comb_det_av"] = {}
+            for metric, metric_name in zip(metrics_list, metric_names):
+                cls_res = {
+                    cls_key: cls_value[metric_name]
+                    for cls_key, cls_value in res[res_field].items()
+                    if cls_key not in combined_cls_keys
+                }
+                res[res_field]["cls_comb_cls_av"][metric_name] = (
+                    metric.combine_classes_class_averaged(cls_res)
+                )
+                res[res_field]["cls_comb_det_av"][metric_name] = (
+                    metric.combine_classes_det_averaged(cls_res)
+                )
+        # combine classes to super classes
+        if dataset.use_super_categories:
+            for cat, sub_cats in dataset.super_categories.items():
+                combined_cls_keys.append(cat)
+                res[res_field][cat] = {}
+                for metric, metric_name in zip(metrics_list, metric_names):
+                    cat_res = {
+                        cls_key: cls_value[metric_name]
+                        for cls_key, cls_value in res[res_field].items()
+                        if cls_key in sub_cats
+                    }
+                    res[res_field][cat][metric_name] = (
+                        metric.combine_classes_det_averaged(cat_res)
+                    )
+        return res, combined_cls_keys
+
+    def _summarize_results(
+        self,
+        res,
+        tracker,
+        metrics_list,
+        metric_names,
+        dataset,
+        res_field,
+        combined_cls_keys,
+    ):
+        config = self.config
+        output_fol = dataset.get_output_fol(tracker)
+        tracker_display_name = dataset.get_display_name(tracker)
+        for c_cls in res[
+            res_field
+        ].keys():  # class_list + combined classes if calculated
+            summaries = []
+            details = []
+            num_dets = res[res_field][c_cls]["Count"]["Dets"]
+            if config["OUTPUT_EMPTY_CLASSES"] or num_dets > 0:
+                for metric, metric_name in zip(metrics_list, metric_names):
+                    # for combined classes there is no per sequence evaluation
+                    if c_cls in combined_cls_keys:
+                        table_res = {res_field: res[res_field][c_cls][metric_name]}
+                    else:
+                        table_res = {
+                            seq_key: seq_value[c_cls][metric_name]
+                            for seq_key, seq_value in res.items()
+                        }
+
+                    if config["PRINT_RESULTS"] and config["PRINT_ONLY_COMBINED"]:
+                        dont_print = (
+                            dataset.should_classes_combine
+                            and c_cls not in combined_cls_keys
+                        )
+                        if not dont_print:
+                            metric.print_table(
+                                {res_field: table_res[res_field]},
+                                tracker_display_name,
+                                c_cls,
+                                res_field,
+                                res_field,
+                            )
+                    elif config["PRINT_RESULTS"]:
+                        metric.print_table(
+                            table_res, tracker_display_name, c_cls, res_field, res_field
+                        )
+                    if config["OUTPUT_SUMMARY"]:
+                        summaries.append(metric.summary_results(table_res))
+                    if config["OUTPUT_DETAILED"]:
+                        details.append(metric.detailed_results(table_res))
+                    if config["PLOT_CURVES"]:
+                        metric.plot_single_tracker_results(
+                            table_res,
+                            tracker_display_name,
+                            c_cls,
+                            output_fol,
+                        )
+                if config["OUTPUT_SUMMARY"]:
+                    utils.write_summary_results(summaries, c_cls, output_fol)
+                if config["OUTPUT_DETAILED"]:
+                    utils.write_detailed_results(details, c_cls, output_fol)
+
+    @_timing.time
+    def evaluate(self, dataset_list, metrics_list, show_progressbar=False):
+        """Evaluate a set of metrics on a set of datasets"""
+        config = self.config
+        metrics_list = metrics_list + [Count()]  # Count metrics are always run
+        metric_names = utils.validate_metrics_list(metrics_list)
+        dataset_names = [dataset.get_name() for dataset in dataset_list]
+        output_res = {}
+        output_msg = {}
+
+        for dataset, dataset_name in zip(dataset_list, dataset_names):
+            # Get dataset info about what to evaluate
+            output_res[dataset_name] = {}
+            output_msg[dataset_name] = {}
+            tracker_list, seq_list, class_list = dataset.get_eval_info()
+            print(
+                "\nEvaluating %i tracker(s) on %i sequence(s) for %i class(es) on %s dataset using the following "
+                "metrics: %s\n"
+                % (
+                    len(tracker_list),
+                    len(seq_list),
+                    len(class_list),
+                    dataset_name,
+                    ", ".join(metric_names),
+                )
+            )
+
+            # Evaluate each tracker
+            for tracker in tracker_list:
+                # if not config['BREAK_ON_ERROR'] then go to next tracker without breaking
+                try:
+                    # Evaluate each sequence in parallel or in series.
+                    # returns a nested dict (res), indexed like: res[seq][class][metric_name][sub_metric field]
+                    # e.g. res[seq_0001][pedestrian][hota][DetA]
+                    print("\nEvaluating %s\n" % tracker)
+                    time_start = time.time()
+                    if config["USE_PARALLEL"]:
+                        if show_progressbar and TQDM_IMPORTED:
+                            seq_list_sorted = sorted(seq_list)
+
+                            with Pool(config["NUM_PARALLEL_CORES"]) as pool, tqdm.tqdm(
+                                total=len(seq_list)
+                            ) as pbar:
+                                _eval_sequence = partial(
+                                    eval_sequence,
+                                    dataset=dataset,
+                                    tracker=tracker,
+                                    class_list=class_list,
+                                    metrics_list=metrics_list,
+                                    metric_names=metric_names,
+                                )
+                                results = []
+                                for r in pool.imap(
+                                    _eval_sequence, seq_list_sorted, chunksize=20
+                                ):
+                                    results.append(r)
+                                    pbar.update()
+                                res = dict(zip(seq_list_sorted, results))
+
+                        else:
+                            with Pool(config["NUM_PARALLEL_CORES"]) as pool:
+                                _eval_sequence = partial(
+                                    eval_sequence,
+                                    dataset=dataset,
+                                    tracker=tracker,
+                                    class_list=class_list,
+                                    metrics_list=metrics_list,
+                                    metric_names=metric_names,
+                                )
+                                results = pool.map(_eval_sequence, seq_list)
+                                res = dict(zip(seq_list, results))
+                    else:
+                        res = {}
+                        if show_progressbar and TQDM_IMPORTED:
+                            seq_list_sorted = sorted(seq_list)
+                            for curr_seq in tqdm.tqdm(seq_list_sorted):
+                                res[curr_seq] = eval_sequence(
+                                    curr_seq,
+                                    dataset,
+                                    tracker,
+                                    class_list,
+                                    metrics_list,
+                                    metric_names,
+                                )
+                        else:
+                            for curr_seq in sorted(seq_list):
+                                res[curr_seq] = eval_sequence(
+                                    curr_seq,
+                                    dataset,
+                                    tracker,
+                                    class_list,
+                                    metrics_list,
+                                    metric_names,
+                                )
+
+                    # Combine results over all sequences and then over all classes
+                    res, combined_cls_keys = self._combine_results(
+                        res, metrics_list, metric_names, dataset, "COMBINED_SEQ"
+                    )
+
+                    if np.all(
+                        ["tags" in annot for annot in dataset.gt_data["annotations"]]
+                    ):
+                        # Combine results over the challenging sequences and then over all classes
+                        # currently only support "tracking_challenging_pair"
+                        res, _ = self._combine_results(
+                            res,
+                            metrics_list,
+                            metric_names,
+                            dataset,
+                            "COMBINED_SEQ_CHALLENGING",
+                            "tracking_challenging_pair",
+                        )
+
+                    # Print and output results in various formats
+                    if config["TIME_PROGRESS"]:
+                        print(
+                            "\nAll sequences for %s finished in %.2f seconds"
+                            % (tracker, time.time() - time_start)
+                        )
+
+                    self._summarize_results(
+                        res,
+                        tracker,
+                        metrics_list,
+                        metric_names,
+                        dataset,
+                        "COMBINED_SEQ",
+                        combined_cls_keys,
+                    )
+                    if "COMBINED_SEQ_CHALLENGING" in res:
+                        self._summarize_results(
+                            res,
+                            tracker,
+                            metrics_list,
+                            metric_names,
+                            dataset,
+                            "COMBINED_SEQ_CHALLENGING",
+                            combined_cls_keys,
+                        )
+
+                    # Output for returning from function
+                    output_res[dataset_name][tracker] = res
+                    output_msg[dataset_name][tracker] = "Success"
+
+                except Exception as err:
+                    output_res[dataset_name][tracker] = None
+                    if type(err) == TrackEvalException:
+                        output_msg[dataset_name][tracker] = str(err)
+                    else:
+                        output_msg[dataset_name][tracker] = "Unknown error occurred."
+                    print("Tracker %s was unable to be evaluated." % tracker)
+                    print(err)
+                    traceback.print_exc()
+                    if config["LOG_ON_ERROR"] is not None:
+                        with open(config["LOG_ON_ERROR"], "a") as f:
+                            print(dataset_name, file=f)
+                            print(tracker, file=f)
+                            print(traceback.format_exc(), file=f)
+                            print("\n\n\n", file=f)
+                    if config["BREAK_ON_ERROR"]:
+                        raise err
+                    elif config["RETURN_ON_ERROR"]:
+                        return output_res, output_msg
+
+        return output_res, output_msg
+
+
+@_timing.time
+def eval_sequence(seq, dataset, tracker, class_list, metrics_list, metric_names):
+    """Function for evaluating a single sequence"""
+
+    raw_data = dataset.get_raw_seq_data(tracker, seq)
+    seq_res = {}
+    for cls in class_list:
+        seq_res[cls] = {}
+        data = dataset.get_preprocessed_seq_data(raw_data, cls)
+        for metric, met_name in zip(metrics_list, metric_names):
+            seq_res[cls][met_name] = metric.eval_sequence(data)
+    return seq_res
--- a/sam3/eval/hota_eval_toolkit/trackeval/metrics/init.py
+++ b/sam3/eval/hota_eval_toolkit/trackeval/metrics/init.py
@@ -0,0 +1,4 @@
+# flake8: noqa
+
+from .count import Count
+from .hota import HOTA
--- a/sam3/eval/hota_eval_toolkit/trackeval/metrics/_base_metric.py
+++ b/sam3/eval/hota_eval_toolkit/trackeval/metrics/_base_metric.py
@@ -0,0 +1,145 @@
+# flake8: noqa
+
+from abc import ABC, abstractmethod
+
+import numpy as np
+
+from .. import _timing
+from ..utils import TrackEvalException
+
+
+class _BaseMetric(ABC):
+    @abstractmethod
+    def __init__(self):
+        self.plottable = False
+        self.integer_fields = []
+        self.float_fields = []
+        self.array_labels = []
+        self.integer_array_fields = []
+        self.float_array_fields = []
+        self.fields = []
+        self.summary_fields = []
+        self.registered = False
+
+    #####################################################################
+    # Abstract functions for subclasses to implement
+
+    @_timing.time
+    @abstractmethod
+    def eval_sequence(self, data): ...
+
+    @abstractmethod
+    def combine_sequences(self, all_res): ...
+
+    @abstractmethod
+    def combine_classes_class_averaged(self, all_res, ignore_empty_classes=False): ...
+
+    @abstractmethod
+    def combine_classes_det_averaged(self, all_res): ...
+
+    def plot_single_tracker_results(self, all_res, tracker, output_folder, cls):
+        """Plot results of metrics, only valid for metrics with self.plottable"""
+        if self.plottable:
+            raise NotImplementedError(
+                "plot_results is not implemented for metric %s" % self.get_name()
+            )
+        else:
+            pass
+
+    #####################################################################
+    # Helper functions which are useful for all metrics:
+
+    @classmethod
+    def get_name(cls):
+        return cls.__name__
+
+    @staticmethod
+    def _combine_sum(all_res, field):
+        """Combine sequence results via sum"""
+        return sum([all_res[k][field] for k in all_res.keys()])
+
+    @staticmethod
+    def _combine_weighted_av(all_res, field, comb_res, weight_field):
+        """Combine sequence results via weighted average"""
+        return sum(
+            [all_res[k][field] * all_res[k][weight_field] for k in all_res.keys()]
+        ) / np.maximum(1.0, comb_res[weight_field])
+
+    def print_table(
+        self, table_res, tracker, cls, res_field="COMBINED_SEQ", output_lable="COMBINED"
+    ):
+        """Prints table of results for all sequences"""
+        print("")
+        metric_name = self.get_name()
+        self._row_print(
+            [metric_name + ": " + tracker + "-" + cls] + self.summary_fields
+        )
+        for seq, results in sorted(table_res.items()):
+            if seq.startswith("COMBINED_SEQ"):
+                continue
+            summary_res = self._summary_row(results)
+            self._row_print([seq] + summary_res)
+        summary_res = self._summary_row(table_res[res_field])
+        self._row_print([output_lable] + summary_res)
+
+    def _summary_row(self, results_):
+        vals = []
+        for h in self.summary_fields:
+            if h in self.float_array_fields:
+                vals.append("{0:1.5g}".format(100 * np.mean(results_[h])))
+            elif h in self.float_fields:
+                vals.append("{0:1.5g}".format(100 * float(results_[h])))
+            elif h in self.integer_fields:
+                vals.append("{0:d}".format(int(results_[h])))
+            else:
+                raise NotImplementedError(
+                    "Summary function not implemented for this field type."
+                )
+        return vals
+
+    @staticmethod
+    def _row_print(*argv):
+        """Prints results in an evenly spaced rows, with more space in first row"""
+        if len(argv) == 1:
+            argv = argv[0]
+        to_print = "%-35s" % argv[0]
+        for v in argv[1:]:
+            to_print += "%-10s" % str(v)
+        print(to_print)
+
+    def summary_results(self, table_res):
+        """Returns a simple summary of final results for a tracker"""
+        return dict(
+            zip(self.summary_fields, self._summary_row(table_res["COMBINED_SEQ"]))
+        )
+
+    def detailed_results(self, table_res):
+        """Returns detailed final results for a tracker"""
+        # Get detailed field information
+        detailed_fields = self.float_fields + self.integer_fields
+        for h in self.float_array_fields + self.integer_array_fields:
+            for alpha in [int(100 * x) for x in self.array_labels]:
+                detailed_fields.append(h + "___" + str(alpha))
+            detailed_fields.append(h + "___AUC")
+
+        # Get detailed results
+        detailed_results = {}
+        for seq, res in table_res.items():
+            detailed_row = self._detailed_row(res)
+            if len(detailed_row) != len(detailed_fields):
+                raise TrackEvalException(
+                    "Field names and data have different sizes (%i and %i)"
+                    % (len(detailed_row), len(detailed_fields))
+                )
+            detailed_results[seq] = dict(zip(detailed_fields, detailed_row))
+        return detailed_results
+
+    def _detailed_row(self, res):
+        detailed_row = []
+        for h in self.float_fields + self.integer_fields:
+            detailed_row.append(res[h])
+        for h in self.float_array_fields + self.integer_array_fields:
+            for i, alpha in enumerate([int(100 * x) for x in self.array_labels]):
+                detailed_row.append(res[h][i])
+            detailed_row.append(np.mean(res[h]))
+        return detailed_row
--- a/sam3/eval/hota_eval_toolkit/trackeval/metrics/count.py
+++ b/sam3/eval/hota_eval_toolkit/trackeval/metrics/count.py
@@ -0,0 +1,48 @@
+# flake8: noqa
+
+from .. import _timing
+from ._base_metric import _BaseMetric
+
+
+class Count(_BaseMetric):
+    """Class which simply counts the number of tracker and gt detections and ids."""
+
+    def __init__(self, config=None):
+        super().__init__()
+        self.integer_fields = ["Dets", "GT_Dets", "IDs", "GT_IDs"]
+        self.fields = self.integer_fields
+        self.summary_fields = self.fields
+
+    @_timing.time
+    def eval_sequence(self, data):
+        """Returns counts for one sequence"""
+        # Get results
+        res = {
+            "Dets": data["num_tracker_dets"],
+            "GT_Dets": data["num_gt_dets"],
+            "IDs": data["num_tracker_ids"],
+            "GT_IDs": data["num_gt_ids"],
+            "Frames": data["num_timesteps"],
+        }
+        return res
+
+    def combine_sequences(self, all_res):
+        """Combines metrics across all sequences"""
+        res = {}
+        for field in self.integer_fields:
+            res[field] = self._combine_sum(all_res, field)
+        return res
+
+    def combine_classes_class_averaged(self, all_res, ignore_empty_classes=None):
+        """Combines metrics across all classes by averaging over the class values"""
+        res = {}
+        for field in self.integer_fields:
+            res[field] = self._combine_sum(all_res, field)
+        return res
+
+    def combine_classes_det_averaged(self, all_res):
+        """Combines metrics across all classes by averaging over the detection values"""
+        res = {}
+        for field in self.integer_fields:
+            res[field] = self._combine_sum(all_res, field)
+        return res
--- a/sam3/eval/hota_eval_toolkit/trackeval/metrics/hota.py
+++ b/sam3/eval/hota_eval_toolkit/trackeval/metrics/hota.py
@@ -0,0 +1,291 @@
+# flake8: noqa
+
+import os
+
+import numpy as np
+from scipy.optimize import linear_sum_assignment
+
+from .. import _timing
+from ._base_metric import _BaseMetric
+
+
+class HOTA(_BaseMetric):
+    """Class which implements the HOTA metrics.
+    See: https://link.springer.com/article/10.1007/s11263-020-01375-2
+    """
+
+    def __init__(self, config=None):
+        super().__init__()
+        self.plottable = True
+        self.array_labels = np.arange(0.05, 0.99, 0.05)
+        self.integer_array_fields = ["HOTA_TP", "HOTA_FN", "HOTA_FP"]
+        self.float_array_fields = [
+            "HOTA",
+            "DetA",
+            "AssA",
+            "DetRe",
+            "DetPr",
+            "AssRe",
+            "AssPr",
+            "LocA",
+            "OWTA",
+        ]
+        self.float_fields = ["HOTA(0)", "LocA(0)", "HOTALocA(0)"]
+        self.fields = (
+            self.float_array_fields + self.integer_array_fields + self.float_fields
+        )
+        self.summary_fields = self.float_array_fields + self.float_fields
+
+    @_timing.time
+    def eval_sequence(self, data):
+        """Calculates the HOTA metrics for one sequence"""
+
+        # Initialise results
+        res = {}
+        for field in self.float_array_fields + self.integer_array_fields:
+            res[field] = np.zeros((len(self.array_labels)), dtype=float)
+        for field in self.float_fields:
+            res[field] = 0
+
+        # Return result quickly if tracker or gt sequence is empty
+        if data["num_tracker_dets"] == 0:
+            res["HOTA_FN"] = data["num_gt_dets"] * np.ones(
+                (len(self.array_labels)), dtype=float
+            )
+            res["LocA"] = np.ones((len(self.array_labels)), dtype=float)
+            res["LocA(0)"] = 1.0
+            return res
+        if data["num_gt_dets"] == 0:
+            res["HOTA_FP"] = data["num_tracker_dets"] * np.ones(
+                (len(self.array_labels)), dtype=float
+            )
+            res["LocA"] = np.ones((len(self.array_labels)), dtype=float)
+            res["LocA(0)"] = 1.0
+            return res
+
+        # Variables counting global association
+        potential_matches_count = np.zeros(
+            (data["num_gt_ids"], data["num_tracker_ids"])
+        )
+        gt_id_count = np.zeros((data["num_gt_ids"], 1))
+        tracker_id_count = np.zeros((1, data["num_tracker_ids"]))
+
+        # First loop through each timestep and accumulate global track information.
+        for t, (gt_ids_t, tracker_ids_t) in enumerate(
+            zip(data["gt_ids"], data["tracker_ids"])
+        ):
+            # Count the potential matches between ids in each timestep
+            # These are normalised, weighted by the match similarity.
+            similarity = data["similarity_scores"][t]
+            sim_iou_denom = (
+                similarity.sum(0)[np.newaxis, :]
+                + similarity.sum(1)[:, np.newaxis]
+                - similarity
+            )
+            sim_iou = np.zeros_like(similarity)
+            sim_iou_mask = sim_iou_denom > 0 + np.finfo("float").eps
+            sim_iou[sim_iou_mask] = (
+                similarity[sim_iou_mask] / sim_iou_denom[sim_iou_mask]
+            )
+            potential_matches_count[
+                gt_ids_t[:, np.newaxis], tracker_ids_t[np.newaxis, :]
+            ] += sim_iou
+
+            # Calculate the total number of dets for each gt_id and tracker_id.
+            gt_id_count[gt_ids_t] += 1
+            tracker_id_count[0, tracker_ids_t] += 1
+
+        # Calculate overall jaccard alignment score (before unique matching) between IDs
+        global_alignment_score = potential_matches_count / (
+            gt_id_count + tracker_id_count - potential_matches_count
+        )
+        matches_counts = [
+            np.zeros_like(potential_matches_count) for _ in self.array_labels
+        ]
+
+        # Calculate scores for each timestep
+        for t, (gt_ids_t, tracker_ids_t) in enumerate(
+            zip(data["gt_ids"], data["tracker_ids"])
+        ):
+            # Deal with the case that there are no gt_det/tracker_det in a timestep.
+            if len(gt_ids_t) == 0:
+                for a, alpha in enumerate(self.array_labels):
+                    res["HOTA_FP"][a] += len(tracker_ids_t)
+                continue
+            if len(tracker_ids_t) == 0:
+                for a, alpha in enumerate(self.array_labels):
+                    res["HOTA_FN"][a] += len(gt_ids_t)
+                continue
+
+            # Get matching scores between pairs of dets for optimizing HOTA
+            similarity = data["similarity_scores"][t]
+            score_mat = (
+                global_alignment_score[
+                    gt_ids_t[:, np.newaxis], tracker_ids_t[np.newaxis, :]
+                ]
+                * similarity
+            )
+
+            # Hungarian algorithm to find best matches
+            match_rows, match_cols = linear_sum_assignment(-score_mat)
+
+            # Calculate and accumulate basic statistics
+            for a, alpha in enumerate(self.array_labels):
+                actually_matched_mask = (
+                    similarity[match_rows, match_cols] >= alpha - np.finfo("float").eps
+                )
+                alpha_match_rows = match_rows[actually_matched_mask]
+                alpha_match_cols = match_cols[actually_matched_mask]
+                num_matches = len(alpha_match_rows)
+                res["HOTA_TP"][a] += num_matches
+                res["HOTA_FN"][a] += len(gt_ids_t) - num_matches
+                res["HOTA_FP"][a] += len(tracker_ids_t) - num_matches
+                if num_matches > 0:
+                    res["LocA"][a] += sum(
+                        similarity[alpha_match_rows, alpha_match_cols]
+                    )
+                    matches_counts[a][
+                        gt_ids_t[alpha_match_rows], tracker_ids_t[alpha_match_cols]
+                    ] += 1
+
+        # Calculate association scores (AssA, AssRe, AssPr) for the alpha value.
+        # First calculate scores per gt_id/tracker_id combo and then average over the number of detections.
+        for a, alpha in enumerate(self.array_labels):
+            matches_count = matches_counts[a]
+            ass_a = matches_count / np.maximum(
+                1, gt_id_count + tracker_id_count - matches_count
+            )
+            res["AssA"][a] = np.sum(matches_count * ass_a) / np.maximum(
+                1, res["HOTA_TP"][a]
+            )
+            ass_re = matches_count / np.maximum(1, gt_id_count)
+            res["AssRe"][a] = np.sum(matches_count * ass_re) / np.maximum(
+                1, res["HOTA_TP"][a]
+            )
+            ass_pr = matches_count / np.maximum(1, tracker_id_count)
+            res["AssPr"][a] = np.sum(matches_count * ass_pr) / np.maximum(
+                1, res["HOTA_TP"][a]
+            )
+
+        # Calculate final scores
+        res["LocA"] = np.maximum(1e-10, res["LocA"]) / np.maximum(1e-10, res["HOTA_TP"])
+        res = self._compute_final_fields(res)
+        return res
+
+    def combine_sequences(self, all_res):
+        """Combines metrics across all sequences"""
+        res = {}
+        for field in self.integer_array_fields:
+            res[field] = self._combine_sum(all_res, field)
+        for field in ["AssRe", "AssPr", "AssA"]:
+            res[field] = self._combine_weighted_av(
+                all_res, field, res, weight_field="HOTA_TP"
+            )
+        loca_weighted_sum = sum(
+            [all_res[k]["LocA"] * all_res[k]["HOTA_TP"] for k in all_res.keys()]
+        )
+        res["LocA"] = np.maximum(1e-10, loca_weighted_sum) / np.maximum(
+            1e-10, res["HOTA_TP"]
+        )
+        res = self._compute_final_fields(res)
+        return res
+
+    def combine_classes_class_averaged(self, all_res, ignore_empty_classes=False):
+        """Combines metrics across all classes by averaging over the class values.
+        If 'ignore_empty_classes' is True, then it only sums over classes with at least one gt or predicted detection.
+        """
+        res = {}
+        for field in self.integer_array_fields:
+            if ignore_empty_classes:
+                res[field] = self._combine_sum(
+                    {
+                        k: v
+                        for k, v in all_res.items()
+                        if (
+                            v["HOTA_TP"] + v["HOTA_FN"] + v["HOTA_FP"]
+                            > 0 + np.finfo("float").eps
+                        ).any()
+                    },
+                    field,
+                )
+            else:
+                res[field] = self._combine_sum(
+                    {k: v for k, v in all_res.items()}, field
+                )
+
+        for field in self.float_fields + self.float_array_fields:
+            if ignore_empty_classes:
+                res[field] = np.mean(
+                    [
+                        v[field]
+                        for v in all_res.values()
+                        if (
+                            v["HOTA_TP"] + v["HOTA_FN"] + v["HOTA_FP"]
+                            > 0 + np.finfo("float").eps
+                        ).any()
+                    ],
+                    axis=0,
+                )
+            else:
+                res[field] = np.mean([v[field] for v in all_res.values()], axis=0)
+        return res
+
+    def combine_classes_det_averaged(self, all_res):
+        """Combines metrics across all classes by averaging over the detection values"""
+        res = {}
+        for field in self.integer_array_fields:
+            res[field] = self._combine_sum(all_res, field)
+        for field in ["AssRe", "AssPr", "AssA"]:
+            res[field] = self._combine_weighted_av(
+                all_res, field, res, weight_field="HOTA_TP"
+            )
+        loca_weighted_sum = sum(
+            [all_res[k]["LocA"] * all_res[k]["HOTA_TP"] for k in all_res.keys()]
+        )
+        res["LocA"] = np.maximum(1e-10, loca_weighted_sum) / np.maximum(
+            1e-10, res["HOTA_TP"]
+        )
+        res = self._compute_final_fields(res)
+        return res
+
+    @staticmethod
+    def _compute_final_fields(res):
+        """Calculate sub-metric ('field') values which only depend on other sub-metric values.
+        This function is used both for both per-sequence calculation, and in combining values across sequences.
+        """
+        res["DetRe"] = res["HOTA_TP"] / np.maximum(1, res["HOTA_TP"] + res["HOTA_FN"])
+        res["DetPr"] = res["HOTA_TP"] / np.maximum(1, res["HOTA_TP"] + res["HOTA_FP"])
+        res["DetA"] = res["HOTA_TP"] / np.maximum(
+            1, res["HOTA_TP"] + res["HOTA_FN"] + res["HOTA_FP"]
+        )
+        res["HOTA"] = np.sqrt(res["DetA"] * res["AssA"])
+        res["OWTA"] = np.sqrt(res["DetRe"] * res["AssA"])
+
+        res["HOTA(0)"] = res["HOTA"][0]
+        res["LocA(0)"] = res["LocA"][0]
+        res["HOTALocA(0)"] = res["HOTA(0)"] * res["LocA(0)"]
+        return res
+
+    def plot_single_tracker_results(self, table_res, tracker, cls, output_folder):
+        """Create plot of results"""
+
+        # Only loaded when run to reduce minimum requirements
+        from matplotlib import pyplot as plt
+
+        res = table_res["COMBINED_SEQ"]
+        styles_to_plot = ["r", "b", "g", "b--", "b:", "g--", "g:", "m"]
+        for name, style in zip(self.float_array_fields, styles_to_plot):
+            plt.plot(self.array_labels, res[name], style)
+        plt.xlabel("alpha")
+        plt.ylabel("score")
+        plt.title(tracker + " - " + cls)
+        plt.axis([0, 1, 0, 1])
+        legend = []
+        for name in self.float_array_fields:
+            legend += [name + " (" + str(np.round(np.mean(res[name]), 2)) + ")"]
+        plt.legend(legend, loc="lower left")
+        out_file = os.path.join(output_folder, cls + "_plot.pdf")
+        os.makedirs(os.path.dirname(out_file), exist_ok=True)
+        plt.savefig(out_file)
+        plt.savefig(out_file.replace(".pdf", ".png"))
+        plt.clf()
--- a/sam3/eval/hota_eval_toolkit/trackeval/utils.py
+++ b/sam3/eval/hota_eval_toolkit/trackeval/utils.py
@@ -0,0 +1,195 @@
+# flake8: noqa
+
+import argparse
+import csv
+import os
+from collections import OrderedDict
+
+
+def init_config(config, default_config, name=None):
+    """Initialise non-given config values with defaults"""
+    if config is None:
+        config = default_config
+    else:
+        for k in default_config.keys():
+            if k not in config.keys():
+                config[k] = default_config[k]
+    if name and config["PRINT_CONFIG"]:
+        print("\n%s Config:" % name)
+        for c in config.keys():
+            print("%-20s : %-30s" % (c, config[c]))
+    return config
+
+
+def update_config(config):
+    """
+    Parse the arguments of a script and updates the config values for a given value if specified in the arguments.
+    :param config: the config to update
+    :return: the updated config
+    """
+    parser = argparse.ArgumentParser()
+    for setting in config.keys():
+        if type(config[setting]) == list or type(config[setting]) == type(None):
+            parser.add_argument("--" + setting, nargs="+")
+        else:
+            parser.add_argument("--" + setting)
+    args = parser.parse_args().__dict__
+    for setting in args.keys():
+        if args[setting] is not None:
+            if type(config[setting]) == type(True):
+                if args[setting] == "True":
+                    x = True
+                elif args[setting] == "False":
+                    x = False
+                else:
+                    raise Exception(
+                        "Command line parameter " + setting + "must be True or False"
+                    )
+            elif type(config[setting]) == type(1):
+                x = int(args[setting])
+            elif type(args[setting]) == type(None):
+                x = None
+            else:
+                x = args[setting]
+            config[setting] = x
+    return config
+
+
+def get_code_path():
+    """Get base path where code is"""
+    return os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
+
+
+def validate_metrics_list(metrics_list):
+    """Get names of metric class and ensures they are unique, further checks that the fields within each metric class
+    do not have overlapping names.
+    """
+    metric_names = [metric.get_name() for metric in metrics_list]
+    # check metric names are unique
+    if len(metric_names) != len(set(metric_names)):
+        raise TrackEvalException(
+            "Code being run with multiple metrics of the same name"
+        )
+    fields = []
+    for m in metrics_list:
+        fields += m.fields
+    # check metric fields are unique
+    if len(fields) != len(set(fields)):
+        raise TrackEvalException(
+            "Code being run with multiple metrics with fields of the same name"
+        )
+    return metric_names
+
+
+def write_summary_results(summaries, cls, output_folder):
+    """Write summary results to file"""
+
+    fields = sum([list(s.keys()) for s in summaries], [])
+    values = sum([list(s.values()) for s in summaries], [])
+
+    # In order to remain consistent upon new fields being adding, for each of the following fields if they are present
+    # they will be output in the summary first in the order below. Any further fields will be output in the order each
+    # metric family is called, and within each family either in the order they were added to the dict (python >= 3.6) or
+    # randomly (python < 3.6).
+    default_order = [
+        "HOTA",
+        "DetA",
+        "AssA",
+        "DetRe",
+        "DetPr",
+        "AssRe",
+        "AssPr",
+        "LocA",
+        "OWTA",
+        "HOTA(0)",
+        "LocA(0)",
+        "HOTALocA(0)",
+        "MOTA",
+        "MOTP",
+        "MODA",
+        "CLR_Re",
+        "CLR_Pr",
+        "MTR",
+        "PTR",
+        "MLR",
+        "CLR_TP",
+        "CLR_FN",
+        "CLR_FP",
+        "IDSW",
+        "MT",
+        "PT",
+        "ML",
+        "Frag",
+        "sMOTA",
+        "IDF1",
+        "IDR",
+        "IDP",
+        "IDTP",
+        "IDFN",
+        "IDFP",
+        "Dets",
+        "GT_Dets",
+        "IDs",
+        "GT_IDs",
+    ]
+    default_ordered_dict = OrderedDict(
+        zip(default_order, [None for _ in default_order])
+    )
+    for f, v in zip(fields, values):
+        default_ordered_dict[f] = v
+    for df in default_order:
+        if default_ordered_dict[df] is None:
+            del default_ordered_dict[df]
+    fields = list(default_ordered_dict.keys())
+    values = list(default_ordered_dict.values())
+
+    out_file = os.path.join(output_folder, cls + "_summary.txt")
+    os.makedirs(os.path.dirname(out_file), exist_ok=True)
+    with open(out_file, "w", newline="") as f:
+        writer = csv.writer(f, delimiter=" ")
+        writer.writerow(fields)
+        writer.writerow(values)
+
+
+def write_detailed_results(details, cls, output_folder):
+    """Write detailed results to file"""
+    sequences = details[0].keys()
+    fields = ["seq"] + sum([list(s["COMBINED_SEQ"].keys()) for s in details], [])
+    out_file = os.path.join(output_folder, cls + "_detailed.csv")
+    os.makedirs(os.path.dirname(out_file), exist_ok=True)
+    with open(out_file, "w", newline="") as f:
+        writer = csv.writer(f)
+        writer.writerow(fields)
+        for seq in sorted(sequences):
+            if seq == "COMBINED_SEQ":
+                continue
+            writer.writerow([seq] + sum([list(s[seq].values()) for s in details], []))
+        writer.writerow(
+            ["COMBINED"] + sum([list(s["COMBINED_SEQ"].values()) for s in details], [])
+        )
+
+
+def load_detail(file):
+    """Loads detailed data for a tracker."""
+    data = {}
+    with open(file) as f:
+        for i, row_text in enumerate(f):
+            row = row_text.replace("\r", "").replace("\n", "").split(",")
+            if i == 0:
+                keys = row[1:]
+                continue
+            current_values = row[1:]
+            seq = row[0]
+            if seq == "COMBINED":
+                seq = "COMBINED_SEQ"
+            if (len(current_values) == len(keys)) and seq != "":
+                data[seq] = {}
+                for key, value in zip(keys, current_values):
+                    data[seq][key] = float(value)
+    return data
+
+
+class TrackEvalException(Exception):
+    """Custom exception for catching expected errors."""
+
+    ...
--- a/sam3/eval/postprocessors.py
+++ b/sam3/eval/postprocessors.py
@@ -0,0 +1,648 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
+
+"""Postprocessors class to transform MDETR output according to the downstream task"""
+
+import dataclasses
+import logging
+from collections import defaultdict
+from typing import Dict, List, Optional
+
+import numpy as np
+import torch
+from sam3.model import box_ops
+from sam3.model.data_misc import BatchedInferenceMetadata, interpolate
+from sam3.train.masks_ops import rle_encode, robust_rle_encode
+from torch import nn
+
+
+class PostProcessNullOp(nn.Module):
+    def __init__(self, **kwargs):
+        super(PostProcessNullOp).__init__()
+        pass
+
+    def forward(self, input):
+        pass
+
+    def process_results(self, **kwargs):
+        return kwargs["find_stages"]
+
+
+class PostProcessImage(nn.Module):
+    """This module converts the model's output into the format expected by the coco api"""
+
+    def __init__(
+        self,
+        max_dets_per_img: int,
+        iou_type="bbox",
+        to_cpu: bool = True,
+        use_original_ids: bool = False,
+        use_original_sizes_box: bool = False,
+        use_original_sizes_mask: bool = False,
+        convert_mask_to_rle: bool = False,
+        always_interpolate_masks_on_gpu: bool = True,
+        use_presence: bool = True,
+        detection_threshold: float = -1.0,
+    ) -> None:
+        super().__init__()
+        self.max_dets_per_img = max_dets_per_img
+        self.iou_type = iou_type
+        self.to_cpu = to_cpu
+        self.convert_mask_to_rle = convert_mask_to_rle
+        self.always_interpolate_masks_on_gpu = always_interpolate_masks_on_gpu
+
+        self.use_presence = use_presence
+        self.detection_threshold = detection_threshold
+        self.use_original_ids = use_original_ids
+        self.use_original_sizes_box = use_original_sizes_box
+        self.use_original_sizes_mask = use_original_sizes_mask
+
+    @torch.no_grad()
+    def forward(
+        self,
+        outputs,
+        target_sizes_boxes,
+        target_sizes_masks,
+        forced_labels=None,
+        consistent=False,
+        ret_tensordict: bool = False,  # This is experimental
+    ):
+        """Perform the computation
+        Parameters:
+            outputs: raw outputs of the model
+            target_sizes_boxes: tensor of dimension [batch_size x 2] containing the size of each images of the batch
+                          For evaluation, this must be the original image size (before any data augmentation)
+                          For visualization, this should be the image size after data augment, but before padding
+            target_sizes_masks: same but used to resize masks
+            forced_labels: tensor of dimension [batch_size] containing the label to force for each image of the batch
+                           This is useful when evaluating the model using standard metrics (eg on COCO, LVIS). In that case,
+                           we query the model with every possible class label, so we when we pass the predictions to the evaluator,
+                           we want to make sure that the predicted "class" matches the one that was queried.
+            consistent: whether all target sizes are equal
+            ret_tensordict: Experimental argument. If true, return a tensordict.TensorDict instead of a list of dictionaries for easier manipulation.
+        """
+        if ret_tensordict:
+            assert (
+                consistent is True
+            ), "We don't support returning TensorDict if the outputs have different shapes"  # NOTE: It's possible but we don't support it.
+            assert self.detection_threshold <= 0.0, "TODO: implement?"
+            try:
+                from tensordict import TensorDict
+            except ImportError:
+                logging.info(
+                    "tensordict is not installed. Install by running `pip install tensordict --no-deps`. Falling back by setting `ret_tensordict=False`"
+                )
+                ret_tensordict = False
+
+        out_bbox = outputs["pred_boxes"] if "pred_boxes" in outputs else None
+        out_logits = outputs["pred_logits"]
+        pred_masks = outputs["pred_masks"] if self.iou_type == "segm" else None
+        out_probs = out_logits.sigmoid()
+        if self.use_presence:
+            presence_score = outputs["presence_logit_dec"].sigmoid().unsqueeze(1)
+            out_probs = out_probs * presence_score
+
+        assert target_sizes_boxes.shape[1] == 2
+        assert target_sizes_masks.shape[1] == 2
+        batch_size = target_sizes_boxes.shape[0]
+
+        boxes, scores, labels, keep = self._process_boxes_and_labels(
+            target_sizes_boxes, forced_labels, out_bbox, out_probs
+        )
+        assert boxes is None or len(boxes) == batch_size
+        out_masks = self._process_masks(
+            target_sizes_masks, pred_masks, consistent=consistent, keep=keep
+        )
+        del pred_masks
+
+        if boxes is None:
+            assert out_masks is not None
+            assert not ret_tensordict, "We don't support returning TensorDict if the output does not contain boxes"
+            B = len(out_masks)
+            boxes = [None] * B
+            scores = [None] * B
+            labels = [None] * B
+
+        results = {
+            "scores": scores,
+            "labels": labels,
+            "boxes": boxes,
+        }
+        if out_masks is not None:
+            if self.convert_mask_to_rle:
+                results.update(masks_rle=out_masks)
+            else:
+                results.update(masks=out_masks)
+
+        if ret_tensordict:
+            results = TensorDict(results).auto_batch_size_()
+            if self.to_cpu:
+                results = results.cpu()
+        else:
+            # Convert a dictonary of lists/tensors to list of dictionaries
+            results = [
+                dict(zip(results.keys(), res_tuple))
+                for res_tuple in zip(*results.values())
+            ]
+
+        return results
+
+    def _process_masks(self, target_sizes, pred_masks, consistent=True, keep=None):
+        if pred_masks is None:
+            return None
+        if self.always_interpolate_masks_on_gpu:
+            gpu_device = target_sizes.device
+            assert gpu_device.type == "cuda"
+            pred_masks = pred_masks.to(device=gpu_device)
+        if consistent:
+            assert keep is None, "TODO: implement?"
+            # All masks should have the same shape, expected when processing a batch of size 1
+            target_size = target_sizes.unique(dim=0)
+            assert target_size.size(0) == 1, "Expecting all target sizes to be equal"
+            out_masks = (
+                interpolate(
+                    pred_masks,
+                    target_size.squeeze().tolist(),
+                    mode="bilinear",
+                    align_corners=False,
+                ).sigmoid()
+                > 0.5
+            )
+            if self.convert_mask_to_rle:
+                raise RuntimeError("TODO: implement?")
+            if self.to_cpu:
+                out_masks = out_masks.cpu()
+        else:
+            out_masks = [[]] * len(pred_masks)
+
+            assert keep is None or len(keep) == len(pred_masks)
+            for i, mask in enumerate(pred_masks):
+                h, w = target_sizes[i]
+                if keep is not None:
+                    mask = mask[keep[i]]
+                # Uses the gpu version fist, moves masks to cpu if it fails"""
+                try:
+                    interpolated = (
+                        interpolate(
+                            mask.unsqueeze(1),
+                            (h, w),
+                            mode="bilinear",
+                            align_corners=False,
+                        ).sigmoid()
+                        > 0.5
+                    )
+                except Exception as e:
+                    logging.info("Issue found, reverting to CPU mode!")
+                    mask_device = mask.device
+                    mask = mask.cpu()
+                    interpolated = (
+                        interpolate(
+                            mask.unsqueeze(1),
+                            (h, w),
+                            mode="bilinear",
+                            align_corners=False,
+                        ).sigmoid()
+                        > 0.5
+                    )
+                    interpolated = interpolated.to(mask_device)
+
+                if self.convert_mask_to_rle:
+                    out_masks[i] = robust_rle_encode(interpolated.squeeze(1))
+                else:
+                    out_masks[i] = interpolated
+                    if self.to_cpu:
+                        out_masks[i] = out_masks[i].cpu()
+
+        return out_masks
+
+    def _process_boxes_and_labels(
+        self, target_sizes, forced_labels, out_bbox, out_probs
+    ):
+        if out_bbox is None:
+            return None, None, None, None
+        assert len(out_probs) == len(target_sizes)
+        if self.to_cpu:
+            out_probs = out_probs.cpu()
+        scores, labels = out_probs.max(-1)
+        if forced_labels is None:
+            labels = torch.ones_like(labels)
+        else:
+            labels = forced_labels[:, None].expand_as(labels)
+
+        # convert to [x0, y0, x1, y1] format
+        boxes = box_ops.box_cxcywh_to_xyxy(out_bbox)
+
+        img_h, img_w = target_sizes.unbind(1)
+        scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
+        boxes = boxes * scale_fct[:, None, :]
+
+        if self.to_cpu:
+            boxes = boxes.cpu()
+
+        keep = None
+        if self.detection_threshold > 0:
+            # Filter out the boxes with scores below the detection threshold
+            keep = scores > self.detection_threshold
+            assert len(keep) == len(boxes) == len(scores) == len(labels)
+
+            boxes = [b[k.to(b.device)] for b, k in zip(boxes, keep)]
+            scores = [s[k.to(s.device)] for s, k in zip(scores, keep)]
+            labels = [l[k.to(l.device)] for l, k in zip(labels, keep)]
+
+        return boxes, scores, labels, keep
+
+    def process_results(
+        self, find_stages, find_metadatas: List[BatchedInferenceMetadata], **kwargs
+    ):
+        if find_stages.loss_stages is not None:
+            find_metadatas = [find_metadatas[i] for i in find_stages.loss_stages]
+        assert len(find_stages) == len(find_metadatas)
+        results = {}
+        for outputs, meta in zip(find_stages, find_metadatas):
+            img_size_for_boxes = (
+                meta.original_size
+                if self.use_original_sizes_box
+                else torch.ones_like(meta.original_size)
+            )
+            img_size_for_masks = (
+                meta.original_size
+                if self.use_original_sizes_mask
+                else torch.ones_like(meta.original_size)
+            )
+            detection_results = self(
+                outputs,
+                img_size_for_boxes,
+                img_size_for_masks,
+                forced_labels=(
+                    meta.original_category_id if self.use_original_ids else None
+                ),
+            )
+            ids = (
+                meta.original_image_id if self.use_original_ids else meta.coco_image_id
+            )
+            assert len(detection_results) == len(ids)
+            for img_id, result in zip(ids, detection_results):
+                if img_id.item() not in results:
+                    results[img_id.item()] = result
+                else:
+                    assert set(results[img_id.item()].keys()) == set(result.keys())
+                    for k in result.keys():
+                        if isinstance(result[k], torch.Tensor):
+                            results[img_id.item()][k] = torch.cat(
+                                [results[img_id.item()][k], result[k]], dim=0
+                            )
+                        elif isinstance(result[k], list):
+                            results[img_id.item()][k] += result[k]
+                        else:
+                            raise NotImplementedError(
+                                f"Unexpected type {type(result[k])} in result."
+                            )
+        # Prune the results to the max number of detections per image.
+        for img_id, result in results.items():
+            if (
+                self.max_dets_per_img > 0
+                and len(result["scores"]) > self.max_dets_per_img
+            ):
+                _, topk_indexes = torch.topk(
+                    result["scores"], self.max_dets_per_img, dim=0
+                )
+                if self.to_cpu:
+                    topk_indexes = topk_indexes.cpu()
+                for k in result.keys():
+                    if isinstance(results[img_id][k], list):
+                        results[img_id][k] = [
+                            results[img_id][k][i] for i in topk_indexes.tolist()
+                        ]
+                    else:
+                        results[img_id][k] = results[img_id][k].to(topk_indexes.device)[
+                            topk_indexes
+                        ]
+
+        return results
+
+
+class PostProcessAPIVideo(PostProcessImage):
+    """This module converts the video model's output into the format expected by the YT-VIS api"""
+
+    def __init__(
+        self,
+        *args,
+        to_cpu: bool = True,
+        convert_mask_to_rle: bool = False,
+        always_interpolate_masks_on_gpu: bool = True,
+        prob_thresh: float = 0.5,
+        use_presence: bool = False,
+        **kwargs,
+    ):
+        super().__init__(
+            *args,
+            # Here we always set `convert_mask_to_rle=False` in the base `PostProcessAPI` class
+            # (so that its `_process_masks` won't return a list of RLEs). If we want to return
+            # RLEs for video masklets, we handle it in this `PostProcessAPIVideo` class instead.
+            convert_mask_to_rle=False,
+            # Here we always set `to_cpu=False` in the base `PostProcessAPI` class (so that
+            # the interpolated masks won't be automatically moved back to CPU). We will handle
+            # it in this `PostProcessAPIVideo` class instead.
+            always_interpolate_masks_on_gpu=always_interpolate_masks_on_gpu,
+            use_presence=use_presence,
+            **kwargs,
+        )
+        # Expected keys in the output dict to postprocess
+        self.EXPECTED_KEYS = [
+            "pred_logits",
+            "pred_boxes",
+            "pred_masks",
+        ]
+        # Whether to post-process video masklets (under packed representation) into RLE format
+        self.convert_mask_to_rle_for_video = convert_mask_to_rle
+        self.to_cpu_for_video = to_cpu
+        self.prob_thresh = prob_thresh
+
+    def process_results(
+        self, find_stages, find_metadatas: List[BatchedInferenceMetadata], **kwargs
+    ):
+        """
+        Tracking Postprocessor for SAM 3 video model.
+        This function takes in the output of the SAM 3 video model and processes it to extract all the tracklet predictions.
+        Args:
+            find_stages: A list of tensors representing the output of the SAM 3 video model.
+            find_metadatas: A list of BatchedInferenceMetadata objects containing metadata about each frame.
+            **kwargs: Additional keyword arguments.
+        Returns:
+            A dictionary of predcitions with video_id as key.
+        """
+
+        # Import tensordict here to avoid global dependency.
+        try:
+            from tensordict import TensorDict
+        except ImportError as e:
+            logging.error(
+                "tensordict is not installed, please install by running `pip install tensordict --no-deps`"
+            )
+            raise e
+        # Notes and assumptions:
+        # 1- This postprocessor assumes results only for a single video.
+        # 2- There are N stage outputs corresponding to N video frames
+        # 3- Each stage outputs contains PxQ preds, where P is number of prompts and Q is number of object queries. The output should also contain the tracking object ids corresponding to each object query.
+        # 4- The tracking object id has a default value of -1, indicating that the object query is not tracking any object in the frame, and hence its predictions can be ingored for a given frame.
+        # 5- Some objects may be tracked in a subset of frames only. So, we first extract the predictions in a packed representation (for efficient postprocessing -- specially memory)
+        # and then we convert the packed representation into a padded one, where we zero pad boxes/masks for objects that are not tracked in some frames.
+        # 6- We refer to objects by an object id, which is a tuple (prompt_idx, obj_id)
+
+        assert len(find_stages) > 0, "There is nothing to postprocess?"
+        PROMPT_AXIS, OBJ_QUERY_AXIS = (0, 1)
+        NO_OBJ_ID = -1
+        # Maps object ID -> [indices in packed tensor]
+        tracked_objects_packed_idx = defaultdict(list)
+        # Maps object ID -> [indices in padded tensor (abs frame index)]
+        tracked_objects_frame_idx = defaultdict(list)
+        total_num_preds = 0
+        # This will hold the packed representation of predictions.
+        vid_preds_packed: List[TensorDict] = []
+        vid_masklets_rle_packed: List[Optional[Dict]] = []
+        video_id = -1  # We assume single video postprocessing, this ID should be unique in the datapoint.
+
+        for frame_idx, (frame_outs, meta) in enumerate(
+            zip(find_stages, find_metadatas)
+        ):
+            # only store keys we need to extract the results
+            frame_outs_td = TensorDict(
+                {k: frame_outs[k] for k in self.EXPECTED_KEYS}
+            ).auto_batch_size_()  # Shape is [P,Q,...]
+            meta_td = TensorDict(
+                dataclasses.asdict(meta)
+            ).auto_batch_size_()  # Shape is [P,...]
+            unique_vid_id = meta.original_image_id.unique()
+            assert unique_vid_id.size(0) == 1
+            if video_id == -1:
+                video_id = unique_vid_id.item()
+            else:
+                assert (
+                    video_id == unique_vid_id.item()
+                ), "We can only postprocess one video per datapoint"
+            # keeping track of which objects appear in the current frame
+            obj_ids_per_frame = frame_outs["pred_object_ids"]
+            assert obj_ids_per_frame.size(-1) == frame_outs["pred_logits"].size(-2)
+            if self.prob_thresh is not None:
+                # only keep the predictions on this frame with probability above the threshold
+                # (remove those predictions during the keep-alive period of a tracking query,
+                # where its "pred_object_ids" is still the tracked object ID rather than -1)
+                pred_probs = frame_outs["pred_logits"].sigmoid().squeeze(-1)
+                obj_ids_per_frame = torch.where(
+                    pred_probs >= self.prob_thresh, obj_ids_per_frame, NO_OBJ_ID
+                )
+            tracked_obj_ids_idx = torch.where(obj_ids_per_frame != NO_OBJ_ID)
+            # Object id is a tuple of (prompt_idx, obj_id). This is because the model can assign same obj_id for two different prompts.
+            tracked_obj_ids = [
+                (p_id.item(), obj_ids_per_frame[p_id, q_id].item())
+                for p_id, q_id in zip(
+                    tracked_obj_ids_idx[PROMPT_AXIS],
+                    tracked_obj_ids_idx[OBJ_QUERY_AXIS],
+                )
+            ]
+            if len(tracked_obj_ids) == 0:
+                continue
+            # For each object, we keep track of the packed and padded (frame index) indices
+            for oid in tracked_obj_ids:
+                tracked_objects_packed_idx[oid].append(total_num_preds)
+                tracked_objects_frame_idx[oid].append(frame_idx)
+                total_num_preds += 1
+
+            # Since we have P*Q masks per frame, mask interpolation is the GPU memory bottleneck or time bottleneck in case of cpu processing.
+            # Instead, we first extract results only for tracked objects, reducing the number of masks to K = sum_i(tracked_objs_per_ith_prompt), hopefully <<< P*Q
+            tracked_objs_outs_td = frame_outs_td[
+                tracked_obj_ids_idx
+            ]  # [P,Q,...] --> [K,...]
+            meta_td = meta_td[tracked_obj_ids_idx[PROMPT_AXIS].cpu()]
+            if self.always_interpolate_masks_on_gpu:
+                gpu_device = meta_td["original_size"].device
+                assert gpu_device.type == "cuda"
+                tracked_objs_outs_td = tracked_objs_outs_td.to(device=gpu_device)
+            frame_results_td = self(
+                tracked_objs_outs_td.unsqueeze(1),
+                (
+                    meta_td["original_size"]
+                    if self.use_original_sizes
+                    else torch.ones_like(meta_td["original_size"])
+                ),
+                forced_labels=(
+                    meta_td["original_category_id"] if self.use_original_ids else None
+                ),
+                consistent=True,
+                ret_tensordict=True,
+            ).squeeze(1)
+            del tracked_objs_outs_td
+
+            # Optionally, remove "masks" from output tensor dict and directly encode them
+            # to RLE format under packed representations
+            if self.convert_mask_to_rle_for_video:
+                interpolated_binary_masks = frame_results_td.pop("masks")
+                rle_list = rle_encode(interpolated_binary_masks, return_areas=True)
+                vid_masklets_rle_packed.extend(rle_list)
+            # Optionally, move output TensorDict to CPU (do this after RLE encoding step above)
+            if self.to_cpu_for_video:
+                frame_results_td = frame_results_td.cpu()
+            vid_preds_packed.append(frame_results_td)
+
+        if len(vid_preds_packed) == 0:
+            logging.debug(f"Video {video_id} has no predictions")
+            return {video_id: []}
+
+        vid_preds_packed = torch.cat(vid_preds_packed, dim=0)
+        ############### Construct a padded representation of the predictions ###############
+        num_preds = len(tracked_objects_packed_idx)
+        num_frames = len(find_stages)
+        # We zero pad any missing prediction
+        # NOTE: here, we also have padded tensors for "scores" and "labels", but we overwrite them later.
+        padded_frames_results = TensorDict(
+            {
+                k: torch.zeros(
+                    num_preds, num_frames, *v.shape[1:], device=v.device, dtype=v.dtype
+                )
+                for k, v in vid_preds_packed.items()
+            },
+            batch_size=[
+                num_preds,
+                num_frames,
+            ],
+        )
+        padded_frames_results["scores"][...] = -1e8  # a very low score for empty object
+        # Track scores and labels of each pred tracklet, only for frames where the model was able to track that object
+        tracklet_scores = []
+        tracklet_labels = []
+        # Optionally, fill the list of RLEs for masklets
+        # note: only frames with actual predicted masks (in packed format) will be
+        # filled with RLEs; the rest will remains None in results["masks_rle"]
+        if self.convert_mask_to_rle_for_video:
+            vid_masklets_rle_padded = [[None] * num_frames for _ in range(num_preds)]
+        for o_idx, oid in enumerate(tracked_objects_packed_idx):
+            oid2packed_idx = tracked_objects_packed_idx[oid]
+            oid2padded_idx = tracked_objects_frame_idx[oid]
+            obj_packed_results = vid_preds_packed[oid2packed_idx]
+            padded_frames_results[o_idx][oid2padded_idx] = obj_packed_results
+            if self.convert_mask_to_rle_for_video:
+                for packed_idx, padded_idx in zip(oid2packed_idx, oid2padded_idx):
+                    vid_masklets_rle_padded[o_idx][padded_idx] = (
+                        vid_masklets_rle_packed[packed_idx]
+                    )
+            # NOTE: We need a single confidence score per tracklet for the mAP metric.
+            # We use the average confidence score across time. (How does this impact AP?)
+            tracklet_scores.append(obj_packed_results["scores"].mean())
+            # We also need to have a unique category Id per tracklet.
+            # This is not a problem for phrase AP, however, for mAP we do majority voting across time.
+            tracklet_labels.append(obj_packed_results["labels"].mode()[0])
+
+        results = padded_frames_results.to_dict()
+        results["scores"] = torch.stack(tracklet_scores, dim=0)
+        results["labels"] = torch.stack(tracklet_labels, dim=0)
+        if self.convert_mask_to_rle_for_video:
+            results["masks_rle"] = vid_masklets_rle_padded
+        # we keep the frame-level scores since it's needed by some evaluation scripts
+        results["per_frame_scores"] = padded_frames_results["scores"]
+
+        return {video_id: results}
+
+
+class PostProcessTracking(PostProcessImage):
+    """This module converts the model's output into the format expected by the coco api"""
+
+    def __init__(
+        self,
+        max_dets_per_img: int,
+        iou_type="bbox",
+        force_single_mask: bool = False,
+        **kwargs,
+    ) -> None:
+        super().__init__(max_dets_per_img=max_dets_per_img, iou_type=iou_type, **kwargs)
+        self.force_single_mask = force_single_mask
+
+    def process_results(
+        self, find_stages, find_metadatas: BatchedInferenceMetadata, **kwargs
+    ):
+        assert len(find_stages) == len(find_metadatas)
+        results = {}
+        for outputs, meta in zip(find_stages, find_metadatas):
+            if self.force_single_mask:
+                scores, labels = outputs["pred_logits"].max(-1)
+                m = []
+                for i in range(len(outputs["pred_masks"])):
+                    score, idx = scores[i].max(0)
+                    m.append(outputs["pred_masks"][i][idx])
+                outputs["pred_masks"] = torch.stack(m, 0).unsqueeze(1)
+            detection_results = self(outputs, meta.original_size, consistent=False)
+            assert len(detection_results) == len(meta.coco_image_id)
+            results.update(
+                {
+                    (media_id.item(), object_id.item(), frame_index.item()): result
+                    for media_id, object_id, frame_index, result in zip(
+                        meta.original_image_id,
+                        meta.object_id,
+                        meta.frame_index,
+                        detection_results,
+                    )
+                }
+            )
+        return results
+
+
+class PostProcessCounting(nn.Module):
+    """This module converts the model's output to be evaluated for counting tasks"""
+
+    def __init__(
+        self,
+        use_original_ids: bool = False,
+        threshold: float = 0.5,
+        use_presence: bool = False,
+    ) -> None:
+        """
+        Args:
+            use_original_ids: whether to use the original image ids or the coco ids
+            threshold: threshold for counting (values above this are counted)
+        """
+        super().__init__()
+        self.use_original_ids = use_original_ids
+        self.threshold = threshold
+        self.use_presence = use_presence
+
+    def forward(self, outputs, target_sizes):
+        """Perform the computation
+        Parameters:
+            outputs: raw outputs of the model
+            target_sizes: tensor of dimension [batch_size x 2] containing the size of each images of the batch
+        """
+        # Extract scores from model outputs and apply sigmoid
+        scores = torch.sigmoid(outputs["pred_logits"]).squeeze(-1)  # [B, N]
+        if self.use_presence:
+            presence_score = outputs["presence_logit_dec"].sigmoid()
+            if presence_score.ndim == 1:
+                presence_score = presence_score.unsqueeze(1)  # [B, 1]
+            scores = scores * presence_score  # [B, N]
+
+        # Calculate counts by summing values above threshold
+        counts = (scores > self.threshold).float().sum(dim=1)
+
+        assert len(counts) == len(target_sizes)
+        results = []
+        for count in counts:
+            results.append({"count": count.item()})
+
+        return results
+
+    @torch.no_grad()
+    def process_results(
+        self, find_stages, find_metadatas: List[BatchedInferenceMetadata], **kwargs
+    ):
+        assert len(find_stages) == len(find_metadatas)
+        results = {}
+        for outputs, meta in zip(find_stages, find_metadatas):
+            detection_results = self(
+                outputs,
+                meta.original_size,
+            )
+            ids = (
+                meta.original_image_id if self.use_original_ids else meta.coco_image_id
+            )
+            assert len(detection_results) == len(ids)
+            for img_id, result in zip(ids, detection_results):
+                results[img_id.item()] = result
+
+        return results
--- a/sam3/eval/saco_veval_eval.py
+++ b/sam3/eval/saco_veval_eval.py
@@ -0,0 +1,155 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
+import argparse
+import json
+import os
+from collections import defaultdict
+
+from iopath.common.file_io import g_pathmgr
+from sam3.eval.saco_veval_evaluators import (
+    VideoCGF1Evaluator,
+    VideoPhraseApEvaluator,
+    VideoPhraseHotaEvaluator,
+    VideoTetaEvaluator,
+    YTVISPredFileEvaluator,
+)
+
+
+class VEvalEvaluator:
+    def __init__(self, gt_annot_file: str, eval_res_file: str):
+        self.gt_annot_file = gt_annot_file
+        self.eval_res_file = eval_res_file
+        self.evaluators = [
+            # mAP
+            YTVISPredFileEvaluator(gt_annot_file),
+            # Phrase AP
+            VideoPhraseApEvaluator(gt_annot_file),
+            # TETA
+            VideoTetaEvaluator(gt_annot_file, use_mask=True, is_exhaustive=True),
+            # HOTA
+            VideoPhraseHotaEvaluator(gt_annot_file),
+            # cgF1
+            VideoCGF1Evaluator(gt_annot_file),
+        ]
+
+    def run_eval(self, pred_file: str):
+        dataset_results = {}
+        video_np_results = defaultdict(dict)
+        for evaluator in self.evaluators:
+            d_res, v_np_res = evaluator.evaluate(pred_file)
+            dataset_results.update(d_res)
+            for (video_id, category_id), res in v_np_res.items():
+                video_np_results[(video_id, category_id)].update(res)
+
+        if len(dataset_results) == 0:
+            dataset_results = {"": 0.0}
+
+        formatted_video_np_results = [
+            {"video_id": video_id, "category_id": category_id, **res}
+            for (video_id, category_id), res in video_np_results.items()
+        ]
+        eval_metrics = {
+            "dataset_results": dataset_results,
+            "video_np_results": formatted_video_np_results,
+        }
+
+        with g_pathmgr.open(self.eval_res_file, "w") as f:
+            json.dump(eval_metrics, f)
+
+        return eval_metrics
+
+
+def run_main_all(dataset_name, args):
+    gt_annot_file = os.path.join(args.gt_annot_dir, dataset_name + ".json")
+    pred_file = os.path.join(args.pred_dir, dataset_name + "_preds.json")
+    eval_res_file = os.path.join(args.eval_res_dir, dataset_name + "_eval_res.json")
+    print(f"=== Running evaluation for Pred {pred_file} vs GT {gt_annot_file} ===")
+    veval_evaluator = VEvalEvaluator(
+        gt_annot_file=gt_annot_file, eval_res_file=eval_res_file
+    )
+    _ = veval_evaluator.run_eval(pred_file=pred_file)
+
+    print(f"=== Results saved to {eval_res_file} ===")
+
+
+def main_all(args):
+    saco_veval_dataset_names = [
+        "saco_veval_sav_test",
+        "saco_veval_sav_val",
+        "saco_veval_yt1b_test",
+        "saco_veval_yt1b_val",
+        "saco_veval_smartglasses_test",
+        "saco_veval_smartglasses_val",
+    ]
+
+    # multiprocessing may not really work as inner evaluator also using multiprocessing
+    # so we just for loop
+    for dataset_name in saco_veval_dataset_names:
+        print(f"=== Running evaluation for dataset {dataset_name} ===")
+        run_main_all(dataset_name=dataset_name, args=args)
+
+
+def main_one(args):
+    gt_annot_file = args.gt_annot_file
+    pred_file = args.pred_file
+    eval_res_file = args.eval_res_file
+
+    print(f"=== Running evaluation for Pred {pred_file} vs GT {gt_annot_file} ===")
+    veval_evaluator = VEvalEvaluator(
+        gt_annot_file=gt_annot_file, eval_res_file=eval_res_file
+    )
+    _ = veval_evaluator.run_eval(pred_file=pred_file)
+
+    print(f"=== Results saved to {eval_res_file} ===")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Run video grounding evaluators")
+
+    # Create subparsers for different commands
+    subparsers = parser.add_subparsers(dest="command", required=True)
+
+    # Run evaluation for all datasets
+    all_parser = subparsers.add_parser("all", help="Run evaluation for all datasets")
+    all_parser.add_argument(
+        "--gt_annot_dir",
+        type=str,
+        help="Directory that contains the ground truth annotation files",
+    )
+    all_parser.add_argument(
+        "--pred_dir",
+        type=str,
+        help="Directory that contains the prediction files",
+    )
+    all_parser.add_argument(
+        "--eval_res_dir",
+        type=str,
+        help="Directory that contains the eval results files",
+    )
+    all_parser.set_defaults(func=main_all)
+
+    # Run evaluation for one dataset
+    one_parser = subparsers.add_parser("one", help="Run evaluation for one dataset")
+    one_parser.add_argument(
+        "--gt_annot_file",
+        type=str,
+        help="Path to the ground truth annotation file",
+    )
+    one_parser.add_argument(
+        "--pred_file",
+        type=str,
+        help="Path to the prediction file",
+    )
+    one_parser.add_argument(
+        "--eval_res_file",
+        type=str,
+        help="Path to the eval results file",
+    )
+    one_parser.set_defaults(func=main_one)
+
+    # Parse and dispatch
+    args = parser.parse_args()
+    args.func(args)
+
+
+if __name__ == "__main__":
+    main()
--- a/sam3/eval/saco_veval_evaluators.py
+++ b/sam3/eval/saco_veval_evaluators.py
@@ -0,0 +1,838 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
+import json
+import os
+import tempfile
+from collections import defaultdict
+from typing import Dict, Optional, Sequence, Tuple
+
+import numpy as np
+import pycocotools.mask
+from sam3.eval.cgf1_eval import CGF1_METRICS
+from sam3.eval.conversion_util import (
+    convert_ytbvis_to_cocovid_gt,
+    convert_ytbvis_to_cocovid_pred,
+)
+from sam3.eval.hota_eval_toolkit.run_ytvis_eval import run_ytvis_eval
+from sam3.eval.teta_eval_toolkit import config, Evaluator, metrics
+from sam3.eval.teta_eval_toolkit.datasets import COCO, TAO
+from sam3.eval.ytvis_coco_wrapper import YTVIS
+from sam3.eval.ytvis_eval import VideoDemoF1Eval, YTVISeval
+from sam3.train.nms_helper import process_frame_level_nms, process_track_level_nms
+
+
+def _get_metric_index(metric_name: str, iou_threshold: Optional[float] = None) -> int:
+    """
+    Find the index of a metric in CGF1_METRICS by name and IoU threshold.
+
+    Args:
+        metric_name: Name of the metric (e.g., "cgF1", "precision", "recall")
+        iou_threshold: IoU threshold (None for average over 0.5:0.95, or specific value like 0.5, 0.75)
+
+    Returns:
+        Index of the metric in CGF1_METRICS
+
+    Raises:
+        ValueError: If metric not found
+    """
+    for idx, metric in enumerate(CGF1_METRICS):
+        if metric.name == metric_name and metric.iou_threshold == iou_threshold:
+            return idx
+    raise ValueError(
+        f"Metric '{metric_name}' with IoU threshold {iou_threshold} not found in CGF1_METRICS"
+    )
+
+
+class BasePredFileEvaluator:
+    """A base class for evaluating a prediction file."""
+
+    pass
+
+
+class YTVISPredFileEvaluator(BasePredFileEvaluator):
+    """Evaluate class mAP for YT-VIS prediction files."""
+
+    def __init__(
+        self,
+        gt_ann_file: str,
+        dataset_name: str = "video",
+        iou_types: Optional[Sequence[str]] = None,
+    ):
+        self.gt_ann_file = gt_ann_file
+        self.dataset_name = dataset_name
+        self.iou_types = list(iou_types) if iou_types is not None else ["bbox", "segm"]
+        assert all(iou_type in ["bbox", "segm"] for iou_type in self.iou_types)
+
+    def evaluate(self, pred_file: str) -> Dict[str, float]:
+        # use our internal video evaluation toolkit for YT-VIS pred file
+        # (i.e. the same one we're using for video phrase AP)
+        results = {}
+        use_cats = True  # YT-VIS mAP evaluation uses categories
+        ytvisGT = YTVIS(self.gt_ann_file, ignore_gt_cats=not use_cats)
+        # the original YT-VIS GT annotations have uncompressed RLEs ("counts" is an integer list)
+        # rather than compressed RLEs ("counts" is a string), so we first convert them here.
+        if "segm" in self.iou_types:
+            for ann in ytvisGT.dataset["annotations"]:
+                ann["segmentations"] = [
+                    _compress_rle(rle) for rle in ann["segmentations"]
+                ]
+
+        with open(pred_file) as f:
+            dt = json.load(f)
+        # Our prediction file saves "video_id" and absolute (unnormalized) boxes.
+        # Note that we should use the official (original) YT-VIS annotations (i.e. the one
+        # saved via "scripts/datasets/training/ytvis_split.py", instead of the one saved
+        # via "scripts/api_db_to_ytvis_json.py") in this evaluator, which contain absolute
+        # boxes coordinates in its GT annotations.
+        for d in dt:
+            d["image_id"] = d["video_id"]
+        ytvisDT = ytvisGT.loadRes(dt)
+
+        for iou_type in self.iou_types:
+            ytvisEval = YTVISeval(ytvisGT, ytvisDT, iou_type)
+
+            # set the area ranges for small, medium, and large objects (using
+            # absolute pixel areas) as in the official YT-VIS evaluation toolkit:
+            # https://github.com/achalddave/ytvosapi/blob/eca601117c9f86bad084cb91f1d918e9ab665a75/PythonAPI/ytvostools/ytvoseval.py#L538
+            ytvisEval.params.areaRng = [
+                [0**2, 1e5**2],
+                [0**2, 128**2],
+                [128**2, 256**2],
+                [256**2, 1e5**2],
+            ]
+            ytvisEval.params.areaRngLbl = ["all", "small", "medium", "large"]
+            ytvisEval.params.useCats = use_cats
+
+            ytvisEval.evaluate()
+            ytvisEval.accumulate()
+            ytvisEval.summarize()
+            result_key = f"{self.dataset_name}_{'mask' if iou_type == 'segm' else 'bbox'}_mAP_50_95"
+            results[result_key] = ytvisEval.stats[0]
+
+        # video-NP level results not supported for `YTVISPredFileEvaluator` yet
+        video_np_level_results = {}
+        return results, video_np_level_results
+
+
+class VideoPhraseApEvaluator(BasePredFileEvaluator):
+    """Evaluate Video Phrase AP with YT-VIS format prediction and GT files."""
+
+    def __init__(
+        self,
+        gt_ann_file: str,
+        dataset_name: str = "video",
+        iou_types: Optional[Sequence[str]] = None,
+    ):
+        self.gt_ann_file = gt_ann_file
+        self.dataset_name = dataset_name
+        self.iou_types = list(iou_types) if iou_types is not None else ["bbox", "segm"]
+        assert all(iou_type in ["bbox", "segm"] for iou_type in self.iou_types)
+
+    def evaluate(self, pred_file: str) -> Dict[str, float]:
+        with open(self.gt_ann_file) as f:
+            gt = json.load(f)
+        with open(pred_file) as f:
+            dt = json.load(f)
+        # For phrase AP and demo F1 evaluation, we need to remap each pair of (video_id, category_id) to
+        # a new unique video_id, so that we don't mix detections from different categories under `useCat=False`
+        gt, dt = remap_video_category_pairs_to_unique_video_ids(gt, dt)
+        if "segm" in self.iou_types:
+            for ann in gt["annotations"]:
+                ann["segmentations"] = [
+                    _compress_rle(rle) for rle in ann["segmentations"]
+                ]
+        for d in dt:
+            d["image_id"] = d["video_id"]
+
+        results = {}
+        use_cats = False  # Phrase AP evaluation does not use categories
+        ytvisGT = YTVIS(annotation_file=None, ignore_gt_cats=not use_cats)
+        ytvisGT.dataset = gt
+        ytvisGT.createIndex()
+        ytvisDT = ytvisGT.loadRes(dt)
+
+        for iou_type in self.iou_types:
+            phraseApEval = YTVISeval(ytvisGT, ytvisDT, iou_type)
+
+            # set the area ranges for small, medium, and large objects (using
+            # absolute pixel areas) as in the official YT-VIS evaluation toolkit:
+            # https://github.com/achalddave/ytvosapi/blob/eca601117c9f86bad084cb91f1d918e9ab665a75/PythonAPI/ytvostools/ytvoseval.py#L538
+            phraseApEval.params.areaRng = [
+                [0**2, 1e5**2],
+                [0**2, 128**2],
+                [128**2, 256**2],
+                [256**2, 1e5**2],
+            ]
+            phraseApEval.params.areaRngLbl = ["all", "small", "medium", "large"]
+            phraseApEval.params.useCats = use_cats
+
+            phraseApEval.evaluate()
+            phraseApEval.accumulate()
+            phraseApEval.summarize()
+            result_prefix = f"{self.dataset_name}"
+            result_prefix += f"_{'mask' if iou_type == 'segm' else 'bbox'}_phrase_ap"
+            # fetch Phrase AP results from the corresponding indices in `phraseApEval.stats`
+            # (see `_summarizeDets` in https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/pycocotools/cocoeval.py)
+            results[result_prefix + "_50_95"] = phraseApEval.stats[0]  # IoU=0.5:0.95
+            results[result_prefix + "_50"] = phraseApEval.stats[1]  # IoU=0.5
+            results[result_prefix + "_75"] = phraseApEval.stats[2]  # IoU=0.75
+
+        # video-NP level results not supported for `VideoPhraseApEvaluator` yet
+        video_np_level_results = {}
+        return results, video_np_level_results
+
+
+class VideoCGF1Evaluator(BasePredFileEvaluator):
+    """Evaluate Video Demo F1 with YT-VIS format prediction and GT files."""
+
+    def __init__(
+        self,
+        gt_ann_file: str,
+        dataset_name: str = "video",
+        prob_thresh: float = 0.5,
+        iou_types: Optional[Sequence[str]] = None,
+    ):
+        self.gt_ann_file = gt_ann_file
+        self.dataset_name = dataset_name
+        self.prob_thresh = prob_thresh
+        self.iou_types = list(iou_types) if iou_types is not None else ["bbox", "segm"]
+        assert all(iou_type in ["bbox", "segm"] for iou_type in self.iou_types)
+
+    def evaluate(self, pred_file: str) -> Dict[str, float]:
+        with open(self.gt_ann_file) as f:
+            gt = json.load(f)
+        with open(pred_file) as f:
+            dt = json.load(f)
+        # compute IL_MCC and CG-F1 can only be computed if we have "video_np_pairs" keys in the GT JSON
+        compute_ilmcc_and_cgf1 = "video_np_pairs" in gt
+        if not compute_ilmcc_and_cgf1:
+            print(
+                f"Warning: IL_MCC and CG-F1 are not computed for {pred_file=} as it does not have 'video_np_pairs' keys in the GT JSON"
+            )
+        # For phrase AP and demo F1 evaluation, we need to remap each pair of (video_id, category_id) to
+        # a new unique video_id, so that we don't mix detections from different categories under `useCat=False`
+        gt, dt = remap_video_category_pairs_to_unique_video_ids(
+            gt, dt, add_negative_np_pairs=compute_ilmcc_and_cgf1
+        )
+        if "segm" in self.iou_types:
+            for ann in gt["annotations"]:
+                ann["segmentations"] = [
+                    _compress_rle(rle) for rle in ann["segmentations"]
+                ]
+        for d in dt:
+            d["image_id"] = d["video_id"]
+
+        results = {}
+        use_cats = False  # Demo F1 evaluation does not use categories
+        ytvisGT = YTVIS(annotation_file=None, ignore_gt_cats=not use_cats)
+        ytvisGT.dataset = gt
+        ytvisGT.createIndex()
+        ytvisDT = ytvisGT.loadRes(dt)
+
+        video_np_level_results = {}
+        for iou_type in self.iou_types:
+            demoF1Eval = VideoDemoF1Eval(ytvisGT, ytvisDT, iou_type, self.prob_thresh)
+
+            demoF1Eval.params.useCats = use_cats
+            demoF1Eval.params.areaRng = [[0**2, 1e5**2]]
+            demoF1Eval.params.areaRngLbl = ["all"]
+            demoF1Eval.params.maxDets = [100000]
+
+            demoF1Eval.evaluate()
+            demoF1Eval.accumulate()
+            demoF1Eval.summarize()
+            result_prefix = f"{self.dataset_name}"
+            result_prefix += f"_{'mask' if iou_type == 'segm' else 'bbox'}_demo"
+
+            stats = demoF1Eval.stats
+
+            if compute_ilmcc_and_cgf1:
+                # Average IoU threshold (0.5:0.95)
+                cgf1_micro_avg_idx = _get_metric_index("cgF1", None)
+                positive_micro_f1_avg_idx = _get_metric_index("positive_micro_F1", None)
+                ilmcc_avg_idx = _get_metric_index("IL_MCC", None)
+                results[result_prefix + "_cgf1_micro_50_95"] = stats[cgf1_micro_avg_idx]
+                results[result_prefix + "_ilmcc_50_95"] = stats[ilmcc_avg_idx]
+                results[result_prefix + "_positive_micro_f1_50_95"] = stats[
+                    positive_micro_f1_avg_idx
+                ]
+
+                # IoU = 0.5
+                cgf1_micro_50_idx = _get_metric_index("cgF1", 0.5)
+                positive_micro_f1_50_idx = _get_metric_index("positive_micro_F1", 0.5)
+                results[result_prefix + "_cgf1_micro_50"] = stats[cgf1_micro_50_idx]
+                results[result_prefix + "_ilmcc_50"] = float(
+                    np.array(stats[cgf1_micro_50_idx])
+                    / np.array(stats[positive_micro_f1_50_idx])
+                )
+                results[result_prefix + "_positive_micro_f1_50"] = stats[
+                    positive_micro_f1_50_idx
+                ]
+
+                # IoU = 0.75
+                cgf1_micro_75_idx = _get_metric_index("cgF1", 0.75)
+                positive_micro_f1_75_idx = _get_metric_index("positive_micro_F1", 0.75)
+                results[result_prefix + "_cgf1_micro_75"] = stats[cgf1_micro_75_idx]
+                results[result_prefix + "_ilmcc_75"] = float(
+                    np.array(stats[cgf1_micro_75_idx])
+                    / np.array(stats[positive_micro_f1_75_idx])
+                )
+                results[result_prefix + "_positive_micro_f1_75"] = stats[
+                    positive_micro_f1_75_idx
+                ]
+
+            self.extract_video_np_level_results(demoF1Eval, video_np_level_results)
+
+        return results, video_np_level_results
+
+    def extract_video_np_level_results(self, demoF1Eval, video_np_level_results):
+        """Aggregate statistics for video-level metrics."""
+        num_iou_thrs = len(demoF1Eval.params.iouThrs)
+        iou_50_index = int(np.where(demoF1Eval.params.iouThrs == 0.5)[0])
+        iou_75_index = int(np.where(demoF1Eval.params.iouThrs == 0.75)[0])
+
+        result_prefix = "mask" if demoF1Eval.params.iouType == "segm" else "bbox"
+
+        assert len(demoF1Eval.evalImgs) == len(demoF1Eval.cocoGt.dataset["images"])
+        for i, video in enumerate(demoF1Eval.cocoGt.dataset["images"]):
+            # the original video id and category id before remapping
+            video_id = video["orig_video_id"]
+            category_id = video["orig_category_id"]
+            eval_img_dict = demoF1Eval.evalImgs[i]
+
+            TPs = eval_img_dict.get("TPs", np.zeros(num_iou_thrs, dtype=np.int64))
+            FPs = eval_img_dict.get("FPs", np.zeros(num_iou_thrs, dtype=np.int64))
+            FNs = eval_img_dict.get("FNs", np.zeros(num_iou_thrs, dtype=np.int64))
+            assert len(TPs) == len(FPs) == len(FNs) == num_iou_thrs
+            # F1 = 2*TP / (2*TP + FP + FN), and we set F1 to 1.0 if denominator is 0
+            denominator = 2 * TPs + FPs + FNs
+            F1s = np.where(denominator > 0, 2 * TPs / np.maximum(denominator, 1), 1.0)
+            local_results = {
+                f"{result_prefix}_TP_50_95": float(TPs.mean()),
+                f"{result_prefix}_FP_50_95": float(FPs.mean()),
+                f"{result_prefix}_FN_50_95": float(FNs.mean()),
+                f"{result_prefix}_F1_50_95": float(F1s.mean()),
+                f"{result_prefix}_TP_50": float(TPs[iou_50_index]),
+                f"{result_prefix}_FP_50": float(FPs[iou_50_index]),
+                f"{result_prefix}_FN_50": float(FNs[iou_50_index]),
+                f"{result_prefix}_F1_50": float(F1s[iou_50_index]),
+                f"{result_prefix}_TP_75": float(TPs[iou_75_index]),
+                f"{result_prefix}_FP_75": float(FPs[iou_75_index]),
+                f"{result_prefix}_FN_75": float(FNs[iou_75_index]),
+                f"{result_prefix}_F1_75": float(F1s[iou_75_index]),
+            }
+            if (video_id, category_id) not in video_np_level_results:
+                video_np_level_results[(video_id, category_id)] = {}
+            video_np_level_results[(video_id, category_id)].update(local_results)
+
+
+class VideoTetaEvaluator(BasePredFileEvaluator):
+    """Evaluate TETA metric using YouTubeVIS format prediction and GT files."""
+
+    def __init__(
+        self,
+        gt_ann_file: str,
+        dataset_name: str = "video",
+        tracker_name: str = "Sam3",
+        nms_threshold: float = 0.5,
+        nms_strategy: str = "none",  # "track", "frame", or "none"
+        prob_thresh: float = 0.5,
+        is_exhaustive: bool = False,
+        use_mask: bool = False,
+        num_parallel_cores: int = 8,
+    ):
+        self.gt_ann_file = gt_ann_file
+        self.dataset_name = dataset_name
+        self.tracker_name = tracker_name
+        self.nms_threshold = nms_threshold
+        self.nms_strategy = nms_strategy.lower()  # Convert to lowercase for consistency
+        self.prob_thresh = prob_thresh
+        self.metric_prefix = "TETA"
+        self.is_exhaustive = is_exhaustive
+        self.use_mask = use_mask
+        self.num_parallel_cores = num_parallel_cores
+
+        # Verify NMS strategy is valid
+        valid_strategies = ["track", "frame", "none"]
+        print("current nms_strategy:", self.nms_strategy)
+        if self.nms_strategy not in valid_strategies:
+            raise ValueError(
+                f"Invalid NMS strategy: {self.nms_strategy}. Must be one of {valid_strategies}"
+            )
+
+        print(f"Initialized VideoTetaEvaluator with NMS strategy: {self.nms_strategy}")
+        print(f"Probability threshold set to: {self.prob_thresh}")
+        print(f"Dataset exhaustivity set to: {self.is_exhaustive}")
+        print(f"Tracker name set to: {self.tracker_name}")
+        print(f"Dataset name set to: {self.dataset_name}")
+        print(f"Use mask set to: {self.use_mask}")
+
+    def process_predictions(self, pred_file: str, tmp_dir: str) -> str:
+        """Process predictions with selected NMS strategy"""
+        with open(pred_file, "r") as f:
+            raw_preds = json.load(f)
+        print(f"Processing predictions with {self.nms_strategy} NMS strategy")
+
+        # Filter by score threshold
+        if self.prob_thresh > 0:
+            raw_preds = [d for d in raw_preds if d["score"] >= self.prob_thresh]
+            print(
+                f"Filtered to {len(raw_preds)} predictions with score >= {self.prob_thresh}"
+            )
+        # Group predictions by video_id
+        video_groups = defaultdict(list)
+        for pred in raw_preds:
+            video_groups[pred["video_id"]].append(pred)
+        # Process based on NMS strategy
+        if self.nms_strategy == "track":
+            process_track_level_nms(video_groups, nms_threshold=self.nms_threshold)
+        elif self.nms_strategy == "frame":
+            process_frame_level_nms(video_groups, nms_threshold=self.nms_threshold)
+        elif self.nms_strategy == "none":
+            print("Skipping NMS processing as strategy is set to 'none'")
+            # No processing needed for "none" strategy
+        # Save processed predictions
+        processed_preds = [
+            track for tracks in video_groups.values() for track in tracks
+        ]
+        processed_path = os.path.join(tmp_dir, "processed_preds.json")
+        with open(processed_path, "w") as f:
+            json.dump(processed_preds, f)
+
+        print(f"Saved processed predictions to {processed_path}")
+        return processed_path
+
+    def evaluate(self, pred_file: str) -> Tuple[Dict[str, float], Dict]:
+        """Main evaluation method"""
+
+        print(f"Evaluating TETA Metric with {self.nms_strategy.upper()} NMS strategy")
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            # Process predictions first
+            processed_pred_file = self.process_predictions(pred_file, tmp_dir)
+
+            # Convert GT to COCO-vid format
+            gt_dir = os.path.join(tmp_dir, "gt")
+            os.makedirs(gt_dir, exist_ok=True)
+            gt_coco_path = os.path.join(gt_dir, "annotations.json")
+            convert_ytbvis_to_cocovid_gt(self.gt_ann_file, gt_coco_path)
+
+            # Convert processed predictions to COCO-vid format
+            pred_dir = os.path.join(tmp_dir, "predictions")
+            tracker_dir = os.path.join(pred_dir, self.tracker_name)
+            os.makedirs(tracker_dir, exist_ok=True)
+            pred_coco_path = os.path.join(tracker_dir, "track_results_cocofmt.json")
+            convert_ytbvis_to_cocovid_pred(
+                youtubevis_pred_path=processed_pred_file,
+                converted_dataset_path=gt_coco_path,
+                output_path=pred_coco_path,
+            )
+            # Configure TETA evaluator
+            default_eval_config = config.get_default_eval_config()
+            default_eval_config["PRINT_ONLY_COMBINED"] = True
+            default_eval_config["DISPLAY_LESS_PROGRESS"] = True
+            default_eval_config["OUTPUT_TEMP_RAW_DATA"] = True
+            default_eval_config["NUM_PARALLEL_CORES"] = self.num_parallel_cores
+            default_dataset_config = config.get_default_dataset_config()
+            default_dataset_config["TRACKERS_TO_EVAL"] = [self.tracker_name]
+            default_dataset_config["GT_FOLDER"] = gt_dir
+            default_dataset_config["OUTPUT_FOLDER"] = pred_dir
+            default_dataset_config["TRACKER_SUB_FOLDER"] = tracker_dir
+            default_dataset_config["USE_MASK"] = self.use_mask
+
+            evaluator = Evaluator(default_eval_config)
+            if self.is_exhaustive:
+                dataset_list = [COCO(default_dataset_config)]
+                dataset_parsing_key = "COCO"
+            else:
+                dataset_list = [TAO(default_dataset_config)]
+                dataset_parsing_key = "TAO"
+
+            # Run evaluation
+            eval_results, _ = evaluator.evaluate(
+                dataset_list, [metrics.TETA(exhaustive=self.is_exhaustive)]
+            )
+
+            # Extract and format results
+            results = {
+                f"{self.dataset_name}_{'mask' if self.use_mask else 'bbox'}_teta": float(
+                    eval_results[dataset_parsing_key]["TETA"][0]
+                ),
+                f"{self.dataset_name}_{'mask' if self.use_mask else 'bbox'}_loc_a": float(
+                    eval_results[dataset_parsing_key]["TETA"][1]
+                ),
+                f"{self.dataset_name}_{'mask' if self.use_mask else 'bbox'}_assoc_a": float(
+                    eval_results[dataset_parsing_key]["TETA"][2]
+                ),
+                f"{self.dataset_name}_{'mask' if self.use_mask else 'bbox'}_cls_a": float(
+                    eval_results[dataset_parsing_key]["TETA"][3]
+                ),
+                f"{self.dataset_name}_{'mask' if self.use_mask else 'bbox'}_loc_re": float(
+                    eval_results[dataset_parsing_key]["TETA"][4]
+                ),
+                f"{self.dataset_name}_{'mask' if self.use_mask else 'bbox'}_loc_pr": float(
+                    eval_results[dataset_parsing_key]["TETA"][5]
+                ),
+                f"{self.dataset_name}_{'mask' if self.use_mask else 'bbox'}_assoc_re": float(
+                    eval_results[dataset_parsing_key]["TETA"][6]
+                ),
+                f"{self.dataset_name}_{'mask' if self.use_mask else 'bbox'}_assoc_pr": float(
+                    eval_results[dataset_parsing_key]["TETA"][7]
+                ),
+                f"{self.dataset_name}_{'mask' if self.use_mask else 'bbox'}_cls_re": float(
+                    eval_results[dataset_parsing_key]["TETA"][8]
+                ),
+                f"{self.dataset_name}_{'mask' if self.use_mask else 'bbox'}_cls_pr": float(
+                    eval_results[dataset_parsing_key]["TETA"][9]
+                ),
+            }
+
+        # video-NP level results not supported for `VideoTetaEvaluator` yet
+        video_np_level_results = {}
+        return results, video_np_level_results
+
+
+class VideoPhraseHotaEvaluator(BasePredFileEvaluator):
+    """Evaluate Video Phrase HOTA with YT-VIS format prediction and GT files."""
+
+    def __init__(
+        self,
+        gt_ann_file: str,
+        dataset_name: str = "video",
+        prob_thresh: float = 0.5,
+        iou_types: Optional[Sequence[str]] = None,
+        compute_video_mot_hota: bool = False,
+    ):
+        self.gt_ann_file = gt_ann_file
+        self.dataset_name = dataset_name
+        self.prob_thresh = prob_thresh
+        self.metric_prefix = "phrase"
+        # the list of metrics to collect from the HOTA evaluation results
+        self.metric_to_collect = [
+            "HOTA",
+            "DetA",
+            "AssA",
+            "DetRe",
+            "DetPr",
+            "AssRe",
+            "AssPr",
+            "LocA",
+            "OWTA",
+        ]
+        self.iou_types = list(iou_types) if iou_types is not None else ["bbox", "segm"]
+        assert all(iou_type in ["bbox", "segm"] for iou_type in self.iou_types)
+
+        # If True, compute video MOT HOTA, aggregating predictions/GT from all categories.
+        self.compute_video_mot_hota = compute_video_mot_hota
+
+    def evaluate(self, pred_file: str) -> Dict[str, float]:
+        # use the YT-VIS evaluation toolkit in TrackEval
+
+        with open(self.gt_ann_file) as f:
+            gt = json.load(f)
+        with open(pred_file) as f:
+            dt = json.load(f)
+        # keep only predictions with score above the probability threshold
+        dt = [d for d in dt if d["score"] > self.prob_thresh]
+        for d in dt:
+            assert len(d["areas"]) == len(d["bboxes"])
+            assert len(d["areas"]) == len(d["segmentations"])
+            # remove empty boxes (otherwise they will count as false positives for during
+            # per-frame detection accuracy in HOTA evaluation)
+            for t in range(len(d["bboxes"])):
+                bbox = d["bboxes"][t]
+                if d["areas"][t] == 0 or bbox is None or all(x == 0 for x in bbox):
+                    d["segmentations"][t] = None
+                    d["bboxes"][t] = None
+                    d["areas"][t] = None
+            # check that box occurence and mask occurence are consistent
+            for bbox, mask, area in zip(d["bboxes"], d["segmentations"], d["areas"]):
+                assert (area is None) == (bbox is None)
+                assert (area is None) == (mask is None)
+            # set all scores to 1.0 for HOTA evaluation (just like Demo F1, the exact score
+            # value is not used in HOTA metrics; it will be treated as a detection prediction
+            # as long as its score is above the threshold)
+            d["score"] = 1.0
+
+        # remap the GT and DT annotations for phrase HOTA evaluation
+        gt = _fill_in_ann_height_width(gt)
+        if not self.compute_video_mot_hota:
+            # remap the GT and DT annotations for phrase HOTA evaluation
+            gt, dt = self._remap_gt_dt(gt, dt)
+        else:
+            # Compute video-level MOT HOTA
+            # Apply track-level NMS
+            video_groups = defaultdict(list)
+            for pred in dt:
+                video_groups[pred["video_id"]].append(pred)
+            process_track_level_nms(video_groups, nms_threshold=0.5)
+            dt = [track for tracks in video_groups.values() for track in tracks]
+
+            # Remap GT track ids for class-agnostic HOTA
+            gt, dt = remap_gt_dt_class_agnostic(gt, dt)
+
+        # run the HOTA evaluation using TrackEval on the remapped (video_id, category_id) pairs
+        out_dict = {}
+        video_np_level_results = {}
+        for iou_type in self.iou_types:
+            output_res, _ = run_ytvis_eval(
+                args=[
+                    "--METRICS",
+                    "HOTA",
+                    "--IOU_TYPE",
+                    iou_type,
+                    "--DATASET_NAME",
+                    self.dataset_name,
+                    "--USE_PARALLEL",
+                    "True",
+                    "--NUM_PARALLEL_CORES",
+                    "8",
+                    "--PLOT_CURVES",
+                    "False",
+                    "--LOG_ON_ERROR",
+                    "None",
+                    "--PRINT_ONLY_COMBINED",
+                    "True",
+                    "--OUTPUT_SUMMARY",
+                    "False",
+                    "--OUTPUT_DETAILED",
+                    "False",
+                    "--TIME_PROGRESS",
+                    "False",
+                    "--PRINT_CONFIG",
+                    "False",
+                ],
+                gt_json=gt,
+                dt_json=dt,
+            )
+            self.extract_video_np_level_results(
+                iou_type=iou_type,
+                remapped_gt=gt,
+                raw_results=output_res[self.dataset_name]["tracker"],
+                video_np_level_results=video_np_level_results,
+            )
+
+            def _summarize_results(output_res, iou_type, field, suffix):
+                eval_res = output_res[self.dataset_name]["tracker"][field]
+                result_prefix = f"{self.dataset_name}_{'mask' if iou_type == 'segm' else 'bbox'}_{suffix}"
+                for metric_name in self.metric_to_collect:
+                    eval_res_hota = eval_res["cls_comb_cls_av"]["HOTA"]
+                    result_key = f"{result_prefix}_{self.metric_prefix}_{metric_name}"
+                    result_value = float(np.mean(eval_res_hota[metric_name]))
+                    out_dict[result_key] = result_value
+
+            _summarize_results(output_res, iou_type, "COMBINED_SEQ", "all")
+            if "COMBINED_SEQ_CHALLENGING" in output_res[self.dataset_name]["tracker"]:
+                _summarize_results(
+                    output_res, iou_type, "COMBINED_SEQ_CHALLENGING", "challenging"
+                )
+
+        # video-NP level results not supported for `VideoPhraseHotaEvaluator` yet
+        return out_dict, video_np_level_results
+
+    def _remap_gt_dt(self, gt, dt):
+        # For phrase HOTA evaluation, we need to remap each pair of (video_id, category_id) to
+        # a new unique video_id, so that we don't mix detections from different categories
+        gt, dt = remap_video_category_pairs_to_unique_video_ids(gt, dt)
+        # We further map all the categories to category_id=1 in HOTA evaluation toolkit
+        # for phrase HOTA (similar to "useCat=False" for video phrase AP)
+        remapped_category_id = 1
+        gt["categories"] = [
+            {
+                "supercategory": "object",
+                "id": remapped_category_id,
+                "name": "_REMAPPED_FOR_PHRASE_METRICS_",
+            }
+        ]
+        for ann in gt["annotations"]:
+            ann["category_id"] = remapped_category_id
+        for d in dt:
+            d["category_id"] = remapped_category_id
+        # To be compatible with the TrackEval YT-VIS evaluation toolkit, we need to give
+        # unique filenames to each remapped video, so we add remapped video_id as prefix.
+        for video in gt["videos"]:
+            new_video_id = video["id"]
+            video["file_names"] = [
+                f"remapped_vid_{new_video_id:012d}/{name}"
+                for name in video["file_names"]
+            ]
+        return gt, dt
+
+    def extract_video_np_level_results(
+        self, iou_type, remapped_gt, raw_results, video_np_level_results
+    ):
+        """Aggregate statistics for video-level metrics."""
+        result_prefix = "mask" if iou_type == "segm" else "bbox"
+        for video in remapped_gt["videos"]:
+            # the original video id and category id before remapping
+            video_id = video["orig_video_id"]
+            category_id = video["orig_category_id"]
+            video_key = f"remapped_vid_{video['id']:012d}"
+            results = raw_results[video_key]["_REMAPPED_FOR_PHRASE_METRICS_"]["HOTA"]
+
+            local_results = {}
+            for metric_name in self.metric_to_collect:
+                result_key = f"{result_prefix}_{metric_name}"
+                local_results[result_key] = float(results[metric_name].mean())
+            if (video_id, category_id) not in video_np_level_results:
+                video_np_level_results[(video_id, category_id)] = {}
+            video_np_level_results[(video_id, category_id)].update(local_results)
+
+
+class VideoClassBasedHotaEvaluator(VideoPhraseHotaEvaluator):
+    def __init__(
+        self,
+        gt_ann_file: str,
+        dataset_name: str = "video",
+        prob_thresh: float = 0.5,
+    ):
+        super().__init__(gt_ann_file, dataset_name, prob_thresh)
+        self.metric_prefix = "class"
+
+    def _remap_gt_dt(self, gt, dt):
+        return gt, dt  # no remapping needed for class-based HOTA evaluation
+
+    def extract_video_np_level_results(self, *args, **kwargs):
+        pass  # no video-NP level results for class-based HOTA evaluation
+
+
+def _compress_rle(rle):
+    """Convert RLEs from uncompressed (integer list) to compressed (string) format."""
+    if rle is None:
+        return None
+    if isinstance(rle["counts"], list):
+        rle = pycocotools.mask.frPyObjects(rle, rle["size"][0], rle["size"][1])
+        rle["counts"] = rle["counts"].decode()
+    return rle
+
+
+def remap_video_category_pairs_to_unique_video_ids(
+    gt_json, dt_json, add_negative_np_pairs=False
+):
+    """
+    Remap each pair of (video_id, category_id) to a new unique video_id. This is useful
+    for phrase AP and demo F1 evaluation on videos, where we have `useCat=False` and
+    rely on separating different NPs (from the same video) into different new video ids,
+    so that we don't mix detections from different categories in computeIoU under `useCat=False`.
+
+    This is consistent with how do we phrase AP and demo F1 evaluation on images, where we
+    use a remapped unique coco_image_id for each image-NP pair (based in its query["id"] in
+    CustomCocoDetectionAPI.load_queries in modulated_detection_api.py)
+    """
+    # collect the unique video_id-category_id pairs
+    video_id_to_video = {v["id"]: v for v in gt_json["videos"]}
+    video_id_category_id_pairs = set()
+    for pred in dt_json:
+        video_id_category_id_pairs.add((pred["video_id"], pred["category_id"]))
+    for ann in gt_json["annotations"]:
+        video_id_category_id_pairs.add((ann["video_id"], ann["category_id"]))
+
+    # assign the video_id-category_id pairs to unique video ids
+    video_id_category_id_pairs = sorted(video_id_category_id_pairs)
+    video_id_category_id_to_new_video_id = {
+        pair: (i + 1) for i, pair in enumerate(video_id_category_id_pairs)
+    }
+    # also map the negative NP pairs -- this is needed for IL_MCC and CG-F1 evaluation
+    if add_negative_np_pairs:
+        for vnp in gt_json["video_np_pairs"]:
+            pair = (vnp["video_id"], vnp["category_id"])
+            if pair not in video_id_category_id_to_new_video_id:
+                video_id_category_id_to_new_video_id[pair] = (
+                    len(video_id_category_id_to_new_video_id) + 1
+                )
+
+    # map the "video_id" in predictions
+    for pred in dt_json:
+        pred["video_id"] = video_id_category_id_to_new_video_id[
+            (pred["video_id"], pred["category_id"])
+        ]
+    # map the "video_id" in gt_json["annotations"]
+    for ann in gt_json["annotations"]:
+        ann["video_id"] = video_id_category_id_to_new_video_id[
+            (ann["video_id"], ann["category_id"])
+        ]
+    # map and duplicate gt_json["videos"]
+    new_videos = []
+    for (
+        video_id,
+        category_id,
+    ), new_video_id in video_id_category_id_to_new_video_id.items():
+        video = video_id_to_video[video_id].copy()
+        video["id"] = new_video_id
+        # preserve the original video_id and category_id of each remapped video entry,
+        # so that we can associate sample-level eval metrics with the original video-NP pairs
+        video["orig_video_id"] = video_id
+        video["orig_category_id"] = category_id
+        new_videos.append(video)
+    gt_json["videos"] = new_videos
+
+    return gt_json, dt_json
+
+
+def remap_gt_dt_class_agnostic(gt, dt):
+    """
+    For class-agnostic HOTA, merge all GT tracks for each video (across NPs),
+    ensure unique track_ids, and set all category_id to 1.
+    Also, add orig_video_id and orig_category_id for compatibility.
+    """
+    # 1. Remap all GT track_ids to be unique per video
+    gt_anns_by_video = defaultdict(list)
+    for ann in gt["annotations"]:
+        gt_anns_by_video[ann["video_id"]].append(ann)
+
+    # Ensure unique track ids across tracks of all videos
+    next_tid = 1
+    for _, anns in gt_anns_by_video.items():
+        # Map old track_ids to new unique ones
+        old_to_new_tid = {}
+        for ann in anns:
+            old_tid = ann["id"]
+            if old_tid not in old_to_new_tid:
+                old_to_new_tid[old_tid] = next_tid
+                next_tid += 1
+            ann["id"] = old_to_new_tid[old_tid]
+            # Set category_id to 1 for class-agnostic
+            ann["category_id"] = 1
+
+    # Set all GT categories to a single category
+    gt["categories"] = [
+        {
+            "supercategory": "object",
+            "id": 1,
+            "name": "_REMAPPED_FOR_PHRASE_METRICS_",
+        }
+    ]
+
+    # Add orig_video_id and orig_category_id to each video for compatibility
+    anns_by_video = defaultdict(list)
+    for ann in gt["annotations"]:
+        anns_by_video[ann["video_id"]].append(ann)
+    for video in gt["videos"]:
+        video["orig_video_id"] = video["id"]
+        # Use the first annotation's original category_id if available, else None
+        orig_cat = (
+            anns_by_video[video["id"]][0]["category_id"]
+            if anns_by_video[video["id"]]
+            else None
+        )
+        video["orig_category_id"] = orig_cat
+        video["file_names"] = [
+            f"remapped_vid_{video['id']:012d}/{name}" for name in video["file_names"]
+        ]
+
+    # Set all DT category_id to 1
+    for d in dt:
+        d["category_id"] = 1
+    return gt, dt
+
+
+def _fill_in_ann_height_width(gt_json):
+    """Fill in missing height/width in GT annotations from its video info."""
+    video_id_to_video = {v["id"]: v for v in gt_json["videos"]}
+    for ann in gt_json["annotations"]:
+        if "height" not in ann or "width" not in ann:
+            video = video_id_to_video[ann["video_id"]]
+            if "height" not in ann:
+                ann["height"] = video["height"]
+            if "width" not in ann:
+                ann["width"] = video["width"]
+
+    return gt_json
--- a/sam3/eval/teta_eval_toolkit/init.py
+++ b/sam3/eval/teta_eval_toolkit/init.py
@@ -0,0 +1,5 @@
+# fmt: off
+# flake8: noqa
+
+from . import config, datasets, metrics, utils
+from .eval import Evaluator
--- a/sam3/eval/teta_eval_toolkit/_timing.py
+++ b/sam3/eval/teta_eval_toolkit/_timing.py
@@ -0,0 +1,69 @@
+# fmt: off
+# flake8: noqa
+
+import inspect
+from functools import wraps
+from time import perf_counter
+
+DO_TIMING = False
+DISPLAY_LESS_PROGRESS = False
+timer_dict = {}
+counter = 0
+
+
+def time(f):
+    @wraps(f)
+    def wrap(*args, **kw):
+        if DO_TIMING:
+            # Run function with timing
+            ts = perf_counter()
+            result = f(*args, **kw)
+            te = perf_counter()
+            tt = te - ts
+
+            # Get function name
+            arg_names = inspect.getfullargspec(f)[0]
+            if arg_names[0] == "self" and DISPLAY_LESS_PROGRESS:
+                return result
+            elif arg_names[0] == "self":
+                method_name = type(args[0]).__name__ + "." + f.__name__
+            else:
+                method_name = f.__name__
+
+            # Record accumulative time in each function for analysis
+            if method_name in timer_dict.keys():
+                timer_dict[method_name] += tt
+            else:
+                timer_dict[method_name] = tt
+
+            # If code is finished, display timing summary
+            if method_name == "Evaluator.evaluate":
+                print("")
+                print("Timing analysis:")
+                for key, value in timer_dict.items():
+                    print("%-70s %2.4f sec" % (key, value))
+            else:
+                # Get function argument values for printing special arguments of interest
+                arg_titles = ["tracker", "seq", "cls"]
+                arg_vals = []
+                for i, a in enumerate(arg_names):
+                    if a in arg_titles:
+                        arg_vals.append(args[i])
+                arg_text = "(" + ", ".join(arg_vals) + ")"
+
+                # Display methods and functions with different indentation.
+                if arg_names[0] == "self":
+                    print("%-74s %2.4f sec" % (" " * 4 + method_name + arg_text, tt))
+                elif arg_names[0] == "test":
+                    pass
+                else:
+                    global counter
+                    counter += 1
+                    print("%i %-70s %2.4f sec" % (counter, method_name + arg_text, tt))
+
+            return result
+        else:
+            # If config["TIME_PROGRESS"] is false, or config["USE_PARALLEL"] is true, run functions normally without timing.
+            return f(*args, **kw)
+
+    return wrap
--- a/sam3/eval/teta_eval_toolkit/config.py
+++ b/sam3/eval/teta_eval_toolkit/config.py
@@ -0,0 +1,153 @@
+# fmt: off
+# flake8: noqa
+
+"""Config."""
+import argparse
+import os
+
+
+def parse_configs():
+    """Parse command line."""
+    default_eval_config = get_default_eval_config()
+    default_eval_config["DISPLAY_LESS_PROGRESS"] = True
+    default_dataset_config = get_default_dataset_config()
+    default_metrics_config = {"METRICS": ["TETA"]}
+    config = {
+        **default_eval_config,
+        **default_dataset_config,
+        **default_metrics_config,
+    }
+    parser = argparse.ArgumentParser()
+    for setting in config.keys():
+        if type(config[setting]) == list or type(config[setting]) == type(None):
+            parser.add_argument("--" + setting, nargs="+")
+        else:
+            parser.add_argument("--" + setting)
+    args = parser.parse_args().__dict__
+    for setting in args.keys():
+        if args[setting] is not None:
+            if type(config[setting]) == type(True):
+                if args[setting] == "True":
+                    x = True
+                elif args[setting] == "False":
+                    x = False
+                else:
+                    raise Exception(
+                        f"Command line parameter {setting} must be True/False"
+                    )
+            elif type(config[setting]) == type(1):
+                x = int(args[setting])
+            elif type(args[setting]) == type(None):
+                x = None
+            else:
+                x = args[setting]
+            config[setting] = x
+    eval_config = {k: v for k, v in config.items() if k in default_eval_config.keys()}
+    dataset_config = {
+        k: v for k, v in config.items() if k in default_dataset_config.keys()
+    }
+    metrics_config = {
+        k: v for k, v in config.items() if k in default_metrics_config.keys()
+    }
+
+    return eval_config, dataset_config, metrics_config
+
+
+def get_default_eval_config():
+    """Returns the default config values for evaluation."""
+    code_path = get_code_path()
+    default_config = {
+        "USE_PARALLEL": True,
+        "NUM_PARALLEL_CORES": 8,
+        "BREAK_ON_ERROR": True,
+        "RETURN_ON_ERROR": False,
+        "LOG_ON_ERROR": os.path.join(code_path, "error_log.txt"),
+        "PRINT_RESULTS": True,
+        "PRINT_ONLY_COMBINED": True,
+        "PRINT_CONFIG": True,
+        "TIME_PROGRESS": True,
+        "DISPLAY_LESS_PROGRESS": True,
+        "OUTPUT_SUMMARY": True,
+        "OUTPUT_EMPTY_CLASSES": True,
+        "OUTPUT_TEM_RAW_DATA": True,
+        "OUTPUT_PER_SEQ_RES": True,
+    }
+    return default_config
+
+
+def get_default_dataset_config():
+    """Default class config values"""
+    code_path = get_code_path()
+    default_config = {
+        "GT_FOLDER": os.path.join(
+            code_path, "data/gt/tao/tao_training"
+        ),  # Location of GT data
+        "TRACKERS_FOLDER": os.path.join(
+            code_path, "data/trackers/tao/tao_training"
+        ),  # Trackers location
+        "OUTPUT_FOLDER": None,  # Where to save eval results (if None, same as TRACKERS_FOLDER)
+        "TRACKERS_TO_EVAL": ['TETer'],  # Filenames of trackers to eval (if None, all in folder)
+        "CLASSES_TO_EVAL": None,  # Classes to eval (if None, all classes)
+        "SPLIT_TO_EVAL": "training",  # Valid: 'training', 'val'
+        "PRINT_CONFIG": True,  # Whether to print current config
+        "TRACKER_SUB_FOLDER": "data",  # Tracker files are in TRACKER_FOLDER/tracker_name/TRACKER_SUB_FOLDER
+        "OUTPUT_SUB_FOLDER": "",  # Output files are saved in OUTPUT_FOLDER/tracker_name/OUTPUT_SUB_FOLDER
+        "TRACKER_DISPLAY_NAMES": None,  # Names of trackers to display, if None: TRACKERS_TO_EVAL
+        "MAX_DETECTIONS": 0,  # Number of maximal allowed detections per image (0 for unlimited)
+        "USE_MASK": False,  # Whether to use mask data for evaluation
+    }
+    return default_config
+
+
+def init_config(config, default_config, name=None):
+    """Initialize non-given config values with defaults."""
+    if config is None:
+        config = default_config
+    else:
+        for k in default_config.keys():
+            if k not in config.keys():
+                config[k] = default_config[k]
+    if name and config["PRINT_CONFIG"]:
+        print("\n%s Config:" % name)
+        for c in config.keys():
+            print("%-20s : %-30s" % (c, config[c]))
+    return config
+
+
+def update_config(config):
+    """
+    Parse the arguments of a script and updates the config values for a given value if specified in the arguments.
+    :param config: the config to update
+    :return: the updated config
+    """
+    parser = argparse.ArgumentParser()
+    for setting in config.keys():
+        if type(config[setting]) == list or type(config[setting]) == type(None):
+            parser.add_argument("--" + setting, nargs="+")
+        else:
+            parser.add_argument("--" + setting)
+    args = parser.parse_args().__dict__
+    for setting in args.keys():
+        if args[setting] is not None:
+            if type(config[setting]) == type(True):
+                if args[setting] == "True":
+                    x = True
+                elif args[setting] == "False":
+                    x = False
+                else:
+                    raise Exception(
+                        "Command line parameter " + setting + "must be True or False"
+                    )
+            elif type(config[setting]) == type(1):
+                x = int(args[setting])
+            elif type(args[setting]) == type(None):
+                x = None
+            else:
+                x = args[setting]
+            config[setting] = x
+    return config
+
+
+def get_code_path():
+    """Get base path where code is"""
+    return os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
--- a/sam3/eval/teta_eval_toolkit/datasets/init.py
+++ b/sam3/eval/teta_eval_toolkit/datasets/init.py
@@ -0,0 +1,5 @@
+# fmt: off
+# flake8: noqa
+"""Datasets."""
+from .coco import COCO
+from .tao import TAO
--- a/sam3/eval/teta_eval_toolkit/datasets/_base_dataset.py
+++ b/sam3/eval/teta_eval_toolkit/datasets/_base_dataset.py
@@ -0,0 +1,379 @@
+# fmt: off
+# flake8: noqa
+
+import csv
+import io
+import os
+import traceback
+import zipfile
+from abc import ABC, abstractmethod
+from copy import deepcopy
+
+import numpy as np
+
+from .. import _timing
+from ..utils import TrackEvalException
+
+
+class _BaseDataset(ABC):
+    @abstractmethod
+    def __init__(self):
+        self.tracker_list = None
+        self.seq_list = None
+        self.class_list = None
+        self.output_fol = None
+        self.output_sub_fol = None
+        self.should_classes_combine = True
+        self.use_super_categories = False
+
+    # Functions to implement:
+
+    @abstractmethod
+    def _load_raw_file(self, tracker, seq, is_gt):
+        ...
+
+    @_timing.time
+    @abstractmethod
+    def get_preprocessed_seq_data(self, raw_data, cls):
+        ...
+
+    @abstractmethod
+    def _calculate_similarities(self, gt_dets_t, tracker_dets_t):
+        ...
+
+    # Helper functions for all datasets:
+
+    @classmethod
+    def get_class_name(cls):
+        return cls.__name__
+
+    def get_name(self):
+        return self.get_class_name()
+
+    def get_output_fol(self, tracker):
+        return os.path.join(self.output_fol, tracker, self.output_sub_fol)
+
+    def get_display_name(self, tracker):
+        """Can be overwritten if the trackers name (in files) is different to how it should be displayed.
+        By default this method just returns the trackers name as is.
+        """
+        return tracker
+
+    def get_eval_info(self):
+        """Return info about the dataset needed for the Evaluator"""
+        return self.tracker_list, self.seq_list, self.class_list
+
+    @_timing.time
+    def get_raw_seq_data(self, tracker, seq):
+        """Loads raw data (tracker and ground-truth) for a single tracker on a single sequence.
+        Raw data includes all of the information needed for both preprocessing and evaluation, for all classes.
+        A later function (get_processed_seq_data) will perform such preprocessing and extract relevant information for
+        the evaluation of each class.
+
+        This returns a dict which contains the fields:
+        [num_timesteps]: integer
+        [gt_ids, tracker_ids, gt_classes, tracker_classes, tracker_confidences]:
+                                                                list (for each timestep) of 1D NDArrays (for each det).
+        [gt_dets, tracker_dets, gt_crowd_ignore_regions]: list (for each timestep) of lists of detections.
+        [similarity_scores]: list (for each timestep) of 2D NDArrays.
+        [gt_extras]: dict (for each extra) of lists (for each timestep) of 1D NDArrays (for each det).
+
+        gt_extras contains dataset specific information used for preprocessing such as occlusion and truncation levels.
+
+        Note that similarities are extracted as part of the dataset and not the metric, because almost all metrics are
+        independent of the exact method of calculating the similarity. However datasets are not (e.g. segmentation
+        masks vs 2D boxes vs 3D boxes).
+        We calculate the similarity before preprocessing because often both preprocessing and evaluation require it and
+        we don't wish to calculate this twice.
+        We calculate similarity between all gt and tracker classes (not just each class individually) to allow for
+        calculation of metrics such as class confusion matrices. Typically the impact of this on performance is low.
+        """
+        # Load raw data.
+        raw_gt_data = self._load_raw_file(tracker, seq, is_gt=True)
+        raw_tracker_data = self._load_raw_file(tracker, seq, is_gt=False)
+        raw_data = {**raw_tracker_data, **raw_gt_data}  # Merges dictionaries
+
+        # Calculate similarities for each timestep.
+        similarity_scores = []
+        for _, (gt_dets_t, tracker_dets_t) in enumerate(
+            zip(raw_data["gt_dets"], raw_data["tk_dets"])
+        ):
+            ious = self._calculate_similarities(gt_dets_t, tracker_dets_t)
+            similarity_scores.append(ious)
+        raw_data["similarity_scores"] = similarity_scores
+        return raw_data
+
+    @staticmethod
+    def _load_simple_text_file(
+        file,
+        time_col=0,
+        id_col=None,
+        remove_negative_ids=False,
+        valid_filter=None,
+        crowd_ignore_filter=None,
+        convert_filter=None,
+        is_zipped=False,
+        zip_file=None,
+        force_delimiters=None,
+    ):
+        """Function that loads data which is in a commonly used text file format.
+        Assumes each det is given by one row of a text file.
+        There is no limit to the number or meaning of each column,
+        however one column needs to give the timestep of each det (time_col) which is default col 0.
+
+        The file dialect (deliminator, num cols, etc) is determined automatically.
+        This function automatically separates dets by timestep,
+        and is much faster than alternatives such as np.loadtext or pandas.
+
+        If remove_negative_ids is True and id_col is not None, dets with negative values in id_col are excluded.
+        These are not excluded from ignore data.
+
+        valid_filter can be used to only include certain classes.
+        It is a dict with ints as keys, and lists as values,
+        such that a row is included if "row[key].lower() is in value" for all key/value pairs in the dict.
+        If None, all classes are included.
+
+        crowd_ignore_filter can be used to read crowd_ignore regions separately. It has the same format as valid filter.
+
+        convert_filter can be used to convert value read to another format.
+        This is used most commonly to convert classes given as string to a class id.
+        This is a dict such that the key is the column to convert, and the value is another dict giving the mapping.
+
+        Optionally, input files could be a zip of multiple text files for storage efficiency.
+
+        Returns read_data and ignore_data.
+        Each is a dict (with keys as timesteps as strings) of lists (over dets) of lists (over column values).
+        Note that all data is returned as strings, and must be converted to float/int later if needed.
+        Note that timesteps will not be present in the returned dict keys if there are no dets for them
+        """
+
+        if remove_negative_ids and id_col is None:
+            raise TrackEvalException(
+                "remove_negative_ids is True, but id_col is not given."
+            )
+        if crowd_ignore_filter is None:
+            crowd_ignore_filter = {}
+        if convert_filter is None:
+            convert_filter = {}
+        try:
+            if is_zipped:  # Either open file directly or within a zip.
+                if zip_file is None:
+                    raise TrackEvalException(
+                        "is_zipped set to True, but no zip_file is given."
+                    )
+                archive = zipfile.ZipFile(os.path.join(zip_file), "r")
+                fp = io.TextIOWrapper(archive.open(file, "r"))
+            else:
+                fp = open(file)
+            read_data = {}
+            crowd_ignore_data = {}
+            fp.seek(0, os.SEEK_END)
+            # check if file is empty
+            if fp.tell():
+                fp.seek(0)
+                dialect = csv.Sniffer().sniff(
+                    fp.readline(), delimiters=force_delimiters
+                )  # Auto determine structure.
+                dialect.skipinitialspace = (
+                    True  # Deal with extra spaces between columns
+                )
+                fp.seek(0)
+                reader = csv.reader(fp, dialect)
+                for row in reader:
+                    try:
+                        # Deal with extra trailing spaces at the end of rows
+                        if row[-1] in "":
+                            row = row[:-1]
+                        timestep = str(int(float(row[time_col])))
+                        # Read ignore regions separately.
+                        is_ignored = False
+                        for ignore_key, ignore_value in crowd_ignore_filter.items():
+                            if row[ignore_key].lower() in ignore_value:
+                                # Convert values in one column (e.g. string to id)
+                                for (
+                                    convert_key,
+                                    convert_value,
+                                ) in convert_filter.items():
+                                    row[convert_key] = convert_value[
+                                        row[convert_key].lower()
+                                    ]
+                                # Save data separated by timestep.
+                                if timestep in crowd_ignore_data.keys():
+                                    crowd_ignore_data[timestep].append(row)
+                                else:
+                                    crowd_ignore_data[timestep] = [row]
+                                is_ignored = True
+                        if (
+                            is_ignored
+                        ):  # if det is an ignore region, it cannot be a normal det.
+                            continue
+                        # Exclude some dets if not valid.
+                        if valid_filter is not None:
+                            for key, value in valid_filter.items():
+                                if row[key].lower() not in value:
+                                    continue
+                        if remove_negative_ids:
+                            if int(float(row[id_col])) < 0:
+                                continue
+                        # Convert values in one column (e.g. string to id)
+                        for convert_key, convert_value in convert_filter.items():
+                            row[convert_key] = convert_value[row[convert_key].lower()]
+                        # Save data separated by timestep.
+                        if timestep in read_data.keys():
+                            read_data[timestep].append(row)
+                        else:
+                            read_data[timestep] = [row]
+                    except Exception:
+                        exc_str_init = (
+                            "In file %s the following line cannot be read correctly: \n"
+                            % os.path.basename(file)
+                        )
+                        exc_str = " ".join([exc_str_init] + row)
+                        raise TrackEvalException(exc_str)
+            fp.close()
+        except Exception:
+            print("Error loading file: %s, printing traceback." % file)
+            traceback.print_exc()
+            raise TrackEvalException(
+                "File %s cannot be read because it is either not present or invalidly formatted"
+                % os.path.basename(file)
+            )
+        return read_data, crowd_ignore_data
+
+    @staticmethod
+    def _calculate_mask_ious(masks1, masks2, is_encoded=False, do_ioa=False):
+        """Calculates the IOU (intersection over union) between two arrays of segmentation masks.
+        If is_encoded a run length encoding with pycocotools is assumed as input format, otherwise an input of numpy
+        arrays of the shape (num_masks, height, width) is assumed and the encoding is performed.
+        If do_ioa (intersection over area) , then calculates the intersection over the area of masks1 - this is commonly
+        used to determine if detections are within crowd ignore region.
+        :param masks1:  first set of masks (numpy array of shape (num_masks, height, width) if not encoded,
+                        else pycocotools rle encoded format)
+        :param masks2:  second set of masks (numpy array of shape (num_masks, height, width) if not encoded,
+                        else pycocotools rle encoded format)
+        :param is_encoded: whether the input is in pycocotools rle encoded format
+        :param do_ioa: whether to perform IoA computation
+        :return: the IoU/IoA scores
+        """
+
+        # Only loaded when run to reduce minimum requirements
+        from pycocotools import mask as mask_utils
+
+        # use pycocotools for run length encoding of masks
+        if not is_encoded:
+            masks1 = mask_utils.encode(
+                np.array(np.transpose(masks1, (1, 2, 0)), order="F")
+            )
+            masks2 = mask_utils.encode(
+                np.array(np.transpose(masks2, (1, 2, 0)), order="F")
+            )
+
+        # use pycocotools for iou computation of rle encoded masks
+        ious = mask_utils.iou(masks1, masks2, [do_ioa] * len(masks2))
+        if len(masks1) == 0 or len(masks2) == 0:
+            ious = np.asarray(ious).reshape(len(masks1), len(masks2))
+        assert (ious >= 0 - np.finfo("float").eps).all()
+        assert (ious <= 1 + np.finfo("float").eps).all()
+
+        return ious
+
+    @staticmethod
+    def _calculate_box_ious(bboxes1, bboxes2, box_format="xywh", do_ioa=False):
+        """Calculates the IOU (intersection over union) between two arrays of boxes.
+        Allows variable box formats ('xywh' and 'x0y0x1y1').
+        If do_ioa (intersection over area) , then calculates the intersection over the area of boxes1 - this is commonly
+        used to determine if detections are within crowd ignore region.
+        """
+        if box_format in "xywh":
+            # layout: (x0, y0, w, h)
+            bboxes1 = deepcopy(bboxes1)
+            bboxes2 = deepcopy(bboxes2)
+
+            bboxes1[:, 2] = bboxes1[:, 0] + bboxes1[:, 2]
+            bboxes1[:, 3] = bboxes1[:, 1] + bboxes1[:, 3]
+            bboxes2[:, 2] = bboxes2[:, 0] + bboxes2[:, 2]
+            bboxes2[:, 3] = bboxes2[:, 1] + bboxes2[:, 3]
+        elif box_format not in "x0y0x1y1":
+            raise (TrackEvalException("box_format %s is not implemented" % box_format))
+
+        # layout: (x0, y0, x1, y1)
+        min_ = np.minimum(bboxes1[:, np.newaxis, :], bboxes2[np.newaxis, :, :])
+        max_ = np.maximum(bboxes1[:, np.newaxis, :], bboxes2[np.newaxis, :, :])
+        intersection = np.maximum(min_[..., 2] - max_[..., 0], 0) * np.maximum(
+            min_[..., 3] - max_[..., 1], 0
+        )
+        area1 = (bboxes1[..., 2] - bboxes1[..., 0]) * (
+            bboxes1[..., 3] - bboxes1[..., 1]
+        )
+
+        if do_ioa:
+            ioas = np.zeros_like(intersection)
+            valid_mask = area1 > 0 + np.finfo("float").eps
+            ioas[valid_mask, :] = (
+                intersection[valid_mask, :] / area1[valid_mask][:, np.newaxis]
+            )
+
+            return ioas
+        else:
+            area2 = (bboxes2[..., 2] - bboxes2[..., 0]) * (
+                bboxes2[..., 3] - bboxes2[..., 1]
+            )
+            union = area1[:, np.newaxis] + area2[np.newaxis, :] - intersection
+            intersection[area1 <= 0 + np.finfo("float").eps, :] = 0
+            intersection[:, area2 <= 0 + np.finfo("float").eps] = 0
+            intersection[union <= 0 + np.finfo("float").eps] = 0
+            union[union <= 0 + np.finfo("float").eps] = 1
+            ious = intersection / union
+            return ious
+
+    @staticmethod
+    def _calculate_euclidean_similarity(dets1, dets2, zero_distance=2.0):
+        """Calculates the euclidean distance between two sets of detections, and then converts this into a similarity
+        measure with values between 0 and 1 using the following formula: sim = max(0, 1 - dist/zero_distance).
+        The default zero_distance of 2.0, corresponds to the default used in MOT15_3D, such that a 0.5 similarity
+        threshold corresponds to a 1m distance threshold for TPs.
+        """
+        dist = np.linalg.norm(dets1[:, np.newaxis] - dets2[np.newaxis, :], axis=2)
+        sim = np.maximum(0, 1 - dist / zero_distance)
+        return sim
+
+    @staticmethod
+    def _check_unique_ids(data, after_preproc=False):
+        """Check the requirement that the tracker_ids and gt_ids are unique per timestep"""
+        gt_ids = data["gt_ids"]
+        tracker_ids = data["tk_ids"]
+        for t, (gt_ids_t, tracker_ids_t) in enumerate(zip(gt_ids, tracker_ids)):
+            if len(tracker_ids_t) > 0:
+                unique_ids, counts = np.unique(tracker_ids_t, return_counts=True)
+                if np.max(counts) != 1:
+                    duplicate_ids = unique_ids[counts > 1]
+                    exc_str_init = (
+                        "Tracker predicts the same ID more than once in a single timestep "
+                        "(seq: %s, frame: %i, ids:" % (data["seq"], t + 1)
+                    )
+                    exc_str = (
+                        " ".join([exc_str_init] + [str(d) for d in duplicate_ids]) + ")"
+                    )
+                    if after_preproc:
+                        exc_str_init += (
+                            "\n Note that this error occurred after preprocessing (but not before), "
+                            "so ids may not be as in file, and something seems wrong with preproc."
+                        )
+                    raise TrackEvalException(exc_str)
+            if len(gt_ids_t) > 0:
+                unique_ids, counts = np.unique(gt_ids_t, return_counts=True)
+                if np.max(counts) != 1:
+                    duplicate_ids = unique_ids[counts > 1]
+                    exc_str_init = (
+                        "Ground-truth has the same ID more than once in a single timestep "
+                        "(seq: %s, frame: %i, ids:" % (data["seq"], t + 1)
+                    )
+                    exc_str = (
+                        " ".join([exc_str_init] + [str(d) for d in duplicate_ids]) + ")"
+                    )
+                    if after_preproc:
+                        exc_str_init += (
+                            "\n Note that this error occurred after preprocessing (but not before), "
+                            "so ids may not be as in file, and something seems wrong with preproc."
+                        )
+                    raise TrackEvalException(exc_str)
--- a/sam3/eval/teta_eval_toolkit/datasets/coco.py
+++ b/sam3/eval/teta_eval_toolkit/datasets/coco.py
@@ -0,0 +1,637 @@
+# fmt: off
+# flake8: noqa
+
+"""COCO Dataset."""
+import copy
+import itertools
+import json
+import os
+from collections import defaultdict
+
+import numpy as np
+from scipy.optimize import linear_sum_assignment
+
+from .. import _timing, utils
+from ..config import get_default_dataset_config, init_config
+from ..utils import TrackEvalException
+from ._base_dataset import _BaseDataset
+
+
+class COCO(_BaseDataset):
+    """Tracking datasets in COCO format."""
+
+    def __init__(self, config=None):
+        """Initialize dataset, checking that all required files are present."""
+        super().__init__()
+        # Fill non-given config values with defaults
+        self.config = init_config(config, get_default_dataset_config(), self.get_name())
+        self.gt_fol = self.config["GT_FOLDER"]
+        self.tracker_fol = self.config["TRACKERS_FOLDER"]
+        self.should_classes_combine = True
+        self.use_super_categories = False
+        self.use_mask = self.config["USE_MASK"]
+
+        self.tracker_sub_fol = self.config["TRACKER_SUB_FOLDER"]
+        self.output_fol = self.config["OUTPUT_FOLDER"]
+        if self.output_fol is None:
+            self.output_fol = self.tracker_fol
+        self.output_sub_fol = self.config["OUTPUT_SUB_FOLDER"]
+
+        if self.gt_fol.endswith(".json"):
+            self.gt_data = json.load(open(self.gt_fol, "r"))
+        else:
+            gt_dir_files = [
+                file for file in os.listdir(self.gt_fol) if file.endswith(".json")
+            ]
+            if len(gt_dir_files) != 1:
+                raise TrackEvalException(
+                    f"{self.gt_fol} does not contain exactly one json file."
+                )
+
+            with open(os.path.join(self.gt_fol, gt_dir_files[0])) as f:
+                self.gt_data = json.load(f)
+
+        # fill missing video ids
+        self._fill_video_ids_inplace(self.gt_data["annotations"])
+
+        # get sequences to eval and sequence information
+        self.seq_list = [
+            vid["name"].replace("/", "-") for vid in self.gt_data["videos"]
+        ]
+        self.seq_name2seqid = {
+            vid["name"].replace("/", "-"): vid["id"] for vid in self.gt_data["videos"]
+        }
+        # compute mappings from videos to annotation data
+        self.video2gt_track, self.video2gt_image = self._compute_vid_mappings(
+            self.gt_data["annotations"]
+        )
+        # compute sequence lengths
+        self.seq_lengths = {vid["id"]: 0 for vid in self.gt_data["videos"]}
+        for img in self.gt_data["images"]:
+            self.seq_lengths[img["video_id"]] += 1
+        self.seq2images2timestep = self._compute_image_to_timestep_mappings()
+        self.seq2cls = {
+            vid["id"]: {
+                "pos_cat_ids": list(
+                    {track["category_id"] for track in self.video2gt_track[vid["id"]]}
+                ),
+            }
+            for vid in self.gt_data["videos"]
+        }
+
+        # Get classes to eval
+        considered_vid_ids = [self.seq_name2seqid[vid] for vid in self.seq_list]
+        seen_cats = set(
+            [
+                cat_id
+                for vid_id in considered_vid_ids
+                for cat_id in self.seq2cls[vid_id]["pos_cat_ids"]
+            ]
+        )
+        # only classes with ground truth are evaluated in TAO
+        self.valid_classes = [
+            cls["name"] for cls in self.gt_data["categories"] if cls["id"] in seen_cats
+        ]
+        cls_name2clsid_map = {
+            cls["name"]: cls["id"] for cls in self.gt_data["categories"]
+        }
+
+        if self.config["CLASSES_TO_EVAL"]:
+            self.class_list = [
+                cls.lower() if cls.lower() in self.valid_classes else None
+                for cls in self.config["CLASSES_TO_EVAL"]
+            ]
+            if not all(self.class_list):
+                valid_cls = ", ".join(self.valid_classes)
+                raise TrackEvalException(
+                    "Attempted to evaluate an invalid class. Only classes "
+                    f"{valid_cls} are valid (classes present in ground truth"
+                    " data)."
+                )
+        else:
+            self.class_list = [cls for cls in self.valid_classes]
+        self.cls_name2clsid = {
+            k: v for k, v in cls_name2clsid_map.items() if k in self.class_list
+        }
+        self.clsid2cls_name = {
+            v: k for k, v in cls_name2clsid_map.items() if k in self.class_list
+        }
+        # get trackers to eval
+        if self.config["TRACKERS_TO_EVAL"] is None:
+            self.tracker_list = os.listdir(self.tracker_fol)
+        else:
+            self.tracker_list = self.config["TRACKERS_TO_EVAL"]
+
+        if self.config["TRACKER_DISPLAY_NAMES"] is None:
+            self.tracker_to_disp = dict(zip(self.tracker_list, self.tracker_list))
+        elif (self.config["TRACKERS_TO_EVAL"] is not None) and (
+            len(self.config["TK_DISPLAY_NAMES"]) == len(self.tracker_list)
+        ):
+            self.tracker_to_disp = dict(
+                zip(self.tracker_list, self.config["TK_DISPLAY_NAMES"])
+            )
+        else:
+            raise TrackEvalException(
+                "List of tracker files and tracker display names do not match."
+            )
+
+        self.tracker_data = {tracker: dict() for tracker in self.tracker_list}
+
+        for tracker in self.tracker_list:
+            if self.tracker_sub_fol.endswith(".json"):
+                with open(os.path.join(self.tracker_sub_fol)) as f:
+                    curr_data = json.load(f)
+            else:
+                tr_dir = os.path.join(self.tracker_fol, tracker, self.tracker_sub_fol)
+                tr_dir_files = [
+                    file for file in os.listdir(tr_dir) if file.endswith(".json")
+                ]
+                if len(tr_dir_files) != 1:
+                    raise TrackEvalException(
+                        f"{tr_dir} does not contain exactly one json file."
+                    )
+                with open(os.path.join(tr_dir, tr_dir_files[0])) as f:
+                    curr_data = json.load(f)
+
+            # limit detections if MAX_DETECTIONS > 0
+            if self.config["MAX_DETECTIONS"]:
+                curr_data = self._limit_dets_per_image(curr_data)
+
+            # fill missing video ids
+            self._fill_video_ids_inplace(curr_data)
+
+            # make track ids unique over whole evaluation set
+            self._make_tk_ids_unique(curr_data)
+
+            # get tracker sequence information
+            curr_vids2tracks, curr_vids2images = self._compute_vid_mappings(curr_data)
+            self.tracker_data[tracker]["vids_to_tracks"] = curr_vids2tracks
+            self.tracker_data[tracker]["vids_to_images"] = curr_vids2images
+
+    def get_display_name(self, tracker):
+        return self.tracker_to_disp[tracker]
+
+    def _load_raw_file(self, tracker, seq, is_gt):
+        """Load a file (gt or tracker) in the TAO format
+
+        If is_gt, this returns a dict which contains the fields:
+        [gt_ids, gt_classes]:
+            list (for each timestep) of 1D NDArrays (for each det).
+        [gt_dets]: list (for each timestep) of lists of detections.
+
+        if not is_gt, this returns a dict which contains the fields:
+        [tk_ids, tk_classes]:
+            list (for each timestep) of 1D NDArrays (for each det).
+        [tk_dets]: list (for each timestep) of lists of detections.
+        """
+        seq_id = self.seq_name2seqid[seq]
+        # file location
+        if is_gt:
+            imgs = self.video2gt_image[seq_id]
+        else:
+            imgs = self.tracker_data[tracker]["vids_to_images"][seq_id]
+
+        # convert data to required format
+        num_timesteps = self.seq_lengths[seq_id]
+        img_to_timestep = self.seq2images2timestep[seq_id]
+        data_keys = ["ids", "classes", "dets"]
+        # if not is_gt:
+        #     data_keys += ["tk_confidences"]
+        raw_data = {key: [None] * num_timesteps for key in data_keys}
+        for img in imgs:
+            # some tracker data contains images without any ground truth info,
+            # these are ignored
+            if img["id"] not in img_to_timestep:
+                continue
+            t = img_to_timestep[img["id"]]
+            anns = img["annotations"]
+            tk_str = utils.get_track_id_str(anns[0])
+
+            if self.use_mask:
+                # When using mask, extract segmentation data
+                raw_data["dets"][t] = [ann.get("segmentation") for ann in anns]
+            else:
+                # When using bbox, extract bbox data
+                raw_data["dets"][t] = np.atleast_2d([ann["bbox"] for ann in anns]).astype(
+                    float
+                )
+            raw_data["ids"][t] = np.atleast_1d([ann[tk_str] for ann in anns]).astype(
+                int
+            )
+            raw_data["classes"][t] = np.atleast_1d(
+                [ann["category_id"] for ann in anns]
+            ).astype(int)
+            # if not is_gt:
+            #     raw_data["tk_confidences"][t] = np.atleast_1d(
+            #         [ann["score"] for ann in anns]
+            #     ).astype(float)
+
+        for t, d in enumerate(raw_data["dets"]):
+            if d is None:
+                raw_data["dets"][t] = np.empty((0, 4)).astype(float)
+                raw_data["ids"][t] = np.empty(0).astype(int)
+                raw_data["classes"][t] = np.empty(0).astype(int)
+                # if not is_gt:
+                #     raw_data["tk_confidences"][t] = np.empty(0)
+
+        if is_gt:
+            key_map = {"ids": "gt_ids", "classes": "gt_classes", "dets": "gt_dets"}
+        else:
+            key_map = {"ids": "tk_ids", "classes": "tk_classes", "dets": "tk_dets"}
+        for k, v in key_map.items():
+            raw_data[v] = raw_data.pop(k)
+
+        raw_data["num_timesteps"] = num_timesteps
+        raw_data["seq"] = seq
+        return raw_data
+
+    def get_preprocessed_seq_data_thr(self, raw_data, cls, assignment=None):
+        """Preprocess data for a single sequence for a single class.
+
+        Inputs:
+            raw_data: dict containing the data for the sequence already
+                read in by get_raw_seq_data().
+            cls: class to be evaluated.
+        Outputs:
+            gt_ids:
+                list (for each timestep) of ids of GT tracks
+            tk_ids:
+                list (for each timestep) of ids of predicted tracks (all for TP
+                matching (Det + AssocA))
+            tk_overlap_ids:
+                list (for each timestep) of ids of predicted tracks that overlap
+                with GTs
+            tk_dets:
+                list (for each timestep) of lists of detections that
+                corresponding to the tk_ids
+            tk_classes:
+                list (for each timestep) of lists of classes that corresponding
+                to the tk_ids
+            tk_confidences:
+                list (for each timestep) of lists of classes that corresponding
+                to the tk_ids
+            sim_scores:
+                similarity score between gt_ids and tk_ids.
+        """
+        if cls != "all":
+            cls_id = self.cls_name2clsid[cls]
+
+        data_keys = [
+            "gt_ids",
+            "tk_ids",
+            "gt_id_map",
+            "tk_id_map",
+            "gt_dets",
+            "gt_classes",
+            "gt_class_name",
+            "tk_overlap_classes",
+            "tk_overlap_ids",
+            "tk_class_eval_tk_ids",
+            "tk_dets",
+            "tk_classes",
+            # "tk_confidences",
+            "tk_exh_ids",
+            "sim_scores",
+        ]
+        data = {key: [None] * raw_data["num_timesteps"] for key in data_keys}
+        unique_gt_ids = []
+        unique_tk_ids = []
+        num_gt_dets = 0
+        num_tk_cls_dets = 0
+        num_tk_overlap_dets = 0
+        overlap_ious_thr = 0.5
+        loc_and_asso_tk_ids = []
+        exh_class_tk_ids = []
+
+        for t in range(raw_data["num_timesteps"]):
+            # only extract relevant dets for this class for preproc and eval
+            if cls == "all":
+                gt_class_mask = np.ones_like(raw_data["gt_classes"][t]).astype(bool)
+            else:
+                gt_class_mask = np.atleast_1d(
+                    raw_data["gt_classes"][t] == cls_id
+                ).astype(bool)
+
+            # select GT that is not in the evaluating classes
+            if assignment is not None and assignment:
+                all_gt_ids = list(assignment[t].keys())
+                gt_ids_in = raw_data["gt_ids"][t][gt_class_mask]
+                gt_ids_out = set(all_gt_ids) - set(gt_ids_in)
+                tk_ids_out = set([assignment[t][key] for key in list(gt_ids_out)])
+
+            # compute overlapped tracks and add their ids to overlap_tk_ids
+            sim_scores = raw_data["similarity_scores"]
+            overlap_ids_masks = (sim_scores[t][gt_class_mask] >= overlap_ious_thr).any(
+                axis=0
+            )
+            overlap_tk_ids_t = raw_data["tk_ids"][t][overlap_ids_masks]
+            if assignment is not None and assignment:
+                data["tk_overlap_ids"][t] = list(set(overlap_tk_ids_t) - tk_ids_out)
+            else:
+                data["tk_overlap_ids"][t] = list(set(overlap_tk_ids_t))
+
+            loc_and_asso_tk_ids += data["tk_overlap_ids"][t]
+
+            data["tk_exh_ids"][t] = []
+            if cls == "all":
+                continue
+
+            # add the track ids of exclusive annotated class to exh_class_tk_ids
+            tk_exh_mask = np.atleast_1d(raw_data["tk_classes"][t] == cls_id)
+            tk_exh_mask = tk_exh_mask.astype(bool)
+            exh_class_tk_ids_t = raw_data["tk_ids"][t][tk_exh_mask]
+            exh_class_tk_ids.append(exh_class_tk_ids_t)
+            data["tk_exh_ids"][t] = exh_class_tk_ids_t
+
+        # remove tk_ids that has been assigned to GT belongs to other classes.
+        loc_and_asso_tk_ids = list(set(loc_and_asso_tk_ids))
+
+        # remove all unwanted unmatched tracker detections
+        for t in range(raw_data["num_timesteps"]):
+            # add gt to the data
+            if cls == "all":
+                gt_class_mask = np.ones_like(raw_data["gt_classes"][t]).astype(bool)
+            else:
+                gt_class_mask = np.atleast_1d(
+                    raw_data["gt_classes"][t] == cls_id
+                ).astype(bool)
+                data["gt_classes"][t] = cls_id
+                data["gt_class_name"][t] = cls
+
+            gt_ids = raw_data["gt_ids"][t][gt_class_mask]
+            if self.use_mask:
+                gt_dets = [raw_data['gt_dets'][t][ind] for ind in range(len(gt_class_mask)) if gt_class_mask[ind]]
+            else:
+                gt_dets = raw_data["gt_dets"][t][gt_class_mask]
+            data["gt_ids"][t] = gt_ids
+            data["gt_dets"][t] = gt_dets
+
+            # filter pred and only keep those that highly overlap with GTs
+            tk_mask = np.isin(
+                raw_data["tk_ids"][t], np.array(loc_and_asso_tk_ids), assume_unique=True
+            )
+            tk_overlap_mask = np.isin(
+                raw_data["tk_ids"][t],
+                np.array(data["tk_overlap_ids"][t]),
+                assume_unique=True,
+            )
+
+            tk_ids = raw_data["tk_ids"][t][tk_mask]
+            if self.use_mask:
+                tk_dets = [raw_data['tk_dets'][t][ind] for ind in range(len(tk_mask)) if
+                            tk_mask[ind]]
+            else:
+                tk_dets = raw_data["tk_dets"][t][tk_mask]
+
+            tracker_classes = raw_data["tk_classes"][t][tk_mask]
+
+            # add overlap classes for computing the FP for Cls term
+            tracker_overlap_classes = raw_data["tk_classes"][t][tk_overlap_mask]
+            # tracker_confidences = raw_data["tk_confidences"][t][tk_mask]
+            sim_scores_masked = sim_scores[t][gt_class_mask, :][:, tk_mask]
+
+            # add filtered prediction to the data
+            data["tk_classes"][t] = tracker_classes
+            data["tk_overlap_classes"][t] = tracker_overlap_classes
+            data["tk_ids"][t] = tk_ids
+            data["tk_dets"][t] = tk_dets
+            # data["tk_confidences"][t] = tracker_confidences
+            data["sim_scores"][t] = sim_scores_masked
+            data["tk_class_eval_tk_ids"][t] = set(
+                list(data["tk_overlap_ids"][t]) + list(data["tk_exh_ids"][t])
+            )
+
+            # count total number of detections
+            unique_gt_ids += list(np.unique(data["gt_ids"][t]))
+            # the unique track ids are for association.
+            unique_tk_ids += list(np.unique(data["tk_ids"][t]))
+
+            num_tk_overlap_dets += len(data["tk_overlap_ids"][t])
+            num_tk_cls_dets += len(data["tk_class_eval_tk_ids"][t])
+            num_gt_dets += len(data["gt_ids"][t])
+
+        # re-label IDs such that there are no empty IDs
+        if len(unique_gt_ids) > 0:
+            unique_gt_ids = np.unique(unique_gt_ids)
+            gt_id_map = np.nan * np.ones((np.max(unique_gt_ids) + 1))
+            gt_id_map[unique_gt_ids] = np.arange(len(unique_gt_ids))
+            data["gt_id_map"] = {}
+            for gt_id in unique_gt_ids:
+                new_gt_id = gt_id_map[gt_id].astype(int)
+                data["gt_id_map"][new_gt_id] = gt_id
+
+            for t in range(raw_data["num_timesteps"]):
+                if len(data["gt_ids"][t]) > 0:
+                    data["gt_ids"][t] = gt_id_map[data["gt_ids"][t]].astype(int)
+
+        if len(unique_tk_ids) > 0:
+            unique_tk_ids = np.unique(unique_tk_ids)
+            tk_id_map = np.nan * np.ones((np.max(unique_tk_ids) + 1))
+            tk_id_map[unique_tk_ids] = np.arange(len(unique_tk_ids))
+
+            data["tk_id_map"] = {}
+            for track_id in unique_tk_ids:
+                new_track_id = tk_id_map[track_id].astype(int)
+                data["tk_id_map"][new_track_id] = track_id
+
+            for t in range(raw_data["num_timesteps"]):
+                if len(data["tk_ids"][t]) > 0:
+                    data["tk_ids"][t] = tk_id_map[data["tk_ids"][t]].astype(int)
+                if len(data["tk_overlap_ids"][t]) > 0:
+                    data["tk_overlap_ids"][t] = tk_id_map[
+                        data["tk_overlap_ids"][t]
+                    ].astype(int)
+
+        # record overview statistics.
+        data["num_tk_cls_dets"] = num_tk_cls_dets
+        data["num_tk_overlap_dets"] = num_tk_overlap_dets
+        data["num_gt_dets"] = num_gt_dets
+        data["num_tk_ids"] = len(unique_tk_ids)
+        data["num_gt_ids"] = len(unique_gt_ids)
+        data["num_timesteps"] = raw_data["num_timesteps"]
+        data["seq"] = raw_data["seq"]
+
+        self._check_unique_ids(data)
+
+        return data
+
+    @_timing.time
+    def get_preprocessed_seq_data(
+        self, raw_data, cls, assignment=None, thresholds=[50, 75]
+    ):
+        """Preprocess data for a single sequence for a single class."""
+        data = {}
+        if thresholds is None:
+            thresholds = [50, 75]
+        elif isinstance(thresholds, int):
+            thresholds = [thresholds]
+
+        for thr in thresholds:
+            assignment_thr = None
+            if assignment is not None:
+                assignment_thr = assignment[thr]
+            data[thr] = self.get_preprocessed_seq_data_thr(
+                raw_data, cls, assignment_thr
+            )
+
+        return data
+
+    def _calculate_similarities(self, gt_dets_t, tk_dets_t):
+        """Compute similarity scores."""
+        if self.use_mask:
+            similarity_scores = self._calculate_mask_ious(gt_dets_t, tk_dets_t, is_encoded=True, do_ioa=False)
+        else:
+            similarity_scores = self._calculate_box_ious(gt_dets_t, tk_dets_t)
+        return similarity_scores
+
+    def _compute_vid_mappings(self, annotations):
+        """Computes mappings from videos to corresponding tracks and images."""
+        vids_to_tracks = {}
+        vids_to_imgs = {}
+        vid_ids = [vid["id"] for vid in self.gt_data["videos"]]
+
+        # compute an mapping from image IDs to images
+        images = {}
+        for image in self.gt_data["images"]:
+            images[image["id"]] = image
+
+        tk_str = utils.get_track_id_str(annotations[0])
+        for ann in annotations:
+            ann["area"] = ann["bbox"][2] * ann["bbox"][3]
+
+            vid = ann["video_id"]
+            if ann["video_id"] not in vids_to_tracks.keys():
+                vids_to_tracks[ann["video_id"]] = list()
+            if ann["video_id"] not in vids_to_imgs.keys():
+                vids_to_imgs[ann["video_id"]] = list()
+
+            # fill in vids_to_tracks
+            tid = ann[tk_str]
+            exist_tids = [track["id"] for track in vids_to_tracks[vid]]
+            try:
+                index1 = exist_tids.index(tid)
+            except ValueError:
+                index1 = -1
+            if tid not in exist_tids:
+                curr_track = {
+                    "id": tid,
+                    "category_id": ann["category_id"],
+                    "video_id": vid,
+                    "annotations": [ann],
+                }
+                vids_to_tracks[vid].append(curr_track)
+            else:
+                vids_to_tracks[vid][index1]["annotations"].append(ann)
+
+            # fill in vids_to_imgs
+            img_id = ann["image_id"]
+            exist_img_ids = [img["id"] for img in vids_to_imgs[vid]]
+            try:
+                index2 = exist_img_ids.index(img_id)
+            except ValueError:
+                index2 = -1
+            if index2 == -1:
+                curr_img = {"id": img_id, "annotations": [ann]}
+                vids_to_imgs[vid].append(curr_img)
+            else:
+                vids_to_imgs[vid][index2]["annotations"].append(ann)
+
+        # sort annotations by frame index and compute track area
+        for vid, tracks in vids_to_tracks.items():
+            for track in tracks:
+                track["annotations"] = sorted(
+                    track["annotations"],
+                    key=lambda x: images[x["image_id"]]["frame_id"],
+                )
+                # compute average area
+                track["area"] = sum(x["area"] for x in track["annotations"]) / len(
+                    track["annotations"]
+                )
+
+        # ensure all videos are present
+        for vid_id in vid_ids:
+            if vid_id not in vids_to_tracks.keys():
+                vids_to_tracks[vid_id] = []
+            if vid_id not in vids_to_imgs.keys():
+                vids_to_imgs[vid_id] = []
+
+        return vids_to_tracks, vids_to_imgs
+
+    def _compute_image_to_timestep_mappings(self):
+        """Computes a mapping from images to timestep in sequence."""
+        images = {}
+        for image in self.gt_data["images"]:
+            images[image["id"]] = image
+
+        seq_to_imgs_to_timestep = {vid["id"]: dict() for vid in self.gt_data["videos"]}
+        for vid in seq_to_imgs_to_timestep:
+            curr_imgs = [img["id"] for img in self.video2gt_image[vid]]
+            curr_imgs = sorted(curr_imgs, key=lambda x: images[x]["frame_id"])
+            seq_to_imgs_to_timestep[vid] = {
+                curr_imgs[i]: i for i in range(len(curr_imgs))
+            }
+
+        return seq_to_imgs_to_timestep
+
+    def _limit_dets_per_image(self, annotations):
+        """Limits the number of detections for each image.
+
+        Adapted from https://github.com/TAO-Dataset/.
+        """
+        max_dets = self.config["MAX_DETECTIONS"]
+        img_ann = defaultdict(list)
+        for ann in annotations:
+            img_ann[ann["image_id"]].append(ann)
+
+        for img_id, _anns in img_ann.items():
+            if len(_anns) <= max_dets:
+                continue
+            _anns = sorted(_anns, key=lambda x: x["score"], reverse=True)
+            img_ann[img_id] = _anns[:max_dets]
+
+        return [ann for anns in img_ann.values() for ann in anns]
+
+    def _fill_video_ids_inplace(self, annotations):
+        """Fills in missing video IDs inplace.
+
+        Adapted from https://github.com/TAO-Dataset/.
+        """
+        missing_video_id = [x for x in annotations if "video_id" not in x]
+        if missing_video_id:
+            image_id_to_video_id = {
+                x["id"]: x["video_id"] for x in self.gt_data["images"]
+            }
+            for x in missing_video_id:
+                x["video_id"] = image_id_to_video_id[x["image_id"]]
+
+    @staticmethod
+    def _make_tk_ids_unique(annotations):
+        """Makes track IDs unqiue over the whole annotation set.
+
+        Adapted from https://github.com/TAO-Dataset/.
+        """
+        track_id_videos = {}
+        track_ids_to_update = set()
+        max_track_id = 0
+
+        tk_str = utils.get_track_id_str(annotations[0])
+        for ann in annotations:
+            t = int(ann[tk_str])
+            if t not in track_id_videos:
+                track_id_videos[t] = ann["video_id"]
+
+            if ann["video_id"] != track_id_videos[t]:
+                # track id is assigned to multiple videos
+                track_ids_to_update.add(t)
+            max_track_id = max(max_track_id, t)
+
+        if track_ids_to_update:
+            print("true")
+            next_id = itertools.count(max_track_id + 1)
+            new_tk_ids = defaultdict(lambda: next(next_id))
+            for ann in annotations:
+                t = ann[tk_str]
+                v = ann["video_id"]
+                if t in track_ids_to_update:
+                    ann[tk_str] = new_tk_ids[t, v]
+        return len(track_ids_to_update)
--- a/sam3/eval/teta_eval_toolkit/datasets/tao.py
+++ b/sam3/eval/teta_eval_toolkit/datasets/tao.py
@@ -0,0 +1,659 @@
+# fmt: off
+# flake8: noqa
+
+"""TAO Dataset."""
+import copy
+import itertools
+import json
+import os
+from collections import defaultdict
+
+import numpy as np
+
+from .. import _timing
+from ..config import get_default_dataset_config, init_config
+from ..utils import TrackEvalException
+from ._base_dataset import _BaseDataset
+
+
+class TAO(_BaseDataset):
+    """Dataset class for TAO tracking"""
+
+    def __init__(self, config=None):
+        """Initialize dataset, checking that all required files are present."""
+        super().__init__()
+        # Fill non-given config values with defaults
+        self.config = init_config(config, get_default_dataset_config(), self.get_name())
+        self.gt_fol = self.config["GT_FOLDER"]
+        self.tracker_fol = self.config["TRACKERS_FOLDER"]
+        self.should_classes_combine = True
+        self.use_super_categories = False
+        self.use_mask = self.config["USE_MASK"]
+
+
+        self.tracker_sub_fol = self.config["TRACKER_SUB_FOLDER"]
+        self.output_fol = self.config["OUTPUT_FOLDER"]
+        if self.output_fol is None:
+            self.output_fol = self.tracker_fol
+        self.output_sub_fol = self.config["OUTPUT_SUB_FOLDER"]
+
+        if self.gt_fol.endswith(".json"):
+            self.gt_data = json.load(open(self.gt_fol, "r"))
+        else:
+            gt_dir_files = [
+                file for file in os.listdir(self.gt_fol) if file.endswith(".json")
+            ]
+            if len(gt_dir_files) != 1:
+                raise TrackEvalException(
+                    f"{self.gt_fol} does not contain exactly one json file."
+                )
+
+            with open(os.path.join(self.gt_fol, gt_dir_files[0])) as f:
+                self.gt_data = json.load(f)
+
+        # merge categories marked with a merged tag in TAO dataset
+        self._merge_categories(self.gt_data["annotations"] + self.gt_data["tracks"])
+
+        # get sequences to eval and sequence information
+        self.seq_list = [
+            vid["name"].replace("/", "-") for vid in self.gt_data["videos"]
+        ]
+        self.seq_name2seqid = {
+            vid["name"].replace("/", "-"): vid["id"] for vid in self.gt_data["videos"]
+        }
+        # compute mappings from videos to annotation data
+        self.video2gt_track, self.video2gt_image = self._compute_vid_mappings(
+            self.gt_data["annotations"]
+        )
+        # compute sequence lengths
+        self.seq_lengths = {vid["id"]: 0 for vid in self.gt_data["videos"]}
+        for img in self.gt_data["images"]:
+            self.seq_lengths[img["video_id"]] += 1
+        self.seq2images2timestep = self._compute_image_to_timestep_mappings()
+        self.seq2cls = {
+            vid["id"]: {
+                "pos_cat_ids": list(
+                    {track["category_id"] for track in self.video2gt_track[vid["id"]]}
+                ),
+                "neg_cat_ids": vid["neg_category_ids"],
+                "not_exh_labeled_cat_ids": vid["not_exhaustive_category_ids"],
+            }
+            for vid in self.gt_data["videos"]
+        }
+
+        # Get classes to eval
+        considered_vid_ids = [self.seq_name2seqid[vid] for vid in self.seq_list]
+        seen_cats = set(
+            [
+                cat_id
+                for vid_id in considered_vid_ids
+                for cat_id in self.seq2cls[vid_id]["pos_cat_ids"]
+            ]
+        )
+        # only classes with ground truth are evaluated in TAO
+        self.valid_classes = [
+            cls["name"] for cls in self.gt_data["categories"] if cls["id"] in seen_cats
+        ]
+        cls_name2clsid_map = {
+            cls["name"]: cls["id"] for cls in self.gt_data["categories"]
+        }
+
+        if self.config["CLASSES_TO_EVAL"]:
+            self.class_list = [
+                cls.lower() if cls.lower() in self.valid_classes else None
+                for cls in self.config["CLASSES_TO_EVAL"]
+            ]
+            if not all(self.class_list):
+                valid_cls = ", ".join(self.valid_classes)
+                raise TrackEvalException(
+                    "Attempted to evaluate an invalid class. Only classes "
+                    f"{valid_cls} are valid (classes present in ground truth"
+                    " data)."
+                )
+        else:
+            self.class_list = [cls for cls in self.valid_classes]
+        self.cls_name2clsid = {
+            k: v for k, v in cls_name2clsid_map.items() if k in self.class_list
+        }
+        self.clsid2cls_name = {
+            v: k for k, v in cls_name2clsid_map.items() if k in self.class_list
+        }
+        # get trackers to eval
+        print(self.config["TRACKERS_TO_EVAL"] )
+        if self.config["TRACKERS_TO_EVAL"] is None:
+            self.tracker_list = os.listdir(self.tracker_fol)
+        else:
+            self.tracker_list = self.config["TRACKERS_TO_EVAL"]
+
+        if self.config["TRACKER_DISPLAY_NAMES"] is None:
+            self.tracker_to_disp = dict(zip(self.tracker_list, self.tracker_list))
+        elif (self.config["TRACKERS_TO_EVAL"] is not None) and (
+            len(self.config["TK_DISPLAY_NAMES"]) == len(self.tracker_list)
+        ):
+            self.tracker_to_disp = dict(
+                zip(self.tracker_list, self.config["TK_DISPLAY_NAMES"])
+            )
+        else:
+            raise TrackEvalException(
+                "List of tracker files and tracker display names do not match."
+            )
+
+        self.tracker_data = {tracker: dict() for tracker in self.tracker_list}
+
+        for tracker in self.tracker_list:
+            if self.tracker_sub_fol.endswith(".json"):
+                with open(os.path.join(self.tracker_sub_fol)) as f:
+                    curr_data = json.load(f)
+            else:
+                tr_dir = os.path.join(self.tracker_fol, tracker, self.tracker_sub_fol)
+                tr_dir_files = [
+                    file for file in os.listdir(tr_dir) if file.endswith(".json")
+                ]
+                if len(tr_dir_files) != 1:
+                    raise TrackEvalException(
+                        f"{tr_dir} does not contain exactly one json file."
+                    )
+                with open(os.path.join(tr_dir, tr_dir_files[0])) as f:
+                    curr_data = json.load(f)
+
+            # limit detections if MAX_DETECTIONS > 0
+            if self.config["MAX_DETECTIONS"]:
+                curr_data = self._limit_dets_per_image(curr_data)
+
+            # fill missing video ids
+            self._fill_video_ids_inplace(curr_data)
+
+            # make track ids unique over whole evaluation set
+            self._make_tk_ids_unique(curr_data)
+
+            # merge categories marked with a merged tag in TAO dataset
+            self._merge_categories(curr_data)
+
+            # get tracker sequence information
+            curr_vids2tracks, curr_vids2images = self._compute_vid_mappings(curr_data)
+            self.tracker_data[tracker]["vids_to_tracks"] = curr_vids2tracks
+            self.tracker_data[tracker]["vids_to_images"] = curr_vids2images
+
+    def get_display_name(self, tracker):
+        return self.tracker_to_disp[tracker]
+
+    def _load_raw_file(self, tracker, seq, is_gt):
+        """Load a file (gt or tracker) in the TAO format
+
+        If is_gt, this returns a dict which contains the fields:
+        [gt_ids, gt_classes]:
+            list (for each timestep) of 1D NDArrays (for each det).
+        [gt_dets]: list (for each timestep) of lists of detections.
+
+        if not is_gt, this returns a dict which contains the fields:
+        [tk_ids, tk_classes, tk_confidences]:
+            list (for each timestep) of 1D NDArrays (for each det).
+        [tk_dets]: list (for each timestep) of lists of detections.
+        """
+        seq_id = self.seq_name2seqid[seq]
+        # file location
+        if is_gt:
+            imgs = self.video2gt_image[seq_id]
+        else:
+            imgs = self.tracker_data[tracker]["vids_to_images"][seq_id]
+
+        # convert data to required format
+        num_timesteps = self.seq_lengths[seq_id]
+        img_to_timestep = self.seq2images2timestep[seq_id]
+        data_keys = ["ids", "classes", "dets"]
+        if not is_gt:
+            data_keys += ["tk_confidences"]
+        raw_data = {key: [None] * num_timesteps for key in data_keys}
+        for img in imgs:
+            # some tracker data contains images without any ground truth info,
+            # these are ignored
+            if img["id"] not in img_to_timestep:
+                continue
+            t = img_to_timestep[img["id"]]
+            anns = img["annotations"]
+            if self.use_mask:
+                # When using mask, extract segmentation data
+                raw_data["dets"][t] = [ann.get("segmentation") for ann in anns]
+            else:
+                # When using bbox, extract bbox data
+                raw_data["dets"][t] = np.atleast_2d([ann["bbox"] for ann in anns]).astype(
+                    float
+                )
+            raw_data["ids"][t] = np.atleast_1d(
+                [ann["track_id"] for ann in anns]
+            ).astype(int)
+            raw_data["classes"][t] = np.atleast_1d(
+                [ann["category_id"] for ann in anns]
+            ).astype(int)
+            if not is_gt:
+                raw_data["tk_confidences"][t] = np.atleast_1d(
+                    [ann["score"] for ann in anns]
+                ).astype(float)
+
+        for t, d in enumerate(raw_data["dets"]):
+            if d is None:
+                raw_data["dets"][t] = np.empty((0, 4)).astype(float)
+                raw_data["ids"][t] = np.empty(0).astype(int)
+                raw_data["classes"][t] = np.empty(0).astype(int)
+                if not is_gt:
+                    raw_data["tk_confidences"][t] = np.empty(0)
+
+        if is_gt:
+            key_map = {"ids": "gt_ids", "classes": "gt_classes", "dets": "gt_dets"}
+        else:
+            key_map = {"ids": "tk_ids", "classes": "tk_classes", "dets": "tk_dets"}
+        for k, v in key_map.items():
+            raw_data[v] = raw_data.pop(k)
+
+        raw_data["num_timesteps"] = num_timesteps
+        raw_data["neg_cat_ids"] = self.seq2cls[seq_id]["neg_cat_ids"]
+        raw_data["not_exh_labeled_cls"] = self.seq2cls[seq_id][
+            "not_exh_labeled_cat_ids"
+        ]
+        raw_data["seq"] = seq
+        return raw_data
+
+    def get_preprocessed_seq_data_thr(self, raw_data, cls, assignment=None):
+        """Preprocess data for a single sequence for a single class.
+
+        Inputs:
+            raw_data: dict containing the data for the sequence already
+                read in by get_raw_seq_data().
+            cls: class to be evaluated.
+        Outputs:
+            gt_ids:
+                list (for each timestep) of ids of GT tracks
+            tk_ids:
+                list (for each timestep) of ids of predicted tracks (all for TP
+                matching (Det + AssocA))
+            tk_overlap_ids:
+                list (for each timestep) of ids of predicted tracks that overlap
+                with GTs
+            tk_neg_ids:
+                list (for each timestep) of ids of predicted tracks that with
+                the class id on the negative list for the current sequence.
+            tk_exh_ids:
+                list (for each timestep) of ids of predicted tracks that do not
+                overlap with existing GTs but have the class id on the
+                exhaustive annotated class list for the current sequence.
+            tk_dets:
+                list (for each timestep) of lists of detections that
+                corresponding to the tk_ids
+            tk_classes:
+                list (for each timestep) of lists of classes that corresponding
+                to the tk_ids
+            tk_confidences:
+                list (for each timestep) of lists of classes that corresponding
+                to the tk_ids
+            sim_scores:
+                similarity score between gt_ids and tk_ids.
+        """
+        if cls != "all":
+            cls_id = self.cls_name2clsid[cls]
+
+        data_keys = [
+            "gt_ids",
+            "tk_ids",
+            "gt_id_map",
+            "tk_id_map",
+            "gt_dets",
+            "gt_classes",
+            "gt_class_name",
+            "tk_overlap_classes",
+            "tk_overlap_ids",
+            "tk_neg_ids",
+            "tk_exh_ids",
+            "tk_class_eval_tk_ids",
+            "tk_dets",
+            "tk_classes",
+            "tk_confidences",
+            "sim_scores",
+        ]
+        data = {key: [None] * raw_data["num_timesteps"] for key in data_keys}
+        unique_gt_ids = []
+        unique_tk_ids = []
+        num_gt_dets = 0
+        num_tk_cls_dets = 0
+        num_tk_overlap_dets = 0
+        overlap_ious_thr = 0.5
+        loc_and_asso_tk_ids = []
+
+        for t in range(raw_data["num_timesteps"]):
+            # only extract relevant dets for this class for preproc and eval
+            if cls == "all":
+                gt_class_mask = np.ones_like(raw_data["gt_classes"][t]).astype(bool)
+            else:
+                gt_class_mask = np.atleast_1d(
+                    raw_data["gt_classes"][t] == cls_id
+                ).astype(bool)
+
+            # select GT that is not in the evaluating classes
+            if assignment is not None and assignment:
+                all_gt_ids = list(assignment[t].keys())
+                gt_ids_in = raw_data["gt_ids"][t][gt_class_mask]
+                gt_ids_out = set(all_gt_ids) - set(gt_ids_in)
+                tk_ids_out = set([assignment[t][key] for key in list(gt_ids_out)])
+
+            # compute overlapped tracks and add their ids to overlap_tk_ids
+            sim_scores = raw_data["similarity_scores"]
+            overlap_ids_masks = (sim_scores[t][gt_class_mask] >= overlap_ious_thr).any(
+                axis=0
+            )
+            overlap_tk_ids_t = raw_data["tk_ids"][t][overlap_ids_masks]
+            if assignment is not None and assignment:
+                data["tk_overlap_ids"][t] = list(set(overlap_tk_ids_t) - tk_ids_out)
+            else:
+                data["tk_overlap_ids"][t] = list(set(overlap_tk_ids_t))
+
+            loc_and_asso_tk_ids += data["tk_overlap_ids"][t]
+
+            data["tk_exh_ids"][t] = []
+            data["tk_neg_ids"][t] = []
+
+            if cls == "all":
+                continue
+
+        # remove tk_ids that has been assigned to GT belongs to other classes.
+        loc_and_asso_tk_ids = list(set(loc_and_asso_tk_ids))
+
+        # remove all unwanted unmatched tracker detections
+        for t in range(raw_data["num_timesteps"]):
+            # add gt to the data
+            if cls == "all":
+                gt_class_mask = np.ones_like(raw_data["gt_classes"][t]).astype(bool)
+            else:
+                gt_class_mask = np.atleast_1d(
+                    raw_data["gt_classes"][t] == cls_id
+                ).astype(bool)
+                data["gt_classes"][t] = cls_id
+                data["gt_class_name"][t] = cls
+
+            gt_ids = raw_data["gt_ids"][t][gt_class_mask]
+            if self.use_mask:
+                gt_dets = [raw_data['gt_dets'][t][ind] for ind in range(len(gt_class_mask)) if gt_class_mask[ind]]
+            else:
+                gt_dets = raw_data["gt_dets"][t][gt_class_mask]
+            data["gt_ids"][t] = gt_ids
+            data["gt_dets"][t] = gt_dets
+
+            # filter pred and only keep those that highly overlap with GTs
+            tk_mask = np.isin(
+                raw_data["tk_ids"][t], np.array(loc_and_asso_tk_ids), assume_unique=True
+            )
+            tk_overlap_mask = np.isin(
+                raw_data["tk_ids"][t],
+                np.array(data["tk_overlap_ids"][t]),
+                assume_unique=True,
+            )
+
+            tk_ids = raw_data["tk_ids"][t][tk_mask]
+            if self.use_mask:
+                tk_dets = [raw_data['tk_dets'][t][ind] for ind in range(len(tk_mask)) if
+                            tk_mask[ind]]
+            else:
+                tk_dets = raw_data["tk_dets"][t][tk_mask]
+            tracker_classes = raw_data["tk_classes"][t][tk_mask]
+
+            # add overlap classes for computing the FP for Cls term
+            tracker_overlap_classes = raw_data["tk_classes"][t][tk_overlap_mask]
+            tracker_confidences = raw_data["tk_confidences"][t][tk_mask]
+            sim_scores_masked = sim_scores[t][gt_class_mask, :][:, tk_mask]
+
+            # add filtered prediction to the data
+            data["tk_classes"][t] = tracker_classes
+            data["tk_overlap_classes"][t] = tracker_overlap_classes
+            data["tk_ids"][t] = tk_ids
+            data["tk_dets"][t] = tk_dets
+            data["tk_confidences"][t] = tracker_confidences
+            data["sim_scores"][t] = sim_scores_masked
+            data["tk_class_eval_tk_ids"][t] = set(
+                list(data["tk_overlap_ids"][t])
+                + list(data["tk_neg_ids"][t])
+                + list(data["tk_exh_ids"][t])
+            )
+
+            # count total number of detections
+            unique_gt_ids += list(np.unique(data["gt_ids"][t]))
+            # the unique track ids are for association.
+            unique_tk_ids += list(np.unique(data["tk_ids"][t]))
+
+            num_tk_overlap_dets += len(data["tk_overlap_ids"][t])
+            num_tk_cls_dets += len(data["tk_class_eval_tk_ids"][t])
+            num_gt_dets += len(data["gt_ids"][t])
+
+        # re-label IDs such that there are no empty IDs
+        if len(unique_gt_ids) > 0:
+            unique_gt_ids = np.unique(unique_gt_ids)
+            gt_id_map = np.nan * np.ones((np.max(unique_gt_ids) + 1))
+            gt_id_map[unique_gt_ids] = np.arange(len(unique_gt_ids))
+            data["gt_id_map"] = {}
+            for gt_id in unique_gt_ids:
+                new_gt_id = gt_id_map[gt_id].astype(int)
+                data["gt_id_map"][new_gt_id] = gt_id
+
+            for t in range(raw_data["num_timesteps"]):
+                if len(data["gt_ids"][t]) > 0:
+                    data["gt_ids"][t] = gt_id_map[data["gt_ids"][t]].astype(int)
+
+        if len(unique_tk_ids) > 0:
+            unique_tk_ids = np.unique(unique_tk_ids)
+            tk_id_map = np.nan * np.ones((np.max(unique_tk_ids) + 1))
+            tk_id_map[unique_tk_ids] = np.arange(len(unique_tk_ids))
+
+            data["tk_id_map"] = {}
+            for track_id in unique_tk_ids:
+                new_track_id = tk_id_map[track_id].astype(int)
+                data["tk_id_map"][new_track_id] = track_id
+
+            for t in range(raw_data["num_timesteps"]):
+                if len(data["tk_ids"][t]) > 0:
+                    data["tk_ids"][t] = tk_id_map[data["tk_ids"][t]].astype(int)
+                if len(data["tk_overlap_ids"][t]) > 0:
+                    data["tk_overlap_ids"][t] = tk_id_map[
+                        data["tk_overlap_ids"][t]
+                    ].astype(int)
+
+        # record overview statistics.
+        data["num_tk_cls_dets"] = num_tk_cls_dets
+        data["num_tk_overlap_dets"] = num_tk_overlap_dets
+        data["num_gt_dets"] = num_gt_dets
+        data["num_tk_ids"] = len(unique_tk_ids)
+        data["num_gt_ids"] = len(unique_gt_ids)
+        data["num_timesteps"] = raw_data["num_timesteps"]
+        data["seq"] = raw_data["seq"]
+
+        self._check_unique_ids(data)
+
+        return data
+
+    @_timing.time
+    def get_preprocessed_seq_data(
+        self, raw_data, cls, assignment=None, thresholds=[50, 75]
+    ):
+        """Preprocess data for a single sequence for a single class."""
+        data = {}
+        if thresholds is None:
+            thresholds = [50]
+        elif isinstance(thresholds, int):
+            thresholds = [thresholds]
+
+        for thr in thresholds:
+            assignment_thr = None
+            if assignment is not None:
+                assignment_thr = assignment[thr]
+            data[thr] = self.get_preprocessed_seq_data_thr(
+                raw_data, cls, assignment_thr
+            )
+
+        return data
+
+    def _calculate_similarities(self, gt_dets_t, tk_dets_t):
+        """Compute similarity scores."""
+        if self.use_mask:
+            similarity_scores = self._calculate_mask_ious(gt_dets_t, tk_dets_t, is_encoded=True, do_ioa=False)
+        else:
+            similarity_scores = self._calculate_box_ious(gt_dets_t, tk_dets_t)
+        return similarity_scores
+
+    def _merge_categories(self, annotations):
+        """Merges categories with a merged tag.
+
+        Adapted from https://github.com/TAO-Dataset.
+        """
+        merge_map = {}
+        for category in self.gt_data["categories"]:
+            if "merged" in category:
+                for to_merge in category["merged"]:
+                    merge_map[to_merge["id"]] = category["id"]
+
+        for ann in annotations:
+            ann["category_id"] = merge_map.get(ann["category_id"], ann["category_id"])
+
+    def _compute_vid_mappings(self, annotations):
+        """Computes mappings from videos to corresponding tracks and images."""
+        vids_to_tracks = {}
+        vids_to_imgs = {}
+        vid_ids = [vid["id"] for vid in self.gt_data["videos"]]
+
+        # compute an mapping from image IDs to images
+        images = {}
+        for image in self.gt_data["images"]:
+            images[image["id"]] = image
+
+        for ann in annotations:
+            ann["area"] = ann["bbox"][2] * ann["bbox"][3]
+
+            vid = ann["video_id"]
+            if ann["video_id"] not in vids_to_tracks.keys():
+                vids_to_tracks[ann["video_id"]] = list()
+            if ann["video_id"] not in vids_to_imgs.keys():
+                vids_to_imgs[ann["video_id"]] = list()
+
+            # fill in vids_to_tracks
+            tid = ann["track_id"]
+            exist_tids = [track["id"] for track in vids_to_tracks[vid]]
+            try:
+                index1 = exist_tids.index(tid)
+            except ValueError:
+                index1 = -1
+            if tid not in exist_tids:
+                curr_track = {
+                    "id": tid,
+                    "category_id": ann["category_id"],
+                    "video_id": vid,
+                    "annotations": [ann],
+                }
+                vids_to_tracks[vid].append(curr_track)
+            else:
+                vids_to_tracks[vid][index1]["annotations"].append(ann)
+
+            # fill in vids_to_imgs
+            img_id = ann["image_id"]
+            exist_img_ids = [img["id"] for img in vids_to_imgs[vid]]
+            try:
+                index2 = exist_img_ids.index(img_id)
+            except ValueError:
+                index2 = -1
+            if index2 == -1:
+                curr_img = {"id": img_id, "annotations": [ann]}
+                vids_to_imgs[vid].append(curr_img)
+            else:
+                vids_to_imgs[vid][index2]["annotations"].append(ann)
+
+        # sort annotations by frame index and compute track area
+        for vid, tracks in vids_to_tracks.items():
+            for track in tracks:
+                track["annotations"] = sorted(
+                    track["annotations"],
+                    key=lambda x: images[x["image_id"]]["frame_index"],
+                )
+                # compute average area
+                track["area"] = sum(x["area"] for x in track["annotations"]) / len(
+                    track["annotations"]
+                )
+
+        # ensure all videos are present
+        for vid_id in vid_ids:
+            if vid_id not in vids_to_tracks.keys():
+                vids_to_tracks[vid_id] = []
+            if vid_id not in vids_to_imgs.keys():
+                vids_to_imgs[vid_id] = []
+
+        return vids_to_tracks, vids_to_imgs
+
+    def _compute_image_to_timestep_mappings(self):
+        """Computes a mapping from images to timestep in sequence."""
+        images = {}
+        for image in self.gt_data["images"]:
+            images[image["id"]] = image
+
+        seq_to_imgs_to_timestep = {vid["id"]: dict() for vid in self.gt_data["videos"]}
+        for vid in seq_to_imgs_to_timestep:
+            curr_imgs = [img["id"] for img in self.video2gt_image[vid]]
+            curr_imgs = sorted(curr_imgs, key=lambda x: images[x]["frame_index"])
+            seq_to_imgs_to_timestep[vid] = {
+                curr_imgs[i]: i for i in range(len(curr_imgs))
+            }
+
+        return seq_to_imgs_to_timestep
+
+    def _limit_dets_per_image(self, annotations):
+        """Limits the number of detections for each image.
+
+        Adapted from https://github.com/TAO-Dataset/.
+        """
+        max_dets = self.config["MAX_DETECTIONS"]
+        img_ann = defaultdict(list)
+        for ann in annotations:
+            img_ann[ann["image_id"]].append(ann)
+
+        for img_id, _anns in img_ann.items():
+            if len(_anns) <= max_dets:
+                continue
+            _anns = sorted(_anns, key=lambda x: x["score"], reverse=True)
+            img_ann[img_id] = _anns[:max_dets]
+
+        return [ann for anns in img_ann.values() for ann in anns]
+
+    def _fill_video_ids_inplace(self, annotations):
+        """Fills in missing video IDs inplace.
+
+        Adapted from https://github.com/TAO-Dataset/.
+        """
+        missing_video_id = [x for x in annotations if "video_id" not in x]
+        if missing_video_id:
+            image_id_to_video_id = {
+                x["id"]: x["video_id"] for x in self.gt_data["images"]
+            }
+            for x in missing_video_id:
+                x["video_id"] = image_id_to_video_id[x["image_id"]]
+
+    @staticmethod
+    def _make_tk_ids_unique(annotations):
+        """Makes track IDs unqiue over the whole annotation set.
+
+        Adapted from https://github.com/TAO-Dataset/.
+        """
+        track_id_videos = {}
+        track_ids_to_update = set()
+        max_track_id = 0
+        for ann in annotations:
+            t = ann["track_id"]
+            if t not in track_id_videos:
+                track_id_videos[t] = ann["video_id"]
+
+            if ann["video_id"] != track_id_videos[t]:
+                # track id is assigned to multiple videos
+                track_ids_to_update.add(t)
+            max_track_id = max(max_track_id, t)
+
+        if track_ids_to_update:
+            print("true")
+            next_id = itertools.count(max_track_id + 1)
+            new_tk_ids = defaultdict(lambda: next(next_id))
+            for ann in annotations:
+                t = ann["track_id"]
+                v = ann["video_id"]
+                if t in track_ids_to_update:
+                    ann["track_id"] = new_tk_ids[t, v]
+        return len(track_ids_to_update)
--- a/sam3/eval/teta_eval_toolkit/eval.py
+++ b/sam3/eval/teta_eval_toolkit/eval.py
@@ -0,0 +1,275 @@
+# fmt: off
+# flake8: noqa
+
+import copy
+import os
+import pickle
+import time
+import traceback
+from functools import partial
+from multiprocessing.pool import Pool
+
+import numpy as np
+
+from . import _timing, utils
+from .config import get_default_eval_config, init_config
+from .utils import TrackEvalException
+
+
+class Evaluator:
+    """Evaluator class for evaluating different metrics for each datasets."""
+
+    def __init__(self, config=None):
+        """Initialize the evaluator with a config file."""
+        self.config = init_config(config, get_default_eval_config(), "Eval")
+        # Only run timing analysis if not run in parallel.
+        if self.config["TIME_PROGRESS"] and not self.config["USE_PARALLEL"]:
+            _timing.DO_TIMING = True
+            if self.config["DISPLAY_LESS_PROGRESS"]:
+                _timing.DISPLAY_LESS_PROGRESS = True
+
+    @_timing.time
+    def evaluate(self, dataset_list, metrics_list):
+        """Evaluate a set of metrics on a set of datasets."""
+        config = self.config
+        metrics_list = metrics_list
+        metric_names = utils.validate_metrics_list(metrics_list)
+        dataset_names = [dataset.get_name() for dataset in dataset_list]
+        output_res = {}
+        output_msg = {}
+
+        for dataset, dname in zip(dataset_list, dataset_names):
+            # Get dataset info about what to evaluate
+            output_res[dname] = {}
+            output_msg[dname] = {}
+            tracker_list, seq_list, class_list = dataset.get_eval_info()
+            print(
+                f"\nEvaluating {len(tracker_list)} tracker(s) on "
+                f"{len(seq_list)} sequence(s) for {len(class_list)} class(es)"
+                f" on {dname} dataset using the following "
+                f'metrics: {", ".join(metric_names)}\n'
+            )
+
+            # Evaluate each tracker
+            for tracker in tracker_list:
+                try:
+                    output_res, output_msg = self.evaluate_tracker(
+                        tracker,
+                        dataset,
+                        dname,
+                        class_list,
+                        metrics_list,
+                        metric_names,
+                        seq_list,
+                        output_res,
+                        output_msg,
+                    )
+                except Exception as err:
+                    output_res[dname][tracker] = None
+                    if type(err) == TrackEvalException:
+                        output_msg[dname][tracker] = str(err)
+                    else:
+                        output_msg[dname][tracker] = "Unknown error occurred."
+                    print("Tracker %s was unable to be evaluated." % tracker)
+                    print(err)
+                    traceback.print_exc()
+                    if config["LOG_ON_ERROR"] is not None:
+                        with open(config["LOG_ON_ERROR"], "a") as f:
+                            print(dname, file=f)
+                            print(tracker, file=f)
+                            print(traceback.format_exc(), file=f)
+                            print("\n\n\n", file=f)
+                    if config["BREAK_ON_ERROR"]:
+                        raise err
+                    elif config["RETURN_ON_ERROR"]:
+                        return output_res, output_msg
+
+        return output_res, output_msg
+
+    def evaluate_tracker(
+        self,
+        tracker,
+        dataset,
+        dname,
+        class_list,
+        metrics_list,
+        metric_names,
+        seq_list,
+        output_res,
+        output_msg,
+    ):
+        """Evaluate each sequence in parallel or in series."""
+        print("\nEvaluating %s\n" % tracker)
+        time_start = time.time()
+        config = self.config
+        if config["USE_PARALLEL"]:
+            with Pool(config["NUM_PARALLEL_CORES"]) as pool:
+                _eval_sequence = partial(
+                    eval_sequence,
+                    dataset=dataset,
+                    tracker=tracker,
+                    class_list=class_list,
+                    metrics_list=metrics_list,
+                    metric_names=metric_names,
+                )
+                results = pool.map(_eval_sequence, seq_list)
+                res = dict(zip(seq_list, results))
+        else:
+            res = {}
+            for curr_seq in sorted(seq_list):
+                res[curr_seq] = eval_sequence(
+                    curr_seq, dataset, tracker, class_list, metrics_list, metric_names
+                )
+
+
+        # collecting combined cls keys (cls averaged, det averaged, super classes)
+        cls_keys = []
+        res["COMBINED_SEQ"] = {}
+        # combine sequences for each class
+        for c_cls in class_list:
+            res["COMBINED_SEQ"][c_cls] = {}
+            for metric, mname in zip(metrics_list, metric_names):
+                curr_res = {
+                    seq_key: seq_value[c_cls][mname]
+                    for seq_key, seq_value in res.items()
+                    if seq_key != "COMBINED_SEQ"
+                }
+                # combine results over all sequences and then over all classes
+                res["COMBINED_SEQ"][c_cls][mname] = metric.combine_sequences(curr_res)
+
+        # combine classes
+        if dataset.should_classes_combine:
+            if config["OUTPUT_PER_SEQ_RES"]:
+                video_keys = res.keys()
+            else:
+                video_keys = ["COMBINED_SEQ"]
+            for v_key in video_keys:
+                cls_keys += ["average"]
+                res[v_key]["average"] = {}
+                for metric, mname in zip(metrics_list, metric_names):
+                    cls_res = {
+                        cls_key: cls_value[mname]
+                        for cls_key, cls_value in res[v_key].items()
+                        if cls_key not in cls_keys
+                    }
+                    res[v_key]["average"][
+                        mname
+                    ] = metric.combine_classes_class_averaged(
+                        cls_res, ignore_empty=True
+                    )
+
+        # combine classes to super classes
+        if dataset.use_super_categories:
+            for cat, sub_cats in dataset.super_categories.items():
+                cls_keys.append(cat)
+                res["COMBINED_SEQ"][cat] = {}
+                for metric, mname in zip(metrics_list, metric_names):
+                    cat_res = {
+                        cls_key: cls_value[mname]
+                        for cls_key, cls_value in res["COMBINED_SEQ"].items()
+                        if cls_key in sub_cats
+                    }
+                    res["COMBINED_SEQ"][cat][
+                        mname
+                    ] = metric.combine_classes_det_averaged(cat_res)
+        # Print and output results in various formats
+        if config["TIME_PROGRESS"]:
+            print(
+                f"\nAll sequences for {tracker} finished in"
+                f" {time.time() - time_start} seconds"
+            )
+        output_fol = dataset.get_output_fol(tracker)
+        os.makedirs(output_fol, exist_ok=True)
+
+        # take a mean of each field of each thr
+        if config["OUTPUT_PER_SEQ_RES"]:
+            all_res = copy.deepcopy(res)
+            summary_keys = res.keys()
+        else:
+            all_res = copy.deepcopy(res["COMBINED_SEQ"])
+            summary_keys = ["COMBINED_SEQ"]
+        thr_key_list = [50]
+        for s_key in summary_keys:
+            for metric, mname in zip(metrics_list, metric_names):
+                if mname != "TETA":
+                    if s_key == "COMBINED_SEQ":
+                        metric.print_table(
+                            {"COMBINED_SEQ": res["COMBINED_SEQ"][cls_keys[0]][mname]},
+                            tracker,
+                            cls_keys[0],
+                        )
+                    continue
+
+                for c_cls in res[s_key].keys():
+                    for thr in thr_key_list:
+                        all_res[s_key][c_cls][mname][thr] = metric._summary_row(
+                            res[s_key][c_cls][mname][thr]
+                        )
+                    x = (
+                        np.array(list(all_res[s_key][c_cls]["TETA"].values()))
+                        .astype("float")
+                        .mean(axis=0)
+                    )
+                    all_res_summary = list(x.round(decimals=2).astype("str"))
+                    all_res[s_key][c_cls][mname]["ALL"] = all_res_summary
+                if config["OUTPUT_SUMMARY"] and s_key == "COMBINED_SEQ":
+                    for t in thr_key_list:
+                        metric.print_summary_table(
+                            all_res[s_key][cls_keys[0]][mname][t],
+                            t,
+                            tracker,
+                            cls_keys[0],
+                        )
+
+        if config["OUTPUT_TEM_RAW_DATA"]:
+            out_file = os.path.join(output_fol, "teta_summary_results.pth")
+            pickle.dump(all_res, open(out_file, "wb"))
+            print("Saved the TETA summary results.")
+
+        # output
+        output_res[dname][mname] = all_res[s_key][cls_keys[0]][mname][t]
+        output_msg[dname][tracker] = "Success"
+
+        return output_res, output_msg
+
+
+@_timing.time
+def eval_sequence(seq, dataset, tracker, class_list, metrics_list, metric_names):
+    """Function for evaluating a single sequence."""
+    raw_data = dataset.get_raw_seq_data(tracker, seq)
+    seq_res = {}
+
+    if "TETA" in metric_names:
+        thresholds = [50]
+        data_all_class = dataset.get_preprocessed_seq_data(
+            raw_data, "all", thresholds=thresholds
+        )
+        teta = metrics_list[metric_names.index("TETA")]
+        assignment = teta.compute_global_assignment(data_all_class)
+
+        # create a dict to save Cls_FP for each class in different thr.
+        cls_fp = {
+            key: {
+                cls: np.zeros((len(np.arange(0.5, 0.99, 0.05)))) for cls in class_list
+            }
+            for key in thresholds
+        }
+
+    for cls in class_list:
+        seq_res[cls] = {}
+        data = dataset.get_preprocessed_seq_data(raw_data, cls, assignment, thresholds)
+
+        for metric, mname in zip(metrics_list, metric_names):
+            if mname == "TETA":
+                seq_res[cls][mname], cls_fp, _ = metric.eval_sequence(
+                    data, cls, dataset.clsid2cls_name, cls_fp
+                )
+            else:
+                seq_res[cls][mname] = metric.eval_sequence(data)
+
+    if "TETA" in metric_names:
+        for thr in thresholds:
+            for cls in class_list:
+                seq_res[cls]["TETA"][thr]["Cls_FP"] += cls_fp[thr][cls]
+
+    return seq_res
--- a/sam3/eval/teta_eval_toolkit/metrics/init.py
+++ b/sam3/eval/teta_eval_toolkit/metrics/init.py
@@ -0,0 +1,4 @@
+# fmt: off
+# flake8: noqa
+
+from .teta import TETA
--- a/sam3/eval/teta_eval_toolkit/metrics/_base_metric.py
+++ b/sam3/eval/teta_eval_toolkit/metrics/_base_metric.py
@@ -0,0 +1,148 @@
+# fmt: off
+# flake8: noqa
+
+from abc import ABC, abstractmethod
+
+import numpy as np
+
+from .. import _timing
+from ..utils import TrackEvalException
+
+
+class _BaseMetric(ABC):
+    @abstractmethod
+    def __init__(self):
+        self.plottable = False
+        self.integer_fields = []
+        self.float_fields = []
+        self.array_labels = []
+        self.integer_array_fields = []
+        self.float_array_fields = []
+        self.fields = []
+        self.summary_fields = []
+        self.registered = False
+
+    #####################################################################
+    # Abstract functions for subclasses to implement
+
+    @_timing.time
+    @abstractmethod
+    def eval_sequence(self, data):
+        ...
+
+    @abstractmethod
+    def combine_sequences(self, all_res):
+        ...
+
+    @abstractmethod
+    def combine_classes_class_averaged(self, all_res, ignore_empty=False):
+        ...
+
+    @abstractmethod
+    def combine_classes_det_averaged(self, all_res):
+        ...
+
+    def plot_single_tracker_results(self, all_res, tracker, output_folder, cls):
+        """Plot results, only valid for metrics with self.plottable."""
+        if self.plottable:
+            raise NotImplementedError(
+                f"plot_results is not implemented for metric {self.get_name()}"
+            )
+        else:
+            pass
+
+    #####################################################################
+    # Helper functions which are useful for all metrics:
+
+    @classmethod
+    def get_name(cls):
+        return cls.__name__
+
+    @staticmethod
+    def _combine_sum(all_res, field):
+        """Combine sequence results via sum"""
+        return sum([all_res[k][field] for k in all_res.keys()])
+
+    @staticmethod
+    def _combine_weighted_av(all_res, field, comb_res, weight_field):
+        """Combine sequence results via weighted average."""
+        return sum(
+            [all_res[k][field] * all_res[k][weight_field] for k in all_res.keys()]
+        ) / np.maximum(1.0, comb_res[weight_field])
+
+    def print_table(self, table_res, tracker, cls):
+        """Print table of results for all sequences."""
+        print("")
+        metric_name = self.get_name()
+        self._row_print(
+            [metric_name + ": " + tracker + "-" + cls] + self.summary_fields
+        )
+        for seq, results in sorted(table_res.items()):
+            if seq == "COMBINED_SEQ":
+                continue
+            summary_res = self._summary_row(results)
+            self._row_print([seq] + summary_res)
+        summary_res = self._summary_row(table_res["COMBINED_SEQ"])
+        self._row_print(["COMBINED"] + summary_res)
+
+    def _summary_row(self, results_):
+        vals = []
+        for h in self.summary_fields:
+            if h in self.float_array_fields:
+                vals.append("{0:1.5g}".format(100 * np.mean(results_[h])))
+            elif h in self.float_fields:
+                vals.append("{0:1.5g}".format(100 * float(results_[h])))
+            elif h in self.integer_fields:
+                vals.append("{0:d}".format(int(results_[h])))
+            else:
+                raise NotImplementedError(
+                    "Summary function not implemented for this field type."
+                )
+        return vals
+
+    @staticmethod
+    def _row_print(*argv):
+        """Print results in evenly spaced rows, with more space in first row."""
+        if len(argv) == 1:
+            argv = argv[0]
+        to_print = "%-35s" % argv[0]
+        for v in argv[1:]:
+            to_print += "%-10s" % str(v)
+        print(to_print)
+
+    def summary_results(self, table_res):
+        """Return a simple summary of final results for a tracker."""
+        return dict(
+            zip(self.summary_fields, self._summary_row(table_res["COMBINED_SEQ"]),)
+        )
+
+    def detailed_results(self, table_res):
+        """Return detailed final results for a tracker."""
+        # Get detailed field information
+        detailed_fields = self.float_fields + self.integer_fields
+        for h in self.float_array_fields + self.integer_array_fields:
+            for alpha in [int(100 * x) for x in self.array_labels]:
+                detailed_fields.append(h + "___" + str(alpha))
+            detailed_fields.append(h + "___AUC")
+
+        # Get detailed results
+        detailed_results = {}
+        for seq, res in table_res.items():
+            detailed_row = self._detailed_row(res)
+            if len(detailed_row) != len(detailed_fields):
+                raise TrackEvalException(
+                    f"Field names and data have different sizes "
+                    f"({len(detailed_row)} and {len(detailed_fields)})"
+                )
+            detailed_results[seq] = dict(zip(detailed_fields, detailed_row))
+        return detailed_results
+
+    def _detailed_row(self, res):
+        detailed_row = []
+        for h in self.float_fields + self.integer_fields:
+            detailed_row.append(res[h])
+        for h in self.float_array_fields + self.integer_array_fields:
+            for i, _ in enumerate([int(100 * x) for x in self.array_labels]):
+                detailed_row.append(res[h][i])
+            detailed_row.append(np.mean(res[h]))
+        return detailed_row
--- a/sam3/eval/teta_eval_toolkit/metrics/teta.py
+++ b/sam3/eval/teta_eval_toolkit/metrics/teta.py
@@ -0,0 +1,399 @@
+# fmt: off
+# flake8: noqa
+
+"""Track Every Thing Accuracy metric."""
+
+import numpy as np
+from scipy.optimize import linear_sum_assignment
+
+from .. import _timing
+from ._base_metric import _BaseMetric
+
+EPS = np.finfo("float").eps  # epsilon
+
+
+class TETA(_BaseMetric):
+    """TETA metric."""
+
+    def __init__(self, exhaustive=False, config=None):
+        """Initialize metric."""
+        super().__init__()
+        self.plottable = True
+        self.array_labels = np.arange(0.0, 0.99, 0.05)
+        self.cls_array_labels = np.arange(0.5, 0.99, 0.05)
+
+        self.integer_array_fields = [
+            "Loc_TP",
+            "Loc_FN",
+            "Loc_FP",
+            "Cls_TP",
+            "Cls_FN",
+            "Cls_FP",
+        ]
+        self.float_array_fields = (
+            ["TETA", "LocA", "AssocA", "ClsA"]
+            + ["LocRe", "LocPr"]
+            + ["AssocRe", "AssocPr"]
+            + ["ClsRe", "ClsPr"]
+        )
+        self.fields = self.float_array_fields + self.integer_array_fields
+        self.summary_fields = self.float_array_fields
+        self.exhaustive = exhaustive
+
+    def compute_global_assignment(self, data_thr, alpha=0.5):
+        """Compute global assignment of TP."""
+        res = {
+            thr: {t: {} for t in range(data_thr[thr]["num_timesteps"])}
+            for thr in data_thr
+        }
+
+        for thr in data_thr:
+            data = data_thr[thr]
+            # return empty result if tracker or gt sequence is empty
+            if data["num_tk_overlap_dets"] == 0 or data["num_gt_dets"] == 0:
+                return res
+
+            # global alignment score
+            ga_score, _, _ = self.compute_global_alignment_score(data)
+
+            # calculate scores for each timestep
+            for t, (gt_ids_t, tk_ids_t) in enumerate(
+                zip(data["gt_ids"], data["tk_ids"])
+            ):
+                # get matches optimizing for TETA
+                amatch_rows, amatch_cols = self.compute_matches(
+                    data, t, ga_score, gt_ids_t, tk_ids_t, alpha=alpha
+                )
+                gt_ids = [data["gt_id_map"][tid] for tid in gt_ids_t[amatch_rows[0]]]
+                matched_ids = [
+                    data["tk_id_map"][tid] for tid in tk_ids_t[amatch_cols[0]]
+                ]
+                res[thr][t] = dict(zip(gt_ids, matched_ids))
+
+        return res
+
+    def eval_sequence_single_thr(self, data, cls, cid2clsname, cls_fp_thr, thr):
+        """Computes TETA metric for one threshold for one sequence."""
+        res = {}
+        class_info_list = []
+        for field in self.float_array_fields + self.integer_array_fields:
+            if field.startswith("Cls"):
+                res[field] = np.zeros(len(self.cls_array_labels), dtype=float)
+            else:
+                res[field] = np.zeros((len(self.array_labels)), dtype=float)
+
+        # return empty result if tracker or gt sequence is empty
+        if data["num_tk_overlap_dets"] == 0:
+            res["Loc_FN"] = data["num_gt_dets"] * np.ones(
+                (len(self.array_labels)), dtype=float
+            )
+            if self.exhaustive:
+                cls_fp_thr[cls] = data["num_tk_cls_dets"] * np.ones(
+                    (len(self.cls_array_labels)), dtype=float
+                )
+            res = self._compute_final_fields(res)
+            return res, cls_fp_thr, class_info_list
+
+        if data["num_gt_dets"] == 0:
+            if self.exhaustive:
+                cls_fp_thr[cls] = data["num_tk_cls_dets"] * np.ones(
+                    (len(self.cls_array_labels)), dtype=float
+                )
+            res = self._compute_final_fields(res)
+            return res, cls_fp_thr, class_info_list
+
+        # global alignment score
+        ga_score, gt_id_count, tk_id_count = self.compute_global_alignment_score(data)
+        matches_counts = [np.zeros_like(ga_score) for _ in self.array_labels]
+
+        # calculate scores for each timestep
+        for t, (gt_ids_t, tk_ids_t, tk_overlap_ids_t, tk_cls_ids_t) in enumerate(
+            zip(
+                data["gt_ids"],
+                data["tk_ids"],
+                data["tk_overlap_ids"],
+                data["tk_class_eval_tk_ids"],
+            )
+        ):
+            # deal with the case that there are no gt_det/tk_det in a timestep
+            if len(gt_ids_t) == 0:
+                if self.exhaustive:
+                    cls_fp_thr[cls] += len(tk_cls_ids_t)
+                continue
+
+            # get matches optimizing for TETA
+            amatch_rows, amatch_cols = self.compute_matches(
+                data, t, ga_score, gt_ids_t, tk_ids_t, list(self.array_labels)
+            )
+
+            # map overlap_ids to original ids.
+            if len(tk_overlap_ids_t) != 0:
+                sorter = np.argsort(tk_ids_t)
+                indexes = sorter[
+                    np.searchsorted(tk_ids_t, tk_overlap_ids_t, sorter=sorter)
+                ]
+                sim_t = data["sim_scores"][t][:, indexes]
+                fpl_candidates = tk_overlap_ids_t[(sim_t >= (thr / 100)).any(axis=0)]
+                fpl_candidates_ori_ids_t = np.array(
+                    [data["tk_id_map"][tid] for tid in fpl_candidates]
+                )
+            else:
+                fpl_candidates_ori_ids_t = []
+
+            if self.exhaustive:
+                cls_fp_thr[cls] += len(tk_cls_ids_t) - len(tk_overlap_ids_t)
+
+            # calculate and accumulate basic statistics
+            for a, alpha in enumerate(self.array_labels):
+                match_row, match_col = amatch_rows[a], amatch_cols[a]
+                num_matches = len(match_row)
+                matched_ori_ids = set(
+                    [data["tk_id_map"][tid] for tid in tk_ids_t[match_col]]
+                )
+                match_tk_cls = data["tk_classes"][t][match_col]
+                wrong_tk_cls = match_tk_cls[match_tk_cls != data["gt_classes"][t]]
+
+                num_class_and_det_matches = np.sum(
+                    match_tk_cls == data["gt_classes"][t]
+                )
+
+                if alpha >= 0.5:
+                    for cid in wrong_tk_cls:
+                        if cid in cid2clsname:
+                            cname = cid2clsname[cid]
+                            cls_fp_thr[cname][a - 10] += 1
+                    res["Cls_TP"][a - 10] += num_class_and_det_matches
+                    res["Cls_FN"][a - 10] += num_matches - num_class_and_det_matches
+
+                res["Loc_TP"][a] += num_matches
+                res["Loc_FN"][a] += len(gt_ids_t) - num_matches
+                res["Loc_FP"][a] += len(set(fpl_candidates_ori_ids_t) - matched_ori_ids)
+
+                if num_matches > 0:
+                    matches_counts[a][gt_ids_t[match_row], tk_ids_t[match_col]] += 1
+
+        # calculate AssocA, AssocRe, AssocPr
+        self.compute_association_scores(res, matches_counts, gt_id_count, tk_id_count)
+
+        # calculate final scores
+        res = self._compute_final_fields(res)
+        return res, cls_fp_thr, class_info_list
+
+    def compute_global_alignment_score(self, data):
+        """Computes global alignment score."""
+        num_matches = np.zeros((data["num_gt_ids"], data["num_tk_ids"]))
+        gt_id_count = np.zeros((data["num_gt_ids"], 1))
+        tk_id_count = np.zeros((1, data["num_tk_ids"]))
+
+        # loop through each timestep and accumulate global track info.
+        for t, (gt_ids_t, tk_ids_t) in enumerate(zip(data["gt_ids"], data["tk_ids"])):
+            # count potential matches between ids in each time step
+            # these are normalized, weighted by match similarity
+            sim = data["sim_scores"][t]
+            sim_iou_denom = sim.sum(0, keepdims=True) + sim.sum(1, keepdims=True) - sim
+            sim_iou = np.zeros_like(sim)
+            mask = sim_iou_denom > (0 + EPS)
+            sim_iou[mask] = sim[mask] / sim_iou_denom[mask]
+            num_matches[gt_ids_t[:, None], tk_ids_t[None, :]] += sim_iou
+
+            # calculate total number of dets for each gt_id and tk_id.
+            gt_id_count[gt_ids_t] += 1
+            tk_id_count[0, tk_ids_t] += 1
+
+        # Calculate overall Jaccard alignment score between IDs
+        ga_score = num_matches / (gt_id_count + tk_id_count - num_matches)
+        return ga_score, gt_id_count, tk_id_count
+
+    def compute_matches(self, data, t, ga_score, gt_ids, tk_ids, alpha):
+        """Compute matches based on alignment score."""
+        sim = data["sim_scores"][t]
+        score_mat = ga_score[gt_ids[:, None], tk_ids[None, :]] * sim
+        # Hungarian algorithm to find best matches
+        match_rows, match_cols = linear_sum_assignment(-score_mat)
+
+        if not isinstance(alpha, list):
+            alpha = [alpha]
+        alpha_match_rows, alpha_match_cols = [], []
+        for a in alpha:
+            matched_mask = sim[match_rows, match_cols] >= a - EPS
+            alpha_match_rows.append(match_rows[matched_mask])
+            alpha_match_cols.append(match_cols[matched_mask])
+        return alpha_match_rows, alpha_match_cols
+
+    def compute_association_scores(self, res, matches_counts, gt_id_count, tk_id_count):
+        """Calculate association scores for each alpha.
+
+        First calculate scores per gt_id/tk_id combo,
+        and then average over the number of detections.
+        """
+        for a, _ in enumerate(self.array_labels):
+            matches_count = matches_counts[a]
+            ass_a = matches_count / np.maximum(
+                1, gt_id_count + tk_id_count - matches_count
+            )
+            res["AssocA"][a] = np.sum(matches_count * ass_a) / np.maximum(
+                1, res["Loc_TP"][a]
+            )
+            ass_re = matches_count / np.maximum(1, gt_id_count)
+            res["AssocRe"][a] = np.sum(matches_count * ass_re) / np.maximum(
+                1, res["Loc_TP"][a]
+            )
+            ass_pr = matches_count / np.maximum(1, tk_id_count)
+            res["AssocPr"][a] = np.sum(matches_count * ass_pr) / np.maximum(
+                1, res["Loc_TP"][a]
+            )
+
+    @_timing.time
+    def eval_sequence(self, data, cls, cls_id_name_mapping, cls_fp):
+        """Evaluate a single sequence across all thresholds."""
+        res = {}
+        class_info_dict = {}
+
+        for thr in data:
+            res[thr], cls_fp[thr], cls_info = self.eval_sequence_single_thr(
+                data[thr], cls, cls_id_name_mapping, cls_fp[thr], thr
+            )
+            class_info_dict[thr] = cls_info
+
+        return res, cls_fp, class_info_dict
+
+    def combine_sequences(self, all_res):
+        """Combines metrics across all sequences."""
+        data = {}
+        res = {}
+
+        if all_res:
+            thresholds = list(list(all_res.values())[0].keys())
+        else:
+            thresholds = [50]
+        for thr in thresholds:
+            data[thr] = {}
+            for seq_key in all_res:
+                data[thr][seq_key] = all_res[seq_key][thr]
+        for thr in thresholds:
+            res[thr] = self._combine_sequences_thr(data[thr])
+
+        return res
+
+    def _combine_sequences_thr(self, all_res):
+        """Combines sequences over each threshold."""
+        res = {}
+        for field in self.integer_array_fields:
+            res[field] = self._combine_sum(all_res, field)
+        for field in ["AssocRe", "AssocPr", "AssocA"]:
+            res[field] = self._combine_weighted_av(
+                all_res, field, res, weight_field="Loc_TP"
+            )
+        res = self._compute_final_fields(res)
+        return res
+
+    def combine_classes_class_averaged(self, all_res, ignore_empty=False):
+        """Combines metrics across all classes by averaging over classes.
+
+        If 'ignore_empty' is True, then it only sums over classes
+        with at least one gt or predicted detection.
+        """
+        data = {}
+        res = {}
+        if all_res:
+            thresholds = list(list(all_res.values())[0].keys())
+        else:
+            thresholds = [50]
+        for thr in thresholds:
+            data[thr] = {}
+            for cls_key in all_res:
+                data[thr][cls_key] = all_res[cls_key][thr]
+        for thr in data:
+            res[thr] = self._combine_classes_class_averaged_thr(
+                data[thr], ignore_empty=ignore_empty
+            )
+        return res
+
+    def _combine_classes_class_averaged_thr(self, all_res, ignore_empty=False):
+        """Combines classes over each threshold."""
+        res = {}
+
+        def check_empty(val):
+            """Returns True if empty."""
+            return not (val["Loc_TP"] + val["Loc_FN"] + val["Loc_FP"] > 0 + EPS).any()
+
+        for field in self.integer_array_fields:
+            if ignore_empty:
+                res_field = {k: v for k, v in all_res.items() if not check_empty(v)}
+            else:
+                res_field = {k: v for k, v in all_res.items()}
+            res[field] = self._combine_sum(res_field, field)
+
+        for field in self.float_array_fields:
+            if ignore_empty:
+                res_field = [v[field] for v in all_res.values() if not check_empty(v)]
+            else:
+                res_field = [v[field] for v in all_res.values()]
+            res[field] = np.mean(res_field, axis=0)
+        return res
+
+    def combine_classes_det_averaged(self, all_res):
+        """Combines metrics across all classes by averaging over detections."""
+        data = {}
+        res = {}
+        if all_res:
+            thresholds = list(list(all_res.values())[0].keys())
+        else:
+            thresholds = [50]
+        for thr in thresholds:
+            data[thr] = {}
+            for cls_key in all_res:
+                data[thr][cls_key] = all_res[cls_key][thr]
+        for thr in data:
+            res[thr] = self._combine_classes_det_averaged_thr(data[thr])
+        return res
+
+    def _combine_classes_det_averaged_thr(self, all_res):
+        """Combines detections over each threshold."""
+        res = {}
+        for field in self.integer_array_fields:
+            res[field] = self._combine_sum(all_res, field)
+        for field in ["AssocRe", "AssocPr", "AssocA"]:
+            res[field] = self._combine_weighted_av(
+                all_res, field, res, weight_field="Loc_TP"
+            )
+        res = self._compute_final_fields(res)
+        return res
+
+    @staticmethod
+    def _compute_final_fields(res):
+        """Calculate final metric values.
+
+        This function is used both for both per-sequence calculation,
+        and in combining values across sequences.
+        """
+        # LocA
+        res["LocRe"] = res["Loc_TP"] / np.maximum(1, res["Loc_TP"] + res["Loc_FN"])
+        res["LocPr"] = res["Loc_TP"] / np.maximum(1, res["Loc_TP"] + res["Loc_FP"])
+        res["LocA"] = res["Loc_TP"] / np.maximum(
+            1, res["Loc_TP"] + res["Loc_FN"] + res["Loc_FP"]
+        )
+
+        # ClsA
+        res["ClsRe"] = res["Cls_TP"] / np.maximum(1, res["Cls_TP"] + res["Cls_FN"])
+        res["ClsPr"] = res["Cls_TP"] / np.maximum(1, res["Cls_TP"] + res["Cls_FP"])
+        res["ClsA"] = res["Cls_TP"] / np.maximum(
+            1, res["Cls_TP"] + res["Cls_FN"] + res["Cls_FP"]
+        )
+
+        res["ClsRe"] = np.mean(res["ClsRe"])
+        res["ClsPr"] = np.mean(res["ClsPr"])
+        res["ClsA"] = np.mean(res["ClsA"])
+
+        res["TETA"] = (res["LocA"] + res["AssocA"] + res["ClsA"]) / 3
+
+        return res
+
+    def print_summary_table(self, thr_res, thr, tracker, cls):
+        """Prints summary table of results."""
+        print("")
+        metric_name = self.get_name()
+        self._row_print(
+            [f"{metric_name}{str(thr)}: {tracker}-{cls}"] + self.summary_fields
+        )
+        self._row_print(["COMBINED"] + thr_res)
--- a/sam3/eval/teta_eval_toolkit/utils.py
+++ b/sam3/eval/teta_eval_toolkit/utils.py
@@ -0,0 +1,46 @@
+# fmt: off
+# flake8: noqa
+
+import csv
+import os
+from collections import OrderedDict
+
+
+def validate_metrics_list(metrics_list):
+    """Get names of metric class and ensures they are unique, further checks that the fields within each metric class
+    do not have overlapping names.
+    """
+    metric_names = [metric.get_name() for metric in metrics_list]
+    # check metric names are unique
+    if len(metric_names) != len(set(metric_names)):
+        raise TrackEvalException(
+            "Code being run with multiple metrics of the same name"
+        )
+    fields = []
+    for m in metrics_list:
+        fields += m.fields
+    # check metric fields are unique
+    if len(fields) != len(set(fields)):
+        raise TrackEvalException(
+            "Code being run with multiple metrics with fields of the same name"
+        )
+    return metric_names
+
+
+def get_track_id_str(ann):
+    """Get name of track ID in annotation."""
+    if "track_id" in ann:
+        tk_str = "track_id"
+    elif "instance_id" in ann:
+        tk_str = "instance_id"
+    elif "scalabel_id" in ann:
+        tk_str = "scalabel_id"
+    else:
+        assert False, "No track/instance ID."
+    return tk_str
+
+
+class TrackEvalException(Exception):
+    """Custom exception for catching expected errors."""
+
+    ...
--- a/sam3/eval/ytvis_coco_wrapper.py
+++ b/sam3/eval/ytvis_coco_wrapper.py
@@ -0,0 +1,146 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
+# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+import copy
+import json
+import logging
+
+import numpy as np
+import pycocotools.mask as mask_util
+from pycocotools.coco import COCO
+from typing_extensions import override
+
+
+class YTVIS(COCO):
+    """
+    Helper class for reading YT-VIS annotations
+    """
+
+    @override
+    def __init__(self, annotation_file: str = None, ignore_gt_cats: bool = True):
+        """
+        Args:
+            annotation_file: Path to the annotation file
+            ignore_gt_cats: If True, we ignore the ground truth categories and replace them with a dummy "object" category. This is useful for Phrase AP evaluation.
+        """
+        self.ignore_gt_cats = ignore_gt_cats
+        super().__init__(annotation_file=annotation_file)
+
+    @override
+    def createIndex(self):
+        # We rename some keys to match the COCO format before creating the index.
+        if "annotations" in self.dataset:
+            for ann in self.dataset["annotations"]:
+                if "video_id" in ann:
+                    ann["image_id"] = int(ann.pop("video_id"))
+                if self.ignore_gt_cats:
+                    ann["category_id"] = -1
+                else:
+                    ann["category_id"] = int(ann["category_id"])
+                if "bboxes" in ann:
+                    # note that in some datasets we load under this YTVIS class,
+                    # some "bboxes" could be None for when the GT object is invisible,
+                    # so we replace them with [0, 0, 0, 0]
+                    ann["bboxes"] = [
+                        bbox if bbox is not None else [0, 0, 0, 0]
+                        for bbox in ann["bboxes"]
+                    ]
+                if "areas" in ann:
+                    # similar to "bboxes", some areas could be None for when the GT
+                    # object is invisible, so we replace them with 0
+                    areas = [a if a is not None else 0 for a in ann["areas"]]
+                    # Compute average area of tracklet
+                    ann["area"] = np.mean(areas)
+        if "videos" in self.dataset:
+            for vid in self.dataset["videos"]:
+                vid["id"] = int(vid["id"])
+            self.dataset["images"] = self.dataset.pop("videos")
+
+        if self.ignore_gt_cats:
+            self.dataset["categories"] = [
+                {"supercategory": "object", "id": -1, "name": "object"}
+            ]
+        else:
+            for cat in self.dataset["categories"]:
+                cat["id"] = int(cat["id"])
+        super().createIndex()
+
+    @override
+    def getAnnIds(self, imgIds=[], catIds=[], areaRng=[], iscrowd=None):
+        if len(areaRng) > 0:
+            logging.warning(
+                "Note that we filter out objects based on their *average* area across the video, not per frame area"
+            )
+
+        return super().getAnnIds(imgIds=imgIds, catIds=catIds, iscrowd=iscrowd)
+
+    @override
+    def showAnns(self, anns, draw_bbox=False):
+        raise NotImplementedError("Showing annotations is not supported")
+
+    @override
+    def loadRes(self, resFile):
+        # Adapted from COCO.loadRes to support tracklets/masklets
+        res = YTVIS(ignore_gt_cats=self.ignore_gt_cats)
+        res.dataset["images"] = [img for img in self.dataset["images"]]
+
+        if type(resFile) == str:
+            with open(resFile) as f:
+                anns = json.load(f)
+        elif type(resFile) == np.ndarray:
+            anns = self.loadNumpyAnnotations(resFile)
+        else:
+            anns = resFile
+        assert type(anns) == list, "results is not an array of objects"
+        annsImgIds = [ann["image_id"] for ann in anns]
+        assert set(annsImgIds) == (
+            set(annsImgIds) & set(self.getImgIds())
+        ), "Results do not correspond to current coco set"
+        if "bboxes" in anns[0] and not anns[0]["bboxes"] == []:
+            res.dataset["categories"] = copy.deepcopy(self.dataset["categories"])
+            for id, ann in enumerate(anns):
+                bbs = [(bb if bb is not None else [0, 0, 0, 0]) for bb in ann["bboxes"]]
+                xxyy = [[bb[0], bb[0] + bb[2], bb[1], bb[1] + bb[3]] for bb in bbs]
+                if not "segmentations" in ann:
+                    ann["segmentations"] = [
+                        [[x1, y1, x1, y2, x2, y2, x2, y1]] for (x1, x2, y1, y2) in xxyy
+                    ]
+                ann["areas"] = [bb[2] * bb[3] for bb in bbs]
+                # NOTE: We also compute average area of a tracklet across video, allowing us to compute area based mAP.
+                ann["area"] = np.mean(ann["areas"])
+                ann["id"] = id + 1
+                ann["iscrowd"] = 0
+        elif "segmentations" in anns[0]:
+            res.dataset["categories"] = copy.deepcopy(self.dataset["categories"])
+            for id, ann in enumerate(anns):
+                ann["bboxes"] = [
+                    mask_util.toBbox(segm) for segm in ann["segmentations"]
+                ]
+                if "areas" not in ann:
+                    ann["areas"] = [
+                        mask_util.area(segm) for segm in ann["segmentations"]
+                    ]
+                # NOTE: We also compute average area of a tracklet across video, allowing us to compute area based mAP.
+                ann["area"] = np.mean(ann["areas"])
+                ann["id"] = id + 1
+                ann["iscrowd"] = 0
+
+        res.dataset["annotations"] = anns
+        res.createIndex()
+        return res
+
+    @override
+    def download(self, tarDir=None, imgIds=[]):
+        raise NotImplementedError
+
+    @override
+    def loadNumpyAnnotations(self, data):
+        raise NotImplementedError("We don't support numpy annotations for now")
+
+    @override
+    def annToRLE(self, ann):
+        raise NotImplementedError("We expect masks to be already in RLE format")
+
+    @override
+    def annToMask(self, ann):
+        raise NotImplementedError("We expect masks to be already in RLE format")
--- a/sam3/eval/ytvis_eval.py
+++ b/sam3/eval/ytvis_eval.py
@@ -0,0 +1,411 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
+import copy
+import gc
+import logging
+import os
+from collections import defaultdict
+from operator import xor
+from pathlib import Path
+from typing import List, Optional
+
+import numpy as np
+import pycocotools.mask as mask_util
+import torch
+from pycocotools.cocoeval import COCOeval
+from sam3.eval.cgf1_eval import CGF1Eval
+from sam3.eval.coco_eval_offline import convert_to_xywh
+from sam3.model.box_ops import box_xywh_inter_union
+from sam3.train.masks_ops import rle_encode
+from sam3.train.utils import distributed as dist
+from typing_extensions import override
+
+try:
+    import rapidjson as json
+except ModuleNotFoundError:
+    import json
+
+from iopath.common.file_io import g_pathmgr
+
+
+class YTVISevalMixin:
+    """
+    Identical to COCOeval but adapts computeIoU to compute IoU between tracklets/masklets.
+    """
+
+    @override
+    def _prepare(self):
+        """
+        Copied from cocoeval.py but doesn't convert masks to RLEs (we assume they already are RLEs)
+        """
+        p = self.params
+        if p.useCats:
+            gts = self.cocoGt.loadAnns(
+                self.cocoGt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds)
+            )
+            dts = self.cocoDt.loadAnns(
+                self.cocoDt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds)
+            )
+        else:
+            gts = self.cocoGt.loadAnns(self.cocoGt.getAnnIds(imgIds=p.imgIds))
+            dts = self.cocoDt.loadAnns(self.cocoDt.getAnnIds(imgIds=p.imgIds))
+
+        # set ignore flag
+        for gt in gts:
+            gt["ignore"] = gt["ignore"] if "ignore" in gt else 0
+            gt["ignore"] = "iscrowd" in gt and gt["iscrowd"]
+            if p.iouType == "keypoints":
+                gt["ignore"] = (gt["num_keypoints"] == 0) or gt["ignore"]
+        self._gts = defaultdict(list)  # gt for evaluation
+        self._dts = defaultdict(list)  # dt for evaluation
+        for gt in gts:
+            self._gts[gt["image_id"], gt["category_id"]].append(gt)
+        for dt in dts:
+            self._dts[dt["image_id"], dt["category_id"]].append(dt)
+        self.evalImgs = defaultdict(list)  # per-image per-category evaluation results
+        self.eval = {}  # accumulated evaluation results
+
+    def computeIoU(self, imgId, catId):
+        """
+        Compute IoU between tracklets. Copied from cocoeval.py but adapted for videos (in YT-VIS format)
+        """
+        p = self.params
+        if p.useCats:
+            gt = self._gts[imgId, catId]
+            dt = self._dts[imgId, catId]
+        else:
+            gt = [_ for cId in p.catIds for _ in self._gts[imgId, cId]]
+            dt = [_ for cId in p.catIds for _ in self._dts[imgId, cId]]
+        if len(gt) == 0 or len(dt) == 0:
+            return []
+
+        # For class mAP and phrase AP evaluation, we sort the detections in descending order of scores (as in COCOeval).
+        # For demo F1 evaluation, we DO NOT sort the detections (but match them with GTs via Hungarian matching).
+        assert hasattr(self, "sort_inds_by_scores_in_iou"), (
+            "subclasses that inherits YTVISevalMixin should set `self.sort_inds_by_scores_in_iou` "
+            "(True for class mAP and phrase AP, False for demo F1)"
+        )
+        if self.sort_inds_by_scores_in_iou:
+            inds = np.argsort([-d["score"] for d in dt], kind="mergesort")
+            dt = [dt[i] for i in inds]
+            if len(dt) > p.maxDets[-1]:
+                dt = dt[0 : p.maxDets[-1]]
+
+        if p.iouType == "segm":
+            g = [g["segmentations"] for g in gt]
+            d = [d["segmentations"] for d in dt]
+        elif p.iouType == "bbox":
+            g = [g["bboxes"] for g in gt]
+            d = [d["bboxes"] for d in dt]
+        else:
+            raise Exception("unknown iouType for iou computation")
+
+        def iou_tracklets(preds, gts):
+            preds = torch.tensor(preds)
+            gts = torch.tensor(gts)
+            inter, union = box_xywh_inter_union(
+                preds.unsqueeze(1), gts.unsqueeze(0)
+            )  # Num preds x Num GTS x Num frames
+            inter = inter.sum(-1)
+            union = union.sum(-1)
+            assert (
+                union > 0
+            ).all(), (
+                "There exists a tracklet with zero GTs across time. This is suspicious"
+            )
+            return inter / union
+
+        def iou_masklets(preds, gts):
+            inter = 0
+            union = 0
+            for p_i, gt_i in zip(preds, gts):
+                if p_i and gt_i:
+                    # Compute areas of intersection and union
+                    inter += mask_util.area(
+                        mask_util.merge([p_i, gt_i], intersect=True)
+                    )
+                    union += mask_util.area(
+                        mask_util.merge([p_i, gt_i], intersect=False)
+                    )
+                elif gt_i:
+                    union += mask_util.area(gt_i)
+                elif p_i:
+                    union += mask_util.area(p_i)
+            if union > 0:
+                iou = inter / union
+                assert iou >= 0 and iou <= 1, "Encountered an error in IoU computation"
+            else:
+                assert np.isclose(inter, 0) and np.isclose(
+                    union, 0
+                ), "Encountered an error in IoU computation"
+                iou = 1
+            return iou
+
+        if p.iouType == "segm":
+            ious = [[iou_masklets(d_i, g_i) for g_i in g] for d_i in d]
+        else:
+            ious = iou_tracklets(d, g)
+        return np.array(ious)
+
+
+class YTVISeval(YTVISevalMixin, COCOeval):
+    # For class mAP and phrase AP evaluation, we sort the detections in descending order of scores (as in COCOeval).
+    sort_inds_by_scores_in_iou = True
+
+
+class VideoDemoF1Eval(YTVISevalMixin, CGF1Eval):
+    # For demo F1 evaluation, we DO NOT sort the detections (but match them with GTs via Hungarian matching).
+    sort_inds_by_scores_in_iou = False
+
+
+class YTVISResultsWriter:
+    """
+    Gather and dumps predictions in YT-VIS format.
+    Expected flow of API calls: reset() -> N * update() -> compute_synced()
+    """
+
+    def __init__(
+        self,
+        dump_file: str,
+        postprocessor,
+        gather_pred_via_filesys=False,
+        pred_file_evaluators: Optional[List] = None,
+        save_per_frame_scores: bool = False,
+        write_eval_metrics_file: bool = True,
+        eval_metrics_file_suffix: str = ".sam3_eval_metrics",
+    ):
+        self.dump_file = dump_file
+        self.dump = []
+        self.postprocessor = postprocessor
+        self.gather_pred_via_filesys = gather_pred_via_filesys
+        if dist.is_main_process():
+            dirname = os.path.dirname(self.dump_file)
+            if not os.path.exists(dirname):
+                os.makedirs(dirname, exist_ok=True)
+                logging.info(f"Creating folder: {dirname}")
+
+        # the evaluation hooks to be applied to the prediction files
+        self.pred_file_evaluators = pred_file_evaluators or []
+        self.save_per_frame_scores = save_per_frame_scores
+        # in addition to the prediction file, we also write the evaluation metrics
+        # for easier debugging and analysis (stored in another eval_metrics_file
+        # so that we can keep the dumped prediction file under YT-VIS format)
+        self.write_eval_metrics_file = write_eval_metrics_file
+        if self.write_eval_metrics_file:
+            self.eval_metrics_file = self.dump_file + eval_metrics_file_suffix
+            os.makedirs(os.path.dirname(self.eval_metrics_file), exist_ok=True)
+
+    def _dump_vid_preds(self, results):
+        dumped_results = copy.deepcopy(results)
+        self.dump.extend(dumped_results)
+
+    def prepare(self, predictions):
+        ytvis_results = []
+        for video_id, prediction in predictions.items():
+            if len(prediction) == 0:
+                continue
+            for k in ["boxes", "scores", "labels"]:
+                assert (
+                    k in prediction
+                ), f"Expected predictions to have `{k}` key, available keys are {prediction.keys()}"
+            if self.save_per_frame_scores:
+                assert (
+                    "per_frame_scores" in prediction
+                ), f"Expected predictions to have `per_frame_scores` key, available keys are {prediction.keys()}"
+            assert xor(
+                "masks" in prediction, "masks_rle" in prediction
+            ), f"Expected predictions to have either `masks` key or `masks_rle` key, available keys are {prediction.keys()}"
+
+            boxes = prediction["boxes"]
+            boxes = convert_to_xywh(boxes).tolist()
+            scores = prediction["scores"].tolist()
+            labels = prediction["labels"].tolist()
+            if "masks" in prediction:
+                masks = prediction["masks"].squeeze(2)
+                assert (
+                    masks.ndim == 4
+                ), "Expected masks to be of shape(N_preds,T_frames,H,W)"
+
+                areas = [mask.flatten(1).sum(1).tolist() for mask in masks]
+                rles = [rle_encode(masklet) for masklet in masks]
+
+                # memory clean
+                del masks
+                del prediction["masks"]
+            elif "masks_rle" in prediction:
+                rles = prediction.pop("masks_rle")
+                areas = [
+                    [0 if rle is None else rle.pop("area") for rle in rles_per_obj]
+                    for rles_per_obj in rles
+                ]
+            else:
+                raise ValueError(
+                    "Expected either `masks` or `masks_rle` key in the predictions."
+                )
+
+            new_results = [
+                {
+                    "video_id": video_id,
+                    "category_id": track_label,
+                    "bboxes": track_boxes,
+                    "score": track_score,
+                    "segmentations": track_masks,
+                    "areas": track_areas,
+                }
+                for (
+                    track_boxes,
+                    track_masks,
+                    track_areas,
+                    track_score,
+                    track_label,
+                ) in zip(boxes, rles, areas, scores, labels)
+            ]
+            # Optionally, save per-frame scores
+            if self.save_per_frame_scores:
+                per_frame_scores = prediction["per_frame_scores"].tolist()
+                for res, track_per_frame_scores in zip(new_results, per_frame_scores):
+                    res["per_frame_scores"] = track_per_frame_scores
+
+            ytvis_results.extend(new_results)
+
+        return ytvis_results
+
+    def set_sync_device(self, device: torch.device):
+        self._sync_device = device
+
+    def update(self, *args, **kwargs):
+        predictions = self.postprocessor.process_results(*args, **kwargs)
+        results = self.prepare(predictions)
+        self._dump_vid_preds(results)
+
+    def _dump_preds(self):
+        if not dist.is_main_process():
+            self.dump = []
+            gc.collect()
+            return
+        dumped_file = Path(self.dump_file)
+        logging.info(f"YTVIS evaluator: Dumping predictions to {dumped_file}")
+        with g_pathmgr.open(str(dumped_file), "w") as f:
+            json.dump(self.dump, f)
+        self.dump = []
+        gc.collect()
+        return str(dumped_file)
+
+    def synchronize_between_processes(self):
+        logging.info("YT-VIS evaluator: Synchronizing between processes")
+        dump_dict = self._dedup_pre_gather(self.dump)
+        if self.gather_pred_via_filesys:
+            dump_dict_all_gpus = dist.gather_to_rank_0_via_filesys(dump_dict)
+        else:
+            dump_dict_all_gpus = dist.all_gather(dump_dict, force_cpu=True)
+        self.dump = self._dedup_post_gather(dump_dict_all_gpus)
+        logging.info(f"Gathered all {len(self.dump)} predictions")
+
+    def _dedup_pre_gather(self, predictions):
+        """
+        Organize the predictions as a dict-of-list using (video_id, category_id) as keys
+        for deduplication after gathering them across GPUs.
+
+        During evaluation, PyTorch data loader under `drop_last: False` would wrap
+        around the dataset length to be a multiple of world size (GPU num) and duplicate
+        the remaining batches. This causes the same test sample to appear simultaneously
+        in multiple GPUs, resulting in duplicated predictions being saved into prediction
+        files. These duplicates are then counted as false positives under detection mAP
+        metrics (since a ground truth can be matched with only one prediction).
+
+        For example, if there are 4 GPUs and 6 samples [A1, A2, B1, B2, C1, C2], the data
+        loader (under `drop_last: False`) would load it by wrapping it around like
+        `[A1, A2, B1, B2, C1, C2, *A1*, *A2*]` to make a multiple of 4 and then split it as
+
+        - GPU 0: A1, C1
+        - GPU 1: A2, C2
+        - GPU 3: B1, **A1**
+        - GPU 4: B2, **A2**
+        (as in DistributedSampler in https://github.com/pytorch/pytorch/blob/521588519da9f4876d90ddd7a17c10d0eca89dc6/torch/utils/data/distributed.py#L116-L124)
+
+        so the predictions on A1 and A2 will occur twice in the final gathered outputs
+        in the prediction file (and counted as false positives). This also affects our
+        YT-VIS official val evaluation, but to a lesser extent than YT-VIS dev since
+        the latter is much smaller and more susceptible to false positives.
+
+        So we to deduplicate this. The tricky part is that we cannot deduplicate them
+        simply using video id, given that we are sharding the classes in each video
+        across multiple batches (with 20 prompts per batch) in our "orig_cats" eval dbs.
+
+        The solution is to deduplicate based on (video_id, category_id) tuple as keys.
+        We organize the predictions as a dict-of-list using (video_id, category_id) as
+        keys on each GPU, with the list of masklets under this (video_id, category_id)
+        on this GPU as values. Then, we all-gather this dict-of-list across GPUs and
+        if a key (video_id, category_id) appears in multiple GPUs, we only take the
+        prediction masklet list from one GPU.
+        """
+        prediction_dict = defaultdict(list)
+        for p in predictions:
+            prediction_dict[(p["video_id"], p["category_id"])].append(p)
+        return prediction_dict
+
+    def _dedup_post_gather(self, list_of_prediction_dict):
+        """
+        Deduplicate the predictions from all GPUs. See `_dedup_pre_gather` for details.
+        """
+        dedup_prediction_dict = {}
+        duplication_keys = []
+        for prediction_dict in list_of_prediction_dict:
+            for k, v in prediction_dict.items():
+                if k not in dedup_prediction_dict:
+                    dedup_prediction_dict[k] = v
+                else:
+                    duplication_keys.append(k)
+
+        logging.info(
+            f"skipped {len(duplication_keys)} duplicated predictions in YTVISResultsWriter "
+            f"with the following (video_id, category_id) tuples: {duplication_keys}"
+        )
+        dedup_predictions = sum(dedup_prediction_dict.values(), [])
+        return dedup_predictions
+
+    def compute_synced(
+        self,
+    ):
+        self.synchronize_between_processes()
+        dumped_file = self._dump_preds()
+        if not dist.is_main_process():
+            return {"": 0.0}
+
+        # run evaluation hooks on the prediction file
+        meters = {}
+        all_video_np_level_results = defaultdict(dict)
+        for evaluator in self.pred_file_evaluators:
+            gc.collect()
+            results, video_np_level_results = evaluator.evaluate(dumped_file)
+            meters.update(results)
+            for (video_id, category_id), res in video_np_level_results.items():
+                all_video_np_level_results[(video_id, category_id)].update(res)
+
+        gc.collect()
+        if self.write_eval_metrics_file:
+            # convert the nested dict of {(video_id, category_id): per_sample_metric_dict}
+            # to a list of per-sample metric dicts (with video_id and category_id) for JSON,
+            # as JSON doesn't allow using tuples like (video_id, category_id) as dict keys
+            video_np_level_metrics = [
+                {"video_id": video_id, "category_id": category_id, **res}
+                for (video_id, category_id), res in all_video_np_level_results.items()
+            ]
+            eval_metrics = {
+                "dataset_level_metrics": meters,
+                "video_np_level_metrics": video_np_level_metrics,
+            }
+            with g_pathmgr.open(self.eval_metrics_file, "w") as f:
+                json.dump(eval_metrics, f)
+            logging.info(
+                f"YTVIS evaluator: Dumped evaluation metrics to {self.eval_metrics_file}"
+            )
+
+        if len(meters) == 0:
+            meters = {"": 0.0}
+        return meters
+
+    def compute(self):
+        return {"": 0.0}
+
+    def reset(self, *args, **kwargs):
+        self.dump = []
				`@@ -0,0 +1 @@`
				`# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved`