Initial commit

fbshipit-source-id: da6be2f26e3a1202f4bffde8cb980e2dcb851294
2025-11-18 23:07:42 -08:00
commit a13e358df4
504 changed files with 122758 additions and 0 deletions
--- a/sam3/eval/hota_eval_toolkit/trackeval/datasets/init.py
+++ b/sam3/eval/hota_eval_toolkit/trackeval/datasets/init.py
@@ -0,0 +1,4 @@
+# flake8: noqa
+
+from .tao_ow import TAO_OW
+from .youtube_vis import YouTubeVIS
--- a/sam3/eval/hota_eval_toolkit/trackeval/datasets/_base_dataset.py
+++ b/sam3/eval/hota_eval_toolkit/trackeval/datasets/_base_dataset.py
@@ -0,0 +1,379 @@
+# flake8: noqa
+
+import csv
+import io
+import os
+import traceback
+import zipfile
+from abc import ABC, abstractmethod
+from copy import deepcopy
+
+import numpy as np
+
+from .. import _timing
+from ..utils import TrackEvalException
+
+
+class _BaseDataset(ABC):
+    @abstractmethod
+    def __init__(self):
+        self.tracker_list = None
+        self.seq_list = None
+        self.class_list = None
+        self.output_fol = None
+        self.output_sub_fol = None
+        self.should_classes_combine = True
+        self.use_super_categories = False
+
+    # Functions to implement:
+
+    @staticmethod
+    @abstractmethod
+    def get_default_dataset_config(): ...
+
+    @abstractmethod
+    def _load_raw_file(self, tracker, seq, is_gt): ...
+
+    @_timing.time
+    @abstractmethod
+    def get_preprocessed_seq_data(self, raw_data, cls): ...
+
+    @abstractmethod
+    def _calculate_similarities(self, gt_dets_t, tracker_dets_t): ...
+
+    # Helper functions for all datasets:
+
+    @classmethod
+    def get_class_name(cls):
+        return cls.__name__
+
+    def get_name(self):
+        return self.get_class_name()
+
+    def get_output_fol(self, tracker):
+        return os.path.join(self.output_fol, tracker, self.output_sub_fol)
+
+    def get_display_name(self, tracker):
+        """Can be overwritten if the trackers name (in files) is different to how it should be displayed.
+        By default this method just returns the trackers name as is.
+        """
+        return tracker
+
+    def get_eval_info(self):
+        """Return info about the dataset needed for the Evaluator"""
+        return self.tracker_list, self.seq_list, self.class_list
+
+    @_timing.time
+    def get_raw_seq_data(self, tracker, seq):
+        """Loads raw data (tracker and ground-truth) for a single tracker on a single sequence.
+        Raw data includes all of the information needed for both preprocessing and evaluation, for all classes.
+        A later function (get_processed_seq_data) will perform such preprocessing and extract relevant information for
+        the evaluation of each class.
+
+        This returns a dict which contains the fields:
+        [num_timesteps]: integer
+        [gt_ids, tracker_ids, gt_classes, tracker_classes, tracker_confidences]:
+                                                                list (for each timestep) of 1D NDArrays (for each det).
+        [gt_dets, tracker_dets, gt_crowd_ignore_regions]: list (for each timestep) of lists of detections.
+        [similarity_scores]: list (for each timestep) of 2D NDArrays.
+        [gt_extras]: dict (for each extra) of lists (for each timestep) of 1D NDArrays (for each det).
+
+        gt_extras contains dataset specific information used for preprocessing such as occlusion and truncation levels.
+
+        Note that similarities are extracted as part of the dataset and not the metric, because almost all metrics are
+        independent of the exact method of calculating the similarity. However datasets are not (e.g. segmentation
+        masks vs 2D boxes vs 3D boxes).
+        We calculate the similarity before preprocessing because often both preprocessing and evaluation require it and
+        we don't wish to calculate this twice.
+        We calculate similarity between all gt and tracker classes (not just each class individually) to allow for
+        calculation of metrics such as class confusion matrices. Typically the impact of this on performance is low.
+        """
+        # Load raw data.
+        raw_gt_data = self._load_raw_file(tracker, seq, is_gt=True)
+        raw_tracker_data = self._load_raw_file(tracker, seq, is_gt=False)
+        raw_data = {**raw_tracker_data, **raw_gt_data}  # Merges dictionaries
+
+        # Calculate similarities for each timestep.
+        similarity_scores = []
+        for t, (gt_dets_t, tracker_dets_t) in enumerate(
+            zip(raw_data["gt_dets"], raw_data["tracker_dets"])
+        ):
+            ious = self._calculate_similarities(gt_dets_t, tracker_dets_t)
+            similarity_scores.append(ious)
+        raw_data["similarity_scores"] = similarity_scores
+        return raw_data
+
+    @staticmethod
+    def _load_simple_text_file(
+        file,
+        time_col=0,
+        id_col=None,
+        remove_negative_ids=False,
+        valid_filter=None,
+        crowd_ignore_filter=None,
+        convert_filter=None,
+        is_zipped=False,
+        zip_file=None,
+        force_delimiters=None,
+    ):
+        """Function that loads data which is in a commonly used text file format.
+        Assumes each det is given by one row of a text file.
+        There is no limit to the number or meaning of each column,
+        however one column needs to give the timestep of each det (time_col) which is default col 0.
+
+        The file dialect (deliminator, num cols, etc) is determined automatically.
+        This function automatically separates dets by timestep,
+        and is much faster than alternatives such as np.loadtext or pandas.
+
+        If remove_negative_ids is True and id_col is not None, dets with negative values in id_col are excluded.
+        These are not excluded from ignore data.
+
+        valid_filter can be used to only include certain classes.
+        It is a dict with ints as keys, and lists as values,
+        such that a row is included if "row[key].lower() is in value" for all key/value pairs in the dict.
+        If None, all classes are included.
+
+        crowd_ignore_filter can be used to read crowd_ignore regions separately. It has the same format as valid filter.
+
+        convert_filter can be used to convert value read to another format.
+        This is used most commonly to convert classes given as string to a class id.
+        This is a dict such that the key is the column to convert, and the value is another dict giving the mapping.
+
+        Optionally, input files could be a zip of multiple text files for storage efficiency.
+
+        Returns read_data and ignore_data.
+        Each is a dict (with keys as timesteps as strings) of lists (over dets) of lists (over column values).
+        Note that all data is returned as strings, and must be converted to float/int later if needed.
+        Note that timesteps will not be present in the returned dict keys if there are no dets for them
+        """
+
+        if remove_negative_ids and id_col is None:
+            raise TrackEvalException(
+                "remove_negative_ids is True, but id_col is not given."
+            )
+        if crowd_ignore_filter is None:
+            crowd_ignore_filter = {}
+        if convert_filter is None:
+            convert_filter = {}
+        try:
+            if is_zipped:  # Either open file directly or within a zip.
+                if zip_file is None:
+                    raise TrackEvalException(
+                        "is_zipped set to True, but no zip_file is given."
+                    )
+                archive = zipfile.ZipFile(os.path.join(zip_file), "r")
+                fp = io.TextIOWrapper(archive.open(file, "r"))
+            else:
+                fp = open(file)
+            read_data = {}
+            crowd_ignore_data = {}
+            fp.seek(0, os.SEEK_END)
+            # check if file is empty
+            if fp.tell():
+                fp.seek(0)
+                dialect = csv.Sniffer().sniff(
+                    fp.readline(), delimiters=force_delimiters
+                )  # Auto determine structure.
+                dialect.skipinitialspace = (
+                    True  # Deal with extra spaces between columns
+                )
+                fp.seek(0)
+                reader = csv.reader(fp, dialect)
+                for row in reader:
+                    try:
+                        # Deal with extra trailing spaces at the end of rows
+                        if row[-1] in "":
+                            row = row[:-1]
+                        timestep = str(int(float(row[time_col])))
+                        # Read ignore regions separately.
+                        is_ignored = False
+                        for ignore_key, ignore_value in crowd_ignore_filter.items():
+                            if row[ignore_key].lower() in ignore_value:
+                                # Convert values in one column (e.g. string to id)
+                                for (
+                                    convert_key,
+                                    convert_value,
+                                ) in convert_filter.items():
+                                    row[convert_key] = convert_value[
+                                        row[convert_key].lower()
+                                    ]
+                                # Save data separated by timestep.
+                                if timestep in crowd_ignore_data.keys():
+                                    crowd_ignore_data[timestep].append(row)
+                                else:
+                                    crowd_ignore_data[timestep] = [row]
+                                is_ignored = True
+                        if (
+                            is_ignored
+                        ):  # if det is an ignore region, it cannot be a normal det.
+                            continue
+                        # Exclude some dets if not valid.
+                        if valid_filter is not None:
+                            for key, value in valid_filter.items():
+                                if row[key].lower() not in value:
+                                    continue
+                        if remove_negative_ids:
+                            if int(float(row[id_col])) < 0:
+                                continue
+                        # Convert values in one column (e.g. string to id)
+                        for convert_key, convert_value in convert_filter.items():
+                            row[convert_key] = convert_value[row[convert_key].lower()]
+                        # Save data separated by timestep.
+                        if timestep in read_data.keys():
+                            read_data[timestep].append(row)
+                        else:
+                            read_data[timestep] = [row]
+                    except Exception:
+                        exc_str_init = (
+                            "In file %s the following line cannot be read correctly: \n"
+                            % os.path.basename(file)
+                        )
+                        exc_str = " ".join([exc_str_init] + row)
+                        raise TrackEvalException(exc_str)
+            fp.close()
+        except Exception:
+            print("Error loading file: %s, printing traceback." % file)
+            traceback.print_exc()
+            raise TrackEvalException(
+                "File %s cannot be read because it is either not present or invalidly formatted"
+                % os.path.basename(file)
+            )
+        return read_data, crowd_ignore_data
+
+    @staticmethod
+    def _calculate_mask_ious(masks1, masks2, is_encoded=False, do_ioa=False):
+        """Calculates the IOU (intersection over union) between two arrays of segmentation masks.
+        If is_encoded a run length encoding with pycocotools is assumed as input format, otherwise an input of numpy
+        arrays of the shape (num_masks, height, width) is assumed and the encoding is performed.
+        If do_ioa (intersection over area) , then calculates the intersection over the area of masks1 - this is commonly
+        used to determine if detections are within crowd ignore region.
+        :param masks1:  first set of masks (numpy array of shape (num_masks, height, width) if not encoded,
+                        else pycocotools rle encoded format)
+        :param masks2:  second set of masks (numpy array of shape (num_masks, height, width) if not encoded,
+                        else pycocotools rle encoded format)
+        :param is_encoded: whether the input is in pycocotools rle encoded format
+        :param do_ioa: whether to perform IoA computation
+        :return: the IoU/IoA scores
+        """
+
+        # Only loaded when run to reduce minimum requirements
+        from pycocotools import mask as mask_utils
+
+        # use pycocotools for run length encoding of masks
+        if not is_encoded:
+            masks1 = mask_utils.encode(
+                np.array(np.transpose(masks1, (1, 2, 0)), order="F")
+            )
+            masks2 = mask_utils.encode(
+                np.array(np.transpose(masks2, (1, 2, 0)), order="F")
+            )
+
+        # use pycocotools for iou computation of rle encoded masks
+        ious = mask_utils.iou(masks1, masks2, [do_ioa] * len(masks2))
+        if len(masks1) == 0 or len(masks2) == 0:
+            ious = np.asarray(ious).reshape(len(masks1), len(masks2))
+        assert (ious >= 0 - np.finfo("float").eps).all()
+        assert (ious <= 1 + np.finfo("float").eps).all()
+
+        return ious
+
+    @staticmethod
+    def _calculate_box_ious(bboxes1, bboxes2, box_format="xywh", do_ioa=False):
+        """Calculates the IOU (intersection over union) between two arrays of boxes.
+        Allows variable box formats ('xywh' and 'x0y0x1y1').
+        If do_ioa (intersection over area) , then calculates the intersection over the area of boxes1 - this is commonly
+        used to determine if detections are within crowd ignore region.
+        """
+        if box_format in "xywh":
+            # layout: (x0, y0, w, h)
+            bboxes1 = deepcopy(bboxes1)
+            bboxes2 = deepcopy(bboxes2)
+
+            bboxes1[:, 2] = bboxes1[:, 0] + bboxes1[:, 2]
+            bboxes1[:, 3] = bboxes1[:, 1] + bboxes1[:, 3]
+            bboxes2[:, 2] = bboxes2[:, 0] + bboxes2[:, 2]
+            bboxes2[:, 3] = bboxes2[:, 1] + bboxes2[:, 3]
+        elif box_format not in "x0y0x1y1":
+            raise (TrackEvalException("box_format %s is not implemented" % box_format))
+
+        # layout: (x0, y0, x1, y1)
+        min_ = np.minimum(bboxes1[:, np.newaxis, :], bboxes2[np.newaxis, :, :])
+        max_ = np.maximum(bboxes1[:, np.newaxis, :], bboxes2[np.newaxis, :, :])
+        intersection = np.maximum(min_[..., 2] - max_[..., 0], 0) * np.maximum(
+            min_[..., 3] - max_[..., 1], 0
+        )
+        area1 = (bboxes1[..., 2] - bboxes1[..., 0]) * (
+            bboxes1[..., 3] - bboxes1[..., 1]
+        )
+
+        if do_ioa:
+            ioas = np.zeros_like(intersection)
+            valid_mask = area1 > 0 + np.finfo("float").eps
+            ioas[valid_mask, :] = (
+                intersection[valid_mask, :] / area1[valid_mask][:, np.newaxis]
+            )
+
+            return ioas
+        else:
+            area2 = (bboxes2[..., 2] - bboxes2[..., 0]) * (
+                bboxes2[..., 3] - bboxes2[..., 1]
+            )
+            union = area1[:, np.newaxis] + area2[np.newaxis, :] - intersection
+            intersection[area1 <= 0 + np.finfo("float").eps, :] = 0
+            intersection[:, area2 <= 0 + np.finfo("float").eps] = 0
+            intersection[union <= 0 + np.finfo("float").eps] = 0
+            union[union <= 0 + np.finfo("float").eps] = 1
+            ious = intersection / union
+            return ious
+
+    @staticmethod
+    def _calculate_euclidean_similarity(dets1, dets2, zero_distance=2.0):
+        """Calculates the euclidean distance between two sets of detections, and then converts this into a similarity
+        measure with values between 0 and 1 using the following formula: sim = max(0, 1 - dist/zero_distance).
+        The default zero_distance of 2.0, corresponds to the default used in MOT15_3D, such that a 0.5 similarity
+        threshold corresponds to a 1m distance threshold for TPs.
+        """
+        dist = np.linalg.norm(dets1[:, np.newaxis] - dets2[np.newaxis, :], axis=2)
+        sim = np.maximum(0, 1 - dist / zero_distance)
+        return sim
+
+    @staticmethod
+    def _check_unique_ids(data, after_preproc=False):
+        """Check the requirement that the tracker_ids and gt_ids are unique per timestep"""
+        gt_ids = data["gt_ids"]
+        tracker_ids = data["tracker_ids"]
+        for t, (gt_ids_t, tracker_ids_t) in enumerate(zip(gt_ids, tracker_ids)):
+            if len(tracker_ids_t) > 0:
+                unique_ids, counts = np.unique(tracker_ids_t, return_counts=True)
+                if np.max(counts) != 1:
+                    duplicate_ids = unique_ids[counts > 1]
+                    exc_str_init = (
+                        "Tracker predicts the same ID more than once in a single timestep "
+                        "(seq: %s, frame: %i, ids:" % (data["seq"], t + 1)
+                    )
+                    exc_str = (
+                        " ".join([exc_str_init] + [str(d) for d in duplicate_ids]) + ")"
+                    )
+                    if after_preproc:
+                        exc_str_init += (
+                            "\n Note that this error occurred after preprocessing (but not before), "
+                            "so ids may not be as in file, and something seems wrong with preproc."
+                        )
+                    raise TrackEvalException(exc_str)
+            if len(gt_ids_t) > 0:
+                unique_ids, counts = np.unique(gt_ids_t, return_counts=True)
+                if np.max(counts) != 1:
+                    duplicate_ids = unique_ids[counts > 1]
+                    exc_str_init = (
+                        "Ground-truth has the same ID more than once in a single timestep "
+                        "(seq: %s, frame: %i, ids:" % (data["seq"], t + 1)
+                    )
+                    exc_str = (
+                        " ".join([exc_str_init] + [str(d) for d in duplicate_ids]) + ")"
+                    )
+                    if after_preproc:
+                        exc_str_init += (
+                            "\n Note that this error occurred after preprocessing (but not before), "
+                            "so ids may not be as in file, and something seems wrong with preproc."
+                        )
+                    raise TrackEvalException(exc_str)
--- a/sam3/eval/hota_eval_toolkit/trackeval/datasets/tao_ow.py
+++ b/sam3/eval/hota_eval_toolkit/trackeval/datasets/tao_ow.py
@@ -0,0 +1,891 @@
+# flake8: noqa
+
+import itertools
+import json
+import os
+from collections import defaultdict
+
+import numpy as np
+from scipy.optimize import linear_sum_assignment
+
+from .. import _timing, utils
+from ..utils import TrackEvalException
+from ._base_dataset import _BaseDataset
+
+
+class TAO_OW(_BaseDataset):
+    """Dataset class for TAO tracking"""
+
+    @staticmethod
+    def get_default_dataset_config():
+        """Default class config values"""
+        code_path = utils.get_code_path()
+        default_config = {
+            "GT_FOLDER": os.path.join(
+                code_path, "data/gt/tao/tao_training"
+            ),  # Location of GT data
+            "TRACKERS_FOLDER": os.path.join(
+                code_path, "data/trackers/tao/tao_training"
+            ),  # Trackers location
+            "OUTPUT_FOLDER": None,  # Where to save eval results (if None, same as TRACKERS_FOLDER)
+            "TRACKERS_TO_EVAL": None,  # Filenames of trackers to eval (if None, all in folder)
+            "CLASSES_TO_EVAL": None,  # Classes to eval (if None, all classes)
+            "SPLIT_TO_EVAL": "training",  # Valid: 'training', 'val'
+            "PRINT_CONFIG": True,  # Whether to print current config
+            "TRACKER_SUB_FOLDER": "data",  # Tracker files are in TRACKER_FOLDER/tracker_name/TRACKER_SUB_FOLDER
+            "OUTPUT_SUB_FOLDER": "",  # Output files are saved in OUTPUT_FOLDER/tracker_name/OUTPUT_SUB_FOLDER
+            "TRACKER_DISPLAY_NAMES": None,  # Names of trackers to display, if None: TRACKERS_TO_EVAL
+            "MAX_DETECTIONS": 300,  # Number of maximal allowed detections per image (0 for unlimited)
+            "SUBSET": "all",
+        }
+        return default_config
+
+    def __init__(self, config=None):
+        """Initialise dataset, checking that all required files are present"""
+        super().__init__()
+        # Fill non-given config values with defaults
+        self.config = utils.init_config(
+            config, self.get_default_dataset_config(), self.get_name()
+        )
+        self.gt_fol = self.config["GT_FOLDER"]
+        self.tracker_fol = self.config["TRACKERS_FOLDER"]
+        self.should_classes_combine = True
+        self.use_super_categories = False
+
+        self.tracker_sub_fol = self.config["TRACKER_SUB_FOLDER"]
+        self.output_fol = self.config["OUTPUT_FOLDER"]
+        if self.output_fol is None:
+            self.output_fol = self.tracker_fol
+        self.output_sub_fol = self.config["OUTPUT_SUB_FOLDER"]
+
+        gt_dir_files = [
+            file for file in os.listdir(self.gt_fol) if file.endswith(".json")
+        ]
+        if len(gt_dir_files) != 1:
+            raise TrackEvalException(
+                self.gt_fol + " does not contain exactly one json file."
+            )
+
+        with open(os.path.join(self.gt_fol, gt_dir_files[0])) as f:
+            self.gt_data = json.load(f)
+
+        self.subset = self.config["SUBSET"]
+        if self.subset != "all":
+            # Split GT data into `known`, `unknown` or `distractor`
+            self._split_known_unknown_distractor()
+            self.gt_data = self._filter_gt_data(self.gt_data)
+
+        # merge categories marked with a merged tag in TAO dataset
+        self._merge_categories(self.gt_data["annotations"] + self.gt_data["tracks"])
+
+        # Get sequences to eval and sequence information
+        self.seq_list = [
+            vid["name"].replace("/", "-") for vid in self.gt_data["videos"]
+        ]
+        self.seq_name_to_seq_id = {
+            vid["name"].replace("/", "-"): vid["id"] for vid in self.gt_data["videos"]
+        }
+        # compute mappings from videos to annotation data
+        self.videos_to_gt_tracks, self.videos_to_gt_images = self._compute_vid_mappings(
+            self.gt_data["annotations"]
+        )
+        # compute sequence lengths
+        self.seq_lengths = {vid["id"]: 0 for vid in self.gt_data["videos"]}
+        for img in self.gt_data["images"]:
+            self.seq_lengths[img["video_id"]] += 1
+        self.seq_to_images_to_timestep = self._compute_image_to_timestep_mappings()
+        self.seq_to_classes = {
+            vid["id"]: {
+                "pos_cat_ids": list(
+                    {
+                        track["category_id"]
+                        for track in self.videos_to_gt_tracks[vid["id"]]
+                    }
+                ),
+                "neg_cat_ids": vid["neg_category_ids"],
+                "not_exhaustively_labeled_cat_ids": vid["not_exhaustive_category_ids"],
+            }
+            for vid in self.gt_data["videos"]
+        }
+
+        # Get classes to eval
+        considered_vid_ids = [self.seq_name_to_seq_id[vid] for vid in self.seq_list]
+        seen_cats = set(
+            [
+                cat_id
+                for vid_id in considered_vid_ids
+                for cat_id in self.seq_to_classes[vid_id]["pos_cat_ids"]
+            ]
+        )
+        # only classes with ground truth are evaluated in TAO
+        self.valid_classes = [
+            cls["name"] for cls in self.gt_data["categories"] if cls["id"] in seen_cats
+        ]
+        # cls_name_to_cls_id_map = {cls['name']: cls['id'] for cls in self.gt_data['categories']}
+
+        if self.config["CLASSES_TO_EVAL"]:
+            # self.class_list = [cls.lower() if cls.lower() in self.valid_classes else None
+            #                    for cls in self.config['CLASSES_TO_EVAL']]
+            self.class_list = ["object"]  # class-agnostic
+            if not all(self.class_list):
+                raise TrackEvalException(
+                    "Attempted to evaluate an invalid class. Only classes "
+                    + ", ".join(self.valid_classes)
+                    + " are valid (classes present in ground truth data)."
+                )
+        else:
+            # self.class_list = [cls for cls in self.valid_classes]
+            self.class_list = ["object"]  # class-agnostic
+        # self.class_name_to_class_id = {k: v for k, v in cls_name_to_cls_id_map.items() if k in self.class_list}
+        self.class_name_to_class_id = {"object": 1}  # class-agnostic
+
+        # Get trackers to eval
+        if self.config["TRACKERS_TO_EVAL"] is None:
+            self.tracker_list = os.listdir(self.tracker_fol)
+        else:
+            self.tracker_list = self.config["TRACKERS_TO_EVAL"]
+
+        if self.config["TRACKER_DISPLAY_NAMES"] is None:
+            self.tracker_to_disp = dict(zip(self.tracker_list, self.tracker_list))
+        elif (self.config["TRACKERS_TO_EVAL"] is not None) and (
+            len(self.config["TRACKER_DISPLAY_NAMES"]) == len(self.tracker_list)
+        ):
+            self.tracker_to_disp = dict(
+                zip(self.tracker_list, self.config["TRACKER_DISPLAY_NAMES"])
+            )
+        else:
+            raise TrackEvalException(
+                "List of tracker files and tracker display names do not match."
+            )
+
+        self.tracker_data = {tracker: dict() for tracker in self.tracker_list}
+
+        for tracker in self.tracker_list:
+            tr_dir_files = [
+                file
+                for file in os.listdir(
+                    os.path.join(self.tracker_fol, tracker, self.tracker_sub_fol)
+                )
+                if file.endswith(".json")
+            ]
+            if len(tr_dir_files) != 1:
+                raise TrackEvalException(
+                    os.path.join(self.tracker_fol, tracker, self.tracker_sub_fol)
+                    + " does not contain exactly one json file."
+                )
+            with open(
+                os.path.join(
+                    self.tracker_fol, tracker, self.tracker_sub_fol, tr_dir_files[0]
+                )
+            ) as f:
+                curr_data = json.load(f)
+
+            # limit detections if MAX_DETECTIONS > 0
+            if self.config["MAX_DETECTIONS"]:
+                curr_data = self._limit_dets_per_image(curr_data)
+
+            # fill missing video ids
+            self._fill_video_ids_inplace(curr_data)
+
+            # make track ids unique over whole evaluation set
+            self._make_track_ids_unique(curr_data)
+
+            # merge categories marked with a merged tag in TAO dataset
+            self._merge_categories(curr_data)
+
+            # get tracker sequence information
+            curr_videos_to_tracker_tracks, curr_videos_to_tracker_images = (
+                self._compute_vid_mappings(curr_data)
+            )
+            self.tracker_data[tracker]["vids_to_tracks"] = curr_videos_to_tracker_tracks
+            self.tracker_data[tracker]["vids_to_images"] = curr_videos_to_tracker_images
+
+    def get_display_name(self, tracker):
+        return self.tracker_to_disp[tracker]
+
+    def _load_raw_file(self, tracker, seq, is_gt):
+        """Load a file (gt or tracker) in the TAO format
+
+        If is_gt, this returns a dict which contains the fields:
+        [gt_ids, gt_classes] : list (for each timestep) of 1D NDArrays (for each det).
+        [gt_dets]: list (for each timestep) of lists of detections.
+        [classes_to_gt_tracks]: dictionary with class values as keys and list of dictionaries (with frame indices as
+                                keys and corresponding segmentations as values) for each track
+        [classes_to_gt_track_ids, classes_to_gt_track_areas, classes_to_gt_track_lengths]: dictionary with class values
+                                as keys and lists (for each track) as values
+
+        if not is_gt, this returns a dict which contains the fields:
+        [tracker_ids, tracker_classes, tracker_confidences] : list (for each timestep) of 1D NDArrays (for each det).
+        [tracker_dets]: list (for each timestep) of lists of detections.
+        [classes_to_dt_tracks]: dictionary with class values as keys and list of dictionaries (with frame indices as
+                                keys and corresponding segmentations as values) for each track
+        [classes_to_dt_track_ids, classes_to_dt_track_areas, classes_to_dt_track_lengths]: dictionary with class values
+                                                                                           as keys and lists as values
+        [classes_to_dt_track_scores]: dictionary with class values as keys and 1D numpy arrays as values
+        """
+        seq_id = self.seq_name_to_seq_id[seq]
+        # File location
+        if is_gt:
+            imgs = self.videos_to_gt_images[seq_id]
+        else:
+            imgs = self.tracker_data[tracker]["vids_to_images"][seq_id]
+
+        # Convert data to required format
+        num_timesteps = self.seq_lengths[seq_id]
+        img_to_timestep = self.seq_to_images_to_timestep[seq_id]
+        data_keys = ["ids", "classes", "dets"]
+        if not is_gt:
+            data_keys += ["tracker_confidences"]
+        raw_data = {key: [None] * num_timesteps for key in data_keys}
+        for img in imgs:
+            # some tracker data contains images without any ground truth information, these are ignored
+            try:
+                t = img_to_timestep[img["id"]]
+            except KeyError:
+                continue
+            annotations = img["annotations"]
+            raw_data["dets"][t] = np.atleast_2d(
+                [ann["bbox"] for ann in annotations]
+            ).astype(float)
+            raw_data["ids"][t] = np.atleast_1d(
+                [ann["track_id"] for ann in annotations]
+            ).astype(int)
+            raw_data["classes"][t] = np.atleast_1d([1 for _ in annotations]).astype(
+                int
+            )  # class-agnostic
+            if not is_gt:
+                raw_data["tracker_confidences"][t] = np.atleast_1d(
+                    [ann["score"] for ann in annotations]
+                ).astype(float)
+
+        for t, d in enumerate(raw_data["dets"]):
+            if d is None:
+                raw_data["dets"][t] = np.empty((0, 4)).astype(float)
+                raw_data["ids"][t] = np.empty(0).astype(int)
+                raw_data["classes"][t] = np.empty(0).astype(int)
+                if not is_gt:
+                    raw_data["tracker_confidences"][t] = np.empty(0)
+
+        if is_gt:
+            key_map = {"ids": "gt_ids", "classes": "gt_classes", "dets": "gt_dets"}
+        else:
+            key_map = {
+                "ids": "tracker_ids",
+                "classes": "tracker_classes",
+                "dets": "tracker_dets",
+            }
+        for k, v in key_map.items():
+            raw_data[v] = raw_data.pop(k)
+
+        # all_classes = [self.class_name_to_class_id[cls] for cls in self.class_list]
+        all_classes = [1]  # class-agnostic
+
+        if is_gt:
+            classes_to_consider = all_classes
+            all_tracks = self.videos_to_gt_tracks[seq_id]
+        else:
+            # classes_to_consider = self.seq_to_classes[seq_id]['pos_cat_ids'] \
+            #                       + self.seq_to_classes[seq_id]['neg_cat_ids']
+            classes_to_consider = all_classes  # class-agnostic
+            all_tracks = self.tracker_data[tracker]["vids_to_tracks"][seq_id]
+
+        # classes_to_tracks = {cls: [track for track in all_tracks if track['category_id'] == cls]
+        #                      if cls in classes_to_consider else [] for cls in all_classes}
+        classes_to_tracks = {
+            cls: [track for track in all_tracks] if cls in classes_to_consider else []
+            for cls in all_classes
+        }  # class-agnostic
+
+        # mapping from classes to track information
+        raw_data["classes_to_tracks"] = {
+            cls: [
+                {
+                    det["image_id"]: np.atleast_1d(det["bbox"])
+                    for det in track["annotations"]
+                }
+                for track in tracks
+            ]
+            for cls, tracks in classes_to_tracks.items()
+        }
+        raw_data["classes_to_track_ids"] = {
+            cls: [track["id"] for track in tracks]
+            for cls, tracks in classes_to_tracks.items()
+        }
+        raw_data["classes_to_track_areas"] = {
+            cls: [track["area"] for track in tracks]
+            for cls, tracks in classes_to_tracks.items()
+        }
+        raw_data["classes_to_track_lengths"] = {
+            cls: [len(track["annotations"]) for track in tracks]
+            for cls, tracks in classes_to_tracks.items()
+        }
+
+        if not is_gt:
+            raw_data["classes_to_dt_track_scores"] = {
+                cls: np.array(
+                    [
+                        np.mean([float(x["score"]) for x in track["annotations"]])
+                        for track in tracks
+                    ]
+                )
+                for cls, tracks in classes_to_tracks.items()
+            }
+
+        if is_gt:
+            key_map = {
+                "classes_to_tracks": "classes_to_gt_tracks",
+                "classes_to_track_ids": "classes_to_gt_track_ids",
+                "classes_to_track_lengths": "classes_to_gt_track_lengths",
+                "classes_to_track_areas": "classes_to_gt_track_areas",
+            }
+        else:
+            key_map = {
+                "classes_to_tracks": "classes_to_dt_tracks",
+                "classes_to_track_ids": "classes_to_dt_track_ids",
+                "classes_to_track_lengths": "classes_to_dt_track_lengths",
+                "classes_to_track_areas": "classes_to_dt_track_areas",
+            }
+        for k, v in key_map.items():
+            raw_data[v] = raw_data.pop(k)
+
+        raw_data["num_timesteps"] = num_timesteps
+        raw_data["neg_cat_ids"] = self.seq_to_classes[seq_id]["neg_cat_ids"]
+        raw_data["not_exhaustively_labeled_cls"] = self.seq_to_classes[seq_id][
+            "not_exhaustively_labeled_cat_ids"
+        ]
+        raw_data["seq"] = seq
+        return raw_data
+
+    @_timing.time
+    def get_preprocessed_seq_data(self, raw_data, cls):
+        """Preprocess data for a single sequence for a single class ready for evaluation.
+        Inputs:
+             - raw_data is a dict containing the data for the sequence already read in by get_raw_seq_data().
+             - cls is the class to be evaluated.
+        Outputs:
+             - data is a dict containing all of the information that metrics need to perform evaluation.
+                It contains the following fields:
+                    [num_timesteps, num_gt_ids, num_tracker_ids, num_gt_dets, num_tracker_dets] : integers.
+                    [gt_ids, tracker_ids, tracker_confidences]: list (for each timestep) of 1D NDArrays (for each det).
+                    [gt_dets, tracker_dets]: list (for each timestep) of lists of detections.
+                    [similarity_scores]: list (for each timestep) of 2D NDArrays.
+        Notes:
+            General preprocessing (preproc) occurs in 4 steps. Some datasets may not use all of these steps.
+                1) Extract only detections relevant for the class to be evaluated (including distractor detections).
+                2) Match gt dets and tracker dets. Remove tracker dets that are matched to a gt det that is of a
+                    distractor class, or otherwise marked as to be removed.
+                3) Remove unmatched tracker dets if they fall within a crowd ignore region or don't meet a certain
+                    other criteria (e.g. are too small).
+                4) Remove gt dets that were only useful for preprocessing and not for actual evaluation.
+            After the above preprocessing steps, this function also calculates the number of gt and tracker detections
+                and unique track ids. It also relabels gt and tracker ids to be contiguous and checks that ids are
+                unique within each timestep.
+        TAO:
+            In TAO, the 4 preproc steps are as follow:
+                1) All classes present in the ground truth data are evaluated separately.
+                2) No matched tracker detections are removed.
+                3) Unmatched tracker detections are removed if there is not ground truth data and the class does not
+                    belong to the categories marked as negative for this sequence. Additionally, unmatched tracker
+                    detections for classes which are marked as not exhaustively labeled are removed.
+                4) No gt detections are removed.
+            Further, for TrackMAP computation track representations for the given class are accessed from a dictionary
+            and the tracks from the tracker data are sorted according to the tracker confidence.
+        """
+        cls_id = self.class_name_to_class_id[cls]
+        is_not_exhaustively_labeled = cls_id in raw_data["not_exhaustively_labeled_cls"]
+        is_neg_category = cls_id in raw_data["neg_cat_ids"]
+
+        data_keys = [
+            "gt_ids",
+            "tracker_ids",
+            "gt_dets",
+            "tracker_dets",
+            "tracker_confidences",
+            "similarity_scores",
+        ]
+        data = {key: [None] * raw_data["num_timesteps"] for key in data_keys}
+        unique_gt_ids = []
+        unique_tracker_ids = []
+        num_gt_dets = 0
+        num_tracker_dets = 0
+        for t in range(raw_data["num_timesteps"]):
+            # Only extract relevant dets for this class for preproc and eval (cls)
+            gt_class_mask = np.atleast_1d(raw_data["gt_classes"][t] == cls_id)
+            gt_class_mask = gt_class_mask.astype(bool)
+            gt_ids = raw_data["gt_ids"][t][gt_class_mask]
+            gt_dets = raw_data["gt_dets"][t][gt_class_mask]
+
+            tracker_class_mask = np.atleast_1d(raw_data["tracker_classes"][t] == cls_id)
+            tracker_class_mask = tracker_class_mask.astype(bool)
+            tracker_ids = raw_data["tracker_ids"][t][tracker_class_mask]
+            tracker_dets = raw_data["tracker_dets"][t][tracker_class_mask]
+            tracker_confidences = raw_data["tracker_confidences"][t][tracker_class_mask]
+            similarity_scores = raw_data["similarity_scores"][t][gt_class_mask, :][
+                :, tracker_class_mask
+            ]
+
+            # Match tracker and gt dets (with hungarian algorithm).
+            unmatched_indices = np.arange(tracker_ids.shape[0])
+            if gt_ids.shape[0] > 0 and tracker_ids.shape[0] > 0:
+                matching_scores = similarity_scores.copy()
+                matching_scores[matching_scores < 0.5 - np.finfo("float").eps] = 0
+                match_rows, match_cols = linear_sum_assignment(-matching_scores)
+                actually_matched_mask = (
+                    matching_scores[match_rows, match_cols] > 0 + np.finfo("float").eps
+                )
+                match_cols = match_cols[actually_matched_mask]
+                unmatched_indices = np.delete(unmatched_indices, match_cols, axis=0)
+
+            if gt_ids.shape[0] == 0 and not is_neg_category:
+                to_remove_tracker = unmatched_indices
+            elif is_not_exhaustively_labeled:
+                to_remove_tracker = unmatched_indices
+            else:
+                to_remove_tracker = np.array([], dtype=int)
+
+            # remove all unwanted unmatched tracker detections
+            data["tracker_ids"][t] = np.delete(tracker_ids, to_remove_tracker, axis=0)
+            data["tracker_dets"][t] = np.delete(tracker_dets, to_remove_tracker, axis=0)
+            data["tracker_confidences"][t] = np.delete(
+                tracker_confidences, to_remove_tracker, axis=0
+            )
+            similarity_scores = np.delete(similarity_scores, to_remove_tracker, axis=1)
+
+            data["gt_ids"][t] = gt_ids
+            data["gt_dets"][t] = gt_dets
+            data["similarity_scores"][t] = similarity_scores
+
+            unique_gt_ids += list(np.unique(data["gt_ids"][t]))
+            unique_tracker_ids += list(np.unique(data["tracker_ids"][t]))
+            num_tracker_dets += len(data["tracker_ids"][t])
+            num_gt_dets += len(data["gt_ids"][t])
+
+        # Re-label IDs such that there are no empty IDs
+        if len(unique_gt_ids) > 0:
+            unique_gt_ids = np.unique(unique_gt_ids)
+            gt_id_map = np.nan * np.ones((np.max(unique_gt_ids) + 1))
+            gt_id_map[unique_gt_ids] = np.arange(len(unique_gt_ids))
+            for t in range(raw_data["num_timesteps"]):
+                if len(data["gt_ids"][t]) > 0:
+                    data["gt_ids"][t] = gt_id_map[data["gt_ids"][t]].astype(int)
+        if len(unique_tracker_ids) > 0:
+            unique_tracker_ids = np.unique(unique_tracker_ids)
+            tracker_id_map = np.nan * np.ones((np.max(unique_tracker_ids) + 1))
+            tracker_id_map[unique_tracker_ids] = np.arange(len(unique_tracker_ids))
+            for t in range(raw_data["num_timesteps"]):
+                if len(data["tracker_ids"][t]) > 0:
+                    data["tracker_ids"][t] = tracker_id_map[
+                        data["tracker_ids"][t]
+                    ].astype(int)
+
+        # Record overview statistics.
+        data["num_tracker_dets"] = num_tracker_dets
+        data["num_gt_dets"] = num_gt_dets
+        data["num_tracker_ids"] = len(unique_tracker_ids)
+        data["num_gt_ids"] = len(unique_gt_ids)
+        data["num_timesteps"] = raw_data["num_timesteps"]
+        data["seq"] = raw_data["seq"]
+
+        # get track representations
+        data["gt_tracks"] = raw_data["classes_to_gt_tracks"][cls_id]
+        data["gt_track_ids"] = raw_data["classes_to_gt_track_ids"][cls_id]
+        data["gt_track_lengths"] = raw_data["classes_to_gt_track_lengths"][cls_id]
+        data["gt_track_areas"] = raw_data["classes_to_gt_track_areas"][cls_id]
+        data["dt_tracks"] = raw_data["classes_to_dt_tracks"][cls_id]
+        data["dt_track_ids"] = raw_data["classes_to_dt_track_ids"][cls_id]
+        data["dt_track_lengths"] = raw_data["classes_to_dt_track_lengths"][cls_id]
+        data["dt_track_areas"] = raw_data["classes_to_dt_track_areas"][cls_id]
+        data["dt_track_scores"] = raw_data["classes_to_dt_track_scores"][cls_id]
+        data["not_exhaustively_labeled"] = is_not_exhaustively_labeled
+        data["iou_type"] = "bbox"
+
+        # sort tracker data tracks by tracker confidence scores
+        if data["dt_tracks"]:
+            idx = np.argsort(
+                [-score for score in data["dt_track_scores"]], kind="mergesort"
+            )
+            data["dt_track_scores"] = [data["dt_track_scores"][i] for i in idx]
+            data["dt_tracks"] = [data["dt_tracks"][i] for i in idx]
+            data["dt_track_ids"] = [data["dt_track_ids"][i] for i in idx]
+            data["dt_track_lengths"] = [data["dt_track_lengths"][i] for i in idx]
+            data["dt_track_areas"] = [data["dt_track_areas"][i] for i in idx]
+        # Ensure that ids are unique per timestep.
+        self._check_unique_ids(data)
+
+        return data
+
+    def _calculate_similarities(self, gt_dets_t, tracker_dets_t):
+        similarity_scores = self._calculate_box_ious(gt_dets_t, tracker_dets_t)
+        return similarity_scores
+
+    def _merge_categories(self, annotations):
+        """
+        Merges categories with a merged tag. Adapted from https://github.com/TAO-Dataset
+        :param annotations: the annotations in which the classes should be merged
+        :return: None
+        """
+        merge_map = {}
+        for category in self.gt_data["categories"]:
+            if "merged" in category:
+                for to_merge in category["merged"]:
+                    merge_map[to_merge["id"]] = category["id"]
+
+        for ann in annotations:
+            ann["category_id"] = merge_map.get(ann["category_id"], ann["category_id"])
+
+    def _compute_vid_mappings(self, annotations):
+        """
+        Computes mappings from Videos to corresponding tracks and images.
+        :param annotations: the annotations for which the mapping should be generated
+        :return: the video-to-track-mapping, the video-to-image-mapping
+        """
+        vids_to_tracks = {}
+        vids_to_imgs = {}
+        vid_ids = [vid["id"] for vid in self.gt_data["videos"]]
+
+        # compute an mapping from image IDs to images
+        images = {}
+        for image in self.gt_data["images"]:
+            images[image["id"]] = image
+
+        for ann in annotations:
+            ann["area"] = ann["bbox"][2] * ann["bbox"][3]
+
+            vid = ann["video_id"]
+            if ann["video_id"] not in vids_to_tracks.keys():
+                vids_to_tracks[ann["video_id"]] = list()
+            if ann["video_id"] not in vids_to_imgs.keys():
+                vids_to_imgs[ann["video_id"]] = list()
+
+            # Fill in vids_to_tracks
+            tid = ann["track_id"]
+            exist_tids = [track["id"] for track in vids_to_tracks[vid]]
+            try:
+                index1 = exist_tids.index(tid)
+            except ValueError:
+                index1 = -1
+            if tid not in exist_tids:
+                curr_track = {
+                    "id": tid,
+                    "category_id": ann["category_id"],
+                    "video_id": vid,
+                    "annotations": [ann],
+                }
+                vids_to_tracks[vid].append(curr_track)
+            else:
+                vids_to_tracks[vid][index1]["annotations"].append(ann)
+
+            # Fill in vids_to_imgs
+            img_id = ann["image_id"]
+            exist_img_ids = [img["id"] for img in vids_to_imgs[vid]]
+            try:
+                index2 = exist_img_ids.index(img_id)
+            except ValueError:
+                index2 = -1
+            if index2 == -1:
+                curr_img = {"id": img_id, "annotations": [ann]}
+                vids_to_imgs[vid].append(curr_img)
+            else:
+                vids_to_imgs[vid][index2]["annotations"].append(ann)
+
+        # sort annotations by frame index and compute track area
+        for vid, tracks in vids_to_tracks.items():
+            for track in tracks:
+                track["annotations"] = sorted(
+                    track["annotations"],
+                    key=lambda x: images[x["image_id"]]["frame_index"],
+                )
+                # Computer average area
+                track["area"] = sum(x["area"] for x in track["annotations"]) / len(
+                    track["annotations"]
+                )
+
+        # Ensure all videos are present
+        for vid_id in vid_ids:
+            if vid_id not in vids_to_tracks.keys():
+                vids_to_tracks[vid_id] = []
+            if vid_id not in vids_to_imgs.keys():
+                vids_to_imgs[vid_id] = []
+
+        return vids_to_tracks, vids_to_imgs
+
+    def _compute_image_to_timestep_mappings(self):
+        """
+        Computes a mapping from images to the corresponding timestep in the sequence.
+        :return: the image-to-timestep-mapping
+        """
+        images = {}
+        for image in self.gt_data["images"]:
+            images[image["id"]] = image
+
+        seq_to_imgs_to_timestep = {vid["id"]: dict() for vid in self.gt_data["videos"]}
+        for vid in seq_to_imgs_to_timestep:
+            curr_imgs = [img["id"] for img in self.videos_to_gt_images[vid]]
+            curr_imgs = sorted(curr_imgs, key=lambda x: images[x]["frame_index"])
+            seq_to_imgs_to_timestep[vid] = {
+                curr_imgs[i]: i for i in range(len(curr_imgs))
+            }
+
+        return seq_to_imgs_to_timestep
+
+    def _limit_dets_per_image(self, annotations):
+        """
+        Limits the number of detections for each image to config['MAX_DETECTIONS']. Adapted from
+        https://github.com/TAO-Dataset/
+        :param annotations: the annotations in which the detections should be limited
+        :return: the annotations with limited detections
+        """
+        max_dets = self.config["MAX_DETECTIONS"]
+        img_ann = defaultdict(list)
+        for ann in annotations:
+            img_ann[ann["image_id"]].append(ann)
+
+        for img_id, _anns in img_ann.items():
+            if len(_anns) <= max_dets:
+                continue
+            _anns = sorted(_anns, key=lambda x: x["score"], reverse=True)
+            img_ann[img_id] = _anns[:max_dets]
+
+        return [ann for anns in img_ann.values() for ann in anns]
+
+    def _fill_video_ids_inplace(self, annotations):
+        """
+        Fills in missing video IDs inplace. Adapted from https://github.com/TAO-Dataset/
+        :param annotations: the annotations for which the videos IDs should be filled inplace
+        :return: None
+        """
+        missing_video_id = [x for x in annotations if "video_id" not in x]
+        if missing_video_id:
+            image_id_to_video_id = {
+                x["id"]: x["video_id"] for x in self.gt_data["images"]
+            }
+            for x in missing_video_id:
+                x["video_id"] = image_id_to_video_id[x["image_id"]]
+
+    @staticmethod
+    def _make_track_ids_unique(annotations):
+        """
+        Makes the track IDs unqiue over the whole annotation set. Adapted from https://github.com/TAO-Dataset/
+        :param annotations: the annotation set
+        :return: the number of updated IDs
+        """
+        track_id_videos = {}
+        track_ids_to_update = set()
+        max_track_id = 0
+        for ann in annotations:
+            t = ann["track_id"]
+            if t not in track_id_videos:
+                track_id_videos[t] = ann["video_id"]
+
+            if ann["video_id"] != track_id_videos[t]:
+                # Track id is assigned to multiple videos
+                track_ids_to_update.add(t)
+            max_track_id = max(max_track_id, t)
+
+        if track_ids_to_update:
+            print("true")
+            next_id = itertools.count(max_track_id + 1)
+            new_track_ids = defaultdict(lambda: next(next_id))
+            for ann in annotations:
+                t = ann["track_id"]
+                v = ann["video_id"]
+                if t in track_ids_to_update:
+                    ann["track_id"] = new_track_ids[t, v]
+        return len(track_ids_to_update)
+
+    def _split_known_unknown_distractor(self):
+        all_ids = set(
+            [i for i in range(1, 2000)]
+        )  # 2000 is larger than the max category id in TAO-OW.
+        # `knowns` includes 78 TAO_category_ids that corresponds to 78 COCO classes.
+        # (The other 2 COCO classes do not have corresponding classes in TAO).
+        self.knowns = {
+            4,
+            13,
+            1038,
+            544,
+            1057,
+            34,
+            35,
+            36,
+            41,
+            45,
+            58,
+            60,
+            579,
+            1091,
+            1097,
+            1099,
+            78,
+            79,
+            81,
+            91,
+            1115,
+            1117,
+            95,
+            1122,
+            99,
+            1132,
+            621,
+            1135,
+            625,
+            118,
+            1144,
+            126,
+            642,
+            1155,
+            133,
+            1162,
+            139,
+            154,
+            174,
+            185,
+            699,
+            1215,
+            714,
+            717,
+            1229,
+            211,
+            729,
+            221,
+            229,
+            747,
+            235,
+            237,
+            779,
+            276,
+            805,
+            299,
+            829,
+            852,
+            347,
+            371,
+            382,
+            896,
+            392,
+            926,
+            937,
+            428,
+            429,
+            961,
+            452,
+            979,
+            980,
+            982,
+            475,
+            480,
+            993,
+            1001,
+            502,
+            1018,
+        }
+        # `distractors` is defined as in the paper "Opening up Open-World Tracking"
+        self.distractors = {
+            20,
+            63,
+            108,
+            180,
+            188,
+            204,
+            212,
+            247,
+            303,
+            403,
+            407,
+            415,
+            490,
+            504,
+            507,
+            513,
+            529,
+            567,
+            569,
+            588,
+            672,
+            691,
+            702,
+            708,
+            711,
+            720,
+            736,
+            737,
+            798,
+            813,
+            815,
+            827,
+            831,
+            851,
+            877,
+            883,
+            912,
+            971,
+            976,
+            1130,
+            1133,
+            1134,
+            1169,
+            1184,
+            1220,
+        }
+        self.unknowns = all_ids.difference(self.knowns.union(self.distractors))
+
+    def _filter_gt_data(self, raw_gt_data):
+        """
+        Filter out irrelevant data in the raw_gt_data
+        Args:
+            raw_gt_data: directly loaded from json.
+
+        Returns:
+            filtered gt_data
+        """
+        valid_cat_ids = list()
+        if self.subset == "known":
+            valid_cat_ids = self.knowns
+        elif self.subset == "distractor":
+            valid_cat_ids = self.distractors
+        elif self.subset == "unknown":
+            valid_cat_ids = self.unknowns
+        # elif self.subset == "test_only_unknowns":
+        #     valid_cat_ids = test_only_unknowns
+        else:
+            raise Exception("The parameter `SUBSET` is incorrect")
+
+        filtered = dict()
+        filtered["videos"] = raw_gt_data["videos"]
+        # filtered["videos"] = list()
+        unwanted_vid = set()
+        # for video in raw_gt_data["videos"]:
+        #     datasrc = video["name"].split('/')[1]
+        #     if datasrc in data_srcs:
+        #         filtered["videos"].append(video)
+        #     else:
+        #         unwanted_vid.add(video["id"])
+
+        filtered["annotations"] = list()
+        for ann in raw_gt_data["annotations"]:
+            if (ann["video_id"] not in unwanted_vid) and (
+                ann["category_id"] in valid_cat_ids
+            ):
+                filtered["annotations"].append(ann)
+
+        filtered["tracks"] = list()
+        for track in raw_gt_data["tracks"]:
+            if (track["video_id"] not in unwanted_vid) and (
+                track["category_id"] in valid_cat_ids
+            ):
+                filtered["tracks"].append(track)
+
+        filtered["images"] = list()
+        for image in raw_gt_data["images"]:
+            if image["video_id"] not in unwanted_vid:
+                filtered["images"].append(image)
+
+        filtered["categories"] = list()
+        for cat in raw_gt_data["categories"]:
+            if cat["id"] in valid_cat_ids:
+                filtered["categories"].append(cat)
+
+        filtered["info"] = raw_gt_data["info"]
+        filtered["licenses"] = raw_gt_data["licenses"]
+
+        return filtered
--- a/sam3/eval/hota_eval_toolkit/trackeval/datasets/youtube_vis.py
+++ b/sam3/eval/hota_eval_toolkit/trackeval/datasets/youtube_vis.py
@@ -0,0 +1,524 @@
+# flake8: noqa
+
+# note: this file has been modified from its original version in TrackEval in
+# https://github.com/JonathonLuiten/TrackEval/blob/master/trackeval/datasets/youtube_vis.py
+# to support the following:
+# 1) bbox evaluation (via `IOU_TYPE`)
+# 2) passing GT and prediction data as Python objects (via `GT_JSON_OBJECT` and `TRACKER_JSON_OBJECT`)
+# 3) specifying a custom dataset name (via `DATASET_NAME`)
+
+import json
+import os
+
+import numpy as np
+
+from .. import _timing, utils
+from ..utils import TrackEvalException
+from ._base_dataset import _BaseDataset
+
+
+class YouTubeVIS(_BaseDataset):
+    """Dataset class for YouTubeVIS tracking"""
+
+    @staticmethod
+    def get_default_dataset_config():
+        """Default class config values"""
+        code_path = utils.get_code_path()
+        default_config = {
+            "GT_FOLDER": os.path.join(
+                code_path, "data/gt/youtube_vis/"
+            ),  # Location of GT data
+            "TRACKERS_FOLDER": os.path.join(code_path, "data/trackers/youtube_vis/"),
+            # Trackers location
+            "OUTPUT_FOLDER": None,  # Where to save eval results (if None, same as TRACKERS_FOLDER)
+            "TRACKERS_TO_EVAL": None,  # Filenames of trackers to eval (if None, all in folder)
+            "CLASSES_TO_EVAL": None,  # Classes to eval (if None, all classes)
+            "SPLIT_TO_EVAL": "train_sub_split",  # Valid: 'train', 'val', 'train_sub_split'
+            "PRINT_CONFIG": True,  # Whether to print current config
+            "OUTPUT_SUB_FOLDER": "",  # Output files are saved in OUTPUT_FOLDER/tracker_name/OUTPUT_SUB_FOLDER
+            "TRACKER_SUB_FOLDER": "data",  # Tracker files are in TRACKER_FOLDER/tracker_name/TRACKER_SUB_FOLDER
+            "TRACKER_DISPLAY_NAMES": None,  # Names of trackers to display, if None: TRACKERS_TO_EVAL
+            # Added for video phrase AP evaluation -- allow directly specifying the GT JSON data and Tracker (result)
+            # JSON data as Python objects, without reading from files.
+            "GT_JSON_OBJECT": None,
+            "TRACKER_JSON_OBJECT": None,
+            "IOU_TYPE": "segm",
+            "DATASET_NAME": "video",
+        }
+        return default_config
+
+    def __init__(self, config=None):
+        """Initialise dataset, checking that all required files are present"""
+        super().__init__()
+        # Fill non-given config values with defaults
+        self.config = utils.init_config(config, self.get_default_dataset_config())
+        self.gt_fol = (
+            self.config["GT_FOLDER"] + "youtube_vis_" + self.config["SPLIT_TO_EVAL"]
+        )
+        self.tracker_fol = (
+            self.config["TRACKERS_FOLDER"]
+            + "youtube_vis_"
+            + self.config["SPLIT_TO_EVAL"]
+        )
+        self.use_super_categories = False
+        self.should_classes_combine = True
+        assert self.config["IOU_TYPE"] in ["segm", "bbox"]
+        self.iou_type = self.config["IOU_TYPE"]
+        print("=" * 100)
+        print(f"Evaluate annotation type *{self.iou_type}*")
+        self.dataset_name = self.config["DATASET_NAME"]
+
+        self.output_fol = self.config["OUTPUT_FOLDER"]
+        if self.output_fol is None:
+            self.output_fol = self.tracker_fol
+        self.output_sub_fol = self.config["OUTPUT_SUB_FOLDER"]
+        self.tracker_sub_fol = self.config["TRACKER_SUB_FOLDER"]
+
+        if self.config["GT_JSON_OBJECT"] is not None:
+            # allow directly specifying the GT JSON data without reading from files
+            gt_json = self.config["GT_JSON_OBJECT"]
+            assert isinstance(gt_json, dict)
+            assert "videos" in gt_json
+            assert "categories" in gt_json
+            assert "annotations" in gt_json
+            self.gt_data = gt_json
+        else:
+            if not os.path.exists(self.gt_fol):
+                print("GT folder not found: " + self.gt_fol)
+                raise TrackEvalException(
+                    "GT folder not found: " + os.path.basename(self.gt_fol)
+                )
+            gt_dir_files = [
+                file for file in os.listdir(self.gt_fol) if file.endswith(".json")
+            ]
+            if len(gt_dir_files) != 1:
+                raise TrackEvalException(
+                    self.gt_fol + " does not contain exactly one json file."
+                )
+
+            with open(os.path.join(self.gt_fol, gt_dir_files[0])) as f:
+                self.gt_data = json.load(f)
+
+        # Get classes to eval
+        self.valid_classes = [cls["name"] for cls in self.gt_data["categories"]]
+        cls_name_to_cls_id_map = {
+            cls["name"]: cls["id"] for cls in self.gt_data["categories"]
+        }
+
+        if self.config["CLASSES_TO_EVAL"]:
+            self.class_list = [
+                cls.lower() if cls.lower() in self.valid_classes else None
+                for cls in self.config["CLASSES_TO_EVAL"]
+            ]
+            if not all(self.class_list):
+                raise TrackEvalException(
+                    "Attempted to evaluate an invalid class. Only classes "
+                    + ", ".join(self.valid_classes)
+                    + " are valid."
+                )
+        else:
+            self.class_list = [cls["name"] for cls in self.gt_data["categories"]]
+        self.class_name_to_class_id = {
+            k: v for k, v in cls_name_to_cls_id_map.items() if k in self.class_list
+        }
+
+        # Get sequences to eval and check gt files exist
+        self.seq_list = [
+            vid["file_names"][0].split("/")[0] for vid in self.gt_data["videos"]
+        ]
+        self.seq_name_to_seq_id = {
+            vid["file_names"][0].split("/")[0]: vid["id"]
+            for vid in self.gt_data["videos"]
+        }
+        self.seq_lengths = {
+            vid["id"]: len(vid["file_names"]) for vid in self.gt_data["videos"]
+        }
+
+        # encode masks and compute track areas
+        self._prepare_gt_annotations()
+
+        # Get trackers to eval
+        if self.config["TRACKER_JSON_OBJECT"] is not None:
+            # allow directly specifying the tracker JSON data without reading from files
+            tracker_json = self.config["TRACKER_JSON_OBJECT"]
+            assert isinstance(tracker_json, list)
+            self.tracker_list = ["tracker"]
+        elif self.config["TRACKERS_TO_EVAL"] is None:
+            self.tracker_list = os.listdir(self.tracker_fol)
+        else:
+            self.tracker_list = self.config["TRACKERS_TO_EVAL"]
+
+        if self.config["TRACKER_DISPLAY_NAMES"] is None:
+            self.tracker_to_disp = dict(zip(self.tracker_list, self.tracker_list))
+        elif (self.config["TRACKERS_TO_EVAL"] is not None) and (
+            len(self.config["TRACKER_DISPLAY_NAMES"]) == len(self.tracker_list)
+        ):
+            self.tracker_to_disp = dict(
+                zip(self.tracker_list, self.config["TRACKER_DISPLAY_NAMES"])
+            )
+        else:
+            raise TrackEvalException(
+                "List of tracker files and tracker display names do not match."
+            )
+
+        # counter for globally unique track IDs
+        self.global_tid_counter = 0
+
+        self.tracker_data = dict()
+        if self.config["TRACKER_JSON_OBJECT"] is not None:
+            # allow directly specifying the tracker JSON data without reading from files
+            tracker = self.tracker_list[0]
+            self.tracker_data[tracker] = tracker_json
+        else:
+            for tracker in self.tracker_list:
+                tracker_dir_path = os.path.join(
+                    self.tracker_fol, tracker, self.tracker_sub_fol
+                )
+                tr_dir_files = [
+                    file
+                    for file in os.listdir(tracker_dir_path)
+                    if file.endswith(".json")
+                ]
+                if len(tr_dir_files) != 1:
+                    raise TrackEvalException(
+                        tracker_dir_path + " does not contain exactly one json file."
+                    )
+
+                with open(os.path.join(tracker_dir_path, tr_dir_files[0])) as f:
+                    curr_data = json.load(f)
+
+                self.tracker_data[tracker] = curr_data
+
+    def get_display_name(self, tracker):
+        return self.tracker_to_disp[tracker]
+
+    def _load_raw_file(self, tracker, seq, is_gt):
+        """Load a file (gt or tracker) in the YouTubeVIS format
+        If is_gt, this returns a dict which contains the fields:
+        [gt_ids, gt_classes] : list (for each timestep) of 1D NDArrays (for each det).
+        [gt_dets]: list (for each timestep) of lists of detections.
+        [classes_to_gt_tracks]: dictionary with class values as keys and list of dictionaries (with frame indices as
+                                keys and corresponding segmentations as values) for each track
+        [classes_to_gt_track_ids, classes_to_gt_track_areas, classes_to_gt_track_iscrowd]: dictionary with class values
+                                as keys and lists (for each track) as values
+
+        if not is_gt, this returns a dict which contains the fields:
+        [tracker_ids, tracker_classes, tracker_confidences] : list (for each timestep) of 1D NDArrays (for each det).
+        [tracker_dets]: list (for each timestep) of lists of detections.
+        [classes_to_dt_tracks]: dictionary with class values as keys and list of dictionaries (with frame indices as
+                                keys and corresponding segmentations as values) for each track
+        [classes_to_dt_track_ids, classes_to_dt_track_areas]: dictionary with class values as keys and lists as values
+        [classes_to_dt_track_scores]: dictionary with class values as keys and 1D numpy arrays as values
+        """
+        # select sequence tracks
+        seq_id = self.seq_name_to_seq_id[seq]
+        if is_gt:
+            tracks = [
+                ann for ann in self.gt_data["annotations"] if ann["video_id"] == seq_id
+            ]
+        else:
+            tracks = self._get_tracker_seq_tracks(tracker, seq_id)
+
+        # Convert data to required format
+        num_timesteps = self.seq_lengths[seq_id]
+        data_keys = ["ids", "classes", "dets"]
+        if not is_gt:
+            data_keys += ["tracker_confidences"]
+        raw_data = {key: [None] * num_timesteps for key in data_keys}
+        result_key = "segmentations" if self.iou_type == "segm" else "bboxes"
+        for t in range(num_timesteps):
+            raw_data["dets"][t] = [
+                track[result_key][t] for track in tracks if track[result_key][t]
+            ]
+            raw_data["ids"][t] = np.atleast_1d(
+                [track["id"] for track in tracks if track[result_key][t]]
+            ).astype(int)
+            raw_data["classes"][t] = np.atleast_1d(
+                [track["category_id"] for track in tracks if track[result_key][t]]
+            ).astype(int)
+            if not is_gt:
+                raw_data["tracker_confidences"][t] = np.atleast_1d(
+                    [track["score"] for track in tracks if track[result_key][t]]
+                ).astype(float)
+
+        if is_gt:
+            key_map = {"ids": "gt_ids", "classes": "gt_classes", "dets": "gt_dets"}
+        else:
+            key_map = {
+                "ids": "tracker_ids",
+                "classes": "tracker_classes",
+                "dets": "tracker_dets",
+            }
+        for k, v in key_map.items():
+            raw_data[v] = raw_data.pop(k)
+
+        all_cls_ids = {self.class_name_to_class_id[cls] for cls in self.class_list}
+        classes_to_tracks = {
+            cls: [track for track in tracks if track["category_id"] == cls]
+            for cls in all_cls_ids
+        }
+
+        # mapping from classes to track representations and track information
+        raw_data["classes_to_tracks"] = {
+            cls: [
+                {i: track[result_key][i] for i in range(len(track[result_key]))}
+                for track in tracks
+            ]
+            for cls, tracks in classes_to_tracks.items()
+        }
+        raw_data["classes_to_track_ids"] = {
+            cls: [track["id"] for track in tracks]
+            for cls, tracks in classes_to_tracks.items()
+        }
+        raw_data["classes_to_track_areas"] = {
+            cls: [track["area"] for track in tracks]
+            for cls, tracks in classes_to_tracks.items()
+        }
+
+        if is_gt:
+            raw_data["classes_to_gt_track_iscrowd"] = {
+                cls: [track["iscrowd"] for track in tracks]
+                for cls, tracks in classes_to_tracks.items()
+            }
+        else:
+            raw_data["classes_to_dt_track_scores"] = {
+                cls: np.array([track["score"] for track in tracks])
+                for cls, tracks in classes_to_tracks.items()
+            }
+
+        if is_gt:
+            key_map = {
+                "classes_to_tracks": "classes_to_gt_tracks",
+                "classes_to_track_ids": "classes_to_gt_track_ids",
+                "classes_to_track_areas": "classes_to_gt_track_areas",
+            }
+        else:
+            key_map = {
+                "classes_to_tracks": "classes_to_dt_tracks",
+                "classes_to_track_ids": "classes_to_dt_track_ids",
+                "classes_to_track_areas": "classes_to_dt_track_areas",
+            }
+        for k, v in key_map.items():
+            raw_data[v] = raw_data.pop(k)
+
+        raw_data["num_timesteps"] = num_timesteps
+        raw_data["seq"] = seq
+        return raw_data
+
+    @_timing.time
+    def get_preprocessed_seq_data(self, raw_data, cls):
+        """Preprocess data for a single sequence for a single class ready for evaluation.
+        Inputs:
+             - raw_data is a dict containing the data for the sequence already read in by get_raw_seq_data().
+             - cls is the class to be evaluated.
+        Outputs:
+             - data is a dict containing all of the information that metrics need to perform evaluation.
+                It contains the following fields:
+                    [num_timesteps, num_gt_ids, num_tracker_ids, num_gt_dets, num_tracker_dets] : integers.
+                    [gt_ids, tracker_ids, tracker_confidences]: list (for each timestep) of 1D NDArrays (for each det).
+                    [gt_dets, tracker_dets]: list (for each timestep) of lists of detections.
+                    [similarity_scores]: list (for each timestep) of 2D NDArrays.
+        Notes:
+            General preprocessing (preproc) occurs in 4 steps. Some datasets may not use all of these steps.
+                1) Extract only detections relevant for the class to be evaluated (including distractor detections).
+                2) Match gt dets and tracker dets. Remove tracker dets that are matched to a gt det that is of a
+                    distractor class, or otherwise marked as to be removed.
+                3) Remove unmatched tracker dets if they fall within a crowd ignore region or don't meet a certain
+                    other criteria (e.g. are too small).
+                4) Remove gt dets that were only useful for preprocessing and not for actual evaluation.
+            After the above preprocessing steps, this function also calculates the number of gt and tracker detections
+                and unique track ids. It also relabels gt and tracker ids to be contiguous and checks that ids are
+                unique within each timestep.
+        YouTubeVIS:
+            In YouTubeVIS, the 4 preproc steps are as follow:
+                1) There are 40 classes which are evaluated separately.
+                2) No matched tracker dets are removed.
+                3) No unmatched tracker dets are removed.
+                4) No gt dets are removed.
+            Further, for TrackMAP computation track representations for the given class are accessed from a dictionary
+            and the tracks from the tracker data are sorted according to the tracker confidence.
+        """
+        cls_id = self.class_name_to_class_id[cls]
+
+        data_keys = [
+            "gt_ids",
+            "tracker_ids",
+            "gt_dets",
+            "tracker_dets",
+            "similarity_scores",
+        ]
+        data = {key: [None] * raw_data["num_timesteps"] for key in data_keys}
+        unique_gt_ids = []
+        unique_tracker_ids = []
+        num_gt_dets = 0
+        num_tracker_dets = 0
+
+        for t in range(raw_data["num_timesteps"]):
+            # Only extract relevant dets for this class for eval (cls)
+            gt_class_mask = np.atleast_1d(raw_data["gt_classes"][t] == cls_id)
+            gt_class_mask = gt_class_mask.astype(bool)
+            gt_ids = raw_data["gt_ids"][t][gt_class_mask]
+            gt_dets = [
+                raw_data["gt_dets"][t][ind]
+                for ind in range(len(gt_class_mask))
+                if gt_class_mask[ind]
+            ]
+
+            tracker_class_mask = np.atleast_1d(raw_data["tracker_classes"][t] == cls_id)
+            tracker_class_mask = tracker_class_mask.astype(bool)
+            tracker_ids = raw_data["tracker_ids"][t][tracker_class_mask]
+            tracker_dets = [
+                raw_data["tracker_dets"][t][ind]
+                for ind in range(len(tracker_class_mask))
+                if tracker_class_mask[ind]
+            ]
+            similarity_scores = raw_data["similarity_scores"][t][gt_class_mask, :][
+                :, tracker_class_mask
+            ]
+
+            data["tracker_ids"][t] = tracker_ids
+            data["tracker_dets"][t] = tracker_dets
+            data["gt_ids"][t] = gt_ids
+            data["gt_dets"][t] = gt_dets
+            data["similarity_scores"][t] = similarity_scores
+
+            unique_gt_ids += list(np.unique(data["gt_ids"][t]))
+            unique_tracker_ids += list(np.unique(data["tracker_ids"][t]))
+            num_tracker_dets += len(data["tracker_ids"][t])
+            num_gt_dets += len(data["gt_ids"][t])
+
+        # Re-label IDs such that there are no empty IDs
+        if len(unique_gt_ids) > 0:
+            unique_gt_ids = np.unique(unique_gt_ids)
+            gt_id_map = np.nan * np.ones((np.max(unique_gt_ids) + 1))
+            gt_id_map[unique_gt_ids] = np.arange(len(unique_gt_ids))
+            for t in range(raw_data["num_timesteps"]):
+                if len(data["gt_ids"][t]) > 0:
+                    data["gt_ids"][t] = gt_id_map[data["gt_ids"][t]].astype(int)
+        if len(unique_tracker_ids) > 0:
+            unique_tracker_ids = np.unique(unique_tracker_ids)
+            tracker_id_map = np.nan * np.ones((np.max(unique_tracker_ids) + 1))
+            tracker_id_map[unique_tracker_ids] = np.arange(len(unique_tracker_ids))
+            for t in range(raw_data["num_timesteps"]):
+                if len(data["tracker_ids"][t]) > 0:
+                    data["tracker_ids"][t] = tracker_id_map[
+                        data["tracker_ids"][t]
+                    ].astype(int)
+
+        # Ensure that ids are unique per timestep.
+        self._check_unique_ids(data)
+
+        # Record overview statistics.
+        data["num_tracker_dets"] = num_tracker_dets
+        data["num_gt_dets"] = num_gt_dets
+        data["num_tracker_ids"] = len(unique_tracker_ids)
+        data["num_gt_ids"] = len(unique_gt_ids)
+        data["num_timesteps"] = raw_data["num_timesteps"]
+        data["seq"] = raw_data["seq"]
+
+        # get track representations
+        data["gt_tracks"] = raw_data["classes_to_gt_tracks"][cls_id]
+        data["gt_track_ids"] = raw_data["classes_to_gt_track_ids"][cls_id]
+        data["gt_track_areas"] = raw_data["classes_to_gt_track_areas"][cls_id]
+        data["gt_track_iscrowd"] = raw_data["classes_to_gt_track_iscrowd"][cls_id]
+        data["dt_tracks"] = raw_data["classes_to_dt_tracks"][cls_id]
+        data["dt_track_ids"] = raw_data["classes_to_dt_track_ids"][cls_id]
+        data["dt_track_areas"] = raw_data["classes_to_dt_track_areas"][cls_id]
+        data["dt_track_scores"] = raw_data["classes_to_dt_track_scores"][cls_id]
+        data["iou_type"] = "mask"
+
+        # sort tracker data tracks by tracker confidence scores
+        if data["dt_tracks"]:
+            idx = np.argsort(
+                [-score for score in data["dt_track_scores"]], kind="mergesort"
+            )
+            data["dt_track_scores"] = [data["dt_track_scores"][i] for i in idx]
+            data["dt_tracks"] = [data["dt_tracks"][i] for i in idx]
+            data["dt_track_ids"] = [data["dt_track_ids"][i] for i in idx]
+            data["dt_track_areas"] = [data["dt_track_areas"][i] for i in idx]
+
+        return data
+
+    def _calculate_similarities(self, gt_dets_t, tracker_dets_t):
+        if self.iou_type == "segm":
+            similarity_scores = self._calculate_mask_ious(
+                gt_dets_t, tracker_dets_t, is_encoded=True, do_ioa=False
+            )
+        else:
+            gt_dets_t = np.array(gt_dets_t, dtype=np.float32).reshape(-1, 4)
+            tracker_dets_t = np.array(tracker_dets_t, dtype=np.float32).reshape(-1, 4)
+            similarity_scores = self._calculate_box_ious(
+                gt_dets_t, tracker_dets_t, box_format="xywh", do_ioa=False
+            )
+        return similarity_scores
+
+    def _prepare_gt_annotations(self):
+        """
+        Prepares GT data by rle encoding segmentations and computing the average track area.
+        :return: None
+        """
+        if self.iou_type == "segm":
+            # only loaded when needed to reduce minimum requirements
+            from pycocotools import mask as mask_utils
+
+            for track in self.gt_data["annotations"]:
+                h = track["height"]
+                w = track["width"]
+                for i, seg in enumerate(track["segmentations"]):
+                    if seg is not None and isinstance(seg["counts"], list):
+                        track["segmentations"][i] = mask_utils.frPyObjects(seg, h, w)
+                areas = [a for a in track["areas"] if a]
+                if len(areas) == 0:
+                    track["area"] = 0
+                else:
+                    track["area"] = np.array(areas).mean()
+        else:
+            for track in self.gt_data["annotations"]:
+                # For bbox eval, compute areas from bboxes if not already available
+                areas = [a for a in track.get("areas", []) if a]
+                if not areas:
+                    areas = []
+                    for bbox in track.get("bboxes", []):
+                        if bbox is not None:
+                            areas.append(bbox[2] * bbox[3])
+                track["area"] = np.array(areas).mean() if areas else 0
+
+    def _get_tracker_seq_tracks(self, tracker, seq_id):
+        """
+        Prepares tracker data for a given sequence. Extracts all annotations for given sequence ID, computes
+        average track area and assigns a track ID.
+        :param tracker: the given tracker
+        :param seq_id: the sequence ID
+        :return: the extracted tracks
+        """
+        # only loaded when needed to reduce minimum requirements
+        from pycocotools import mask as mask_utils
+
+        tracks = [
+            ann for ann in self.tracker_data[tracker] if ann["video_id"] == seq_id
+        ]
+        for track in tracks:
+            if "areas" not in track:
+                if self.iou_type == "segm":
+                    for seg in track["segmentations"]:
+                        if seg:
+                            track["areas"].append(mask_utils.area(seg))
+                        else:
+                            track["areas"].append(None)
+                else:
+                    for bbox in track["bboxes"]:
+                        if bbox:
+                            track["areas"].append(bbox[2] * bbox[3])
+                        else:
+                            track["areas"].append(None)
+            areas = [a for a in track["areas"] if a]
+            if len(areas) == 0:
+                track["area"] = 0
+            else:
+                track["area"] = np.array(areas).mean()
+            track["id"] = self.global_tid_counter
+            self.global_tid_counter += 1
+        return tracks
+
+    def get_name(self):
+        return self.dataset_name