Initial commit
fbshipit-source-id: da6be2f26e3a1202f4bffde8cb980e2dcb851294
This commit is contained in:
@@ -0,0 +1,4 @@
|
||||
# flake8: noqa
|
||||
|
||||
from .tao_ow import TAO_OW
|
||||
from .youtube_vis import YouTubeVIS
|
||||
379
sam3/eval/hota_eval_toolkit/trackeval/datasets/_base_dataset.py
Normal file
379
sam3/eval/hota_eval_toolkit/trackeval/datasets/_base_dataset.py
Normal file
@@ -0,0 +1,379 @@
|
||||
# flake8: noqa
|
||||
|
||||
import csv
|
||||
import io
|
||||
import os
|
||||
import traceback
|
||||
import zipfile
|
||||
from abc import ABC, abstractmethod
|
||||
from copy import deepcopy
|
||||
|
||||
import numpy as np
|
||||
|
||||
from .. import _timing
|
||||
from ..utils import TrackEvalException
|
||||
|
||||
|
||||
class _BaseDataset(ABC):
|
||||
@abstractmethod
|
||||
def __init__(self):
|
||||
self.tracker_list = None
|
||||
self.seq_list = None
|
||||
self.class_list = None
|
||||
self.output_fol = None
|
||||
self.output_sub_fol = None
|
||||
self.should_classes_combine = True
|
||||
self.use_super_categories = False
|
||||
|
||||
# Functions to implement:
|
||||
|
||||
@staticmethod
|
||||
@abstractmethod
|
||||
def get_default_dataset_config(): ...
|
||||
|
||||
@abstractmethod
|
||||
def _load_raw_file(self, tracker, seq, is_gt): ...
|
||||
|
||||
@_timing.time
|
||||
@abstractmethod
|
||||
def get_preprocessed_seq_data(self, raw_data, cls): ...
|
||||
|
||||
@abstractmethod
|
||||
def _calculate_similarities(self, gt_dets_t, tracker_dets_t): ...
|
||||
|
||||
# Helper functions for all datasets:
|
||||
|
||||
@classmethod
|
||||
def get_class_name(cls):
|
||||
return cls.__name__
|
||||
|
||||
def get_name(self):
|
||||
return self.get_class_name()
|
||||
|
||||
def get_output_fol(self, tracker):
|
||||
return os.path.join(self.output_fol, tracker, self.output_sub_fol)
|
||||
|
||||
def get_display_name(self, tracker):
|
||||
"""Can be overwritten if the trackers name (in files) is different to how it should be displayed.
|
||||
By default this method just returns the trackers name as is.
|
||||
"""
|
||||
return tracker
|
||||
|
||||
def get_eval_info(self):
|
||||
"""Return info about the dataset needed for the Evaluator"""
|
||||
return self.tracker_list, self.seq_list, self.class_list
|
||||
|
||||
@_timing.time
|
||||
def get_raw_seq_data(self, tracker, seq):
|
||||
"""Loads raw data (tracker and ground-truth) for a single tracker on a single sequence.
|
||||
Raw data includes all of the information needed for both preprocessing and evaluation, for all classes.
|
||||
A later function (get_processed_seq_data) will perform such preprocessing and extract relevant information for
|
||||
the evaluation of each class.
|
||||
|
||||
This returns a dict which contains the fields:
|
||||
[num_timesteps]: integer
|
||||
[gt_ids, tracker_ids, gt_classes, tracker_classes, tracker_confidences]:
|
||||
list (for each timestep) of 1D NDArrays (for each det).
|
||||
[gt_dets, tracker_dets, gt_crowd_ignore_regions]: list (for each timestep) of lists of detections.
|
||||
[similarity_scores]: list (for each timestep) of 2D NDArrays.
|
||||
[gt_extras]: dict (for each extra) of lists (for each timestep) of 1D NDArrays (for each det).
|
||||
|
||||
gt_extras contains dataset specific information used for preprocessing such as occlusion and truncation levels.
|
||||
|
||||
Note that similarities are extracted as part of the dataset and not the metric, because almost all metrics are
|
||||
independent of the exact method of calculating the similarity. However datasets are not (e.g. segmentation
|
||||
masks vs 2D boxes vs 3D boxes).
|
||||
We calculate the similarity before preprocessing because often both preprocessing and evaluation require it and
|
||||
we don't wish to calculate this twice.
|
||||
We calculate similarity between all gt and tracker classes (not just each class individually) to allow for
|
||||
calculation of metrics such as class confusion matrices. Typically the impact of this on performance is low.
|
||||
"""
|
||||
# Load raw data.
|
||||
raw_gt_data = self._load_raw_file(tracker, seq, is_gt=True)
|
||||
raw_tracker_data = self._load_raw_file(tracker, seq, is_gt=False)
|
||||
raw_data = {**raw_tracker_data, **raw_gt_data} # Merges dictionaries
|
||||
|
||||
# Calculate similarities for each timestep.
|
||||
similarity_scores = []
|
||||
for t, (gt_dets_t, tracker_dets_t) in enumerate(
|
||||
zip(raw_data["gt_dets"], raw_data["tracker_dets"])
|
||||
):
|
||||
ious = self._calculate_similarities(gt_dets_t, tracker_dets_t)
|
||||
similarity_scores.append(ious)
|
||||
raw_data["similarity_scores"] = similarity_scores
|
||||
return raw_data
|
||||
|
||||
@staticmethod
|
||||
def _load_simple_text_file(
|
||||
file,
|
||||
time_col=0,
|
||||
id_col=None,
|
||||
remove_negative_ids=False,
|
||||
valid_filter=None,
|
||||
crowd_ignore_filter=None,
|
||||
convert_filter=None,
|
||||
is_zipped=False,
|
||||
zip_file=None,
|
||||
force_delimiters=None,
|
||||
):
|
||||
"""Function that loads data which is in a commonly used text file format.
|
||||
Assumes each det is given by one row of a text file.
|
||||
There is no limit to the number or meaning of each column,
|
||||
however one column needs to give the timestep of each det (time_col) which is default col 0.
|
||||
|
||||
The file dialect (deliminator, num cols, etc) is determined automatically.
|
||||
This function automatically separates dets by timestep,
|
||||
and is much faster than alternatives such as np.loadtext or pandas.
|
||||
|
||||
If remove_negative_ids is True and id_col is not None, dets with negative values in id_col are excluded.
|
||||
These are not excluded from ignore data.
|
||||
|
||||
valid_filter can be used to only include certain classes.
|
||||
It is a dict with ints as keys, and lists as values,
|
||||
such that a row is included if "row[key].lower() is in value" for all key/value pairs in the dict.
|
||||
If None, all classes are included.
|
||||
|
||||
crowd_ignore_filter can be used to read crowd_ignore regions separately. It has the same format as valid filter.
|
||||
|
||||
convert_filter can be used to convert value read to another format.
|
||||
This is used most commonly to convert classes given as string to a class id.
|
||||
This is a dict such that the key is the column to convert, and the value is another dict giving the mapping.
|
||||
|
||||
Optionally, input files could be a zip of multiple text files for storage efficiency.
|
||||
|
||||
Returns read_data and ignore_data.
|
||||
Each is a dict (with keys as timesteps as strings) of lists (over dets) of lists (over column values).
|
||||
Note that all data is returned as strings, and must be converted to float/int later if needed.
|
||||
Note that timesteps will not be present in the returned dict keys if there are no dets for them
|
||||
"""
|
||||
|
||||
if remove_negative_ids and id_col is None:
|
||||
raise TrackEvalException(
|
||||
"remove_negative_ids is True, but id_col is not given."
|
||||
)
|
||||
if crowd_ignore_filter is None:
|
||||
crowd_ignore_filter = {}
|
||||
if convert_filter is None:
|
||||
convert_filter = {}
|
||||
try:
|
||||
if is_zipped: # Either open file directly or within a zip.
|
||||
if zip_file is None:
|
||||
raise TrackEvalException(
|
||||
"is_zipped set to True, but no zip_file is given."
|
||||
)
|
||||
archive = zipfile.ZipFile(os.path.join(zip_file), "r")
|
||||
fp = io.TextIOWrapper(archive.open(file, "r"))
|
||||
else:
|
||||
fp = open(file)
|
||||
read_data = {}
|
||||
crowd_ignore_data = {}
|
||||
fp.seek(0, os.SEEK_END)
|
||||
# check if file is empty
|
||||
if fp.tell():
|
||||
fp.seek(0)
|
||||
dialect = csv.Sniffer().sniff(
|
||||
fp.readline(), delimiters=force_delimiters
|
||||
) # Auto determine structure.
|
||||
dialect.skipinitialspace = (
|
||||
True # Deal with extra spaces between columns
|
||||
)
|
||||
fp.seek(0)
|
||||
reader = csv.reader(fp, dialect)
|
||||
for row in reader:
|
||||
try:
|
||||
# Deal with extra trailing spaces at the end of rows
|
||||
if row[-1] in "":
|
||||
row = row[:-1]
|
||||
timestep = str(int(float(row[time_col])))
|
||||
# Read ignore regions separately.
|
||||
is_ignored = False
|
||||
for ignore_key, ignore_value in crowd_ignore_filter.items():
|
||||
if row[ignore_key].lower() in ignore_value:
|
||||
# Convert values in one column (e.g. string to id)
|
||||
for (
|
||||
convert_key,
|
||||
convert_value,
|
||||
) in convert_filter.items():
|
||||
row[convert_key] = convert_value[
|
||||
row[convert_key].lower()
|
||||
]
|
||||
# Save data separated by timestep.
|
||||
if timestep in crowd_ignore_data.keys():
|
||||
crowd_ignore_data[timestep].append(row)
|
||||
else:
|
||||
crowd_ignore_data[timestep] = [row]
|
||||
is_ignored = True
|
||||
if (
|
||||
is_ignored
|
||||
): # if det is an ignore region, it cannot be a normal det.
|
||||
continue
|
||||
# Exclude some dets if not valid.
|
||||
if valid_filter is not None:
|
||||
for key, value in valid_filter.items():
|
||||
if row[key].lower() not in value:
|
||||
continue
|
||||
if remove_negative_ids:
|
||||
if int(float(row[id_col])) < 0:
|
||||
continue
|
||||
# Convert values in one column (e.g. string to id)
|
||||
for convert_key, convert_value in convert_filter.items():
|
||||
row[convert_key] = convert_value[row[convert_key].lower()]
|
||||
# Save data separated by timestep.
|
||||
if timestep in read_data.keys():
|
||||
read_data[timestep].append(row)
|
||||
else:
|
||||
read_data[timestep] = [row]
|
||||
except Exception:
|
||||
exc_str_init = (
|
||||
"In file %s the following line cannot be read correctly: \n"
|
||||
% os.path.basename(file)
|
||||
)
|
||||
exc_str = " ".join([exc_str_init] + row)
|
||||
raise TrackEvalException(exc_str)
|
||||
fp.close()
|
||||
except Exception:
|
||||
print("Error loading file: %s, printing traceback." % file)
|
||||
traceback.print_exc()
|
||||
raise TrackEvalException(
|
||||
"File %s cannot be read because it is either not present or invalidly formatted"
|
||||
% os.path.basename(file)
|
||||
)
|
||||
return read_data, crowd_ignore_data
|
||||
|
||||
@staticmethod
|
||||
def _calculate_mask_ious(masks1, masks2, is_encoded=False, do_ioa=False):
|
||||
"""Calculates the IOU (intersection over union) between two arrays of segmentation masks.
|
||||
If is_encoded a run length encoding with pycocotools is assumed as input format, otherwise an input of numpy
|
||||
arrays of the shape (num_masks, height, width) is assumed and the encoding is performed.
|
||||
If do_ioa (intersection over area) , then calculates the intersection over the area of masks1 - this is commonly
|
||||
used to determine if detections are within crowd ignore region.
|
||||
:param masks1: first set of masks (numpy array of shape (num_masks, height, width) if not encoded,
|
||||
else pycocotools rle encoded format)
|
||||
:param masks2: second set of masks (numpy array of shape (num_masks, height, width) if not encoded,
|
||||
else pycocotools rle encoded format)
|
||||
:param is_encoded: whether the input is in pycocotools rle encoded format
|
||||
:param do_ioa: whether to perform IoA computation
|
||||
:return: the IoU/IoA scores
|
||||
"""
|
||||
|
||||
# Only loaded when run to reduce minimum requirements
|
||||
from pycocotools import mask as mask_utils
|
||||
|
||||
# use pycocotools for run length encoding of masks
|
||||
if not is_encoded:
|
||||
masks1 = mask_utils.encode(
|
||||
np.array(np.transpose(masks1, (1, 2, 0)), order="F")
|
||||
)
|
||||
masks2 = mask_utils.encode(
|
||||
np.array(np.transpose(masks2, (1, 2, 0)), order="F")
|
||||
)
|
||||
|
||||
# use pycocotools for iou computation of rle encoded masks
|
||||
ious = mask_utils.iou(masks1, masks2, [do_ioa] * len(masks2))
|
||||
if len(masks1) == 0 or len(masks2) == 0:
|
||||
ious = np.asarray(ious).reshape(len(masks1), len(masks2))
|
||||
assert (ious >= 0 - np.finfo("float").eps).all()
|
||||
assert (ious <= 1 + np.finfo("float").eps).all()
|
||||
|
||||
return ious
|
||||
|
||||
@staticmethod
|
||||
def _calculate_box_ious(bboxes1, bboxes2, box_format="xywh", do_ioa=False):
|
||||
"""Calculates the IOU (intersection over union) between two arrays of boxes.
|
||||
Allows variable box formats ('xywh' and 'x0y0x1y1').
|
||||
If do_ioa (intersection over area) , then calculates the intersection over the area of boxes1 - this is commonly
|
||||
used to determine if detections are within crowd ignore region.
|
||||
"""
|
||||
if box_format in "xywh":
|
||||
# layout: (x0, y0, w, h)
|
||||
bboxes1 = deepcopy(bboxes1)
|
||||
bboxes2 = deepcopy(bboxes2)
|
||||
|
||||
bboxes1[:, 2] = bboxes1[:, 0] + bboxes1[:, 2]
|
||||
bboxes1[:, 3] = bboxes1[:, 1] + bboxes1[:, 3]
|
||||
bboxes2[:, 2] = bboxes2[:, 0] + bboxes2[:, 2]
|
||||
bboxes2[:, 3] = bboxes2[:, 1] + bboxes2[:, 3]
|
||||
elif box_format not in "x0y0x1y1":
|
||||
raise (TrackEvalException("box_format %s is not implemented" % box_format))
|
||||
|
||||
# layout: (x0, y0, x1, y1)
|
||||
min_ = np.minimum(bboxes1[:, np.newaxis, :], bboxes2[np.newaxis, :, :])
|
||||
max_ = np.maximum(bboxes1[:, np.newaxis, :], bboxes2[np.newaxis, :, :])
|
||||
intersection = np.maximum(min_[..., 2] - max_[..., 0], 0) * np.maximum(
|
||||
min_[..., 3] - max_[..., 1], 0
|
||||
)
|
||||
area1 = (bboxes1[..., 2] - bboxes1[..., 0]) * (
|
||||
bboxes1[..., 3] - bboxes1[..., 1]
|
||||
)
|
||||
|
||||
if do_ioa:
|
||||
ioas = np.zeros_like(intersection)
|
||||
valid_mask = area1 > 0 + np.finfo("float").eps
|
||||
ioas[valid_mask, :] = (
|
||||
intersection[valid_mask, :] / area1[valid_mask][:, np.newaxis]
|
||||
)
|
||||
|
||||
return ioas
|
||||
else:
|
||||
area2 = (bboxes2[..., 2] - bboxes2[..., 0]) * (
|
||||
bboxes2[..., 3] - bboxes2[..., 1]
|
||||
)
|
||||
union = area1[:, np.newaxis] + area2[np.newaxis, :] - intersection
|
||||
intersection[area1 <= 0 + np.finfo("float").eps, :] = 0
|
||||
intersection[:, area2 <= 0 + np.finfo("float").eps] = 0
|
||||
intersection[union <= 0 + np.finfo("float").eps] = 0
|
||||
union[union <= 0 + np.finfo("float").eps] = 1
|
||||
ious = intersection / union
|
||||
return ious
|
||||
|
||||
@staticmethod
|
||||
def _calculate_euclidean_similarity(dets1, dets2, zero_distance=2.0):
|
||||
"""Calculates the euclidean distance between two sets of detections, and then converts this into a similarity
|
||||
measure with values between 0 and 1 using the following formula: sim = max(0, 1 - dist/zero_distance).
|
||||
The default zero_distance of 2.0, corresponds to the default used in MOT15_3D, such that a 0.5 similarity
|
||||
threshold corresponds to a 1m distance threshold for TPs.
|
||||
"""
|
||||
dist = np.linalg.norm(dets1[:, np.newaxis] - dets2[np.newaxis, :], axis=2)
|
||||
sim = np.maximum(0, 1 - dist / zero_distance)
|
||||
return sim
|
||||
|
||||
@staticmethod
|
||||
def _check_unique_ids(data, after_preproc=False):
|
||||
"""Check the requirement that the tracker_ids and gt_ids are unique per timestep"""
|
||||
gt_ids = data["gt_ids"]
|
||||
tracker_ids = data["tracker_ids"]
|
||||
for t, (gt_ids_t, tracker_ids_t) in enumerate(zip(gt_ids, tracker_ids)):
|
||||
if len(tracker_ids_t) > 0:
|
||||
unique_ids, counts = np.unique(tracker_ids_t, return_counts=True)
|
||||
if np.max(counts) != 1:
|
||||
duplicate_ids = unique_ids[counts > 1]
|
||||
exc_str_init = (
|
||||
"Tracker predicts the same ID more than once in a single timestep "
|
||||
"(seq: %s, frame: %i, ids:" % (data["seq"], t + 1)
|
||||
)
|
||||
exc_str = (
|
||||
" ".join([exc_str_init] + [str(d) for d in duplicate_ids]) + ")"
|
||||
)
|
||||
if after_preproc:
|
||||
exc_str_init += (
|
||||
"\n Note that this error occurred after preprocessing (but not before), "
|
||||
"so ids may not be as in file, and something seems wrong with preproc."
|
||||
)
|
||||
raise TrackEvalException(exc_str)
|
||||
if len(gt_ids_t) > 0:
|
||||
unique_ids, counts = np.unique(gt_ids_t, return_counts=True)
|
||||
if np.max(counts) != 1:
|
||||
duplicate_ids = unique_ids[counts > 1]
|
||||
exc_str_init = (
|
||||
"Ground-truth has the same ID more than once in a single timestep "
|
||||
"(seq: %s, frame: %i, ids:" % (data["seq"], t + 1)
|
||||
)
|
||||
exc_str = (
|
||||
" ".join([exc_str_init] + [str(d) for d in duplicate_ids]) + ")"
|
||||
)
|
||||
if after_preproc:
|
||||
exc_str_init += (
|
||||
"\n Note that this error occurred after preprocessing (but not before), "
|
||||
"so ids may not be as in file, and something seems wrong with preproc."
|
||||
)
|
||||
raise TrackEvalException(exc_str)
|
||||
891
sam3/eval/hota_eval_toolkit/trackeval/datasets/tao_ow.py
Normal file
891
sam3/eval/hota_eval_toolkit/trackeval/datasets/tao_ow.py
Normal file
@@ -0,0 +1,891 @@
|
||||
# flake8: noqa
|
||||
|
||||
import itertools
|
||||
import json
|
||||
import os
|
||||
from collections import defaultdict
|
||||
|
||||
import numpy as np
|
||||
from scipy.optimize import linear_sum_assignment
|
||||
|
||||
from .. import _timing, utils
|
||||
from ..utils import TrackEvalException
|
||||
from ._base_dataset import _BaseDataset
|
||||
|
||||
|
||||
class TAO_OW(_BaseDataset):
|
||||
"""Dataset class for TAO tracking"""
|
||||
|
||||
@staticmethod
|
||||
def get_default_dataset_config():
|
||||
"""Default class config values"""
|
||||
code_path = utils.get_code_path()
|
||||
default_config = {
|
||||
"GT_FOLDER": os.path.join(
|
||||
code_path, "data/gt/tao/tao_training"
|
||||
), # Location of GT data
|
||||
"TRACKERS_FOLDER": os.path.join(
|
||||
code_path, "data/trackers/tao/tao_training"
|
||||
), # Trackers location
|
||||
"OUTPUT_FOLDER": None, # Where to save eval results (if None, same as TRACKERS_FOLDER)
|
||||
"TRACKERS_TO_EVAL": None, # Filenames of trackers to eval (if None, all in folder)
|
||||
"CLASSES_TO_EVAL": None, # Classes to eval (if None, all classes)
|
||||
"SPLIT_TO_EVAL": "training", # Valid: 'training', 'val'
|
||||
"PRINT_CONFIG": True, # Whether to print current config
|
||||
"TRACKER_SUB_FOLDER": "data", # Tracker files are in TRACKER_FOLDER/tracker_name/TRACKER_SUB_FOLDER
|
||||
"OUTPUT_SUB_FOLDER": "", # Output files are saved in OUTPUT_FOLDER/tracker_name/OUTPUT_SUB_FOLDER
|
||||
"TRACKER_DISPLAY_NAMES": None, # Names of trackers to display, if None: TRACKERS_TO_EVAL
|
||||
"MAX_DETECTIONS": 300, # Number of maximal allowed detections per image (0 for unlimited)
|
||||
"SUBSET": "all",
|
||||
}
|
||||
return default_config
|
||||
|
||||
def __init__(self, config=None):
|
||||
"""Initialise dataset, checking that all required files are present"""
|
||||
super().__init__()
|
||||
# Fill non-given config values with defaults
|
||||
self.config = utils.init_config(
|
||||
config, self.get_default_dataset_config(), self.get_name()
|
||||
)
|
||||
self.gt_fol = self.config["GT_FOLDER"]
|
||||
self.tracker_fol = self.config["TRACKERS_FOLDER"]
|
||||
self.should_classes_combine = True
|
||||
self.use_super_categories = False
|
||||
|
||||
self.tracker_sub_fol = self.config["TRACKER_SUB_FOLDER"]
|
||||
self.output_fol = self.config["OUTPUT_FOLDER"]
|
||||
if self.output_fol is None:
|
||||
self.output_fol = self.tracker_fol
|
||||
self.output_sub_fol = self.config["OUTPUT_SUB_FOLDER"]
|
||||
|
||||
gt_dir_files = [
|
||||
file for file in os.listdir(self.gt_fol) if file.endswith(".json")
|
||||
]
|
||||
if len(gt_dir_files) != 1:
|
||||
raise TrackEvalException(
|
||||
self.gt_fol + " does not contain exactly one json file."
|
||||
)
|
||||
|
||||
with open(os.path.join(self.gt_fol, gt_dir_files[0])) as f:
|
||||
self.gt_data = json.load(f)
|
||||
|
||||
self.subset = self.config["SUBSET"]
|
||||
if self.subset != "all":
|
||||
# Split GT data into `known`, `unknown` or `distractor`
|
||||
self._split_known_unknown_distractor()
|
||||
self.gt_data = self._filter_gt_data(self.gt_data)
|
||||
|
||||
# merge categories marked with a merged tag in TAO dataset
|
||||
self._merge_categories(self.gt_data["annotations"] + self.gt_data["tracks"])
|
||||
|
||||
# Get sequences to eval and sequence information
|
||||
self.seq_list = [
|
||||
vid["name"].replace("/", "-") for vid in self.gt_data["videos"]
|
||||
]
|
||||
self.seq_name_to_seq_id = {
|
||||
vid["name"].replace("/", "-"): vid["id"] for vid in self.gt_data["videos"]
|
||||
}
|
||||
# compute mappings from videos to annotation data
|
||||
self.videos_to_gt_tracks, self.videos_to_gt_images = self._compute_vid_mappings(
|
||||
self.gt_data["annotations"]
|
||||
)
|
||||
# compute sequence lengths
|
||||
self.seq_lengths = {vid["id"]: 0 for vid in self.gt_data["videos"]}
|
||||
for img in self.gt_data["images"]:
|
||||
self.seq_lengths[img["video_id"]] += 1
|
||||
self.seq_to_images_to_timestep = self._compute_image_to_timestep_mappings()
|
||||
self.seq_to_classes = {
|
||||
vid["id"]: {
|
||||
"pos_cat_ids": list(
|
||||
{
|
||||
track["category_id"]
|
||||
for track in self.videos_to_gt_tracks[vid["id"]]
|
||||
}
|
||||
),
|
||||
"neg_cat_ids": vid["neg_category_ids"],
|
||||
"not_exhaustively_labeled_cat_ids": vid["not_exhaustive_category_ids"],
|
||||
}
|
||||
for vid in self.gt_data["videos"]
|
||||
}
|
||||
|
||||
# Get classes to eval
|
||||
considered_vid_ids = [self.seq_name_to_seq_id[vid] for vid in self.seq_list]
|
||||
seen_cats = set(
|
||||
[
|
||||
cat_id
|
||||
for vid_id in considered_vid_ids
|
||||
for cat_id in self.seq_to_classes[vid_id]["pos_cat_ids"]
|
||||
]
|
||||
)
|
||||
# only classes with ground truth are evaluated in TAO
|
||||
self.valid_classes = [
|
||||
cls["name"] for cls in self.gt_data["categories"] if cls["id"] in seen_cats
|
||||
]
|
||||
# cls_name_to_cls_id_map = {cls['name']: cls['id'] for cls in self.gt_data['categories']}
|
||||
|
||||
if self.config["CLASSES_TO_EVAL"]:
|
||||
# self.class_list = [cls.lower() if cls.lower() in self.valid_classes else None
|
||||
# for cls in self.config['CLASSES_TO_EVAL']]
|
||||
self.class_list = ["object"] # class-agnostic
|
||||
if not all(self.class_list):
|
||||
raise TrackEvalException(
|
||||
"Attempted to evaluate an invalid class. Only classes "
|
||||
+ ", ".join(self.valid_classes)
|
||||
+ " are valid (classes present in ground truth data)."
|
||||
)
|
||||
else:
|
||||
# self.class_list = [cls for cls in self.valid_classes]
|
||||
self.class_list = ["object"] # class-agnostic
|
||||
# self.class_name_to_class_id = {k: v for k, v in cls_name_to_cls_id_map.items() if k in self.class_list}
|
||||
self.class_name_to_class_id = {"object": 1} # class-agnostic
|
||||
|
||||
# Get trackers to eval
|
||||
if self.config["TRACKERS_TO_EVAL"] is None:
|
||||
self.tracker_list = os.listdir(self.tracker_fol)
|
||||
else:
|
||||
self.tracker_list = self.config["TRACKERS_TO_EVAL"]
|
||||
|
||||
if self.config["TRACKER_DISPLAY_NAMES"] is None:
|
||||
self.tracker_to_disp = dict(zip(self.tracker_list, self.tracker_list))
|
||||
elif (self.config["TRACKERS_TO_EVAL"] is not None) and (
|
||||
len(self.config["TRACKER_DISPLAY_NAMES"]) == len(self.tracker_list)
|
||||
):
|
||||
self.tracker_to_disp = dict(
|
||||
zip(self.tracker_list, self.config["TRACKER_DISPLAY_NAMES"])
|
||||
)
|
||||
else:
|
||||
raise TrackEvalException(
|
||||
"List of tracker files and tracker display names do not match."
|
||||
)
|
||||
|
||||
self.tracker_data = {tracker: dict() for tracker in self.tracker_list}
|
||||
|
||||
for tracker in self.tracker_list:
|
||||
tr_dir_files = [
|
||||
file
|
||||
for file in os.listdir(
|
||||
os.path.join(self.tracker_fol, tracker, self.tracker_sub_fol)
|
||||
)
|
||||
if file.endswith(".json")
|
||||
]
|
||||
if len(tr_dir_files) != 1:
|
||||
raise TrackEvalException(
|
||||
os.path.join(self.tracker_fol, tracker, self.tracker_sub_fol)
|
||||
+ " does not contain exactly one json file."
|
||||
)
|
||||
with open(
|
||||
os.path.join(
|
||||
self.tracker_fol, tracker, self.tracker_sub_fol, tr_dir_files[0]
|
||||
)
|
||||
) as f:
|
||||
curr_data = json.load(f)
|
||||
|
||||
# limit detections if MAX_DETECTIONS > 0
|
||||
if self.config["MAX_DETECTIONS"]:
|
||||
curr_data = self._limit_dets_per_image(curr_data)
|
||||
|
||||
# fill missing video ids
|
||||
self._fill_video_ids_inplace(curr_data)
|
||||
|
||||
# make track ids unique over whole evaluation set
|
||||
self._make_track_ids_unique(curr_data)
|
||||
|
||||
# merge categories marked with a merged tag in TAO dataset
|
||||
self._merge_categories(curr_data)
|
||||
|
||||
# get tracker sequence information
|
||||
curr_videos_to_tracker_tracks, curr_videos_to_tracker_images = (
|
||||
self._compute_vid_mappings(curr_data)
|
||||
)
|
||||
self.tracker_data[tracker]["vids_to_tracks"] = curr_videos_to_tracker_tracks
|
||||
self.tracker_data[tracker]["vids_to_images"] = curr_videos_to_tracker_images
|
||||
|
||||
def get_display_name(self, tracker):
|
||||
return self.tracker_to_disp[tracker]
|
||||
|
||||
def _load_raw_file(self, tracker, seq, is_gt):
|
||||
"""Load a file (gt or tracker) in the TAO format
|
||||
|
||||
If is_gt, this returns a dict which contains the fields:
|
||||
[gt_ids, gt_classes] : list (for each timestep) of 1D NDArrays (for each det).
|
||||
[gt_dets]: list (for each timestep) of lists of detections.
|
||||
[classes_to_gt_tracks]: dictionary with class values as keys and list of dictionaries (with frame indices as
|
||||
keys and corresponding segmentations as values) for each track
|
||||
[classes_to_gt_track_ids, classes_to_gt_track_areas, classes_to_gt_track_lengths]: dictionary with class values
|
||||
as keys and lists (for each track) as values
|
||||
|
||||
if not is_gt, this returns a dict which contains the fields:
|
||||
[tracker_ids, tracker_classes, tracker_confidences] : list (for each timestep) of 1D NDArrays (for each det).
|
||||
[tracker_dets]: list (for each timestep) of lists of detections.
|
||||
[classes_to_dt_tracks]: dictionary with class values as keys and list of dictionaries (with frame indices as
|
||||
keys and corresponding segmentations as values) for each track
|
||||
[classes_to_dt_track_ids, classes_to_dt_track_areas, classes_to_dt_track_lengths]: dictionary with class values
|
||||
as keys and lists as values
|
||||
[classes_to_dt_track_scores]: dictionary with class values as keys and 1D numpy arrays as values
|
||||
"""
|
||||
seq_id = self.seq_name_to_seq_id[seq]
|
||||
# File location
|
||||
if is_gt:
|
||||
imgs = self.videos_to_gt_images[seq_id]
|
||||
else:
|
||||
imgs = self.tracker_data[tracker]["vids_to_images"][seq_id]
|
||||
|
||||
# Convert data to required format
|
||||
num_timesteps = self.seq_lengths[seq_id]
|
||||
img_to_timestep = self.seq_to_images_to_timestep[seq_id]
|
||||
data_keys = ["ids", "classes", "dets"]
|
||||
if not is_gt:
|
||||
data_keys += ["tracker_confidences"]
|
||||
raw_data = {key: [None] * num_timesteps for key in data_keys}
|
||||
for img in imgs:
|
||||
# some tracker data contains images without any ground truth information, these are ignored
|
||||
try:
|
||||
t = img_to_timestep[img["id"]]
|
||||
except KeyError:
|
||||
continue
|
||||
annotations = img["annotations"]
|
||||
raw_data["dets"][t] = np.atleast_2d(
|
||||
[ann["bbox"] for ann in annotations]
|
||||
).astype(float)
|
||||
raw_data["ids"][t] = np.atleast_1d(
|
||||
[ann["track_id"] for ann in annotations]
|
||||
).astype(int)
|
||||
raw_data["classes"][t] = np.atleast_1d([1 for _ in annotations]).astype(
|
||||
int
|
||||
) # class-agnostic
|
||||
if not is_gt:
|
||||
raw_data["tracker_confidences"][t] = np.atleast_1d(
|
||||
[ann["score"] for ann in annotations]
|
||||
).astype(float)
|
||||
|
||||
for t, d in enumerate(raw_data["dets"]):
|
||||
if d is None:
|
||||
raw_data["dets"][t] = np.empty((0, 4)).astype(float)
|
||||
raw_data["ids"][t] = np.empty(0).astype(int)
|
||||
raw_data["classes"][t] = np.empty(0).astype(int)
|
||||
if not is_gt:
|
||||
raw_data["tracker_confidences"][t] = np.empty(0)
|
||||
|
||||
if is_gt:
|
||||
key_map = {"ids": "gt_ids", "classes": "gt_classes", "dets": "gt_dets"}
|
||||
else:
|
||||
key_map = {
|
||||
"ids": "tracker_ids",
|
||||
"classes": "tracker_classes",
|
||||
"dets": "tracker_dets",
|
||||
}
|
||||
for k, v in key_map.items():
|
||||
raw_data[v] = raw_data.pop(k)
|
||||
|
||||
# all_classes = [self.class_name_to_class_id[cls] for cls in self.class_list]
|
||||
all_classes = [1] # class-agnostic
|
||||
|
||||
if is_gt:
|
||||
classes_to_consider = all_classes
|
||||
all_tracks = self.videos_to_gt_tracks[seq_id]
|
||||
else:
|
||||
# classes_to_consider = self.seq_to_classes[seq_id]['pos_cat_ids'] \
|
||||
# + self.seq_to_classes[seq_id]['neg_cat_ids']
|
||||
classes_to_consider = all_classes # class-agnostic
|
||||
all_tracks = self.tracker_data[tracker]["vids_to_tracks"][seq_id]
|
||||
|
||||
# classes_to_tracks = {cls: [track for track in all_tracks if track['category_id'] == cls]
|
||||
# if cls in classes_to_consider else [] for cls in all_classes}
|
||||
classes_to_tracks = {
|
||||
cls: [track for track in all_tracks] if cls in classes_to_consider else []
|
||||
for cls in all_classes
|
||||
} # class-agnostic
|
||||
|
||||
# mapping from classes to track information
|
||||
raw_data["classes_to_tracks"] = {
|
||||
cls: [
|
||||
{
|
||||
det["image_id"]: np.atleast_1d(det["bbox"])
|
||||
for det in track["annotations"]
|
||||
}
|
||||
for track in tracks
|
||||
]
|
||||
for cls, tracks in classes_to_tracks.items()
|
||||
}
|
||||
raw_data["classes_to_track_ids"] = {
|
||||
cls: [track["id"] for track in tracks]
|
||||
for cls, tracks in classes_to_tracks.items()
|
||||
}
|
||||
raw_data["classes_to_track_areas"] = {
|
||||
cls: [track["area"] for track in tracks]
|
||||
for cls, tracks in classes_to_tracks.items()
|
||||
}
|
||||
raw_data["classes_to_track_lengths"] = {
|
||||
cls: [len(track["annotations"]) for track in tracks]
|
||||
for cls, tracks in classes_to_tracks.items()
|
||||
}
|
||||
|
||||
if not is_gt:
|
||||
raw_data["classes_to_dt_track_scores"] = {
|
||||
cls: np.array(
|
||||
[
|
||||
np.mean([float(x["score"]) for x in track["annotations"]])
|
||||
for track in tracks
|
||||
]
|
||||
)
|
||||
for cls, tracks in classes_to_tracks.items()
|
||||
}
|
||||
|
||||
if is_gt:
|
||||
key_map = {
|
||||
"classes_to_tracks": "classes_to_gt_tracks",
|
||||
"classes_to_track_ids": "classes_to_gt_track_ids",
|
||||
"classes_to_track_lengths": "classes_to_gt_track_lengths",
|
||||
"classes_to_track_areas": "classes_to_gt_track_areas",
|
||||
}
|
||||
else:
|
||||
key_map = {
|
||||
"classes_to_tracks": "classes_to_dt_tracks",
|
||||
"classes_to_track_ids": "classes_to_dt_track_ids",
|
||||
"classes_to_track_lengths": "classes_to_dt_track_lengths",
|
||||
"classes_to_track_areas": "classes_to_dt_track_areas",
|
||||
}
|
||||
for k, v in key_map.items():
|
||||
raw_data[v] = raw_data.pop(k)
|
||||
|
||||
raw_data["num_timesteps"] = num_timesteps
|
||||
raw_data["neg_cat_ids"] = self.seq_to_classes[seq_id]["neg_cat_ids"]
|
||||
raw_data["not_exhaustively_labeled_cls"] = self.seq_to_classes[seq_id][
|
||||
"not_exhaustively_labeled_cat_ids"
|
||||
]
|
||||
raw_data["seq"] = seq
|
||||
return raw_data
|
||||
|
||||
@_timing.time
|
||||
def get_preprocessed_seq_data(self, raw_data, cls):
|
||||
"""Preprocess data for a single sequence for a single class ready for evaluation.
|
||||
Inputs:
|
||||
- raw_data is a dict containing the data for the sequence already read in by get_raw_seq_data().
|
||||
- cls is the class to be evaluated.
|
||||
Outputs:
|
||||
- data is a dict containing all of the information that metrics need to perform evaluation.
|
||||
It contains the following fields:
|
||||
[num_timesteps, num_gt_ids, num_tracker_ids, num_gt_dets, num_tracker_dets] : integers.
|
||||
[gt_ids, tracker_ids, tracker_confidences]: list (for each timestep) of 1D NDArrays (for each det).
|
||||
[gt_dets, tracker_dets]: list (for each timestep) of lists of detections.
|
||||
[similarity_scores]: list (for each timestep) of 2D NDArrays.
|
||||
Notes:
|
||||
General preprocessing (preproc) occurs in 4 steps. Some datasets may not use all of these steps.
|
||||
1) Extract only detections relevant for the class to be evaluated (including distractor detections).
|
||||
2) Match gt dets and tracker dets. Remove tracker dets that are matched to a gt det that is of a
|
||||
distractor class, or otherwise marked as to be removed.
|
||||
3) Remove unmatched tracker dets if they fall within a crowd ignore region or don't meet a certain
|
||||
other criteria (e.g. are too small).
|
||||
4) Remove gt dets that were only useful for preprocessing and not for actual evaluation.
|
||||
After the above preprocessing steps, this function also calculates the number of gt and tracker detections
|
||||
and unique track ids. It also relabels gt and tracker ids to be contiguous and checks that ids are
|
||||
unique within each timestep.
|
||||
TAO:
|
||||
In TAO, the 4 preproc steps are as follow:
|
||||
1) All classes present in the ground truth data are evaluated separately.
|
||||
2) No matched tracker detections are removed.
|
||||
3) Unmatched tracker detections are removed if there is not ground truth data and the class does not
|
||||
belong to the categories marked as negative for this sequence. Additionally, unmatched tracker
|
||||
detections for classes which are marked as not exhaustively labeled are removed.
|
||||
4) No gt detections are removed.
|
||||
Further, for TrackMAP computation track representations for the given class are accessed from a dictionary
|
||||
and the tracks from the tracker data are sorted according to the tracker confidence.
|
||||
"""
|
||||
cls_id = self.class_name_to_class_id[cls]
|
||||
is_not_exhaustively_labeled = cls_id in raw_data["not_exhaustively_labeled_cls"]
|
||||
is_neg_category = cls_id in raw_data["neg_cat_ids"]
|
||||
|
||||
data_keys = [
|
||||
"gt_ids",
|
||||
"tracker_ids",
|
||||
"gt_dets",
|
||||
"tracker_dets",
|
||||
"tracker_confidences",
|
||||
"similarity_scores",
|
||||
]
|
||||
data = {key: [None] * raw_data["num_timesteps"] for key in data_keys}
|
||||
unique_gt_ids = []
|
||||
unique_tracker_ids = []
|
||||
num_gt_dets = 0
|
||||
num_tracker_dets = 0
|
||||
for t in range(raw_data["num_timesteps"]):
|
||||
# Only extract relevant dets for this class for preproc and eval (cls)
|
||||
gt_class_mask = np.atleast_1d(raw_data["gt_classes"][t] == cls_id)
|
||||
gt_class_mask = gt_class_mask.astype(bool)
|
||||
gt_ids = raw_data["gt_ids"][t][gt_class_mask]
|
||||
gt_dets = raw_data["gt_dets"][t][gt_class_mask]
|
||||
|
||||
tracker_class_mask = np.atleast_1d(raw_data["tracker_classes"][t] == cls_id)
|
||||
tracker_class_mask = tracker_class_mask.astype(bool)
|
||||
tracker_ids = raw_data["tracker_ids"][t][tracker_class_mask]
|
||||
tracker_dets = raw_data["tracker_dets"][t][tracker_class_mask]
|
||||
tracker_confidences = raw_data["tracker_confidences"][t][tracker_class_mask]
|
||||
similarity_scores = raw_data["similarity_scores"][t][gt_class_mask, :][
|
||||
:, tracker_class_mask
|
||||
]
|
||||
|
||||
# Match tracker and gt dets (with hungarian algorithm).
|
||||
unmatched_indices = np.arange(tracker_ids.shape[0])
|
||||
if gt_ids.shape[0] > 0 and tracker_ids.shape[0] > 0:
|
||||
matching_scores = similarity_scores.copy()
|
||||
matching_scores[matching_scores < 0.5 - np.finfo("float").eps] = 0
|
||||
match_rows, match_cols = linear_sum_assignment(-matching_scores)
|
||||
actually_matched_mask = (
|
||||
matching_scores[match_rows, match_cols] > 0 + np.finfo("float").eps
|
||||
)
|
||||
match_cols = match_cols[actually_matched_mask]
|
||||
unmatched_indices = np.delete(unmatched_indices, match_cols, axis=0)
|
||||
|
||||
if gt_ids.shape[0] == 0 and not is_neg_category:
|
||||
to_remove_tracker = unmatched_indices
|
||||
elif is_not_exhaustively_labeled:
|
||||
to_remove_tracker = unmatched_indices
|
||||
else:
|
||||
to_remove_tracker = np.array([], dtype=int)
|
||||
|
||||
# remove all unwanted unmatched tracker detections
|
||||
data["tracker_ids"][t] = np.delete(tracker_ids, to_remove_tracker, axis=0)
|
||||
data["tracker_dets"][t] = np.delete(tracker_dets, to_remove_tracker, axis=0)
|
||||
data["tracker_confidences"][t] = np.delete(
|
||||
tracker_confidences, to_remove_tracker, axis=0
|
||||
)
|
||||
similarity_scores = np.delete(similarity_scores, to_remove_tracker, axis=1)
|
||||
|
||||
data["gt_ids"][t] = gt_ids
|
||||
data["gt_dets"][t] = gt_dets
|
||||
data["similarity_scores"][t] = similarity_scores
|
||||
|
||||
unique_gt_ids += list(np.unique(data["gt_ids"][t]))
|
||||
unique_tracker_ids += list(np.unique(data["tracker_ids"][t]))
|
||||
num_tracker_dets += len(data["tracker_ids"][t])
|
||||
num_gt_dets += len(data["gt_ids"][t])
|
||||
|
||||
# Re-label IDs such that there are no empty IDs
|
||||
if len(unique_gt_ids) > 0:
|
||||
unique_gt_ids = np.unique(unique_gt_ids)
|
||||
gt_id_map = np.nan * np.ones((np.max(unique_gt_ids) + 1))
|
||||
gt_id_map[unique_gt_ids] = np.arange(len(unique_gt_ids))
|
||||
for t in range(raw_data["num_timesteps"]):
|
||||
if len(data["gt_ids"][t]) > 0:
|
||||
data["gt_ids"][t] = gt_id_map[data["gt_ids"][t]].astype(int)
|
||||
if len(unique_tracker_ids) > 0:
|
||||
unique_tracker_ids = np.unique(unique_tracker_ids)
|
||||
tracker_id_map = np.nan * np.ones((np.max(unique_tracker_ids) + 1))
|
||||
tracker_id_map[unique_tracker_ids] = np.arange(len(unique_tracker_ids))
|
||||
for t in range(raw_data["num_timesteps"]):
|
||||
if len(data["tracker_ids"][t]) > 0:
|
||||
data["tracker_ids"][t] = tracker_id_map[
|
||||
data["tracker_ids"][t]
|
||||
].astype(int)
|
||||
|
||||
# Record overview statistics.
|
||||
data["num_tracker_dets"] = num_tracker_dets
|
||||
data["num_gt_dets"] = num_gt_dets
|
||||
data["num_tracker_ids"] = len(unique_tracker_ids)
|
||||
data["num_gt_ids"] = len(unique_gt_ids)
|
||||
data["num_timesteps"] = raw_data["num_timesteps"]
|
||||
data["seq"] = raw_data["seq"]
|
||||
|
||||
# get track representations
|
||||
data["gt_tracks"] = raw_data["classes_to_gt_tracks"][cls_id]
|
||||
data["gt_track_ids"] = raw_data["classes_to_gt_track_ids"][cls_id]
|
||||
data["gt_track_lengths"] = raw_data["classes_to_gt_track_lengths"][cls_id]
|
||||
data["gt_track_areas"] = raw_data["classes_to_gt_track_areas"][cls_id]
|
||||
data["dt_tracks"] = raw_data["classes_to_dt_tracks"][cls_id]
|
||||
data["dt_track_ids"] = raw_data["classes_to_dt_track_ids"][cls_id]
|
||||
data["dt_track_lengths"] = raw_data["classes_to_dt_track_lengths"][cls_id]
|
||||
data["dt_track_areas"] = raw_data["classes_to_dt_track_areas"][cls_id]
|
||||
data["dt_track_scores"] = raw_data["classes_to_dt_track_scores"][cls_id]
|
||||
data["not_exhaustively_labeled"] = is_not_exhaustively_labeled
|
||||
data["iou_type"] = "bbox"
|
||||
|
||||
# sort tracker data tracks by tracker confidence scores
|
||||
if data["dt_tracks"]:
|
||||
idx = np.argsort(
|
||||
[-score for score in data["dt_track_scores"]], kind="mergesort"
|
||||
)
|
||||
data["dt_track_scores"] = [data["dt_track_scores"][i] for i in idx]
|
||||
data["dt_tracks"] = [data["dt_tracks"][i] for i in idx]
|
||||
data["dt_track_ids"] = [data["dt_track_ids"][i] for i in idx]
|
||||
data["dt_track_lengths"] = [data["dt_track_lengths"][i] for i in idx]
|
||||
data["dt_track_areas"] = [data["dt_track_areas"][i] for i in idx]
|
||||
# Ensure that ids are unique per timestep.
|
||||
self._check_unique_ids(data)
|
||||
|
||||
return data
|
||||
|
||||
def _calculate_similarities(self, gt_dets_t, tracker_dets_t):
|
||||
similarity_scores = self._calculate_box_ious(gt_dets_t, tracker_dets_t)
|
||||
return similarity_scores
|
||||
|
||||
def _merge_categories(self, annotations):
|
||||
"""
|
||||
Merges categories with a merged tag. Adapted from https://github.com/TAO-Dataset
|
||||
:param annotations: the annotations in which the classes should be merged
|
||||
:return: None
|
||||
"""
|
||||
merge_map = {}
|
||||
for category in self.gt_data["categories"]:
|
||||
if "merged" in category:
|
||||
for to_merge in category["merged"]:
|
||||
merge_map[to_merge["id"]] = category["id"]
|
||||
|
||||
for ann in annotations:
|
||||
ann["category_id"] = merge_map.get(ann["category_id"], ann["category_id"])
|
||||
|
||||
def _compute_vid_mappings(self, annotations):
|
||||
"""
|
||||
Computes mappings from Videos to corresponding tracks and images.
|
||||
:param annotations: the annotations for which the mapping should be generated
|
||||
:return: the video-to-track-mapping, the video-to-image-mapping
|
||||
"""
|
||||
vids_to_tracks = {}
|
||||
vids_to_imgs = {}
|
||||
vid_ids = [vid["id"] for vid in self.gt_data["videos"]]
|
||||
|
||||
# compute an mapping from image IDs to images
|
||||
images = {}
|
||||
for image in self.gt_data["images"]:
|
||||
images[image["id"]] = image
|
||||
|
||||
for ann in annotations:
|
||||
ann["area"] = ann["bbox"][2] * ann["bbox"][3]
|
||||
|
||||
vid = ann["video_id"]
|
||||
if ann["video_id"] not in vids_to_tracks.keys():
|
||||
vids_to_tracks[ann["video_id"]] = list()
|
||||
if ann["video_id"] not in vids_to_imgs.keys():
|
||||
vids_to_imgs[ann["video_id"]] = list()
|
||||
|
||||
# Fill in vids_to_tracks
|
||||
tid = ann["track_id"]
|
||||
exist_tids = [track["id"] for track in vids_to_tracks[vid]]
|
||||
try:
|
||||
index1 = exist_tids.index(tid)
|
||||
except ValueError:
|
||||
index1 = -1
|
||||
if tid not in exist_tids:
|
||||
curr_track = {
|
||||
"id": tid,
|
||||
"category_id": ann["category_id"],
|
||||
"video_id": vid,
|
||||
"annotations": [ann],
|
||||
}
|
||||
vids_to_tracks[vid].append(curr_track)
|
||||
else:
|
||||
vids_to_tracks[vid][index1]["annotations"].append(ann)
|
||||
|
||||
# Fill in vids_to_imgs
|
||||
img_id = ann["image_id"]
|
||||
exist_img_ids = [img["id"] for img in vids_to_imgs[vid]]
|
||||
try:
|
||||
index2 = exist_img_ids.index(img_id)
|
||||
except ValueError:
|
||||
index2 = -1
|
||||
if index2 == -1:
|
||||
curr_img = {"id": img_id, "annotations": [ann]}
|
||||
vids_to_imgs[vid].append(curr_img)
|
||||
else:
|
||||
vids_to_imgs[vid][index2]["annotations"].append(ann)
|
||||
|
||||
# sort annotations by frame index and compute track area
|
||||
for vid, tracks in vids_to_tracks.items():
|
||||
for track in tracks:
|
||||
track["annotations"] = sorted(
|
||||
track["annotations"],
|
||||
key=lambda x: images[x["image_id"]]["frame_index"],
|
||||
)
|
||||
# Computer average area
|
||||
track["area"] = sum(x["area"] for x in track["annotations"]) / len(
|
||||
track["annotations"]
|
||||
)
|
||||
|
||||
# Ensure all videos are present
|
||||
for vid_id in vid_ids:
|
||||
if vid_id not in vids_to_tracks.keys():
|
||||
vids_to_tracks[vid_id] = []
|
||||
if vid_id not in vids_to_imgs.keys():
|
||||
vids_to_imgs[vid_id] = []
|
||||
|
||||
return vids_to_tracks, vids_to_imgs
|
||||
|
||||
def _compute_image_to_timestep_mappings(self):
|
||||
"""
|
||||
Computes a mapping from images to the corresponding timestep in the sequence.
|
||||
:return: the image-to-timestep-mapping
|
||||
"""
|
||||
images = {}
|
||||
for image in self.gt_data["images"]:
|
||||
images[image["id"]] = image
|
||||
|
||||
seq_to_imgs_to_timestep = {vid["id"]: dict() for vid in self.gt_data["videos"]}
|
||||
for vid in seq_to_imgs_to_timestep:
|
||||
curr_imgs = [img["id"] for img in self.videos_to_gt_images[vid]]
|
||||
curr_imgs = sorted(curr_imgs, key=lambda x: images[x]["frame_index"])
|
||||
seq_to_imgs_to_timestep[vid] = {
|
||||
curr_imgs[i]: i for i in range(len(curr_imgs))
|
||||
}
|
||||
|
||||
return seq_to_imgs_to_timestep
|
||||
|
||||
def _limit_dets_per_image(self, annotations):
|
||||
"""
|
||||
Limits the number of detections for each image to config['MAX_DETECTIONS']. Adapted from
|
||||
https://github.com/TAO-Dataset/
|
||||
:param annotations: the annotations in which the detections should be limited
|
||||
:return: the annotations with limited detections
|
||||
"""
|
||||
max_dets = self.config["MAX_DETECTIONS"]
|
||||
img_ann = defaultdict(list)
|
||||
for ann in annotations:
|
||||
img_ann[ann["image_id"]].append(ann)
|
||||
|
||||
for img_id, _anns in img_ann.items():
|
||||
if len(_anns) <= max_dets:
|
||||
continue
|
||||
_anns = sorted(_anns, key=lambda x: x["score"], reverse=True)
|
||||
img_ann[img_id] = _anns[:max_dets]
|
||||
|
||||
return [ann for anns in img_ann.values() for ann in anns]
|
||||
|
||||
def _fill_video_ids_inplace(self, annotations):
|
||||
"""
|
||||
Fills in missing video IDs inplace. Adapted from https://github.com/TAO-Dataset/
|
||||
:param annotations: the annotations for which the videos IDs should be filled inplace
|
||||
:return: None
|
||||
"""
|
||||
missing_video_id = [x for x in annotations if "video_id" not in x]
|
||||
if missing_video_id:
|
||||
image_id_to_video_id = {
|
||||
x["id"]: x["video_id"] for x in self.gt_data["images"]
|
||||
}
|
||||
for x in missing_video_id:
|
||||
x["video_id"] = image_id_to_video_id[x["image_id"]]
|
||||
|
||||
@staticmethod
|
||||
def _make_track_ids_unique(annotations):
|
||||
"""
|
||||
Makes the track IDs unqiue over the whole annotation set. Adapted from https://github.com/TAO-Dataset/
|
||||
:param annotations: the annotation set
|
||||
:return: the number of updated IDs
|
||||
"""
|
||||
track_id_videos = {}
|
||||
track_ids_to_update = set()
|
||||
max_track_id = 0
|
||||
for ann in annotations:
|
||||
t = ann["track_id"]
|
||||
if t not in track_id_videos:
|
||||
track_id_videos[t] = ann["video_id"]
|
||||
|
||||
if ann["video_id"] != track_id_videos[t]:
|
||||
# Track id is assigned to multiple videos
|
||||
track_ids_to_update.add(t)
|
||||
max_track_id = max(max_track_id, t)
|
||||
|
||||
if track_ids_to_update:
|
||||
print("true")
|
||||
next_id = itertools.count(max_track_id + 1)
|
||||
new_track_ids = defaultdict(lambda: next(next_id))
|
||||
for ann in annotations:
|
||||
t = ann["track_id"]
|
||||
v = ann["video_id"]
|
||||
if t in track_ids_to_update:
|
||||
ann["track_id"] = new_track_ids[t, v]
|
||||
return len(track_ids_to_update)
|
||||
|
||||
def _split_known_unknown_distractor(self):
|
||||
all_ids = set(
|
||||
[i for i in range(1, 2000)]
|
||||
) # 2000 is larger than the max category id in TAO-OW.
|
||||
# `knowns` includes 78 TAO_category_ids that corresponds to 78 COCO classes.
|
||||
# (The other 2 COCO classes do not have corresponding classes in TAO).
|
||||
self.knowns = {
|
||||
4,
|
||||
13,
|
||||
1038,
|
||||
544,
|
||||
1057,
|
||||
34,
|
||||
35,
|
||||
36,
|
||||
41,
|
||||
45,
|
||||
58,
|
||||
60,
|
||||
579,
|
||||
1091,
|
||||
1097,
|
||||
1099,
|
||||
78,
|
||||
79,
|
||||
81,
|
||||
91,
|
||||
1115,
|
||||
1117,
|
||||
95,
|
||||
1122,
|
||||
99,
|
||||
1132,
|
||||
621,
|
||||
1135,
|
||||
625,
|
||||
118,
|
||||
1144,
|
||||
126,
|
||||
642,
|
||||
1155,
|
||||
133,
|
||||
1162,
|
||||
139,
|
||||
154,
|
||||
174,
|
||||
185,
|
||||
699,
|
||||
1215,
|
||||
714,
|
||||
717,
|
||||
1229,
|
||||
211,
|
||||
729,
|
||||
221,
|
||||
229,
|
||||
747,
|
||||
235,
|
||||
237,
|
||||
779,
|
||||
276,
|
||||
805,
|
||||
299,
|
||||
829,
|
||||
852,
|
||||
347,
|
||||
371,
|
||||
382,
|
||||
896,
|
||||
392,
|
||||
926,
|
||||
937,
|
||||
428,
|
||||
429,
|
||||
961,
|
||||
452,
|
||||
979,
|
||||
980,
|
||||
982,
|
||||
475,
|
||||
480,
|
||||
993,
|
||||
1001,
|
||||
502,
|
||||
1018,
|
||||
}
|
||||
# `distractors` is defined as in the paper "Opening up Open-World Tracking"
|
||||
self.distractors = {
|
||||
20,
|
||||
63,
|
||||
108,
|
||||
180,
|
||||
188,
|
||||
204,
|
||||
212,
|
||||
247,
|
||||
303,
|
||||
403,
|
||||
407,
|
||||
415,
|
||||
490,
|
||||
504,
|
||||
507,
|
||||
513,
|
||||
529,
|
||||
567,
|
||||
569,
|
||||
588,
|
||||
672,
|
||||
691,
|
||||
702,
|
||||
708,
|
||||
711,
|
||||
720,
|
||||
736,
|
||||
737,
|
||||
798,
|
||||
813,
|
||||
815,
|
||||
827,
|
||||
831,
|
||||
851,
|
||||
877,
|
||||
883,
|
||||
912,
|
||||
971,
|
||||
976,
|
||||
1130,
|
||||
1133,
|
||||
1134,
|
||||
1169,
|
||||
1184,
|
||||
1220,
|
||||
}
|
||||
self.unknowns = all_ids.difference(self.knowns.union(self.distractors))
|
||||
|
||||
def _filter_gt_data(self, raw_gt_data):
|
||||
"""
|
||||
Filter out irrelevant data in the raw_gt_data
|
||||
Args:
|
||||
raw_gt_data: directly loaded from json.
|
||||
|
||||
Returns:
|
||||
filtered gt_data
|
||||
"""
|
||||
valid_cat_ids = list()
|
||||
if self.subset == "known":
|
||||
valid_cat_ids = self.knowns
|
||||
elif self.subset == "distractor":
|
||||
valid_cat_ids = self.distractors
|
||||
elif self.subset == "unknown":
|
||||
valid_cat_ids = self.unknowns
|
||||
# elif self.subset == "test_only_unknowns":
|
||||
# valid_cat_ids = test_only_unknowns
|
||||
else:
|
||||
raise Exception("The parameter `SUBSET` is incorrect")
|
||||
|
||||
filtered = dict()
|
||||
filtered["videos"] = raw_gt_data["videos"]
|
||||
# filtered["videos"] = list()
|
||||
unwanted_vid = set()
|
||||
# for video in raw_gt_data["videos"]:
|
||||
# datasrc = video["name"].split('/')[1]
|
||||
# if datasrc in data_srcs:
|
||||
# filtered["videos"].append(video)
|
||||
# else:
|
||||
# unwanted_vid.add(video["id"])
|
||||
|
||||
filtered["annotations"] = list()
|
||||
for ann in raw_gt_data["annotations"]:
|
||||
if (ann["video_id"] not in unwanted_vid) and (
|
||||
ann["category_id"] in valid_cat_ids
|
||||
):
|
||||
filtered["annotations"].append(ann)
|
||||
|
||||
filtered["tracks"] = list()
|
||||
for track in raw_gt_data["tracks"]:
|
||||
if (track["video_id"] not in unwanted_vid) and (
|
||||
track["category_id"] in valid_cat_ids
|
||||
):
|
||||
filtered["tracks"].append(track)
|
||||
|
||||
filtered["images"] = list()
|
||||
for image in raw_gt_data["images"]:
|
||||
if image["video_id"] not in unwanted_vid:
|
||||
filtered["images"].append(image)
|
||||
|
||||
filtered["categories"] = list()
|
||||
for cat in raw_gt_data["categories"]:
|
||||
if cat["id"] in valid_cat_ids:
|
||||
filtered["categories"].append(cat)
|
||||
|
||||
filtered["info"] = raw_gt_data["info"]
|
||||
filtered["licenses"] = raw_gt_data["licenses"]
|
||||
|
||||
return filtered
|
||||
524
sam3/eval/hota_eval_toolkit/trackeval/datasets/youtube_vis.py
Normal file
524
sam3/eval/hota_eval_toolkit/trackeval/datasets/youtube_vis.py
Normal file
@@ -0,0 +1,524 @@
|
||||
# flake8: noqa
|
||||
|
||||
# note: this file has been modified from its original version in TrackEval in
|
||||
# https://github.com/JonathonLuiten/TrackEval/blob/master/trackeval/datasets/youtube_vis.py
|
||||
# to support the following:
|
||||
# 1) bbox evaluation (via `IOU_TYPE`)
|
||||
# 2) passing GT and prediction data as Python objects (via `GT_JSON_OBJECT` and `TRACKER_JSON_OBJECT`)
|
||||
# 3) specifying a custom dataset name (via `DATASET_NAME`)
|
||||
|
||||
import json
|
||||
import os
|
||||
|
||||
import numpy as np
|
||||
|
||||
from .. import _timing, utils
|
||||
from ..utils import TrackEvalException
|
||||
from ._base_dataset import _BaseDataset
|
||||
|
||||
|
||||
class YouTubeVIS(_BaseDataset):
|
||||
"""Dataset class for YouTubeVIS tracking"""
|
||||
|
||||
@staticmethod
|
||||
def get_default_dataset_config():
|
||||
"""Default class config values"""
|
||||
code_path = utils.get_code_path()
|
||||
default_config = {
|
||||
"GT_FOLDER": os.path.join(
|
||||
code_path, "data/gt/youtube_vis/"
|
||||
), # Location of GT data
|
||||
"TRACKERS_FOLDER": os.path.join(code_path, "data/trackers/youtube_vis/"),
|
||||
# Trackers location
|
||||
"OUTPUT_FOLDER": None, # Where to save eval results (if None, same as TRACKERS_FOLDER)
|
||||
"TRACKERS_TO_EVAL": None, # Filenames of trackers to eval (if None, all in folder)
|
||||
"CLASSES_TO_EVAL": None, # Classes to eval (if None, all classes)
|
||||
"SPLIT_TO_EVAL": "train_sub_split", # Valid: 'train', 'val', 'train_sub_split'
|
||||
"PRINT_CONFIG": True, # Whether to print current config
|
||||
"OUTPUT_SUB_FOLDER": "", # Output files are saved in OUTPUT_FOLDER/tracker_name/OUTPUT_SUB_FOLDER
|
||||
"TRACKER_SUB_FOLDER": "data", # Tracker files are in TRACKER_FOLDER/tracker_name/TRACKER_SUB_FOLDER
|
||||
"TRACKER_DISPLAY_NAMES": None, # Names of trackers to display, if None: TRACKERS_TO_EVAL
|
||||
# Added for video phrase AP evaluation -- allow directly specifying the GT JSON data and Tracker (result)
|
||||
# JSON data as Python objects, without reading from files.
|
||||
"GT_JSON_OBJECT": None,
|
||||
"TRACKER_JSON_OBJECT": None,
|
||||
"IOU_TYPE": "segm",
|
||||
"DATASET_NAME": "video",
|
||||
}
|
||||
return default_config
|
||||
|
||||
def __init__(self, config=None):
|
||||
"""Initialise dataset, checking that all required files are present"""
|
||||
super().__init__()
|
||||
# Fill non-given config values with defaults
|
||||
self.config = utils.init_config(config, self.get_default_dataset_config())
|
||||
self.gt_fol = (
|
||||
self.config["GT_FOLDER"] + "youtube_vis_" + self.config["SPLIT_TO_EVAL"]
|
||||
)
|
||||
self.tracker_fol = (
|
||||
self.config["TRACKERS_FOLDER"]
|
||||
+ "youtube_vis_"
|
||||
+ self.config["SPLIT_TO_EVAL"]
|
||||
)
|
||||
self.use_super_categories = False
|
||||
self.should_classes_combine = True
|
||||
assert self.config["IOU_TYPE"] in ["segm", "bbox"]
|
||||
self.iou_type = self.config["IOU_TYPE"]
|
||||
print("=" * 100)
|
||||
print(f"Evaluate annotation type *{self.iou_type}*")
|
||||
self.dataset_name = self.config["DATASET_NAME"]
|
||||
|
||||
self.output_fol = self.config["OUTPUT_FOLDER"]
|
||||
if self.output_fol is None:
|
||||
self.output_fol = self.tracker_fol
|
||||
self.output_sub_fol = self.config["OUTPUT_SUB_FOLDER"]
|
||||
self.tracker_sub_fol = self.config["TRACKER_SUB_FOLDER"]
|
||||
|
||||
if self.config["GT_JSON_OBJECT"] is not None:
|
||||
# allow directly specifying the GT JSON data without reading from files
|
||||
gt_json = self.config["GT_JSON_OBJECT"]
|
||||
assert isinstance(gt_json, dict)
|
||||
assert "videos" in gt_json
|
||||
assert "categories" in gt_json
|
||||
assert "annotations" in gt_json
|
||||
self.gt_data = gt_json
|
||||
else:
|
||||
if not os.path.exists(self.gt_fol):
|
||||
print("GT folder not found: " + self.gt_fol)
|
||||
raise TrackEvalException(
|
||||
"GT folder not found: " + os.path.basename(self.gt_fol)
|
||||
)
|
||||
gt_dir_files = [
|
||||
file for file in os.listdir(self.gt_fol) if file.endswith(".json")
|
||||
]
|
||||
if len(gt_dir_files) != 1:
|
||||
raise TrackEvalException(
|
||||
self.gt_fol + " does not contain exactly one json file."
|
||||
)
|
||||
|
||||
with open(os.path.join(self.gt_fol, gt_dir_files[0])) as f:
|
||||
self.gt_data = json.load(f)
|
||||
|
||||
# Get classes to eval
|
||||
self.valid_classes = [cls["name"] for cls in self.gt_data["categories"]]
|
||||
cls_name_to_cls_id_map = {
|
||||
cls["name"]: cls["id"] for cls in self.gt_data["categories"]
|
||||
}
|
||||
|
||||
if self.config["CLASSES_TO_EVAL"]:
|
||||
self.class_list = [
|
||||
cls.lower() if cls.lower() in self.valid_classes else None
|
||||
for cls in self.config["CLASSES_TO_EVAL"]
|
||||
]
|
||||
if not all(self.class_list):
|
||||
raise TrackEvalException(
|
||||
"Attempted to evaluate an invalid class. Only classes "
|
||||
+ ", ".join(self.valid_classes)
|
||||
+ " are valid."
|
||||
)
|
||||
else:
|
||||
self.class_list = [cls["name"] for cls in self.gt_data["categories"]]
|
||||
self.class_name_to_class_id = {
|
||||
k: v for k, v in cls_name_to_cls_id_map.items() if k in self.class_list
|
||||
}
|
||||
|
||||
# Get sequences to eval and check gt files exist
|
||||
self.seq_list = [
|
||||
vid["file_names"][0].split("/")[0] for vid in self.gt_data["videos"]
|
||||
]
|
||||
self.seq_name_to_seq_id = {
|
||||
vid["file_names"][0].split("/")[0]: vid["id"]
|
||||
for vid in self.gt_data["videos"]
|
||||
}
|
||||
self.seq_lengths = {
|
||||
vid["id"]: len(vid["file_names"]) for vid in self.gt_data["videos"]
|
||||
}
|
||||
|
||||
# encode masks and compute track areas
|
||||
self._prepare_gt_annotations()
|
||||
|
||||
# Get trackers to eval
|
||||
if self.config["TRACKER_JSON_OBJECT"] is not None:
|
||||
# allow directly specifying the tracker JSON data without reading from files
|
||||
tracker_json = self.config["TRACKER_JSON_OBJECT"]
|
||||
assert isinstance(tracker_json, list)
|
||||
self.tracker_list = ["tracker"]
|
||||
elif self.config["TRACKERS_TO_EVAL"] is None:
|
||||
self.tracker_list = os.listdir(self.tracker_fol)
|
||||
else:
|
||||
self.tracker_list = self.config["TRACKERS_TO_EVAL"]
|
||||
|
||||
if self.config["TRACKER_DISPLAY_NAMES"] is None:
|
||||
self.tracker_to_disp = dict(zip(self.tracker_list, self.tracker_list))
|
||||
elif (self.config["TRACKERS_TO_EVAL"] is not None) and (
|
||||
len(self.config["TRACKER_DISPLAY_NAMES"]) == len(self.tracker_list)
|
||||
):
|
||||
self.tracker_to_disp = dict(
|
||||
zip(self.tracker_list, self.config["TRACKER_DISPLAY_NAMES"])
|
||||
)
|
||||
else:
|
||||
raise TrackEvalException(
|
||||
"List of tracker files and tracker display names do not match."
|
||||
)
|
||||
|
||||
# counter for globally unique track IDs
|
||||
self.global_tid_counter = 0
|
||||
|
||||
self.tracker_data = dict()
|
||||
if self.config["TRACKER_JSON_OBJECT"] is not None:
|
||||
# allow directly specifying the tracker JSON data without reading from files
|
||||
tracker = self.tracker_list[0]
|
||||
self.tracker_data[tracker] = tracker_json
|
||||
else:
|
||||
for tracker in self.tracker_list:
|
||||
tracker_dir_path = os.path.join(
|
||||
self.tracker_fol, tracker, self.tracker_sub_fol
|
||||
)
|
||||
tr_dir_files = [
|
||||
file
|
||||
for file in os.listdir(tracker_dir_path)
|
||||
if file.endswith(".json")
|
||||
]
|
||||
if len(tr_dir_files) != 1:
|
||||
raise TrackEvalException(
|
||||
tracker_dir_path + " does not contain exactly one json file."
|
||||
)
|
||||
|
||||
with open(os.path.join(tracker_dir_path, tr_dir_files[0])) as f:
|
||||
curr_data = json.load(f)
|
||||
|
||||
self.tracker_data[tracker] = curr_data
|
||||
|
||||
def get_display_name(self, tracker):
|
||||
return self.tracker_to_disp[tracker]
|
||||
|
||||
def _load_raw_file(self, tracker, seq, is_gt):
|
||||
"""Load a file (gt or tracker) in the YouTubeVIS format
|
||||
If is_gt, this returns a dict which contains the fields:
|
||||
[gt_ids, gt_classes] : list (for each timestep) of 1D NDArrays (for each det).
|
||||
[gt_dets]: list (for each timestep) of lists of detections.
|
||||
[classes_to_gt_tracks]: dictionary with class values as keys and list of dictionaries (with frame indices as
|
||||
keys and corresponding segmentations as values) for each track
|
||||
[classes_to_gt_track_ids, classes_to_gt_track_areas, classes_to_gt_track_iscrowd]: dictionary with class values
|
||||
as keys and lists (for each track) as values
|
||||
|
||||
if not is_gt, this returns a dict which contains the fields:
|
||||
[tracker_ids, tracker_classes, tracker_confidences] : list (for each timestep) of 1D NDArrays (for each det).
|
||||
[tracker_dets]: list (for each timestep) of lists of detections.
|
||||
[classes_to_dt_tracks]: dictionary with class values as keys and list of dictionaries (with frame indices as
|
||||
keys and corresponding segmentations as values) for each track
|
||||
[classes_to_dt_track_ids, classes_to_dt_track_areas]: dictionary with class values as keys and lists as values
|
||||
[classes_to_dt_track_scores]: dictionary with class values as keys and 1D numpy arrays as values
|
||||
"""
|
||||
# select sequence tracks
|
||||
seq_id = self.seq_name_to_seq_id[seq]
|
||||
if is_gt:
|
||||
tracks = [
|
||||
ann for ann in self.gt_data["annotations"] if ann["video_id"] == seq_id
|
||||
]
|
||||
else:
|
||||
tracks = self._get_tracker_seq_tracks(tracker, seq_id)
|
||||
|
||||
# Convert data to required format
|
||||
num_timesteps = self.seq_lengths[seq_id]
|
||||
data_keys = ["ids", "classes", "dets"]
|
||||
if not is_gt:
|
||||
data_keys += ["tracker_confidences"]
|
||||
raw_data = {key: [None] * num_timesteps for key in data_keys}
|
||||
result_key = "segmentations" if self.iou_type == "segm" else "bboxes"
|
||||
for t in range(num_timesteps):
|
||||
raw_data["dets"][t] = [
|
||||
track[result_key][t] for track in tracks if track[result_key][t]
|
||||
]
|
||||
raw_data["ids"][t] = np.atleast_1d(
|
||||
[track["id"] for track in tracks if track[result_key][t]]
|
||||
).astype(int)
|
||||
raw_data["classes"][t] = np.atleast_1d(
|
||||
[track["category_id"] for track in tracks if track[result_key][t]]
|
||||
).astype(int)
|
||||
if not is_gt:
|
||||
raw_data["tracker_confidences"][t] = np.atleast_1d(
|
||||
[track["score"] for track in tracks if track[result_key][t]]
|
||||
).astype(float)
|
||||
|
||||
if is_gt:
|
||||
key_map = {"ids": "gt_ids", "classes": "gt_classes", "dets": "gt_dets"}
|
||||
else:
|
||||
key_map = {
|
||||
"ids": "tracker_ids",
|
||||
"classes": "tracker_classes",
|
||||
"dets": "tracker_dets",
|
||||
}
|
||||
for k, v in key_map.items():
|
||||
raw_data[v] = raw_data.pop(k)
|
||||
|
||||
all_cls_ids = {self.class_name_to_class_id[cls] for cls in self.class_list}
|
||||
classes_to_tracks = {
|
||||
cls: [track for track in tracks if track["category_id"] == cls]
|
||||
for cls in all_cls_ids
|
||||
}
|
||||
|
||||
# mapping from classes to track representations and track information
|
||||
raw_data["classes_to_tracks"] = {
|
||||
cls: [
|
||||
{i: track[result_key][i] for i in range(len(track[result_key]))}
|
||||
for track in tracks
|
||||
]
|
||||
for cls, tracks in classes_to_tracks.items()
|
||||
}
|
||||
raw_data["classes_to_track_ids"] = {
|
||||
cls: [track["id"] for track in tracks]
|
||||
for cls, tracks in classes_to_tracks.items()
|
||||
}
|
||||
raw_data["classes_to_track_areas"] = {
|
||||
cls: [track["area"] for track in tracks]
|
||||
for cls, tracks in classes_to_tracks.items()
|
||||
}
|
||||
|
||||
if is_gt:
|
||||
raw_data["classes_to_gt_track_iscrowd"] = {
|
||||
cls: [track["iscrowd"] for track in tracks]
|
||||
for cls, tracks in classes_to_tracks.items()
|
||||
}
|
||||
else:
|
||||
raw_data["classes_to_dt_track_scores"] = {
|
||||
cls: np.array([track["score"] for track in tracks])
|
||||
for cls, tracks in classes_to_tracks.items()
|
||||
}
|
||||
|
||||
if is_gt:
|
||||
key_map = {
|
||||
"classes_to_tracks": "classes_to_gt_tracks",
|
||||
"classes_to_track_ids": "classes_to_gt_track_ids",
|
||||
"classes_to_track_areas": "classes_to_gt_track_areas",
|
||||
}
|
||||
else:
|
||||
key_map = {
|
||||
"classes_to_tracks": "classes_to_dt_tracks",
|
||||
"classes_to_track_ids": "classes_to_dt_track_ids",
|
||||
"classes_to_track_areas": "classes_to_dt_track_areas",
|
||||
}
|
||||
for k, v in key_map.items():
|
||||
raw_data[v] = raw_data.pop(k)
|
||||
|
||||
raw_data["num_timesteps"] = num_timesteps
|
||||
raw_data["seq"] = seq
|
||||
return raw_data
|
||||
|
||||
@_timing.time
|
||||
def get_preprocessed_seq_data(self, raw_data, cls):
|
||||
"""Preprocess data for a single sequence for a single class ready for evaluation.
|
||||
Inputs:
|
||||
- raw_data is a dict containing the data for the sequence already read in by get_raw_seq_data().
|
||||
- cls is the class to be evaluated.
|
||||
Outputs:
|
||||
- data is a dict containing all of the information that metrics need to perform evaluation.
|
||||
It contains the following fields:
|
||||
[num_timesteps, num_gt_ids, num_tracker_ids, num_gt_dets, num_tracker_dets] : integers.
|
||||
[gt_ids, tracker_ids, tracker_confidences]: list (for each timestep) of 1D NDArrays (for each det).
|
||||
[gt_dets, tracker_dets]: list (for each timestep) of lists of detections.
|
||||
[similarity_scores]: list (for each timestep) of 2D NDArrays.
|
||||
Notes:
|
||||
General preprocessing (preproc) occurs in 4 steps. Some datasets may not use all of these steps.
|
||||
1) Extract only detections relevant for the class to be evaluated (including distractor detections).
|
||||
2) Match gt dets and tracker dets. Remove tracker dets that are matched to a gt det that is of a
|
||||
distractor class, or otherwise marked as to be removed.
|
||||
3) Remove unmatched tracker dets if they fall within a crowd ignore region or don't meet a certain
|
||||
other criteria (e.g. are too small).
|
||||
4) Remove gt dets that were only useful for preprocessing and not for actual evaluation.
|
||||
After the above preprocessing steps, this function also calculates the number of gt and tracker detections
|
||||
and unique track ids. It also relabels gt and tracker ids to be contiguous and checks that ids are
|
||||
unique within each timestep.
|
||||
YouTubeVIS:
|
||||
In YouTubeVIS, the 4 preproc steps are as follow:
|
||||
1) There are 40 classes which are evaluated separately.
|
||||
2) No matched tracker dets are removed.
|
||||
3) No unmatched tracker dets are removed.
|
||||
4) No gt dets are removed.
|
||||
Further, for TrackMAP computation track representations for the given class are accessed from a dictionary
|
||||
and the tracks from the tracker data are sorted according to the tracker confidence.
|
||||
"""
|
||||
cls_id = self.class_name_to_class_id[cls]
|
||||
|
||||
data_keys = [
|
||||
"gt_ids",
|
||||
"tracker_ids",
|
||||
"gt_dets",
|
||||
"tracker_dets",
|
||||
"similarity_scores",
|
||||
]
|
||||
data = {key: [None] * raw_data["num_timesteps"] for key in data_keys}
|
||||
unique_gt_ids = []
|
||||
unique_tracker_ids = []
|
||||
num_gt_dets = 0
|
||||
num_tracker_dets = 0
|
||||
|
||||
for t in range(raw_data["num_timesteps"]):
|
||||
# Only extract relevant dets for this class for eval (cls)
|
||||
gt_class_mask = np.atleast_1d(raw_data["gt_classes"][t] == cls_id)
|
||||
gt_class_mask = gt_class_mask.astype(bool)
|
||||
gt_ids = raw_data["gt_ids"][t][gt_class_mask]
|
||||
gt_dets = [
|
||||
raw_data["gt_dets"][t][ind]
|
||||
for ind in range(len(gt_class_mask))
|
||||
if gt_class_mask[ind]
|
||||
]
|
||||
|
||||
tracker_class_mask = np.atleast_1d(raw_data["tracker_classes"][t] == cls_id)
|
||||
tracker_class_mask = tracker_class_mask.astype(bool)
|
||||
tracker_ids = raw_data["tracker_ids"][t][tracker_class_mask]
|
||||
tracker_dets = [
|
||||
raw_data["tracker_dets"][t][ind]
|
||||
for ind in range(len(tracker_class_mask))
|
||||
if tracker_class_mask[ind]
|
||||
]
|
||||
similarity_scores = raw_data["similarity_scores"][t][gt_class_mask, :][
|
||||
:, tracker_class_mask
|
||||
]
|
||||
|
||||
data["tracker_ids"][t] = tracker_ids
|
||||
data["tracker_dets"][t] = tracker_dets
|
||||
data["gt_ids"][t] = gt_ids
|
||||
data["gt_dets"][t] = gt_dets
|
||||
data["similarity_scores"][t] = similarity_scores
|
||||
|
||||
unique_gt_ids += list(np.unique(data["gt_ids"][t]))
|
||||
unique_tracker_ids += list(np.unique(data["tracker_ids"][t]))
|
||||
num_tracker_dets += len(data["tracker_ids"][t])
|
||||
num_gt_dets += len(data["gt_ids"][t])
|
||||
|
||||
# Re-label IDs such that there are no empty IDs
|
||||
if len(unique_gt_ids) > 0:
|
||||
unique_gt_ids = np.unique(unique_gt_ids)
|
||||
gt_id_map = np.nan * np.ones((np.max(unique_gt_ids) + 1))
|
||||
gt_id_map[unique_gt_ids] = np.arange(len(unique_gt_ids))
|
||||
for t in range(raw_data["num_timesteps"]):
|
||||
if len(data["gt_ids"][t]) > 0:
|
||||
data["gt_ids"][t] = gt_id_map[data["gt_ids"][t]].astype(int)
|
||||
if len(unique_tracker_ids) > 0:
|
||||
unique_tracker_ids = np.unique(unique_tracker_ids)
|
||||
tracker_id_map = np.nan * np.ones((np.max(unique_tracker_ids) + 1))
|
||||
tracker_id_map[unique_tracker_ids] = np.arange(len(unique_tracker_ids))
|
||||
for t in range(raw_data["num_timesteps"]):
|
||||
if len(data["tracker_ids"][t]) > 0:
|
||||
data["tracker_ids"][t] = tracker_id_map[
|
||||
data["tracker_ids"][t]
|
||||
].astype(int)
|
||||
|
||||
# Ensure that ids are unique per timestep.
|
||||
self._check_unique_ids(data)
|
||||
|
||||
# Record overview statistics.
|
||||
data["num_tracker_dets"] = num_tracker_dets
|
||||
data["num_gt_dets"] = num_gt_dets
|
||||
data["num_tracker_ids"] = len(unique_tracker_ids)
|
||||
data["num_gt_ids"] = len(unique_gt_ids)
|
||||
data["num_timesteps"] = raw_data["num_timesteps"]
|
||||
data["seq"] = raw_data["seq"]
|
||||
|
||||
# get track representations
|
||||
data["gt_tracks"] = raw_data["classes_to_gt_tracks"][cls_id]
|
||||
data["gt_track_ids"] = raw_data["classes_to_gt_track_ids"][cls_id]
|
||||
data["gt_track_areas"] = raw_data["classes_to_gt_track_areas"][cls_id]
|
||||
data["gt_track_iscrowd"] = raw_data["classes_to_gt_track_iscrowd"][cls_id]
|
||||
data["dt_tracks"] = raw_data["classes_to_dt_tracks"][cls_id]
|
||||
data["dt_track_ids"] = raw_data["classes_to_dt_track_ids"][cls_id]
|
||||
data["dt_track_areas"] = raw_data["classes_to_dt_track_areas"][cls_id]
|
||||
data["dt_track_scores"] = raw_data["classes_to_dt_track_scores"][cls_id]
|
||||
data["iou_type"] = "mask"
|
||||
|
||||
# sort tracker data tracks by tracker confidence scores
|
||||
if data["dt_tracks"]:
|
||||
idx = np.argsort(
|
||||
[-score for score in data["dt_track_scores"]], kind="mergesort"
|
||||
)
|
||||
data["dt_track_scores"] = [data["dt_track_scores"][i] for i in idx]
|
||||
data["dt_tracks"] = [data["dt_tracks"][i] for i in idx]
|
||||
data["dt_track_ids"] = [data["dt_track_ids"][i] for i in idx]
|
||||
data["dt_track_areas"] = [data["dt_track_areas"][i] for i in idx]
|
||||
|
||||
return data
|
||||
|
||||
def _calculate_similarities(self, gt_dets_t, tracker_dets_t):
|
||||
if self.iou_type == "segm":
|
||||
similarity_scores = self._calculate_mask_ious(
|
||||
gt_dets_t, tracker_dets_t, is_encoded=True, do_ioa=False
|
||||
)
|
||||
else:
|
||||
gt_dets_t = np.array(gt_dets_t, dtype=np.float32).reshape(-1, 4)
|
||||
tracker_dets_t = np.array(tracker_dets_t, dtype=np.float32).reshape(-1, 4)
|
||||
similarity_scores = self._calculate_box_ious(
|
||||
gt_dets_t, tracker_dets_t, box_format="xywh", do_ioa=False
|
||||
)
|
||||
return similarity_scores
|
||||
|
||||
def _prepare_gt_annotations(self):
|
||||
"""
|
||||
Prepares GT data by rle encoding segmentations and computing the average track area.
|
||||
:return: None
|
||||
"""
|
||||
if self.iou_type == "segm":
|
||||
# only loaded when needed to reduce minimum requirements
|
||||
from pycocotools import mask as mask_utils
|
||||
|
||||
for track in self.gt_data["annotations"]:
|
||||
h = track["height"]
|
||||
w = track["width"]
|
||||
for i, seg in enumerate(track["segmentations"]):
|
||||
if seg is not None and isinstance(seg["counts"], list):
|
||||
track["segmentations"][i] = mask_utils.frPyObjects(seg, h, w)
|
||||
areas = [a for a in track["areas"] if a]
|
||||
if len(areas) == 0:
|
||||
track["area"] = 0
|
||||
else:
|
||||
track["area"] = np.array(areas).mean()
|
||||
else:
|
||||
for track in self.gt_data["annotations"]:
|
||||
# For bbox eval, compute areas from bboxes if not already available
|
||||
areas = [a for a in track.get("areas", []) if a]
|
||||
if not areas:
|
||||
areas = []
|
||||
for bbox in track.get("bboxes", []):
|
||||
if bbox is not None:
|
||||
areas.append(bbox[2] * bbox[3])
|
||||
track["area"] = np.array(areas).mean() if areas else 0
|
||||
|
||||
def _get_tracker_seq_tracks(self, tracker, seq_id):
|
||||
"""
|
||||
Prepares tracker data for a given sequence. Extracts all annotations for given sequence ID, computes
|
||||
average track area and assigns a track ID.
|
||||
:param tracker: the given tracker
|
||||
:param seq_id: the sequence ID
|
||||
:return: the extracted tracks
|
||||
"""
|
||||
# only loaded when needed to reduce minimum requirements
|
||||
from pycocotools import mask as mask_utils
|
||||
|
||||
tracks = [
|
||||
ann for ann in self.tracker_data[tracker] if ann["video_id"] == seq_id
|
||||
]
|
||||
for track in tracks:
|
||||
if "areas" not in track:
|
||||
if self.iou_type == "segm":
|
||||
for seg in track["segmentations"]:
|
||||
if seg:
|
||||
track["areas"].append(mask_utils.area(seg))
|
||||
else:
|
||||
track["areas"].append(None)
|
||||
else:
|
||||
for bbox in track["bboxes"]:
|
||||
if bbox:
|
||||
track["areas"].append(bbox[2] * bbox[3])
|
||||
else:
|
||||
track["areas"].append(None)
|
||||
areas = [a for a in track["areas"] if a]
|
||||
if len(areas) == 0:
|
||||
track["area"] = 0
|
||||
else:
|
||||
track["area"] = np.array(areas).mean()
|
||||
track["id"] = self.global_tid_counter
|
||||
self.global_tid_counter += 1
|
||||
return tracks
|
||||
|
||||
def get_name(self):
|
||||
return self.dataset_name
|
||||
Reference in New Issue
Block a user