Initial commit

fbshipit-source-id: da6be2f26e3a1202f4bffde8cb980e2dcb851294
This commit is contained in:
facebook-github-bot
2025-11-18 23:07:42 -08:00
commit a13e358df4
504 changed files with 122758 additions and 0 deletions

1
sam3/eval/__init__.py Normal file
View File

@@ -0,0 +1 @@
# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved

703
sam3/eval/cgf1_eval.py Normal file
View File

@@ -0,0 +1,703 @@
# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
import contextlib
import copy
import json
import os
import time
from collections import defaultdict
from dataclasses import dataclass
from typing import List, Union
import numpy as np
import pycocotools.mask as maskUtils
from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval
from scipy.optimize import linear_sum_assignment
from tqdm import tqdm
@dataclass
class Metric:
name: str
# whether the metric is computed at the image level or the box level
image_level: bool
# iou threshold (None is used for image level metrics or to indicate averaging over all thresholds in [0.5:0.95])
iou_threshold: Union[float, None]
CGF1_METRICS = [
Metric(name="cgF1", image_level=False, iou_threshold=None),
Metric(name="precision", image_level=False, iou_threshold=None),
Metric(name="recall", image_level=False, iou_threshold=None),
Metric(name="F1", image_level=False, iou_threshold=None),
Metric(name="positive_macro_F1", image_level=False, iou_threshold=None),
Metric(name="positive_micro_F1", image_level=False, iou_threshold=None),
Metric(name="positive_micro_precision", image_level=False, iou_threshold=None),
Metric(name="IL_precision", image_level=True, iou_threshold=None),
Metric(name="IL_recall", image_level=True, iou_threshold=None),
Metric(name="IL_F1", image_level=True, iou_threshold=None),
Metric(name="IL_FPR", image_level=True, iou_threshold=None),
Metric(name="IL_MCC", image_level=True, iou_threshold=None),
Metric(name="cgF1", image_level=False, iou_threshold=0.5),
Metric(name="precision", image_level=False, iou_threshold=0.5),
Metric(name="recall", image_level=False, iou_threshold=0.5),
Metric(name="F1", image_level=False, iou_threshold=0.5),
Metric(name="positive_macro_F1", image_level=False, iou_threshold=0.5),
Metric(name="positive_micro_F1", image_level=False, iou_threshold=0.5),
Metric(name="positive_micro_precision", image_level=False, iou_threshold=0.5),
Metric(name="cgF1", image_level=False, iou_threshold=0.75),
Metric(name="precision", image_level=False, iou_threshold=0.75),
Metric(name="recall", image_level=False, iou_threshold=0.75),
Metric(name="F1", image_level=False, iou_threshold=0.75),
Metric(name="positive_macro_F1", image_level=False, iou_threshold=0.75),
Metric(name="positive_micro_F1", image_level=False, iou_threshold=0.75),
Metric(name="positive_micro_precision", image_level=False, iou_threshold=0.75),
]
class COCOCustom(COCO):
"""COCO class from pycocotools with tiny modifications for speed"""
def createIndex(self):
# create index
print("creating index...")
anns, cats, imgs = {}, {}, {}
imgToAnns, catToImgs = defaultdict(list), defaultdict(list)
if "annotations" in self.dataset:
for ann in self.dataset["annotations"]:
imgToAnns[ann["image_id"]].append(ann)
anns[ann["id"]] = ann
if "images" in self.dataset:
# MODIFICATION: do not reload imgs if they are already there
if self.imgs:
imgs = self.imgs
else:
for img in self.dataset["images"]:
imgs[img["id"]] = img
# END MODIFICATION
if "categories" in self.dataset:
for cat in self.dataset["categories"]:
cats[cat["id"]] = cat
if "annotations" in self.dataset and "categories" in self.dataset:
for ann in self.dataset["annotations"]:
catToImgs[ann["category_id"]].append(ann["image_id"])
print("index created!")
# create class members
self.anns = anns
self.imgToAnns = imgToAnns
self.catToImgs = catToImgs
self.imgs = imgs
self.cats = cats
def loadRes(self, resFile):
"""
Load result file and return a result api object.
:param resFile (str) : file name of result file
:return: res (obj) : result api object
"""
res = COCOCustom()
res.dataset["info"] = copy.deepcopy(self.dataset.get("info", {}))
# MODIFICATION: no copy
# res.dataset['images'] = [img for img in self.dataset['images']]
res.dataset["images"] = self.dataset["images"]
# END MODIFICATION
print("Loading and preparing results...")
tic = time.time()
if type(resFile) == str:
with open(resFile) as f:
anns = json.load(f)
elif type(resFile) == np.ndarray:
anns = self.loadNumpyAnnotations(resFile)
else:
anns = resFile
assert type(anns) == list, "results in not an array of objects"
annsImgIds = [ann["image_id"] for ann in anns]
# MODIFICATION: faster and cached subset check
if not hasattr(self, "img_id_set"):
self.img_id_set = set(self.getImgIds())
assert set(annsImgIds).issubset(
self.img_id_set
), "Results do not correspond to current coco set"
# END MODIFICATION
if "caption" in anns[0]:
imgIds = set([img["id"] for img in res.dataset["images"]]) & set(
[ann["image_id"] for ann in anns]
)
res.dataset["images"] = [
img for img in res.dataset["images"] if img["id"] in imgIds
]
for id, ann in enumerate(anns):
ann["id"] = id + 1
elif "bbox" in anns[0] and not anns[0]["bbox"] == []:
res.dataset["categories"] = copy.deepcopy(self.dataset["categories"])
for id, ann in enumerate(anns):
bb = ann["bbox"]
x1, x2, y1, y2 = [bb[0], bb[0] + bb[2], bb[1], bb[1] + bb[3]]
if not "segmentation" in ann:
ann["segmentation"] = [[x1, y1, x1, y2, x2, y2, x2, y1]]
ann["area"] = bb[2] * bb[3]
ann["id"] = id + 1
ann["iscrowd"] = 0
elif "segmentation" in anns[0]:
res.dataset["categories"] = copy.deepcopy(self.dataset["categories"])
for id, ann in enumerate(anns):
# now only support compressed RLE format as segmentation results
ann["area"] = maskUtils.area(ann["segmentation"])
if not "bbox" in ann:
ann["bbox"] = maskUtils.toBbox(ann["segmentation"])
ann["id"] = id + 1
ann["iscrowd"] = 0
elif "keypoints" in anns[0]:
res.dataset["categories"] = copy.deepcopy(self.dataset["categories"])
for id, ann in enumerate(anns):
s = ann["keypoints"]
x = s[0::3]
y = s[1::3]
x0, x1, y0, y1 = np.min(x), np.max(x), np.min(y), np.max(y)
ann["area"] = (x1 - x0) * (y1 - y0)
ann["id"] = id + 1
ann["bbox"] = [x0, y0, x1 - x0, y1 - y0]
print("DONE (t={:0.2f}s)".format(time.time() - tic))
res.dataset["annotations"] = anns
# MODIFICATION: inherit images
res.imgs = self.imgs
# END MODIFICATION
res.createIndex()
return res
class CGF1Eval(COCOeval):
"""
This evaluator is based upon COCO evaluation, but evaluates the model in a more realistic setting
for downstream applications.
See SAM3 paper for the details on the CGF1 metric.
Do not use this evaluator directly. Prefer the CGF1Evaluator wrapper.
Notes:
- This evaluator does not support per-category evaluation (in the way defined by pyCocotools)
- In open vocabulary settings, we have different noun-phrases for each image. What we call an "image_id" here is actually an (image, noun-phrase) pair. So in every "image_id" there is only one category, implied by the noun-phrase. Thus we can ignore the usual coco "category" field of the predictions
"""
def __init__(
self,
coco_gt=None,
coco_dt=None,
iouType="segm",
threshold=0.5,
):
"""
Args:
coco_gt (COCO): ground truth COCO API
coco_dt (COCO): detections COCO API
iou_type (str): type of IoU to evaluate
threshold (float): threshold for predictions
"""
super().__init__(coco_gt, coco_dt, iouType)
self.threshold = threshold
self.params.useCats = False
self.params.areaRng = [[0**2, 1e5**2]]
self.params.areaRngLbl = ["all"]
self.params.maxDets = [1000000]
def computeIoU(self, imgId, catId):
# Same as the original COCOeval.computeIoU, but without sorting
p = self.params
if p.useCats:
gt = self._gts[imgId, catId]
dt = self._dts[imgId, catId]
else:
gt = [_ for cId in p.catIds for _ in self._gts[imgId, cId]]
dt = [_ for cId in p.catIds for _ in self._dts[imgId, cId]]
if len(gt) == 0 and len(dt) == 0:
return []
if p.iouType == "segm":
g = [g["segmentation"] for g in gt]
d = [d["segmentation"] for d in dt]
elif p.iouType == "bbox":
g = [g["bbox"] for g in gt]
d = [d["bbox"] for d in dt]
else:
raise Exception("unknown iouType for iou computation")
# compute iou between each dt and gt region
iscrowd = [int(o["iscrowd"]) for o in gt]
ious = maskUtils.iou(d, g, iscrowd)
return ious
def evaluateImg(self, imgId, catId, aRng, maxDet):
"""
perform evaluation for single category and image
:return: dict (single image results)
"""
p = self.params
assert not p.useCats, "This evaluator does not support per-category evaluation."
assert catId == -1
all_gts = [_ for cId in p.catIds for _ in self._gts[imgId, cId]]
keep_gt = np.array([not g["ignore"] for g in all_gts], dtype=bool)
gt = [g for g in all_gts if not g["ignore"]]
all_dts = [_ for cId in p.catIds for _ in self._dts[imgId, cId]]
keep_dt = np.array([d["score"] >= self.threshold for d in all_dts], dtype=bool)
dt = [d for d in all_dts if d["score"] >= self.threshold]
if len(gt) == 0 and len(dt) == 0:
# This is a "true negative" case, where there are no GTs and no predictions
# The box-level metrics are ill-defined, so we don't add them to this dict
return {
"image_id": imgId,
"IL_TP": 0,
"IL_TN": 1,
"IL_FP": 0,
"IL_FN": 0,
"num_dt": len(dt),
}
if len(gt) > 0 and len(dt) == 0:
# This is a "false negative" case, where there are GTs but no predictions
return {
"image_id": imgId,
"IL_TP": 0,
"IL_TN": 0,
"IL_FP": 0,
"IL_FN": 1,
"TPs": np.zeros((len(p.iouThrs),), dtype=np.int64),
"FPs": np.zeros((len(p.iouThrs),), dtype=np.int64),
"FNs": np.ones((len(p.iouThrs),), dtype=np.int64) * len(gt),
"local_F1s": np.zeros((len(p.iouThrs),), dtype=np.int64),
"local_positive_F1s": np.zeros((len(p.iouThrs),), dtype=np.int64),
"num_dt": len(dt),
}
# Load pre-computed ious
ious = self.ious[(imgId, catId)]
# compute matching
if len(ious) == 0:
ious = np.zeros((len(dt), len(gt)))
else:
ious = ious[keep_dt, :][:, keep_gt]
assert ious.shape == (len(dt), len(gt))
matched_dt, matched_gt = linear_sum_assignment(-ious)
match_scores = ious[matched_dt, matched_gt]
TPs, FPs, FNs = [], [], []
IL_perfect = []
for thresh in p.iouThrs:
TP = (match_scores >= thresh).sum()
FP = len(dt) - TP
FN = len(gt) - TP
assert (
FP >= 0 and FN >= 0
), f"FP: {FP}, FN: {FN}, TP: {TP}, match_scores: {match_scores}, len(dt): {len(dt)}, len(gt): {len(gt)}, ious: {ious}"
TPs.append(TP)
FPs.append(FP)
FNs.append(FN)
if FP == FN and FP == 0:
IL_perfect.append(1)
else:
IL_perfect.append(0)
TPs = np.array(TPs, dtype=np.int64)
FPs = np.array(FPs, dtype=np.int64)
FNs = np.array(FNs, dtype=np.int64)
IL_perfect = np.array(IL_perfect, dtype=np.int64)
# compute precision recall and F1
precision = TPs / (TPs + FPs + 1e-4)
assert np.all(precision <= 1)
recall = TPs / (TPs + FNs + 1e-4)
assert np.all(recall <= 1)
F1 = 2 * precision * recall / (precision + recall + 1e-4)
result = {
"image_id": imgId,
"TPs": TPs,
"FPs": FPs,
"FNs": FNs,
"local_F1s": F1,
"IL_TP": (len(gt) > 0) and (len(dt) > 0),
"IL_FP": (len(gt) == 0) and (len(dt) > 0),
"IL_TN": (len(gt) == 0) and (len(dt) == 0),
"IL_FN": (len(gt) > 0) and (len(dt) == 0),
"num_dt": len(dt),
}
if len(gt) > 0 and len(dt) > 0:
result["local_positive_F1s"] = F1
return result
def accumulate(self, p=None):
"""
Accumulate per image evaluation results and store the result in self.eval
:param p: input params for evaluation
:return: None
"""
if self.evalImgs is None or len(self.evalImgs) == 0:
print("Please run evaluate() first")
# allows input customized parameters
if p is None:
p = self.params
setImgIds = set(p.imgIds)
# TPs, FPs, FNs
TPs = np.zeros((len(p.iouThrs),), dtype=np.int64)
FPs = np.zeros((len(p.iouThrs),), dtype=np.int64)
pmFPs = np.zeros((len(p.iouThrs),), dtype=np.int64)
FNs = np.zeros((len(p.iouThrs),), dtype=np.int64)
local_F1s = np.zeros((len(p.iouThrs),), dtype=np.float64)
# Image level metrics
IL_TPs = 0
IL_FPs = 0
IL_TNs = 0
IL_FNs = 0
valid_img_count = 0
valid_F1_count = 0
evaledImgIds = set()
for res in self.evalImgs:
if res["image_id"] not in setImgIds:
continue
evaledImgIds.add(res["image_id"])
IL_TPs += res["IL_TP"]
IL_FPs += res["IL_FP"]
IL_TNs += res["IL_TN"]
IL_FNs += res["IL_FN"]
if "TPs" not in res:
continue
TPs += res["TPs"]
FPs += res["FPs"]
FNs += res["FNs"]
valid_img_count += 1
if "local_positive_F1s" in res:
local_F1s += res["local_positive_F1s"]
pmFPs += res["FPs"]
if res["num_dt"] > 0:
valid_F1_count += 1
assert len(setImgIds - evaledImgIds) == 0, (
f"{len(setImgIds - evaledImgIds)} images not evaluated. "
f"Here are the IDs of the first 3: {list(setImgIds - evaledImgIds)[:3]}"
)
# compute precision recall and F1
precision = TPs / (TPs + FPs + 1e-4)
positive_micro_precision = TPs / (TPs + pmFPs + 1e-4)
assert np.all(precision <= 1)
recall = TPs / (TPs + FNs + 1e-4)
assert np.all(recall <= 1)
F1 = 2 * precision * recall / (precision + recall + 1e-4)
positive_micro_F1 = (
2
* positive_micro_precision
* recall
/ (positive_micro_precision + recall + 1e-4)
)
IL_rec = IL_TPs / (IL_TPs + IL_FNs + 1e-6)
IL_prec = IL_TPs / (IL_TPs + IL_FPs + 1e-6)
IL_F1 = 2 * IL_prec * IL_rec / (IL_prec + IL_rec + 1e-6)
IL_FPR = IL_FPs / (IL_FPs + IL_TNs + 1e-6)
IL_MCC = float(IL_TPs * IL_TNs - IL_FPs * IL_FNs) / (
(
float(IL_TPs + IL_FPs)
* float(IL_TPs + IL_FNs)
* float(IL_TNs + IL_FPs)
* float(IL_TNs + IL_FNs)
)
** 0.5
+ 1e-6
)
self.eval = {
"params": p,
"TPs": TPs,
"FPs": FPs,
"positive_micro_FPs": pmFPs,
"FNs": FNs,
"precision": precision,
"positive_micro_precision": positive_micro_precision,
"recall": recall,
"F1": F1,
"positive_micro_F1": positive_micro_F1,
"positive_macro_F1": local_F1s / valid_F1_count,
"IL_recall": IL_rec,
"IL_precision": IL_prec,
"IL_F1": IL_F1,
"IL_FPR": IL_FPR,
"IL_MCC": IL_MCC,
}
self.eval["cgF1"] = self.eval["positive_micro_F1"] * self.eval["IL_MCC"]
def summarize(self):
"""
Compute and display summary metrics for evaluation results.
"""
if not self.eval:
raise Exception("Please run accumulate() first")
def _summarize(iouThr=None, metric=""):
p = self.params
iStr = " {:<18} @[ IoU={:<9}] = {:0.3f}"
titleStr = "Average " + metric
iouStr = (
"{:0.2f}:{:0.2f}".format(p.iouThrs[0], p.iouThrs[-1])
if iouThr is None
else "{:0.2f}".format(iouThr)
)
s = self.eval[metric]
# IoU
if iouThr is not None:
t = np.where(iouThr == p.iouThrs)[0]
s = s[t]
if len(s[s > -1]) == 0:
mean_s = -1
else:
mean_s = np.mean(s[s > -1])
print(iStr.format(titleStr, iouStr, mean_s))
return mean_s
def _summarize_single(metric=""):
titleStr = "Average " + metric
iStr = " {:<35} = {:0.3f}"
s = self.eval[metric]
print(iStr.format(titleStr, s))
return s
def _summarizeDets():
stats = []
for metric in CGF1_METRICS:
if metric.image_level:
stats.append(_summarize_single(metric=metric.name))
else:
stats.append(
_summarize(iouThr=metric.iou_threshold, metric=metric.name)
)
return np.asarray(stats)
summarize = _summarizeDets
self.stats = summarize()
def _evaluate(self):
"""
Run per image evaluation on given images and store results (a list of dict) in self.evalImgs
"""
p = self.params
# add backward compatibility if useSegm is specified in params
p.imgIds = list(np.unique(p.imgIds))
p.useCats = False
p.maxDets = sorted(p.maxDets)
self.params = p
self._prepare()
# loop through images, area range, max detection number
catIds = [-1]
if p.iouType == "segm" or p.iouType == "bbox":
computeIoU = self.computeIoU
else:
raise RuntimeError(f"Unsupported iou {p.iouType}")
self.ious = {
(imgId, catId): computeIoU(imgId, catId)
for imgId in p.imgIds
for catId in catIds
}
maxDet = p.maxDets[-1]
evalImgs = [
self.evaluateImg(imgId, catId, areaRng, maxDet)
for catId in catIds
for areaRng in p.areaRng
for imgId in p.imgIds
]
# this is NOT in the pycocotools code, but could be done outside
evalImgs = np.asarray(evalImgs).reshape(len(catIds), len(p.areaRng), len(p.imgIds))
return p.imgIds, evalImgs
class CGF1Evaluator:
"""
Wrapper class for cgF1 evaluation.
This supports the oracle setting (when several ground-truths are available per image)
"""
def __init__(
self,
gt_path: Union[str, List[str]],
iou_type="segm",
verbose=False,
):
"""
Args:
gt_path (str or list of str): path(s) to ground truth COCO json file(s)
iou_type (str): type of IoU to evaluate
threshold (float): threshold for predictions
"""
self.gt_paths = gt_path if isinstance(gt_path, list) else [gt_path]
self.iou_type = iou_type
self.coco_gts = [COCOCustom(gt) for gt in self.gt_paths]
self.verbose = verbose
self.coco_evals = []
for i, coco_gt in enumerate(self.coco_gts):
self.coco_evals.append(
CGF1Eval(
coco_gt=coco_gt,
iouType=iou_type,
)
)
self.coco_evals[i].useCats = False
exclude_img_ids = set()
# exclude_img_ids are the ids that are not exhaustively annotated in any of the other gts
for coco_gt in self.coco_gts[1:]:
exclude_img_ids = exclude_img_ids.union(
{
img["id"]
for img in coco_gt.dataset["images"]
if not img["is_instance_exhaustive"]
}
)
# we only eval on instance exhaustive queries
self.eval_img_ids = [
img["id"]
for img in self.coco_gts[0].dataset["images"]
if (img["is_instance_exhaustive"] and img["id"] not in exclude_img_ids)
]
def evaluate(self, pred_file: str):
"""
Evaluate the detections using cgF1 metric.
Args:
pred_file: path to the predictions COCO json file
"""
assert len(self.coco_gts) > 0, "No ground truth provided for evaluation."
assert len(self.coco_gts) == len(
self.coco_evals
), "Mismatch in number of ground truths and evaluators."
if self.verbose:
print(f"Loading predictions from {pred_file}")
with open(pred_file, "r") as f:
preds = json.load(f)
if self.verbose:
print(f"Loaded {len(preds)} predictions")
img2preds = defaultdict(list)
for pred in preds:
img2preds[pred["image_id"]].append(pred)
all_eval_imgs = []
for img_id in tqdm(self.eval_img_ids, disable=not self.verbose):
results = img2preds[img_id]
all_scorings = []
for cur_coco_gt, coco_eval in zip(self.coco_gts, self.coco_evals):
# suppress pycocotools prints
with open(os.devnull, "w") as devnull:
with contextlib.redirect_stdout(devnull):
coco_dt = (
cur_coco_gt.loadRes(results) if results else COCOCustom()
)
coco_eval.cocoDt = coco_dt
coco_eval.params.imgIds = [img_id]
coco_eval.params.useCats = False
img_ids, eval_imgs = _evaluate(coco_eval)
all_scorings.append(eval_imgs)
selected = self._select_best_scoring(all_scorings)
all_eval_imgs.append(selected)
# After this point, we have selected the best scoring per image among several ground truths
# we can now accumulate and summarize, using only the first coco_eval
self.coco_evals[0].evalImgs = list(
np.concatenate(all_eval_imgs, axis=2).flatten()
)
self.coco_evals[0].params.imgIds = self.eval_img_ids
self.coco_evals[0]._paramsEval = copy.deepcopy(self.coco_evals[0].params)
if self.verbose:
print(f"Accumulating results")
self.coco_evals[0].accumulate()
print("cgF1 metric, IoU type={}".format(self.iou_type))
self.coco_evals[0].summarize()
print()
out = {}
for i, value in enumerate(self.coco_evals[0].stats):
name = CGF1_METRICS[i].name
if CGF1_METRICS[i].iou_threshold is not None:
name = f"{name}@{CGF1_METRICS[i].iou_threshold}"
out[f"cgF1_eval_{self.iou_type}_{name}"] = float(value)
return out
@staticmethod
def _select_best_scoring(scorings):
# This function is used for "oracle" type evaluation.
# It accepts the evaluation results with respect to several ground truths, and picks the best
if len(scorings) == 1:
return scorings[0]
assert (
scorings[0].ndim == 3
), f"Expecting results in [numCats, numAreas, numImgs] format, got {scorings[0].shape}"
assert (
scorings[0].shape[0] == 1
), f"Expecting a single category, got {scorings[0].shape[0]}"
for scoring in scorings:
assert (
scoring.shape == scorings[0].shape
), f"Shape mismatch: {scoring.shape}, {scorings[0].shape}"
selected_imgs = []
for img_id in range(scorings[0].shape[-1]):
best = scorings[0][:, :, img_id]
for scoring in scorings[1:]:
current = scoring[:, :, img_id]
if "local_F1s" in best[0, 0] and "local_F1s" in current[0, 0]:
# we were able to compute a F1 score for this particular image in both evaluations
# best["local_F1s"] contains the results at various IoU thresholds. We simply take the average for comparision
best_score = best[0, 0]["local_F1s"].mean()
current_score = current[0, 0]["local_F1s"].mean()
if current_score > best_score:
best = current
else:
# If we're here, it means that in that in some evaluation we were not able to get a valid local F1
# This happens when both the predictions and targets are empty. In that case, we can assume it's a perfect prediction
if "local_F1s" not in current[0, 0]:
best = current
selected_imgs.append(best)
result = np.stack(selected_imgs, axis=-1)
assert result.shape == scorings[0].shape
return result

916
sam3/eval/coco_eval.py Normal file
View File

@@ -0,0 +1,916 @@
# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
"""
COCO evaluator that works in distributed mode.
Mostly copy-paste from https://github.com/pytorch/vision/blob/edfd5a7/references/detection/coco_eval.py
The difference is that there is less copy-pasting from pycocotools
in the end of the file, as python3 can suppress prints with contextlib
"""
import contextlib
import copy
import json
import logging
import os
import pickle
from collections import defaultdict
from pathlib import Path
from typing import Any, List, Optional
import numpy as np
import pycocotools.mask as mask_utils
import torch
from iopath.common.file_io import g_pathmgr
from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval
from sam3.train.masks_ops import rle_encode
from sam3.train.utils.distributed import (
all_gather,
gather_to_rank_0_via_filesys,
get_rank,
is_main_process,
)
RARITY_BUCKETS = {0: "frequent", 1: "common", 2: "medium", 3: "rare"}
class CocoEvaluator:
def __init__(
self,
coco_gt,
iou_types: List[str],
useCats: bool,
dump_dir: Optional[str],
postprocessor,
average_by_rarity=False,
metrics_dump_dir: Optional[str] = None,
gather_pred_via_filesys=False,
use_normalized_areas=True,
maxdets=[1, 10, 100],
exhaustive_only=False,
all_exhaustive_only=True,
):
"""Online coco evaluator. It will evaluate images as they are generated by the model, then accumulate/summarize at the end
Args:
- coco_gt: COCO api object containing the gt
- iou_types: can be either "bbox" or "segm"
- useCats: If true, categories will be used for evaluation
- dump_dir: if non null, then the predictions will be dumped in that directory
- postprocessor: Module to convert the model's output into the coco format
- average_by_rarity: if true then we expect the images information in the gt dataset
to have a "rarity" field. Then the AP will be computed on all rarity buckets
individually, then averaged
- gather_pred_via_filesys: if true, we use the filesystem for collective gathers
- use_normalized_areas: if true, the areas of the objects in the GT are assumed to be
normalized by the area of the image. In that case, the size buckets are adjusted
- maxdets: maximal number of detections to be evaluated on each image.
- exhaustive_only: If true, we restrict eval only to exhaustive annotations
- all_exhaustive_only: If true, datapoints are restricted only to those with all exhaustive annotations
"""
# coco_gt = copy.deepcopy(coco_gt)
self.coco_gts = [coco_gt] if not isinstance(coco_gt, list) else coco_gt
assert len(maxdets) == 3, f"expecting 3 detection threshold, got {len(maxdets)}"
self.use_normalized_areas = use_normalized_areas
self.iou_types = iou_types
self.useCats = useCats
self.maxdets = maxdets
self.dump = None
self.dump_dir = dump_dir
if self.dump_dir is not None:
self.dump = []
if is_main_process():
if not os.path.exists(self.dump_dir):
os.makedirs(self.dump_dir, exist_ok=True)
logging.info(f"Create the folder: {dump_dir}")
self.initialized = False
# Whether to gather predictions through filesystem (instead of torch
# collective ops; requiring a shared filesystem across all ranks)
self.gather_pred_via_filesys = gather_pred_via_filesys
self.use_self_evaluate = True # CPP version is disabled
self.postprocessor = postprocessor
self.average_by_rarity = average_by_rarity
self.exhaustive_only = exhaustive_only
self.all_exhaustive_only = all_exhaustive_only
self.metrics_dump_dir = metrics_dump_dir
if self.metrics_dump_dir is not None:
if is_main_process():
if not os.path.exists(self.metrics_dump_dir):
os.makedirs(self.metrics_dump_dir, exist_ok=True)
logging.info(f"Create the folder: {metrics_dump_dir}")
def _lazy_init(self, coco_cls=COCO):
if self.initialized:
return
self.initialized = True
self.coco_gts = [
coco_cls(g_pathmgr.get_local_path(gt)) if isinstance(gt, str) else gt
for gt in self.coco_gts
]
self.reset()
self.eval_img_ids = None
if self.exhaustive_only:
exclude_img_ids = set()
# exclude_img_ids are the ids that are not exhaustively annotated in any of the other gts
if self.all_exhaustive_only:
for coco_gt in self.coco_gts[1:]:
exclude_img_ids = exclude_img_ids.union(
{
img["id"]
for img in coco_gt.dataset["images"]
if not img["is_instance_exhaustive"]
}
)
# we only eval on instance exhaustive queries
self.eval_img_ids = [
img["id"]
for img in self.coco_gts[0].dataset["images"]
if (img["is_instance_exhaustive"] and img["id"] not in exclude_img_ids)
]
self.rarity_buckets = None
if self.average_by_rarity:
self.rarity_buckets = defaultdict(list)
eval_img_ids_set = (
set(self.eval_img_ids) if self.eval_img_ids is not None else None
)
for img in self.coco_gts[0].dataset["images"]:
if self.eval_img_ids is not None and img["id"] not in eval_img_ids_set:
continue
self.rarity_buckets[img["rarity"]].append(img["id"])
print("Rarity buckets sizes:")
for k, v in self.rarity_buckets.items():
print(f"{k}: {len(v)}")
def set_sync_device(self, device: torch.device) -> Any:
self._sync_device = device
def _evaluate(self, *args, **kwargs):
return evaluate(*args, **kwargs)
def _loadRes(self, *args, **kwargs):
return loadRes(*args, **kwargs)
def update(self, *args, **kwargs):
self._lazy_init()
predictions = self.postprocessor.process_results(*args, **kwargs)
img_ids = list(np.unique(list(predictions.keys())))
self.img_ids.extend(img_ids)
for iou_type in self.iou_types:
results = self.prepare(predictions, iou_type)
self._dump(results)
assert len(self.coco_gts) == len(self.coco_evals)
all_scorings = []
for cur_coco_gt, cur_coco_eval in zip(self.coco_gts, self.coco_evals):
# suppress pycocotools prints
with open(os.devnull, "w") as devnull:
with contextlib.redirect_stdout(devnull):
coco_dt = (
self._loadRes(cur_coco_gt, results) if results else COCO()
)
coco_eval = cur_coco_eval[iou_type]
coco_eval.cocoDt = coco_dt
coco_eval.params.imgIds = list(img_ids)
coco_eval.params.useCats = self.useCats
coco_eval.params.maxDets = self.maxdets
img_ids, eval_imgs = self._evaluate(coco_eval, self.use_self_evaluate)
all_scorings.append(eval_imgs)
selected = self.select_best_scoring(all_scorings)
self.eval_imgs[iou_type].append(selected)
def select_best_scoring(self, scorings):
# This function is used for "oracle" type evaluation.
# It accepts the evaluation results with respect to several ground truths, and picks the best
if len(scorings) == 1:
return scorings[0]
# Currently we don't support Oracle Phrase AP.
# To implement it, we likely need to modify the cpp code since the eval_image type is opaque
raise RuntimeError("Not implemented")
def _dump(self, results):
if self.dump is not None:
dumped_results = copy.deepcopy(results)
for r in dumped_results:
if "bbox" not in self.iou_types and "bbox" in r:
del r["bbox"]
elif "bbox" in r:
r["bbox"] = [round(coord, 5) for coord in r["bbox"]]
r["score"] = round(r["score"], 5)
self.dump.extend(dumped_results)
def synchronize_between_processes(self):
self._lazy_init()
logging.info("Coco evaluator: Synchronizing between processes")
for iou_type in self.iou_types:
if len(self.eval_imgs[iou_type]) > 0:
self.eval_imgs[iou_type] = np.concatenate(self.eval_imgs[iou_type], 2)
else:
num_areas = len(self.coco_evals[0][iou_type].params.areaRng)
# assuming 1 class
assert not self.useCats
self.eval_imgs[iou_type] = np.empty((1, num_areas, 0))
create_common_coco_eval(
self.coco_evals[0][iou_type],
self.img_ids,
self.eval_imgs[iou_type],
use_self_evaluate=self.use_self_evaluate,
gather_pred_via_filesys=self.gather_pred_via_filesys,
metrics_dump_dir=self.metrics_dump_dir,
)
if self.dump is not None:
dumped_file = Path(self.dump_dir) / f"coco_predictions_{get_rank()}.json"
logging.info(f"COCO evaluator: Dumping local predictions to {dumped_file}")
with g_pathmgr.open(str(dumped_file), "w") as f:
json.dump(self.dump, f)
# if self.gather_pred_via_filesys:
# dump = gather_to_rank_0_via_filesys(self.dump)
# else:
# dump = all_gather(self.dump, force_cpu=True)
# self.dump = sum(dump, [])
def accumulate(self, imgIds=None):
self._lazy_init()
logging.info(
f"Coco evaluator: Accumulating on {len(imgIds) if imgIds is not None else 'all'} images"
)
if not is_main_process():
return
if imgIds is None:
for coco_eval in self.coco_evals[0].values():
accumulate(coco_eval, use_self_eval=self.use_self_evaluate)
if imgIds is not None:
imgIds = set(imgIds)
for coco_eval in self.coco_evals[0].values():
p = coco_eval.params
id_mask = np.array([(i in imgIds) for i in p.imgIds], dtype=bool)
old_img_ids = p.imgIds
coco_eval.params.imgIds = np.asarray(p.imgIds)[id_mask]
old_img_evals = coco_eval.evalImgs
catIds = p.catIds if p.useCats else [-1]
coco_eval.evalImgs = list(
np.asarray(coco_eval.evalImgs)
.reshape(len(catIds), len(p.areaRng), len(old_img_ids))[
..., id_mask
]
.flatten()
)
accumulate(coco_eval, use_self_eval=self.use_self_evaluate)
coco_eval.evalImgs = old_img_evals
coco_eval.params.imgIds = old_img_ids
def summarize(self):
self._lazy_init()
logging.info("Coco evaluator: Summarizing")
if not is_main_process():
return {}
outs = {}
if self.rarity_buckets is None:
self.accumulate(self.eval_img_ids)
for iou_type, coco_eval in self.coco_evals[0].items():
print("IoU metric: {}".format(iou_type))
summarize(coco_eval)
if "bbox" in self.coco_evals[0]:
for key, value in zip(*self.coco_evals[0]["bbox"].stats):
outs[f"coco_eval_bbox_{key}"] = value
if "segm" in self.coco_evals[0]:
for key, value in zip(*self.coco_evals[0]["segm"].stats):
outs[f"coco_eval_masks_{key}"] = value
else:
total_stats = {}
all_keys = {}
for bucket, img_list in self.rarity_buckets.items():
self.accumulate(imgIds=img_list)
bucket_name = RARITY_BUCKETS[bucket]
for iou_type, coco_eval in self.coco_evals[0].items():
print(f"IoU metric: {iou_type}. Rarity bucket: {bucket_name}")
summarize(coco_eval)
if "bbox" in self.coco_evals[0]:
if "bbox" not in total_stats:
total_stats["bbox"] = np.zeros_like(
self.coco_evals[0]["bbox"].stats[1]
)
all_keys["bbox"] = self.coco_evals[0]["bbox"].stats[0]
total_stats["bbox"] += self.coco_evals[0]["bbox"].stats[1]
for key, value in zip(*self.coco_evals[0]["bbox"].stats):
outs[f"coco_eval_bbox_{bucket_name}_{key}"] = value
if "segm" in self.coco_evals[0]:
if "segm" not in total_stats:
total_stats["segm"] = np.zeros_like(
self.coco_evals[0]["segm"].stats[1]
)
all_keys["segm"] = self.coco_evals[0]["segm"].stats[0]
total_stats["segm"] += self.coco_evals[0]["segm"].stats[1]
for key, value in zip(*self.coco_evals[0]["segm"].stats):
outs[f"coco_eval_masks_{bucket_name}_{key}"] = value
if "bbox" in total_stats:
total_stats["bbox"] /= len(self.rarity_buckets)
for key, value in zip(all_keys["bbox"], total_stats["bbox"]):
outs[f"coco_eval_bbox_{key}"] = value
if "segm" in total_stats:
total_stats["segm"] /= len(self.rarity_buckets)
for key, value in zip(all_keys["segm"], total_stats["segm"]):
outs[f"coco_eval_masks_{key}"] = value
# if self.dump is not None:
# assert self.dump_dir is not None
# logging.info("Coco evaluator: Dumping the global result file to disk")
# with g_pathmgr.open(str(Path(self.dump_dir) / "coco_eval.json"), "w") as f:
# json.dump(self.dump, f)
return outs
def compute_synced(self):
self._lazy_init()
self.synchronize_between_processes()
return self.summarize()
def compute(self):
self._lazy_init()
return {"": 0.0}
def reset(self, cocoeval_cls=COCOeval):
self.coco_evals = [{} for _ in range(len(self.coco_gts))]
for i, coco_gt in enumerate(self.coco_gts):
for iou_type in self.iou_types:
self.coco_evals[i][iou_type] = cocoeval_cls(coco_gt, iouType=iou_type)
self.coco_evals[i][iou_type].params.useCats = self.useCats
self.coco_evals[i][iou_type].params.maxDets = self.maxdets
if self.use_normalized_areas:
self.coco_evals[i][iou_type].params.areaRng = [
[0, 1e5],
[0, 0.001],
[0.001, 0.01],
[0.01, 0.1],
[0.1, 0.5],
[0.5, 0.95],
[0.95, 1e5],
]
self.coco_evals[i][iou_type].params.areaRngLbl = [
"all",
"tiny",
"small",
"medium",
"large",
"huge",
"whole_image",
]
self.img_ids = []
self.eval_imgs = {k: [] for k in self.iou_types}
if self.dump is not None:
self.dump = []
def write(self, stats):
self._lazy_init()
"""Write the results in the stats dict"""
if "bbox" in self.coco_evals[0]:
stats["coco_eval_bbox"] = self.coco_evals[0]["bbox"].stats.tolist()
if "segm" in self.coco_evals[0]:
stats["coco_eval_masks"] = self.coco_evals[0]["segm"].stats.tolist()
return stats
def prepare(self, predictions, iou_type):
self._lazy_init()
if iou_type == "bbox":
return self.prepare_for_coco_detection(predictions)
elif iou_type == "segm":
return self.prepare_for_coco_segmentation(predictions)
elif iou_type == "keypoints":
return self.prepare_for_coco_keypoint(predictions)
else:
raise ValueError("Unknown iou type {}".format(iou_type))
def prepare_for_coco_detection(self, predictions):
self._lazy_init()
coco_results = []
for original_id, prediction in predictions.items():
if len(prediction) == 0:
continue
boxes = prediction["boxes"]
boxes = convert_to_xywh(boxes).tolist()
scores = prediction["scores"].tolist()
labels = prediction["labels"].tolist()
coco_results.extend(
[
{
"image_id": original_id,
"category_id": labels[k],
"bbox": box,
"score": scores[k],
}
for k, box in enumerate(boxes)
]
)
return coco_results
@torch.no_grad()
def prepare_for_coco_segmentation(self, predictions):
self._lazy_init()
coco_results = []
for original_id, prediction in predictions.items():
if len(prediction) == 0:
continue
scores = prediction["scores"].tolist()
labels = prediction["labels"].tolist()
boundaries, dilated_boundaries = None, None
if "boundaries" in prediction:
boundaries = prediction["boundaries"]
dilated_boundaries = prediction["dilated_boundaries"]
assert dilated_boundaries is not None
assert len(scores) == len(boundaries)
if "masks_rle" in prediction:
rles = prediction["masks_rle"]
areas = []
for rle in rles:
cur_area = mask_utils.area(rle)
h, w = rle["size"]
areas.append(cur_area / (h * w))
else:
masks = prediction["masks"]
masks = masks > 0.5
h, w = masks.shape[-2:]
areas = masks.flatten(1).sum(1) / (h * w)
areas = areas.tolist()
rles = rle_encode(masks.squeeze(1))
# memory clean
del masks
del prediction["masks"]
assert len(areas) == len(rles) == len(scores)
for k, rle in enumerate(rles):
payload = {
"image_id": original_id,
"category_id": labels[k],
"segmentation": rle,
"score": scores[k],
"area": areas[k],
}
if boundaries is not None:
payload["boundary"] = boundaries[k]
payload["dilated_boundary"] = dilated_boundaries[k]
coco_results.append(payload)
return coco_results
def prepare_for_coco_keypoint(self, predictions):
self._lazy_init()
coco_results = []
for original_id, prediction in predictions.items():
if len(prediction) == 0:
continue
boxes = prediction["boxes"]
boxes = convert_to_xywh(boxes).tolist()
scores = prediction["scores"].tolist()
labels = prediction["labels"].tolist()
keypoints = prediction["keypoints"]
keypoints = keypoints.flatten(start_dim=1).tolist()
coco_results.extend(
[
{
"image_id": original_id,
"category_id": labels[k],
"keypoints": keypoint,
"score": scores[k],
}
for k, keypoint in enumerate(keypoints)
]
)
return coco_results
def convert_to_xywh(boxes):
xmin, ymin, xmax, ymax = boxes.unbind(-1)
return torch.stack((xmin, ymin, xmax - xmin, ymax - ymin), dim=-1)
def merge(img_ids, eval_imgs, gather_pred_via_filesys=False):
if gather_pred_via_filesys:
# only gather the predictions to rank 0 (other ranks will receive empty
# lists for `all_img_ids` and `all_eval_imgs`, which should be OK as
# merging and evaluation are only done on rank 0)
all_img_ids = gather_to_rank_0_via_filesys(img_ids)
all_eval_imgs = gather_to_rank_0_via_filesys(eval_imgs)
else:
all_img_ids = all_gather(img_ids, force_cpu=True)
all_eval_imgs = all_gather(eval_imgs, force_cpu=True)
if not is_main_process():
return None, None
merged_img_ids = []
for p in all_img_ids:
merged_img_ids.extend(p)
merged_eval_imgs = []
for p in all_eval_imgs:
merged_eval_imgs.append(p)
merged_img_ids = np.array(merged_img_ids)
merged_eval_imgs = np.concatenate(merged_eval_imgs, 2)
# keep only unique (and in sorted order) images
merged_img_ids, idx = np.unique(merged_img_ids, return_index=True)
merged_eval_imgs = merged_eval_imgs[..., idx]
return merged_img_ids, merged_eval_imgs
def create_common_coco_eval(
coco_eval,
img_ids,
eval_imgs,
use_self_evaluate,
gather_pred_via_filesys=False,
metrics_dump_dir=None,
):
img_ids, eval_imgs = merge(img_ids, eval_imgs, gather_pred_via_filesys)
if not is_main_process():
return
if metrics_dump_dir is not None:
dumped_file = (
Path(metrics_dump_dir) / f"coco_eval_img_metrics_{get_rank()}.json"
)
logging.info(f"COCO evaluator: Dumping local predictions to {dumped_file}")
with g_pathmgr.open(str(dumped_file), "w") as f:
json.dump(eval_imgs.squeeze(), f, default=lambda x: x.tolist())
img_ids = list(img_ids)
# If some images were not predicted, we need to create dummy detections for them
missing_img_ids = set(coco_eval.cocoGt.getImgIds()) - set(img_ids)
if len(missing_img_ids) > 0:
print(f"WARNING: {len(missing_img_ids)} images were not predicted!")
coco_eval.cocoDt = COCO()
coco_eval.params.imgIds = list(missing_img_ids)
new_img_ids, new_eval_imgs = evaluate(coco_eval, use_self_evaluate)
img_ids.extend(new_img_ids)
eval_imgs = np.concatenate((eval_imgs, new_eval_imgs), axis=2)
eval_imgs = list(eval_imgs.flatten())
assert len(img_ids) == len(coco_eval.cocoGt.getImgIds())
coco_eval.evalImgs = eval_imgs
coco_eval.params.imgIds = img_ids
coco_eval._paramsEval = copy.deepcopy(coco_eval.params)
#################################################################
# From pycocotools, just removed the prints and fixed
# a Python3 bug about unicode not defined
#################################################################
# Copy of COCO prepare, but doesn't convert anntoRLE
def segmentation_prepare(self):
"""
Prepare ._gts and ._dts for evaluation based on params
:return: None
"""
p = self.params
if p.useCats:
gts = self.cocoGt.loadAnns(
self.cocoGt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds)
)
dts = self.cocoDt.loadAnns(
self.cocoDt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds)
)
else:
gts = self.cocoGt.loadAnns(self.cocoGt.getAnnIds(imgIds=p.imgIds))
dts = self.cocoDt.loadAnns(self.cocoDt.getAnnIds(imgIds=p.imgIds))
for gt in gts:
gt["ignore"] = gt["ignore"] if "ignore" in gt else 0
gt["ignore"] = "iscrowd" in gt and gt["iscrowd"]
if p.iouType == "keypoints":
gt["ignore"] = (gt["num_keypoints"] == 0) or gt["ignore"]
self._gts = defaultdict(list) # gt for evaluation
self._dts = defaultdict(list) # dt for evaluation
for gt in gts:
self._gts[gt["image_id"], gt["category_id"]].append(gt)
for dt in dts:
self._dts[dt["image_id"], dt["category_id"]].append(dt)
self.evalImgs = defaultdict(list) # per-image per-category evaluation results
self.eval = {} # accumulated evaluation results
def evaluate(self, use_self_evaluate):
"""
Run per image evaluation on given images and store results (a list of dict) in self.evalImgs
:return: None
"""
# tic = time.time()
# print('Running per image evaluation...', use_self_evaluate)
p = self.params
# add backward compatibility if useSegm is specified in params
if p.useSegm is not None:
p.iouType = "segm" if p.useSegm == 1 else "bbox"
print(
"useSegm (deprecated) is not None. Running {} evaluation".format(p.iouType)
)
# print('Evaluate annotation type *{}*'.format(p.iouType))
p.imgIds = list(np.unique(p.imgIds))
if p.useCats:
p.catIds = list(np.unique(p.catIds))
p.maxDets = sorted(p.maxDets)
self.params = p
self._prepare()
# loop through images, area range, max detection number
catIds = p.catIds if p.useCats else [-1]
if p.iouType == "segm" or p.iouType == "bbox":
computeIoU = self.computeIoU
elif p.iouType == "keypoints":
computeIoU = self.computeOks
self.ious = {
(imgId, catId): computeIoU(imgId, catId)
for imgId in p.imgIds
for catId in catIds
}
maxDet = p.maxDets[-1]
if use_self_evaluate:
evalImgs = [
self.evaluateImg(imgId, catId, areaRng, maxDet)
for catId in catIds
for areaRng in p.areaRng
for imgId in p.imgIds
]
# this is NOT in the pycocotools code, but could be done outside
evalImgs = np.asarray(evalImgs).reshape(
len(catIds), len(p.areaRng), len(p.imgIds)
)
return p.imgIds, evalImgs
# <<<< Beginning of code differences with original COCO API
# def convert_instances_to_cpp(instances, is_det=False):
# # Convert annotations for a list of instances in an image to a format that's fast
# # to access in C++
# instances_cpp = []
# for instance in instances:
# instance_cpp = _CPP.InstanceAnnotation(
# int(instance["id"]),
# instance["score"] if is_det else instance.get("score", 0.0),
# instance["area"],
# bool(instance.get("iscrowd", 0)),
# bool(instance.get("ignore", 0)),
# )
# instances_cpp.append(instance_cpp)
# return instances_cpp
# # Convert GT annotations, detections, and IOUs to a format that's fast to access in C++
# ground_truth_instances = [
# [convert_instances_to_cpp(self._gts[imgId, catId]) for catId in p.catIds]
# for imgId in p.imgIds
# ]
# detected_instances = [
# [
# convert_instances_to_cpp(self._dts[imgId, catId], is_det=True)
# for catId in p.catIds
# ]
# for imgId in p.imgIds
# ]
# ious = [[self.ious[imgId, catId] for catId in catIds] for imgId in p.imgIds]
# if not p.useCats:
# # For each image, flatten per-category lists into a single list
# ground_truth_instances = [
# [[o for c in i for o in c]] for i in ground_truth_instances
# ]
# detected_instances = [[[o for c in i for o in c]] for i in detected_instances]
# # Call C++ implementation of self.evaluateImgs()
# _evalImgs_cpp = _CPP.COCOevalEvaluateImages(
# p.areaRng, maxDet, p.iouThrs, ious, ground_truth_instances, detected_instances
# )
# self._paramsEval = copy.deepcopy(self.params)
# evalImgs = np.asarray(_evalImgs_cpp).reshape(
# len(catIds), len(p.areaRng), len(p.imgIds)
# )
# return p.imgIds, evalImgs
#################################################################
# end of straight copy from pycocotools, just removing the prints
#################################################################
#################################################################
# From pycocotools, but disabled mask->box conversion which is
# pointless
#################################################################
def loadRes(self, resFile):
"""
Load result file and return a result api object.
:param resFile (str) : file name of result file
:return: res (obj) : result api object
"""
res = COCO()
res.dataset["images"] = [img for img in self.dataset["images"]]
if type(resFile) == str:
anns = json.load(open(resFile))
elif type(resFile) == np.ndarray:
anns = self.loadNumpyAnnotations(resFile)
else:
anns = resFile
assert type(anns) == list, "results in not an array of objects"
annsImgIds = [ann["image_id"] for ann in anns]
assert set(annsImgIds) == (
set(annsImgIds) & set(self.getImgIds())
), "Results do not correspond to current coco set"
if "caption" in anns[0]:
imgIds = set([img["id"] for img in res.dataset["images"]]) & set(
[ann["image_id"] for ann in anns]
)
res.dataset["images"] = [
img for img in res.dataset["images"] if img["id"] in imgIds
]
for id, ann in enumerate(anns):
ann["id"] = id + 1
elif "bbox" in anns[0] and not anns[0]["bbox"] == []:
res.dataset["categories"] = copy.deepcopy(self.dataset["categories"])
for id, ann in enumerate(anns):
bb = ann["bbox"]
x1, x2, y1, y2 = [bb[0], bb[0] + bb[2], bb[1], bb[1] + bb[3]]
if "segmentation" not in ann:
ann["segmentation"] = [[x1, y1, x1, y2, x2, y2, x2, y1]]
ann["area"] = bb[2] * bb[3]
ann["id"] = id + 1
ann["iscrowd"] = 0
elif "segmentation" in anns[0]:
res.dataset["categories"] = copy.deepcopy(self.dataset["categories"])
for id, ann in enumerate(anns):
# now only support compressed RLE format as segmentation results
# ann["area"] = mask_util.area(ann["segmentation"])
# The following lines are disabled because they are pointless
# if not 'bbox' in ann:
# ann['bbox'] = maskUtils.toBbox(ann['segmentation'])
ann["id"] = id + 1
ann["iscrowd"] = 0
elif "keypoints" in anns[0]:
res.dataset["categories"] = copy.deepcopy(self.dataset["categories"])
for id, ann in enumerate(anns):
s = ann["keypoints"]
x = s[0::3]
y = s[1::3]
x0, x1, y0, y1 = np.min(x), np.max(x), np.min(y), np.max(y)
ann["area"] = (x1 - x0) * (y1 - y0)
ann["id"] = id + 1
ann["bbox"] = [x0, y0, x1 - x0, y1 - y0]
res.dataset["annotations"] = anns
res.createIndex()
return res
#################################################################
# end of straight copy from pycocotools
#################################################################
#################################################################
# From pycocotools, but added handling of custom area rngs, and returns stat keys
#################################################################
def summarize(self):
"""
Compute and display summary metrics for evaluation results.
Note this functin can *only* be applied on the default parameter setting
"""
def _summarize(ap=1, iouThr=None, areaRng="all", maxDets=100):
p = self.params
iStr = " {:<18} {} @[ IoU={:<9} | area={:>6s} | maxDets={:>3d} ] = {:0.3f}"
titleStr = "Average Precision" if ap == 1 else "Average Recall"
typeStr = "(AP)" if ap == 1 else "(AR)"
iouStr = (
"{:0.2f}:{:0.2f}".format(p.iouThrs[0], p.iouThrs[-1])
if iouThr is None
else "{:0.2f}".format(iouThr)
)
aind = [i for i, aRng in enumerate(p.areaRngLbl) if aRng == areaRng]
mind = [i for i, mDet in enumerate(p.maxDets) if mDet == maxDets]
if ap == 1:
# dimension of precision: [TxRxKxAxM]
s = self.eval["precision"]
# IoU
if iouThr is not None:
t = np.where(iouThr == p.iouThrs)[0]
s = s[t]
s = s[:, :, :, aind, mind]
else:
# dimension of recall: [TxKxAxM]
s = self.eval["recall"]
if iouThr is not None:
t = np.where(iouThr == p.iouThrs)[0]
s = s[t]
s = s[:, :, aind, mind]
if len(s[s > -1]) == 0:
mean_s = -1
else:
mean_s = np.mean(s[s > -1])
print(iStr.format(titleStr, typeStr, iouStr, areaRng, maxDets, mean_s))
return mean_s
def _summarizeDets():
nb_results = 6 + (len(self.params.areaRng) - 1) * 2
assert len(self.params.areaRng) == len(self.params.areaRngLbl)
stats = np.zeros((nb_results,))
keys = ["AP", "AP_50", "AP_75"]
stats[0] = _summarize(1, maxDets=self.params.maxDets[2])
stats[1] = _summarize(1, iouThr=0.5, maxDets=self.params.maxDets[2])
stats[2] = _summarize(1, iouThr=0.75, maxDets=self.params.maxDets[2])
cur_id = 3
for area in self.params.areaRngLbl[1:]:
stats[cur_id] = _summarize(1, areaRng=area, maxDets=self.params.maxDets[2])
cur_id += 1
keys.append(f"AP_{area}")
stats[cur_id] = _summarize(0, maxDets=self.params.maxDets[0])
cur_id += 1
stats[cur_id] = _summarize(0, maxDets=self.params.maxDets[1])
cur_id += 1
stats[cur_id] = _summarize(0, maxDets=self.params.maxDets[2])
cur_id += 1
keys += ["AR", "AR_50", "AR_75"]
for area in self.params.areaRngLbl[1:]:
stats[cur_id] = _summarize(0, areaRng=area, maxDets=self.params.maxDets[2])
cur_id += 1
keys.append(f"AR_{area}")
assert len(stats) == len(keys)
return keys, stats
if not self.eval:
raise Exception("Please run accumulate() first")
self.stats = _summarizeDets()
#################################################################
# end of straight copy from pycocotools
#################################################################
#################################################################
# From https://github.com/facebookresearch/detectron2/blob/main/detectron2/evaluation/fast_eval_api.py
# with slight adjustments
#################################################################
def accumulate(self, use_self_eval=False):
"""
Accumulate per image evaluation results and store the result in self.eval. Does not
support changing parameter settings from those used by self.evaluate()
"""
if use_self_eval:
self.accumulate()
return
# CPP code is disabled
# self.eval = _CPP.COCOevalAccumulate(self.params, self.evalImgs)
# # recall is num_iou_thresholds X num_categories X num_area_ranges X num_max_detections
# self.eval["recall"] = np.array(self.eval["recall"]).reshape(
# self.eval["counts"][:1] + self.eval["counts"][2:]
# )
# # precision and scores are num_iou_thresholds X num_recall_thresholds X num_categories X
# # num_area_ranges X num_max_detections
# self.eval["precision"] = np.array(self.eval["precision"]).reshape(
# self.eval["counts"]
# )
# self.eval["scores"] = np.array(self.eval["scores"]).reshape(self.eval["counts"])

View File

@@ -0,0 +1,181 @@
# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
"""
This evaluator is meant for regular COCO mAP evaluation, for example on the COCO val set.
For Category mAP, we need the model to make predictions for all the categories on every single image.
In general, since the number of classes can be big, and the API model makes predictions individually for each pair (image, class),
we may need to split the inference process for a given image in several chunks.
"""
import logging
from collections import defaultdict
import torch
from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval
from sam3.train.utils.distributed import is_main_process
try:
from tidecv import datasets, TIDE
HAS_TIDE = True
except ImportError:
HAS_TIDE = False
print("WARNING: TIDE not installed. Detailed analysis will not be available.")
# the COCO detection metrics (https://github.com/cocodataset/cocoapi/blob/8c9bcc3cf640524c4c20a9c40e89cb6a2f2fa0e9/PythonAPI/pycocotools/cocoeval.py#L460-L471)
COCO_METRICS = [
"AP",
"AP_50",
"AP_75",
"AP_small",
"AP_medium",
"AP_large",
"AR_maxDets@1",
"AR_maxDets@10",
"AR_maxDets@100",
"AR_small",
"AR_medium",
"AR_large",
]
def convert_to_xywh(boxes):
"""Convert bounding boxes from xyxy format to xywh format."""
xmin, ymin, xmax, ymax = boxes.unbind(-1)
return torch.stack((xmin, ymin, xmax - xmin, ymax - ymin), dim=-1)
class HeapElement:
"""Utility class to make a heap with a custom comparator"""
def __init__(self, val):
self.val = val
def __lt__(self, other):
return self.val["score"] < other.val["score"]
class COCOevalCustom(COCOeval):
"""
This is a slightly modified version of the original COCO API with added support for positive split evaluation.
"""
def __init__(
self, cocoGt=None, cocoDt=None, iouType="segm", dt_only_positive=False
):
super().__init__(cocoGt, cocoDt, iouType)
self.dt_only_positive = dt_only_positive
def _prepare(self):
"""
Prepare ._gts and ._dts for evaluation based on params
:return: None
"""
def _toMask(anns, coco):
# modify ann['segmentation'] by reference
for ann in anns:
rle = coco.annToRLE(ann)
ann["segmentation"] = rle
p = self.params
if p.useCats:
gts = self.cocoGt.loadAnns(
self.cocoGt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds)
)
dts = self.cocoDt.loadAnns(
self.cocoDt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds)
)
else:
gts = self.cocoGt.loadAnns(self.cocoGt.getAnnIds(imgIds=p.imgIds))
dts = self.cocoDt.loadAnns(self.cocoDt.getAnnIds(imgIds=p.imgIds))
# convert ground truth to mask if iouType == 'segm'
if p.iouType == "segm":
_toMask(gts, self.cocoGt)
_toMask(dts, self.cocoDt)
# set ignore flag
for gt in gts:
gt["ignore"] = gt["ignore"] if "ignore" in gt else 0
gt["ignore"] = "iscrowd" in gt and gt["iscrowd"]
if p.iouType == "keypoints":
gt["ignore"] = (gt["num_keypoints"] == 0) or gt["ignore"]
self._gts = defaultdict(list) # gt for evaluation
self._dts = defaultdict(list) # dt for evaluation
_gts_cat_ids = defaultdict(set) # gt for evaluation on positive split
for gt in gts:
self._gts[gt["image_id"], gt["category_id"]].append(gt)
_gts_cat_ids[gt["image_id"]].add(gt["category_id"])
#### BEGIN MODIFICATION ####
for dt in dts:
if (
self.dt_only_positive
and dt["category_id"] not in _gts_cat_ids[dt["image_id"]]
):
continue
self._dts[dt["image_id"], dt["category_id"]].append(dt)
#### END MODIFICATION ####
self.evalImgs = defaultdict(list) # per-image per-category evaluation results
self.eval = {} # accumulated evaluation results
class CocoEvaluatorOfflineWithPredFileEvaluators:
def __init__(
self,
gt_path,
tide: bool = True,
iou_type: str = "bbox",
positive_split=False,
):
self.gt_path = gt_path
self.tide_enabled = HAS_TIDE and tide
self.positive_split = positive_split
self.iou_type = iou_type
def evaluate(self, dumped_file):
if not is_main_process():
return {}
logging.info("OfflineCoco evaluator: Loading groundtruth")
self.gt = COCO(self.gt_path)
# Creating the result file
logging.info("Coco evaluator: Creating the result file")
cocoDt = self.gt.loadRes(str(dumped_file))
# Run the evaluation
logging.info("Coco evaluator: Running evaluation")
coco_eval = COCOevalCustom(
self.gt, cocoDt, iouType=self.iou_type, dt_only_positive=self.positive_split
)
coco_eval.evaluate()
coco_eval.accumulate()
coco_eval.summarize()
outs = {}
for i, value in enumerate(coco_eval.stats):
outs[f"coco_eval_{self.iou_type}_{COCO_METRICS[i]}"] = value
if self.tide_enabled:
logging.info("Coco evaluator: Loading TIDE")
self.tide_gt = datasets.COCO(self.gt_path)
self.tide = TIDE(mode="mask" if self.iou_type == "segm" else "bbox")
# Run TIDE
logging.info("Coco evaluator: Running TIDE")
self.tide.evaluate(
self.tide_gt, datasets.COCOResult(str(dumped_file)), name="coco_eval"
)
self.tide.summarize()
for k, v in self.tide.get_main_errors()["coco_eval"].items():
outs[f"coco_eval_{self.iou_type}_TIDE_{k}"] = v
for k, v in self.tide.get_special_errors()["coco_eval"].items():
outs[f"coco_eval_{self.iou_type}_TIDE_{k}"] = v
return outs

230
sam3/eval/coco_reindex.py Normal file
View File

@@ -0,0 +1,230 @@
# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
"""
Self-contained COCO JSON re-indexing function that creates temporary files.
"""
import json
import os
import tempfile
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
def reindex_coco_to_temp(input_json_path: str) -> Optional[str]:
"""
Convert 0-indexed COCO JSON file to 1-indexed and save to temporary location.
Args:
input_json_path: Path to the input COCO JSON file
Returns:
Path to the new 1-indexed JSON file in temporary directory, or None if no conversion needed
Raises:
FileNotFoundError: If input file doesn't exist
json.JSONDecodeError: If input file is not valid JSON
ValueError: If input file is not a valid COCO format
"""
def is_coco_json(data: Dict[str, Any]) -> bool:
"""Check if data appears to be a COCO format file."""
if not isinstance(data, dict):
return False
# A COCO file should have at least one of these keys
coco_keys = {"images", "annotations", "categories"}
return any(key in data for key in coco_keys)
def check_zero_indexed(data: Dict[str, Any]) -> Tuple[bool, bool, bool]:
"""
Check if annotations, images, or categories start from index 0.
Returns:
Tuple of (annotations_zero_indexed, images_zero_indexed, categories_zero_indexed)
"""
annotations_zero = False
images_zero = False
categories_zero = False
# Check annotations
annotations = data.get("annotations", [])
if annotations and any(ann.get("id", -1) == 0 for ann in annotations):
annotations_zero = True
# Check images
images = data.get("images", [])
if images and any(img.get("id", -1) == 0 for img in images):
images_zero = True
# Check categories
categories = data.get("categories", [])
if categories and any(cat.get("id", -1) == 0 for cat in categories):
categories_zero = True
return annotations_zero, images_zero, categories_zero
def reindex_coco_data(data: Dict[str, Any]) -> Dict[str, Any]:
"""Convert 0-indexed COCO data to 1-indexed."""
modified_data = data.copy()
annotations_zero, images_zero, categories_zero = check_zero_indexed(data)
# Create ID mapping for consistency
image_id_mapping = {}
category_id_mapping = {}
# Process images first (since annotations reference image IDs)
if images_zero and "images" in modified_data:
for img in modified_data["images"]:
old_id = img["id"]
new_id = old_id + 1
image_id_mapping[old_id] = new_id
img["id"] = new_id
# Process categories (since annotations reference category IDs)
if categories_zero and "categories" in modified_data:
for cat in modified_data["categories"]:
old_id = cat["id"]
new_id = old_id + 1
category_id_mapping[old_id] = new_id
cat["id"] = new_id
# Process annotations
if "annotations" in modified_data:
for ann in modified_data["annotations"]:
# Update annotation ID if needed
if annotations_zero:
ann["id"] = ann["id"] + 1
# Update image_id reference if images were reindexed
if images_zero and ann.get("image_id") is not None:
old_image_id = ann["image_id"]
if old_image_id in image_id_mapping:
ann["image_id"] = image_id_mapping[old_image_id]
# Update category_id reference if categories were reindexed
if categories_zero and ann.get("category_id") is not None:
old_category_id = ann["category_id"]
if old_category_id in category_id_mapping:
ann["category_id"] = category_id_mapping[old_category_id]
return modified_data
# Validate input path
if not os.path.exists(input_json_path):
raise FileNotFoundError(f"Input file not found: {input_json_path}")
# Load and validate JSON data
try:
with open(input_json_path, "r", encoding="utf-8") as f:
data = json.load(f)
except json.JSONDecodeError as e:
raise json.JSONDecodeError(f"Invalid JSON in {input_json_path}: {e}")
# Validate COCO format
if not is_coco_json(data):
raise ValueError(
f"File does not appear to be in COCO format: {input_json_path}"
)
# Check if reindexing is needed
annotations_zero, images_zero, categories_zero = check_zero_indexed(data)
if not (annotations_zero or images_zero or categories_zero):
# No conversion needed - just copy to temp location
input_path = Path(input_json_path)
temp_dir = tempfile.mkdtemp()
temp_filename = f"{input_path.stem}_1_indexed{input_path.suffix}"
temp_path = os.path.join(temp_dir, temp_filename)
with open(temp_path, "w", encoding="utf-8") as f:
json.dump(data, f, indent=2, ensure_ascii=False)
return temp_path
# Perform reindexing
modified_data = reindex_coco_data(data)
# Create temporary file
input_path = Path(input_json_path)
temp_dir = tempfile.mkdtemp()
temp_filename = f"{input_path.stem}_1_indexed{input_path.suffix}"
temp_path = os.path.join(temp_dir, temp_filename)
# Write modified data to temporary file
with open(temp_path, "w", encoding="utf-8") as f:
json.dump(modified_data, f, indent=2, ensure_ascii=False)
return temp_path
# Example usage and test function
def test_reindex_function():
"""Test the reindex function with a sample COCO file."""
# Create a test COCO file
test_data = {
"info": {"description": "Test COCO dataset", "version": "1.0", "year": 2023},
"images": [
{"id": 0, "width": 640, "height": 480, "file_name": "test1.jpg"},
{"id": 1, "width": 640, "height": 480, "file_name": "test2.jpg"},
],
"categories": [
{"id": 0, "name": "person", "supercategory": "person"},
{"id": 1, "name": "car", "supercategory": "vehicle"},
],
"annotations": [
{
"id": 0,
"image_id": 0,
"category_id": 0,
"bbox": [100, 100, 50, 75],
"area": 3750,
"iscrowd": 0,
},
{
"id": 1,
"image_id": 1,
"category_id": 1,
"bbox": [200, 150, 120, 80],
"area": 9600,
"iscrowd": 0,
},
],
}
# Create temporary test file
with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
json.dump(test_data, f, indent=2)
test_file_path = f.name
try:
# Test the function
result_path = reindex_coco_to_temp(test_file_path)
print(f"Original file: {test_file_path}")
print(f"Converted file: {result_path}")
# Load and display the result
with open(result_path, "r") as f:
result_data = json.load(f)
print("\nConverted data sample:")
print(f"First image ID: {result_data['images'][0]['id']}")
print(f"First category ID: {result_data['categories'][0]['id']}")
print(f"First annotation ID: {result_data['annotations'][0]['id']}")
print(f"First annotation image_id: {result_data['annotations'][0]['image_id']}")
print(
f"First annotation category_id: {result_data['annotations'][0]['category_id']}"
)
# Clean up
os.unlink(result_path)
os.rmdir(os.path.dirname(result_path))
finally:
# Clean up test file
os.unlink(test_file_path)
if __name__ == "__main__":
test_reindex_function()

352
sam3/eval/coco_writer.py Normal file
View File

@@ -0,0 +1,352 @@
# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
"""
COCO prediction dumper for distributed training.
Handles collection and dumping of COCO-format predictions from models.
Supports distributed processing with multiple GPUs/processes.
"""
import copy
import gc
import heapq
import json
import logging
import os
from collections import defaultdict
from pathlib import Path
from typing import Any, Optional
import pycocotools.mask as mask_utils
import torch
from iopath.common.file_io import g_pathmgr
from sam3.eval.coco_eval_offline import convert_to_xywh
from sam3.train.masks_ops import rle_encode
from sam3.train.utils.distributed import (
all_gather,
gather_to_rank_0_via_filesys,
get_rank,
is_main_process,
)
### Helper functions and classes
class HeapElement:
"""Utility class to make a heap with a custom comparator based on score."""
def __init__(self, val):
self.val = val
def __lt__(self, other):
return self.val["score"] < other.val["score"]
class PredictionDumper:
"""
Handles collection and dumping of COCO-format predictions from a model.
This class processes model outputs through a postprocessor, converts them to COCO format,
and saves them to disk. It supports distributed processing with multiple GPUs/processes.
"""
def __init__(
self,
dump_dir: str,
postprocessor,
maxdets: int,
iou_type: str,
gather_pred_via_filesys: bool = False,
merge_predictions: bool = False,
pred_file_evaluators: Optional[Any] = None,
):
"""
Initialize the PredictionDumper.
Args:
dump_dir: Directory to dump predictions.
postprocessor: Module to convert the model's output into COCO format.
maxdets: Maximum number of detections per image.
iou_type: IoU type to evaluate. Can include "bbox", "segm"
gather_pred_via_filesys: If True, use the filesystem for collective gathers across
processes (requires a shared filesystem). Otherwise, use torch collective ops.
merge_predictions: If True, merge predictions from all processes and dump to a single file.
"""
self.iou_type = iou_type
self.maxdets = maxdets
self.dump_dir = dump_dir
self.postprocessor = postprocessor
self.gather_pred_via_filesys = gather_pred_via_filesys
self.merge_predictions = merge_predictions
self.pred_file_evaluators = pred_file_evaluators
if self.pred_file_evaluators is not None:
assert (
merge_predictions
), "merge_predictions must be True if pred_file_evaluators are provided"
assert self.dump_dir is not None, "dump_dir must be provided"
if is_main_process():
os.makedirs(self.dump_dir, exist_ok=True)
logging.info(f"Created prediction dump directory: {self.dump_dir}")
# Initialize state
self.reset()
def update(self, *args, **kwargs):
"""
Process and accumulate predictions from model outputs.
Args:
*args, **kwargs: Arguments passed to postprocessor.process_results()
"""
predictions = self.postprocessor.process_results(*args, **kwargs)
results = self.prepare(predictions, self.iou_type)
self._dump(results)
def _dump(self, results):
"""
Add results to the dump list with precision rounding.
Args:
results: List of prediction dictionaries in COCO format.
"""
dumped_results = copy.deepcopy(results)
for r in dumped_results:
if "bbox" in r:
r["bbox"] = [round(coord, 5) for coord in r["bbox"]]
r["score"] = round(r["score"], 5)
self.dump.extend(dumped_results)
def synchronize_between_processes(self):
"""
Synchronize predictions across all processes and save to disk.
If gather_pred_via_filesys is True, uses filesystem for gathering.
Otherwise, uses torch distributed collective operations.
Saves per-rank predictions to separate JSON files.
"""
logging.info("Prediction Dumper: Synchronizing between processes")
if not self.merge_predictions:
dumped_file = (
Path(self.dump_dir)
/ f"coco_predictions_{self.iou_type}_{get_rank()}.json"
)
logging.info(
f"Prediction Dumper: Dumping local predictions to {dumped_file}"
)
with g_pathmgr.open(str(dumped_file), "w") as f:
json.dump(self.dump, f)
else:
self.dump = self.gather_and_merge_predictions()
dumped_file = Path(self.dump_dir) / f"coco_predictions_{self.iou_type}.json"
if is_main_process():
logging.info(
f"Prediction Dumper: Dumping merged predictions to {dumped_file}"
)
with g_pathmgr.open(str(dumped_file), "w") as f:
json.dump(self.dump, f)
self.reset()
return dumped_file
def gather_and_merge_predictions(self):
"""
Gather predictions from all processes and merge them, keeping top predictions per image.
This method collects predictions from all processes, then keeps only the top maxdets
predictions per image based on score. It also deduplicates predictions by (image_id, category_id).
Returns:
List of merged prediction dictionaries.
"""
logging.info("Prediction Dumper: Gathering predictions from all processes")
gc.collect()
if self.gather_pred_via_filesys:
dump = gather_to_rank_0_via_filesys(self.dump)
else:
dump = all_gather(self.dump, force_cpu=True)
# Combine predictions, keeping only top maxdets per image
preds_by_image = defaultdict(list)
seen_img_cat = set()
for cur_dump in dump:
cur_seen_img_cat = set()
for p in cur_dump:
image_id = p["image_id"]
cat_id = p["category_id"]
# Skip if we've already seen this image/category pair in a previous dump
if (image_id, cat_id) in seen_img_cat:
continue
cur_seen_img_cat.add((image_id, cat_id))
# Use a min-heap to keep top predictions
if len(preds_by_image[image_id]) < self.maxdets:
heapq.heappush(preds_by_image[image_id], HeapElement(p))
else:
heapq.heappushpop(preds_by_image[image_id], HeapElement(p))
seen_img_cat.update(cur_seen_img_cat)
# Flatten the heap elements back to a list
merged_dump = sum(
[[h.val for h in cur_preds] for cur_preds in preds_by_image.values()], []
)
return merged_dump
def compute_synced(self):
"""
Synchronize predictions across processes and compute summary.
Returns:
Summary dictionary from summarize().
"""
dumped_file = self.synchronize_between_processes()
if not is_main_process():
return {"": 0.0}
meters = {}
if self.pred_file_evaluators is not None:
for evaluator in self.pred_file_evaluators:
results = evaluator.evaluate(dumped_file)
meters.update(results)
if len(meters) == 0:
meters = {"": 0.0}
return meters
def compute(self):
"""
Compute without synchronization.
Returns:
Empty metric dictionary.
"""
return {"": 0.0}
def reset(self):
"""Reset internal state for a new evaluation round."""
self.dump = []
def prepare(self, predictions, iou_type):
"""
Route predictions to the appropriate preparation method based on iou_type.
Args:
predictions: Dictionary mapping image IDs to prediction dictionaries.
iou_type: Type of evaluation ("bbox", "segm").
Returns:
List of COCO-format prediction dictionaries.
"""
if iou_type == "bbox":
return self.prepare_for_coco_detection(predictions)
elif iou_type == "segm":
return self.prepare_for_coco_segmentation(predictions)
else:
raise ValueError(f"Unknown iou type: {iou_type}")
def prepare_for_coco_detection(self, predictions):
"""
Convert predictions to COCO detection format.
Args:
predictions: Dictionary mapping image IDs to prediction dictionaries
containing "boxes", "scores", and "labels".
Returns:
List of COCO-format detection dictionaries.
"""
coco_results = []
for original_id, prediction in predictions.items():
if len(prediction) == 0:
continue
boxes = prediction["boxes"]
boxes = convert_to_xywh(boxes).tolist()
scores = prediction["scores"].tolist()
labels = prediction["labels"].tolist()
coco_results.extend(
[
{
"image_id": original_id,
"category_id": labels[k],
"bbox": box,
"score": scores[k],
}
for k, box in enumerate(boxes)
]
)
return coco_results
@torch.no_grad()
def prepare_for_coco_segmentation(self, predictions):
"""
Convert predictions to COCO segmentation format.
Args:
predictions: Dictionary mapping image IDs to prediction dictionaries
containing "masks" or "masks_rle", "scores", and "labels".
Optionally includes "boundaries" and "dilated_boundaries".
Returns:
List of COCO-format segmentation dictionaries with RLE-encoded masks.
"""
coco_results = []
for original_id, prediction in predictions.items():
if len(prediction) == 0:
continue
scores = prediction["scores"].tolist()
labels = prediction["labels"].tolist()
boxes = None
if "boxes" in prediction:
boxes = prediction["boxes"]
boxes = convert_to_xywh(boxes).tolist()
assert len(boxes) == len(scores)
if "masks_rle" in prediction:
rles = prediction["masks_rle"]
areas = []
for rle in rles:
cur_area = mask_utils.area(rle)
h, w = rle["size"]
areas.append(cur_area / (h * w))
else:
masks = prediction["masks"]
masks = masks > 0.5
h, w = masks.shape[-2:]
areas = masks.flatten(1).sum(1) / (h * w)
areas = areas.tolist()
rles = rle_encode(masks.squeeze(1))
# Memory cleanup
del masks
del prediction["masks"]
assert len(areas) == len(rles) == len(scores)
for k, rle in enumerate(rles):
payload = {
"image_id": original_id,
"category_id": labels[k],
"segmentation": rle,
"score": scores[k],
"area": areas[k],
}
if boxes is not None:
payload["bbox"] = boxes[k]
coco_results.append(payload)
return coco_results

View File

@@ -0,0 +1,211 @@
# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
import json
import os
from collections import defaultdict
from tqdm import tqdm
def convert_ytbvis_to_cocovid_gt(ann_json, save_path=None):
"""Convert YouTube VIS dataset to COCO-style video instance segmentation format.
Args:
ann_json (str): Path to YouTube VIS annotation JSON file
save_path (str): path to save converted COCO-style JSON
"""
# Initialize COCO structure
VIS = {
"info": {},
"images": [],
"videos": [],
"tracks": [],
"annotations": [],
"categories": [],
"licenses": [],
}
# Load original annotations
official_anns = json.load(open(ann_json))
VIS["categories"] = official_anns["categories"] # Direct copy categories
# Initialize counters
records = dict(img_id=1, ann_id=1)
# Create video-to-annotations mapping
vid_to_anns = defaultdict(list)
for ann in official_anns["annotations"]:
vid_to_anns[ann["video_id"]].append(ann)
# Create tracks directly
VIS["tracks"] = [
{
"id": ann["id"],
"category_id": ann["category_id"],
"video_id": ann["video_id"],
}
for ann in official_anns["annotations"]
]
# Process videos
for video_info in tqdm(official_anns["videos"]):
# Create video entry
video = {
"id": video_info["id"],
"name": os.path.dirname(video_info["file_names"][0]),
"width": video_info["width"],
"height": video_info["height"],
"length": video_info["length"],
"neg_category_ids": [],
"not_exhaustive_category_ids": [],
}
VIS["videos"].append(video)
# Process frames
num_frames = len(video_info["file_names"])
for frame_idx in range(num_frames):
# Create image entry
image = {
"id": records["img_id"],
"video_id": video_info["id"],
"file_name": video_info["file_names"][frame_idx],
"width": video_info["width"],
"height": video_info["height"],
"frame_index": frame_idx,
"frame_id": frame_idx,
}
VIS["images"].append(image)
# Process annotations for this frame
if video_info["id"] in vid_to_anns:
for ann in vid_to_anns[video_info["id"]]:
bbox = ann["bboxes"][frame_idx]
if bbox is None:
continue
# Create annotation entry
annotation = {
"id": records["ann_id"],
"video_id": video_info["id"],
"image_id": records["img_id"],
"track_id": ann["id"],
"category_id": ann["category_id"],
"bbox": bbox,
"area": ann["areas"][frame_idx],
"segmentation": ann["segmentations"][frame_idx],
"iscrowd": ann["iscrowd"],
}
VIS["annotations"].append(annotation)
records["ann_id"] += 1
records["img_id"] += 1
# Print summary
print(f"Converted {len(VIS['videos'])} videos")
print(f"Converted {len(VIS['images'])} images")
print(f"Created {len(VIS['tracks'])} tracks")
print(f"Created {len(VIS['annotations'])} annotations")
if save_path is None:
return VIS
# Save output
save_dir = os.path.dirname(save_path)
os.makedirs(save_dir, exist_ok=True)
json.dump(VIS, open(save_path, "w"))
return VIS
def convert_ytbvis_to_cocovid_pred(
youtubevis_pred_path: str, converted_dataset_path: str, output_path: str
) -> None:
"""
Convert YouTubeVIS predictions to COCO format with video_id preservation
Args:
youtubevis_pred_path: Path to YouTubeVIS prediction JSON
converted_dataset_path: Path to converted COCO dataset JSON
output_path: Path to save COCO format predictions
"""
# Load YouTubeVIS predictions
with open(youtubevis_pred_path) as f:
ytv_predictions = json.load(f)
# Load converted dataset for image ID mapping
with open(converted_dataset_path) as f:
coco_dataset = json.load(f)
# Create (video_id, frame_idx) -> image_id mapping
image_id_map = {
(img["video_id"], img["frame_index"]): img["id"]
for img in coco_dataset["images"]
}
coco_annotations = []
track_id_counter = 1 # Unique track ID generator
for pred in tqdm(ytv_predictions):
video_id = pred["video_id"]
category_id = pred["category_id"]
bboxes = pred["bboxes"]
segmentations = pred.get("segmentations", []) # Get segmentations if available
areas = pred.get("areas", []) # Get areas if available
score = pred["score"]
# Assign unique track ID for this prediction
track_id = track_id_counter
track_id_counter += 1
# Ensure segmentations and areas have the same length as bboxes
if len(segmentations) == 0:
segmentations = [None] * len(bboxes)
if len(areas) == 0:
areas = [None] * len(bboxes)
for frame_idx, (bbox, segmentation, area_from_pred) in enumerate(
zip(bboxes, segmentations, areas)
):
# Skip frames with missing objects (None or zero bbox)
if bbox is None or all(x == 0 for x in bbox):
continue
# Get corresponding image ID from mapping
image_id = image_id_map.get((video_id, frame_idx))
if image_id is None:
raise RuntimeError(
f"prediction {video_id=}, {frame_idx=} does not match any images in the converted COCO format"
)
# Extract bbox coordinates
x, y, w, h = bbox
# Calculate area - use area from prediction if available, otherwise from bbox
if area_from_pred is not None and area_from_pred > 0:
area = area_from_pred
else:
area = w * h
# Create COCO annotation with video_id
coco_annotation = {
"image_id": int(image_id),
"video_id": video_id, # Added video_id field
"track_id": track_id,
"category_id": category_id,
"bbox": [float(x), float(y), float(w), float(h)],
"area": float(area),
"iscrowd": 0,
"score": float(score),
}
# Add segmentation if available
if segmentation is not None:
coco_annotation["segmentation"] = segmentation
coco_annotations.append(coco_annotation)
# Save output
with open(output_path, "w") as f:
json.dump(coco_annotations, f)
print(f"Converted {len(coco_annotations)} predictions to COCO format with video_id")

658
sam3/eval/demo_eval.py Normal file
View File

@@ -0,0 +1,658 @@
# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
"""
This evaluator is based upon COCO evaluation, but evaluates the model in a "demo" setting.
This means that the model's predictions are thresholded and evaluated as "hard" predictions.
"""
import logging
from typing import Optional
import numpy as np
import pycocotools.mask as maskUtils
from pycocotools.cocoeval import COCOeval
from sam3.eval.coco_eval import CocoEvaluator
from sam3.train.masks_ops import compute_F_measure
from sam3.train.utils.distributed import is_main_process
from scipy.optimize import linear_sum_assignment
class DemoEval(COCOeval):
"""
This evaluator is based upon COCO evaluation, but evaluates the model in a "demo" setting.
This means that the model's predictions are thresholded and evaluated as "hard" predictions.
"""
def __init__(
self,
coco_gt=None,
coco_dt=None,
iouType="bbox",
threshold=0.5,
compute_JnF=False,
):
"""
Args:
coco_gt (COCO): ground truth COCO API
coco_dt (COCO): detections COCO API
iou_type (str): type of IoU to evaluate
threshold (float): threshold for predictions
"""
super().__init__(coco_gt, coco_dt, iouType)
self.threshold = threshold
self.params.useCats = False
self.params.areaRng = [[0**2, 1e5**2]]
self.params.areaRngLbl = ["all"]
self.params.maxDets = [100000]
self.compute_JnF = compute_JnF
def computeIoU(self, imgId, catId):
# Same as the original COCOeval.computeIoU, but without sorting
p = self.params
if p.useCats:
gt = self._gts[imgId, catId]
dt = self._dts[imgId, catId]
else:
gt = [_ for cId in p.catIds for _ in self._gts[imgId, cId]]
dt = [_ for cId in p.catIds for _ in self._dts[imgId, cId]]
if len(gt) == 0 and len(dt) == 0:
return []
if p.iouType == "segm":
g = [g["segmentation"] for g in gt]
d = [d["segmentation"] for d in dt]
elif p.iouType == "bbox":
g = [g["bbox"] for g in gt]
d = [d["bbox"] for d in dt]
else:
raise Exception("unknown iouType for iou computation")
# compute iou between each dt and gt region
iscrowd = [int(o["iscrowd"]) for o in gt]
ious = maskUtils.iou(d, g, iscrowd)
return ious
def evaluateImg(self, imgId, catId, aRng, maxDet):
"""
perform evaluation for single category and image
:return: dict (single image results)
"""
p = self.params
assert not p.useCats, "This evaluator does not support per-category evaluation."
assert catId == -1
all_gts = [_ for cId in p.catIds for _ in self._gts[imgId, cId]]
keep_gt = np.array([not g["ignore"] for g in all_gts], dtype=bool)
gt = [g for g in all_gts if not g["ignore"]]
all_dts = [_ for cId in p.catIds for _ in self._dts[imgId, cId]]
keep_dt = np.array([d["score"] >= self.threshold for d in all_dts], dtype=bool)
dt = [d for d in all_dts if d["score"] >= self.threshold]
if len(gt) == 0 and len(dt) == 0:
# This is a "true negative" case, where there are no GTs and no predictions
# The box-level metrics are ill-defined, so we don't add them to this dict
return {
"image_id": imgId,
"IL_TP": 0,
"IL_TN": 1,
"IL_FP": 0,
"IL_FN": 0,
"IL_perfect_neg": np.ones((len(p.iouThrs),), dtype=np.int64),
"num_dt": len(dt),
}
if len(gt) > 0 and len(dt) == 0:
# This is a "false negative" case, where there are GTs but no predictions
return {
"image_id": imgId,
"IL_TP": 0,
"IL_TN": 0,
"IL_FP": 0,
"IL_FN": 1,
"TPs": np.zeros((len(p.iouThrs),), dtype=np.int64),
"FPs": np.zeros((len(p.iouThrs),), dtype=np.int64),
"FNs": np.ones((len(p.iouThrs),), dtype=np.int64) * len(gt),
"local_F1s": np.zeros((len(p.iouThrs),), dtype=np.int64),
"local_positive_F1s": np.zeros((len(p.iouThrs),), dtype=np.int64),
"IL_perfect_pos": np.zeros((len(p.iouThrs),), dtype=np.int64),
"num_dt": len(dt),
}
# Load pre-computed ious
ious = self.ious[(imgId, catId)]
# compute matching
if len(ious) == 0:
ious = np.zeros((len(dt), len(gt)))
else:
ious = ious[keep_dt, :][:, keep_gt]
assert ious.shape == (len(dt), len(gt))
matched_dt, matched_gt = linear_sum_assignment(-ious)
match_scores = ious[matched_dt, matched_gt]
if self.compute_JnF and len(match_scores) > 0:
j_score = match_scores.mean()
f_measure = 0
for dt_id, gt_id in zip(matched_dt, matched_gt):
f_measure += compute_F_measure(
gt_boundary_rle=gt[gt_id]["boundary"],
gt_dilated_boundary_rle=gt[gt_id]["dilated_boundary"],
dt_boundary_rle=dt[dt_id]["boundary"],
dt_dilated_boundary_rle=dt[dt_id]["dilated_boundary"],
)
f_measure /= len(match_scores) + 1e-9
JnF = (j_score + f_measure) * 0.5
else:
j_score = f_measure = JnF = -1
TPs, FPs, FNs = [], [], []
IL_perfect = []
for thresh in p.iouThrs:
TP = (match_scores >= thresh).sum()
FP = len(dt) - TP
FN = len(gt) - TP
assert (
FP >= 0 and FN >= 0
), f"FP: {FP}, FN: {FN}, TP: {TP}, match_scores: {match_scores}, len(dt): {len(dt)}, len(gt): {len(gt)}, ious: {ious}"
TPs.append(TP)
FPs.append(FP)
FNs.append(FN)
if FP == FN and FP == 0:
IL_perfect.append(1)
else:
IL_perfect.append(0)
TPs = np.array(TPs, dtype=np.int64)
FPs = np.array(FPs, dtype=np.int64)
FNs = np.array(FNs, dtype=np.int64)
IL_perfect = np.array(IL_perfect, dtype=np.int64)
# compute precision recall and F1
precision = TPs / (TPs + FPs + 1e-4)
assert np.all(precision <= 1)
recall = TPs / (TPs + FNs + 1e-4)
assert np.all(recall <= 1)
F1 = 2 * precision * recall / (precision + recall + 1e-4)
result = {
"image_id": imgId,
"TPs": TPs,
"FPs": FPs,
"FNs": FNs,
"local_F1s": F1,
"IL_TP": (len(gt) > 0) and (len(dt) > 0),
"IL_FP": (len(gt) == 0) and (len(dt) > 0),
"IL_TN": (len(gt) == 0) and (len(dt) == 0),
"IL_FN": (len(gt) > 0) and (len(dt) == 0),
("IL_perfect_pos" if len(gt) > 0 else "IL_perfect_neg"): IL_perfect,
"F": f_measure,
"J": j_score,
"J&F": JnF,
"num_dt": len(dt),
}
if len(gt) > 0 and len(dt) > 0:
result["local_positive_F1s"] = F1
return result
def accumulate(self, p=None):
"""
Accumulate per image evaluation results and store the result in self.eval
:param p: input params for evaluation
:return: None
"""
if not self.evalImgs:
print("Please run evaluate() first")
# allows input customized parameters
if p is None:
p = self.params
setImgIds = set(p.imgIds)
# TPs, FPs, FNs
TPs = np.zeros((len(p.iouThrs),), dtype=np.int64)
FPs = np.zeros((len(p.iouThrs),), dtype=np.int64)
pmFPs = np.zeros((len(p.iouThrs),), dtype=np.int64)
FNs = np.zeros((len(p.iouThrs),), dtype=np.int64)
local_F1s = np.zeros((len(p.iouThrs),), dtype=np.float64)
# Image level metrics
IL_TPs = 0
IL_FPs = 0
IL_TNs = 0
IL_FNs = 0
IL_perfects_neg = np.zeros((len(p.iouThrs),), dtype=np.int64)
IL_perfects_pos = np.zeros((len(p.iouThrs),), dtype=np.int64)
# JnF metric
total_J = 0
total_F = 0
total_JnF = 0
valid_img_count = 0
total_pos_count = 0
total_neg_count = 0
valid_J_count = 0
valid_F1_count = 0
valid_F1_count_w0dt = 0
for res in self.evalImgs:
if res["image_id"] not in setImgIds:
continue
IL_TPs += res["IL_TP"]
IL_FPs += res["IL_FP"]
IL_TNs += res["IL_TN"]
IL_FNs += res["IL_FN"]
if "IL_perfect_neg" in res:
IL_perfects_neg += res["IL_perfect_neg"]
total_neg_count += 1
else:
assert "IL_perfect_pos" in res
IL_perfects_pos += res["IL_perfect_pos"]
total_pos_count += 1
if "TPs" not in res:
continue
TPs += res["TPs"]
FPs += res["FPs"]
FNs += res["FNs"]
valid_img_count += 1
if "local_positive_F1s" in res:
local_F1s += res["local_positive_F1s"]
pmFPs += res["FPs"]
valid_F1_count_w0dt += 1
if res["num_dt"] > 0:
valid_F1_count += 1
if "J" in res and res["J"] > -1e-9:
total_J += res["J"]
total_F += res["F"]
total_JnF += res["J&F"]
valid_J_count += 1
# compute precision recall and F1
precision = TPs / (TPs + FPs + 1e-4)
positive_micro_precision = TPs / (TPs + pmFPs + 1e-4)
assert np.all(precision <= 1)
recall = TPs / (TPs + FNs + 1e-4)
assert np.all(recall <= 1)
F1 = 2 * precision * recall / (precision + recall + 1e-4)
positive_micro_F1 = (
2
* positive_micro_precision
* recall
/ (positive_micro_precision + recall + 1e-4)
)
IL_rec = IL_TPs / (IL_TPs + IL_FNs + 1e-6)
IL_prec = IL_TPs / (IL_TPs + IL_FPs + 1e-6)
IL_F1 = 2 * IL_prec * IL_rec / (IL_prec + IL_rec + 1e-6)
IL_FPR = IL_FPs / (IL_FPs + IL_TNs + 1e-6)
IL_MCC = float(IL_TPs * IL_TNs - IL_FPs * IL_FNs) / (
(
float(IL_TPs + IL_FPs)
* float(IL_TPs + IL_FNs)
* float(IL_TNs + IL_FPs)
* float(IL_TNs + IL_FNs)
)
** 0.5
+ 1e-6
)
IL_perfect_pos = IL_perfects_pos / (total_pos_count + 1e-9)
IL_perfect_neg = IL_perfects_neg / (total_neg_count + 1e-9)
total_J = total_J / (valid_J_count + 1e-9)
total_F = total_F / (valid_J_count + 1e-9)
total_JnF = total_JnF / (valid_J_count + 1e-9)
self.eval = {
"params": p,
"TPs": TPs,
"FPs": FPs,
"positive_micro_FPs": pmFPs,
"FNs": FNs,
"precision": precision,
"positive_micro_precision": positive_micro_precision,
"recall": recall,
"F1": F1,
"positive_micro_F1": positive_micro_F1,
"positive_macro_F1": local_F1s / valid_F1_count,
"positive_w0dt_macro_F1": local_F1s / valid_F1_count_w0dt,
"IL_recall": IL_rec,
"IL_precision": IL_prec,
"IL_F1": IL_F1,
"IL_FPR": IL_FPR,
"IL_MCC": IL_MCC,
"IL_perfect_pos": IL_perfect_pos,
"IL_perfect_neg": IL_perfect_neg,
"J": total_J,
"F": total_F,
"J&F": total_JnF,
}
self.eval["CGF1"] = self.eval["positive_macro_F1"] * self.eval["IL_MCC"]
self.eval["CGF1_w0dt"] = (
self.eval["positive_w0dt_macro_F1"] * self.eval["IL_MCC"]
)
self.eval["CGF1_micro"] = self.eval["positive_micro_F1"] * self.eval["IL_MCC"]
def summarize(self):
"""
Compute and display summary metrics for evaluation results.
Note this functin can *only* be applied on the default parameter setting
"""
if not self.eval:
raise Exception("Please run accumulate() first")
def _summarize(iouThr=None, metric=""):
p = self.params
iStr = " {:<18} @[ IoU={:<9}] = {:0.3f}"
titleStr = "Average " + metric
iouStr = (
"{:0.2f}:{:0.2f}".format(p.iouThrs[0], p.iouThrs[-1])
if iouThr is None
else "{:0.2f}".format(iouThr)
)
s = self.eval[metric]
# IoU
if iouThr is not None:
t = np.where(iouThr == p.iouThrs)[0]
s = s[t]
if len(s[s > -1]) == 0:
mean_s = -1
else:
mean_s = np.mean(s[s > -1])
print(iStr.format(titleStr, iouStr, mean_s))
return mean_s
def _summarize_single(metric=""):
titleStr = "Average " + metric
iStr = " {:<35} = {:0.3f}"
s = self.eval[metric]
print(iStr.format(titleStr, s))
return s
def _summarizeDets():
# note: the index of these metrics are also used in video Demo F1 evaluation
# when adding new metrics, please update the index in video Demo F1 evaluation
# in "evaluate" method of the "VideoDemoF1Evaluator" class
stats = np.zeros((len(DEMO_METRICS),))
stats[0] = _summarize(metric="CGF1")
stats[1] = _summarize(metric="precision")
stats[2] = _summarize(metric="recall")
stats[3] = _summarize(metric="F1")
stats[4] = _summarize(metric="positive_macro_F1")
stats[5] = _summarize_single(metric="IL_precision")
stats[6] = _summarize_single(metric="IL_recall")
stats[7] = _summarize_single(metric="IL_F1")
stats[8] = _summarize_single(metric="IL_FPR")
stats[9] = _summarize_single(metric="IL_MCC")
stats[10] = _summarize(metric="IL_perfect_pos")
stats[11] = _summarize(metric="IL_perfect_neg")
stats[12] = _summarize(iouThr=0.5, metric="CGF1")
stats[13] = _summarize(iouThr=0.5, metric="precision")
stats[14] = _summarize(iouThr=0.5, metric="recall")
stats[15] = _summarize(iouThr=0.5, metric="F1")
stats[16] = _summarize(iouThr=0.5, metric="positive_macro_F1")
stats[17] = _summarize(iouThr=0.5, metric="IL_perfect_pos")
stats[18] = _summarize(iouThr=0.5, metric="IL_perfect_neg")
stats[19] = _summarize(iouThr=0.75, metric="CGF1")
stats[20] = _summarize(iouThr=0.75, metric="precision")
stats[21] = _summarize(iouThr=0.75, metric="recall")
stats[22] = _summarize(iouThr=0.75, metric="F1")
stats[23] = _summarize(iouThr=0.75, metric="positive_macro_F1")
stats[24] = _summarize(iouThr=0.75, metric="IL_perfect_pos")
stats[25] = _summarize(iouThr=0.75, metric="IL_perfect_neg")
stats[26] = _summarize_single(metric="J")
stats[27] = _summarize_single(metric="F")
stats[28] = _summarize_single(metric="J&F")
stats[29] = _summarize(metric="CGF1_micro")
stats[30] = _summarize(metric="positive_micro_precision")
stats[31] = _summarize(metric="positive_micro_F1")
stats[32] = _summarize(iouThr=0.5, metric="CGF1_micro")
stats[33] = _summarize(iouThr=0.5, metric="positive_micro_precision")
stats[34] = _summarize(iouThr=0.5, metric="positive_micro_F1")
stats[35] = _summarize(iouThr=0.75, metric="CGF1_micro")
stats[36] = _summarize(iouThr=0.75, metric="positive_micro_precision")
stats[37] = _summarize(iouThr=0.75, metric="positive_micro_F1")
stats[38] = _summarize(metric="CGF1_w0dt")
stats[39] = _summarize(metric="positive_w0dt_macro_F1")
stats[40] = _summarize(iouThr=0.5, metric="CGF1_w0dt")
stats[41] = _summarize(iouThr=0.5, metric="positive_w0dt_macro_F1")
stats[42] = _summarize(iouThr=0.75, metric="CGF1_w0dt")
stats[43] = _summarize(iouThr=0.75, metric="positive_w0dt_macro_F1")
return stats
summarize = _summarizeDets
self.stats = summarize()
DEMO_METRICS = [
"CGF1",
"Precision",
"Recall",
"F1",
"Macro_F1",
"IL_Precision",
"IL_Recall",
"IL_F1",
"IL_FPR",
"IL_MCC",
"IL_perfect_pos",
"IL_perfect_neg",
"CGF1@0.5",
"Precision@0.5",
"Recall@0.5",
"F1@0.5",
"Macro_F1@0.5",
"IL_perfect_pos@0.5",
"IL_perfect_neg@0.5",
"CGF1@0.75",
"Precision@0.75",
"Recall@0.75",
"F1@0.75",
"Macro_F1@0.75",
"IL_perfect_pos@0.75",
"IL_perfect_neg@0.75",
"J",
"F",
"J&F",
"CGF1_micro",
"positive_micro_Precision",
"positive_micro_F1",
"CGF1_micro@0.5",
"positive_micro_Precision@0.5",
"positive_micro_F1@0.5",
"CGF1_micro@0.75",
"positive_micro_Precision@0.75",
"positive_micro_F1@0.75",
"CGF1_w0dt",
"positive_w0dt_macro_F1",
"CGF1_w0dt@0.5",
"positive_w0dt_macro_F1@0.5",
"CGF1_w0dt@0.75",
"positive_w0dt_macro_F1@0.75",
]
class DemoEvaluator(CocoEvaluator):
def __init__(
self,
coco_gt,
iou_types,
dump_dir: Optional[str],
postprocessor,
threshold=0.5,
average_by_rarity=False,
gather_pred_via_filesys=False,
exhaustive_only=False,
all_exhaustive_only=True,
compute_JnF=False,
metrics_dump_dir: Optional[str] = None,
):
self.iou_types = iou_types
self.threshold = threshold
super().__init__(
coco_gt=coco_gt,
iou_types=iou_types,
useCats=False,
dump_dir=dump_dir,
postprocessor=postprocessor,
# average_by_rarity=average_by_rarity,
gather_pred_via_filesys=gather_pred_via_filesys,
exhaustive_only=exhaustive_only,
all_exhaustive_only=all_exhaustive_only,
metrics_dump_dir=metrics_dump_dir,
)
self.use_self_evaluate = True
self.compute_JnF = compute_JnF
def _lazy_init(self):
if self.initialized:
return
super()._lazy_init()
self.use_self_evaluate = True
self.reset()
def select_best_scoring(self, scorings):
# This function is used for "oracle" type evaluation.
# It accepts the evaluation results with respect to several ground truths, and picks the best
if len(scorings) == 1:
return scorings[0]
assert (
scorings[0].ndim == 3
), f"Expecting results in [numCats, numAreas, numImgs] format, got {scorings[0].shape}"
assert (
scorings[0].shape[0] == 1
), f"Expecting a single category, got {scorings[0].shape[0]}"
for scoring in scorings:
assert (
scoring.shape == scorings[0].shape
), f"Shape mismatch: {scoring.shape}, {scorings[0].shape}"
selected_imgs = []
for img_id in range(scorings[0].shape[-1]):
best = scorings[0][:, :, img_id]
for scoring in scorings[1:]:
current = scoring[:, :, img_id]
if "local_F1s" in best[0, 0] and "local_F1s" in current[0, 0]:
# we were able to compute a F1 score for this particular image in both evaluations
# best["local_F1s"] contains the results at various IoU thresholds. We simply take the average for comparision
best_score = best[0, 0]["local_F1s"].mean()
current_score = current[0, 0]["local_F1s"].mean()
if current_score > best_score:
best = current
else:
# If we're here, it means that in that in some evaluation we were not able to get a valid local F1
# This happens when both the predictions and targets are empty. In that case, we can assume it's a perfect prediction
if "local_F1s" not in current[0, 0]:
best = current
selected_imgs.append(best)
result = np.stack(selected_imgs, axis=-1)
assert result.shape == scorings[0].shape
return result
def summarize(self):
self._lazy_init()
logging.info("Demo evaluator: Summarizing")
if not is_main_process():
return {}
outs = {}
prefix = "oracle_" if len(self.coco_evals) > 1 else ""
# if self.rarity_buckets is None:
self.accumulate(self.eval_img_ids)
for iou_type, coco_eval in self.coco_evals[0].items():
print("Demo metric, IoU type={}".format(iou_type))
coco_eval.summarize()
if "bbox" in self.coco_evals[0]:
for i, value in enumerate(self.coco_evals[0]["bbox"].stats):
outs[f"coco_eval_bbox_{prefix}{DEMO_METRICS[i]}"] = value
if "segm" in self.coco_evals[0]:
for i, value in enumerate(self.coco_evals[0]["segm"].stats):
outs[f"coco_eval_masks_{prefix}{DEMO_METRICS[i]}"] = value
# else:
# total_stats = {}
# for bucket, img_list in self.rarity_buckets.items():
# self.accumulate(imgIds=img_list)
# bucket_name = RARITY_BUCKETS[bucket]
# for iou_type, coco_eval in self.coco_evals[0].items():
# print(
# "Demo metric, IoU type={}, Rarity bucket={}".format(
# iou_type, bucket_name
# )
# )
# coco_eval.summarize()
# if "bbox" in self.coco_evals[0]:
# if "bbox" not in total_stats:
# total_stats["bbox"] = np.zeros_like(
# self.coco_evals[0]["bbox"].stats
# )
# total_stats["bbox"] += self.coco_evals[0]["bbox"].stats
# for i, value in enumerate(self.coco_evals[0]["bbox"].stats):
# outs[
# f"coco_eval_bbox_{bucket_name}_{prefix}{DEMO_METRICS[i]}"
# ] = value
# if "segm" in self.coco_evals[0]:
# if "segm" not in total_stats:
# total_stats["segm"] = np.zeros_like(
# self.coco_evals[0]["segm"].stats
# )
# total_stats["segm"] += self.coco_evals[0]["segm"].stats
# for i, value in enumerate(self.coco_evals[0]["segm"].stats):
# outs[
# f"coco_eval_masks_{bucket_name}_{prefix}{DEMO_METRICS[i]}"
# ] = value
# if "bbox" in total_stats:
# total_stats["bbox"] /= len(self.rarity_buckets)
# for i, value in enumerate(total_stats["bbox"]):
# outs[f"coco_eval_bbox_{prefix}{DEMO_METRICS[i]}"] = value
# if "segm" in total_stats:
# total_stats["segm"] /= len(self.rarity_buckets)
# for i, value in enumerate(total_stats["segm"]):
# outs[f"coco_eval_masks_{prefix}{DEMO_METRICS[i]}"] = value
return outs
def accumulate(self, imgIds=None):
self._lazy_init()
logging.info(
f"demo evaluator: Accumulating on {len(imgIds) if imgIds is not None else 'all'} images"
)
if not is_main_process():
return
if imgIds is not None:
for coco_eval in self.coco_evals[0].values():
coco_eval.params.imgIds = list(imgIds)
for coco_eval in self.coco_evals[0].values():
coco_eval.accumulate()
def reset(self):
self.coco_evals = [{} for _ in range(len(self.coco_gts))]
for i, coco_gt in enumerate(self.coco_gts):
for iou_type in self.iou_types:
self.coco_evals[i][iou_type] = DemoEval(
coco_gt=coco_gt,
iouType=iou_type,
threshold=self.threshold,
compute_JnF=self.compute_JnF,
)
self.coco_evals[i][iou_type].useCats = False
self.img_ids = []
self.eval_imgs = {k: [] for k in self.iou_types}
if self.dump is not None:
self.dump = []

View File

@@ -0,0 +1 @@
# flake8: noqa

View File

@@ -0,0 +1,114 @@
# flake8: noqa
"""run_youtube_vis.py
Run example:
run_youtube_vis.py --USE_PARALLEL False --METRICS HOTA --TRACKERS_TO_EVAL STEm_Seg
Command Line Arguments: Defaults, # Comments
Eval arguments:
'USE_PARALLEL': False,
'NUM_PARALLEL_CORES': 8,
'BREAK_ON_ERROR': True, # Raises exception and exits with error
'RETURN_ON_ERROR': False, # if not BREAK_ON_ERROR, then returns from function on error
'LOG_ON_ERROR': os.path.join(code_path, 'error_log.txt'), # if not None, save any errors into a log file.
'PRINT_RESULTS': True,
'PRINT_ONLY_COMBINED': False,
'PRINT_CONFIG': True,
'TIME_PROGRESS': True,
'DISPLAY_LESS_PROGRESS': True,
'OUTPUT_SUMMARY': True,
'OUTPUT_EMPTY_CLASSES': True, # If False, summary files are not output for classes with no detections
'OUTPUT_DETAILED': True,
'PLOT_CURVES': True,
Dataset arguments:
'GT_FOLDER': os.path.join(code_path, 'data/gt/youtube_vis/youtube_vis_training'), # Location of GT data
'TRACKERS_FOLDER': os.path.join(code_path, 'data/trackers/youtube_vis/youtube_vis_training'),
# Trackers location
'OUTPUT_FOLDER': None, # Where to save eval results (if None, same as TRACKERS_FOLDER)
'TRACKERS_TO_EVAL': None, # Filenames of trackers to eval (if None, all in folder)
'CLASSES_TO_EVAL': None, # Classes to eval (if None, all classes)
'SPLIT_TO_EVAL': 'training', # Valid: 'training', 'val'
'PRINT_CONFIG': True, # Whether to print current config
'OUTPUT_SUB_FOLDER': '', # Output files are saved in OUTPUT_FOLDER/tracker_name/OUTPUT_SUB_FOLDER
'TRACKER_SUB_FOLDER': 'data', # Tracker files are in TRACKER_FOLDER/tracker_name/TRACKER_SUB_FOLDER
'TRACKER_DISPLAY_NAMES': None, # Names of trackers to display, if None: TRACKERS_TO_EVAL
Metric arguments:
'METRICS': ['TrackMAP', 'HOTA', 'CLEAR', 'Identity']
"""
import argparse
import os
import sys
from multiprocessing import freeze_support
from . import trackeval
def run_ytvis_eval(args=None, gt_json=None, dt_json=None):
# Command line interface:
default_eval_config = trackeval.Evaluator.get_default_eval_config()
# print only combined since TrackMAP is undefined for per sequence breakdowns
default_eval_config["PRINT_ONLY_COMBINED"] = True
default_dataset_config = trackeval.datasets.YouTubeVIS.get_default_dataset_config()
default_metrics_config = {"METRICS": ["HOTA"]}
config = {
**default_eval_config,
**default_dataset_config,
**default_metrics_config,
} # Merge default configs
parser = argparse.ArgumentParser()
for setting in config.keys():
if type(config[setting]) == list or type(config[setting]) == type(None):
parser.add_argument("--" + setting, nargs="+")
else:
parser.add_argument("--" + setting)
args = parser.parse_args(args).__dict__
for setting in args.keys():
if args[setting] is not None:
if type(config[setting]) == type(True):
if args[setting] == "True":
x = True
elif args[setting] == "False":
x = False
else:
raise Exception(
"Command line parameter " + setting + "must be True or False"
)
elif type(config[setting]) == type(1):
x = int(args[setting])
elif type(args[setting]) == type(None):
x = None
else:
x = args[setting]
config[setting] = x
eval_config = {k: v for k, v in config.items() if k in default_eval_config.keys()}
dataset_config = {
k: v for k, v in config.items() if k in default_dataset_config.keys()
}
metrics_config = {
k: v for k, v in config.items() if k in default_metrics_config.keys()
}
# Run code
evaluator = trackeval.Evaluator(eval_config)
# allow directly specifying the GT JSON data and Tracker (result)
# JSON data as Python objects, without reading from files.
dataset_config["GT_JSON_OBJECT"] = gt_json
dataset_config["TRACKER_JSON_OBJECT"] = dt_json
dataset_list = [trackeval.datasets.YouTubeVIS(dataset_config)]
metrics_list = []
# for metric in [trackeval.metrics.TrackMAP, trackeval.metrics.HOTA, trackeval.metrics.CLEAR,
# trackeval.metrics.Identity]:
for metric in [trackeval.metrics.HOTA]:
if metric.get_name() in metrics_config["METRICS"]:
metrics_list.append(metric())
if len(metrics_list) == 0:
raise Exception("No metrics selected for evaluation")
output_res, output_msg = evaluator.evaluate(dataset_list, metrics_list)
return output_res, output_msg
if __name__ == "__main__":
import sys
freeze_support()
run_ytvis_eval(sys.argv[1:])

View File

@@ -0,0 +1,4 @@
# flake8: noqa
from . import datasets, metrics, utils
from .eval import Evaluator

View File

@@ -0,0 +1,68 @@
# flake8: noqa
import inspect
from functools import wraps
from time import perf_counter
DO_TIMING = False
DISPLAY_LESS_PROGRESS = False
timer_dict = {}
counter = 0
def time(f):
@wraps(f)
def wrap(*args, **kw):
if DO_TIMING:
# Run function with timing
ts = perf_counter()
result = f(*args, **kw)
te = perf_counter()
tt = te - ts
# Get function name
arg_names = inspect.getfullargspec(f)[0]
if arg_names[0] == "self" and DISPLAY_LESS_PROGRESS:
return result
elif arg_names[0] == "self":
method_name = type(args[0]).__name__ + "." + f.__name__
else:
method_name = f.__name__
# Record accumulative time in each function for analysis
if method_name in timer_dict.keys():
timer_dict[method_name] += tt
else:
timer_dict[method_name] = tt
# If code is finished, display timing summary
if method_name == "Evaluator.evaluate":
print("")
print("Timing analysis:")
for key, value in timer_dict.items():
print("%-70s %2.4f sec" % (key, value))
else:
# Get function argument values for printing special arguments of interest
arg_titles = ["tracker", "seq", "cls"]
arg_vals = []
for i, a in enumerate(arg_names):
if a in arg_titles:
arg_vals.append(args[i])
arg_text = "(" + ", ".join(arg_vals) + ")"
# Display methods and functions with different indentation.
if arg_names[0] == "self":
print("%-74s %2.4f sec" % (" " * 4 + method_name + arg_text, tt))
elif arg_names[0] == "test":
pass
else:
global counter
counter += 1
print("%i %-70s %2.4f sec" % (counter, method_name + arg_text, tt))
return result
else:
# If config["TIME_PROGRESS"] is false, or config["USE_PARALLEL"] is true, run functions normally without timing.
return f(*args, **kw)
return wrap

View File

@@ -0,0 +1,4 @@
# flake8: noqa
from .tao_ow import TAO_OW
from .youtube_vis import YouTubeVIS

View File

@@ -0,0 +1,379 @@
# flake8: noqa
import csv
import io
import os
import traceback
import zipfile
from abc import ABC, abstractmethod
from copy import deepcopy
import numpy as np
from .. import _timing
from ..utils import TrackEvalException
class _BaseDataset(ABC):
@abstractmethod
def __init__(self):
self.tracker_list = None
self.seq_list = None
self.class_list = None
self.output_fol = None
self.output_sub_fol = None
self.should_classes_combine = True
self.use_super_categories = False
# Functions to implement:
@staticmethod
@abstractmethod
def get_default_dataset_config(): ...
@abstractmethod
def _load_raw_file(self, tracker, seq, is_gt): ...
@_timing.time
@abstractmethod
def get_preprocessed_seq_data(self, raw_data, cls): ...
@abstractmethod
def _calculate_similarities(self, gt_dets_t, tracker_dets_t): ...
# Helper functions for all datasets:
@classmethod
def get_class_name(cls):
return cls.__name__
def get_name(self):
return self.get_class_name()
def get_output_fol(self, tracker):
return os.path.join(self.output_fol, tracker, self.output_sub_fol)
def get_display_name(self, tracker):
"""Can be overwritten if the trackers name (in files) is different to how it should be displayed.
By default this method just returns the trackers name as is.
"""
return tracker
def get_eval_info(self):
"""Return info about the dataset needed for the Evaluator"""
return self.tracker_list, self.seq_list, self.class_list
@_timing.time
def get_raw_seq_data(self, tracker, seq):
"""Loads raw data (tracker and ground-truth) for a single tracker on a single sequence.
Raw data includes all of the information needed for both preprocessing and evaluation, for all classes.
A later function (get_processed_seq_data) will perform such preprocessing and extract relevant information for
the evaluation of each class.
This returns a dict which contains the fields:
[num_timesteps]: integer
[gt_ids, tracker_ids, gt_classes, tracker_classes, tracker_confidences]:
list (for each timestep) of 1D NDArrays (for each det).
[gt_dets, tracker_dets, gt_crowd_ignore_regions]: list (for each timestep) of lists of detections.
[similarity_scores]: list (for each timestep) of 2D NDArrays.
[gt_extras]: dict (for each extra) of lists (for each timestep) of 1D NDArrays (for each det).
gt_extras contains dataset specific information used for preprocessing such as occlusion and truncation levels.
Note that similarities are extracted as part of the dataset and not the metric, because almost all metrics are
independent of the exact method of calculating the similarity. However datasets are not (e.g. segmentation
masks vs 2D boxes vs 3D boxes).
We calculate the similarity before preprocessing because often both preprocessing and evaluation require it and
we don't wish to calculate this twice.
We calculate similarity between all gt and tracker classes (not just each class individually) to allow for
calculation of metrics such as class confusion matrices. Typically the impact of this on performance is low.
"""
# Load raw data.
raw_gt_data = self._load_raw_file(tracker, seq, is_gt=True)
raw_tracker_data = self._load_raw_file(tracker, seq, is_gt=False)
raw_data = {**raw_tracker_data, **raw_gt_data} # Merges dictionaries
# Calculate similarities for each timestep.
similarity_scores = []
for t, (gt_dets_t, tracker_dets_t) in enumerate(
zip(raw_data["gt_dets"], raw_data["tracker_dets"])
):
ious = self._calculate_similarities(gt_dets_t, tracker_dets_t)
similarity_scores.append(ious)
raw_data["similarity_scores"] = similarity_scores
return raw_data
@staticmethod
def _load_simple_text_file(
file,
time_col=0,
id_col=None,
remove_negative_ids=False,
valid_filter=None,
crowd_ignore_filter=None,
convert_filter=None,
is_zipped=False,
zip_file=None,
force_delimiters=None,
):
"""Function that loads data which is in a commonly used text file format.
Assumes each det is given by one row of a text file.
There is no limit to the number or meaning of each column,
however one column needs to give the timestep of each det (time_col) which is default col 0.
The file dialect (deliminator, num cols, etc) is determined automatically.
This function automatically separates dets by timestep,
and is much faster than alternatives such as np.loadtext or pandas.
If remove_negative_ids is True and id_col is not None, dets with negative values in id_col are excluded.
These are not excluded from ignore data.
valid_filter can be used to only include certain classes.
It is a dict with ints as keys, and lists as values,
such that a row is included if "row[key].lower() is in value" for all key/value pairs in the dict.
If None, all classes are included.
crowd_ignore_filter can be used to read crowd_ignore regions separately. It has the same format as valid filter.
convert_filter can be used to convert value read to another format.
This is used most commonly to convert classes given as string to a class id.
This is a dict such that the key is the column to convert, and the value is another dict giving the mapping.
Optionally, input files could be a zip of multiple text files for storage efficiency.
Returns read_data and ignore_data.
Each is a dict (with keys as timesteps as strings) of lists (over dets) of lists (over column values).
Note that all data is returned as strings, and must be converted to float/int later if needed.
Note that timesteps will not be present in the returned dict keys if there are no dets for them
"""
if remove_negative_ids and id_col is None:
raise TrackEvalException(
"remove_negative_ids is True, but id_col is not given."
)
if crowd_ignore_filter is None:
crowd_ignore_filter = {}
if convert_filter is None:
convert_filter = {}
try:
if is_zipped: # Either open file directly or within a zip.
if zip_file is None:
raise TrackEvalException(
"is_zipped set to True, but no zip_file is given."
)
archive = zipfile.ZipFile(os.path.join(zip_file), "r")
fp = io.TextIOWrapper(archive.open(file, "r"))
else:
fp = open(file)
read_data = {}
crowd_ignore_data = {}
fp.seek(0, os.SEEK_END)
# check if file is empty
if fp.tell():
fp.seek(0)
dialect = csv.Sniffer().sniff(
fp.readline(), delimiters=force_delimiters
) # Auto determine structure.
dialect.skipinitialspace = (
True # Deal with extra spaces between columns
)
fp.seek(0)
reader = csv.reader(fp, dialect)
for row in reader:
try:
# Deal with extra trailing spaces at the end of rows
if row[-1] in "":
row = row[:-1]
timestep = str(int(float(row[time_col])))
# Read ignore regions separately.
is_ignored = False
for ignore_key, ignore_value in crowd_ignore_filter.items():
if row[ignore_key].lower() in ignore_value:
# Convert values in one column (e.g. string to id)
for (
convert_key,
convert_value,
) in convert_filter.items():
row[convert_key] = convert_value[
row[convert_key].lower()
]
# Save data separated by timestep.
if timestep in crowd_ignore_data.keys():
crowd_ignore_data[timestep].append(row)
else:
crowd_ignore_data[timestep] = [row]
is_ignored = True
if (
is_ignored
): # if det is an ignore region, it cannot be a normal det.
continue
# Exclude some dets if not valid.
if valid_filter is not None:
for key, value in valid_filter.items():
if row[key].lower() not in value:
continue
if remove_negative_ids:
if int(float(row[id_col])) < 0:
continue
# Convert values in one column (e.g. string to id)
for convert_key, convert_value in convert_filter.items():
row[convert_key] = convert_value[row[convert_key].lower()]
# Save data separated by timestep.
if timestep in read_data.keys():
read_data[timestep].append(row)
else:
read_data[timestep] = [row]
except Exception:
exc_str_init = (
"In file %s the following line cannot be read correctly: \n"
% os.path.basename(file)
)
exc_str = " ".join([exc_str_init] + row)
raise TrackEvalException(exc_str)
fp.close()
except Exception:
print("Error loading file: %s, printing traceback." % file)
traceback.print_exc()
raise TrackEvalException(
"File %s cannot be read because it is either not present or invalidly formatted"
% os.path.basename(file)
)
return read_data, crowd_ignore_data
@staticmethod
def _calculate_mask_ious(masks1, masks2, is_encoded=False, do_ioa=False):
"""Calculates the IOU (intersection over union) between two arrays of segmentation masks.
If is_encoded a run length encoding with pycocotools is assumed as input format, otherwise an input of numpy
arrays of the shape (num_masks, height, width) is assumed and the encoding is performed.
If do_ioa (intersection over area) , then calculates the intersection over the area of masks1 - this is commonly
used to determine if detections are within crowd ignore region.
:param masks1: first set of masks (numpy array of shape (num_masks, height, width) if not encoded,
else pycocotools rle encoded format)
:param masks2: second set of masks (numpy array of shape (num_masks, height, width) if not encoded,
else pycocotools rle encoded format)
:param is_encoded: whether the input is in pycocotools rle encoded format
:param do_ioa: whether to perform IoA computation
:return: the IoU/IoA scores
"""
# Only loaded when run to reduce minimum requirements
from pycocotools import mask as mask_utils
# use pycocotools for run length encoding of masks
if not is_encoded:
masks1 = mask_utils.encode(
np.array(np.transpose(masks1, (1, 2, 0)), order="F")
)
masks2 = mask_utils.encode(
np.array(np.transpose(masks2, (1, 2, 0)), order="F")
)
# use pycocotools for iou computation of rle encoded masks
ious = mask_utils.iou(masks1, masks2, [do_ioa] * len(masks2))
if len(masks1) == 0 or len(masks2) == 0:
ious = np.asarray(ious).reshape(len(masks1), len(masks2))
assert (ious >= 0 - np.finfo("float").eps).all()
assert (ious <= 1 + np.finfo("float").eps).all()
return ious
@staticmethod
def _calculate_box_ious(bboxes1, bboxes2, box_format="xywh", do_ioa=False):
"""Calculates the IOU (intersection over union) between two arrays of boxes.
Allows variable box formats ('xywh' and 'x0y0x1y1').
If do_ioa (intersection over area) , then calculates the intersection over the area of boxes1 - this is commonly
used to determine if detections are within crowd ignore region.
"""
if box_format in "xywh":
# layout: (x0, y0, w, h)
bboxes1 = deepcopy(bboxes1)
bboxes2 = deepcopy(bboxes2)
bboxes1[:, 2] = bboxes1[:, 0] + bboxes1[:, 2]
bboxes1[:, 3] = bboxes1[:, 1] + bboxes1[:, 3]
bboxes2[:, 2] = bboxes2[:, 0] + bboxes2[:, 2]
bboxes2[:, 3] = bboxes2[:, 1] + bboxes2[:, 3]
elif box_format not in "x0y0x1y1":
raise (TrackEvalException("box_format %s is not implemented" % box_format))
# layout: (x0, y0, x1, y1)
min_ = np.minimum(bboxes1[:, np.newaxis, :], bboxes2[np.newaxis, :, :])
max_ = np.maximum(bboxes1[:, np.newaxis, :], bboxes2[np.newaxis, :, :])
intersection = np.maximum(min_[..., 2] - max_[..., 0], 0) * np.maximum(
min_[..., 3] - max_[..., 1], 0
)
area1 = (bboxes1[..., 2] - bboxes1[..., 0]) * (
bboxes1[..., 3] - bboxes1[..., 1]
)
if do_ioa:
ioas = np.zeros_like(intersection)
valid_mask = area1 > 0 + np.finfo("float").eps
ioas[valid_mask, :] = (
intersection[valid_mask, :] / area1[valid_mask][:, np.newaxis]
)
return ioas
else:
area2 = (bboxes2[..., 2] - bboxes2[..., 0]) * (
bboxes2[..., 3] - bboxes2[..., 1]
)
union = area1[:, np.newaxis] + area2[np.newaxis, :] - intersection
intersection[area1 <= 0 + np.finfo("float").eps, :] = 0
intersection[:, area2 <= 0 + np.finfo("float").eps] = 0
intersection[union <= 0 + np.finfo("float").eps] = 0
union[union <= 0 + np.finfo("float").eps] = 1
ious = intersection / union
return ious
@staticmethod
def _calculate_euclidean_similarity(dets1, dets2, zero_distance=2.0):
"""Calculates the euclidean distance between two sets of detections, and then converts this into a similarity
measure with values between 0 and 1 using the following formula: sim = max(0, 1 - dist/zero_distance).
The default zero_distance of 2.0, corresponds to the default used in MOT15_3D, such that a 0.5 similarity
threshold corresponds to a 1m distance threshold for TPs.
"""
dist = np.linalg.norm(dets1[:, np.newaxis] - dets2[np.newaxis, :], axis=2)
sim = np.maximum(0, 1 - dist / zero_distance)
return sim
@staticmethod
def _check_unique_ids(data, after_preproc=False):
"""Check the requirement that the tracker_ids and gt_ids are unique per timestep"""
gt_ids = data["gt_ids"]
tracker_ids = data["tracker_ids"]
for t, (gt_ids_t, tracker_ids_t) in enumerate(zip(gt_ids, tracker_ids)):
if len(tracker_ids_t) > 0:
unique_ids, counts = np.unique(tracker_ids_t, return_counts=True)
if np.max(counts) != 1:
duplicate_ids = unique_ids[counts > 1]
exc_str_init = (
"Tracker predicts the same ID more than once in a single timestep "
"(seq: %s, frame: %i, ids:" % (data["seq"], t + 1)
)
exc_str = (
" ".join([exc_str_init] + [str(d) for d in duplicate_ids]) + ")"
)
if after_preproc:
exc_str_init += (
"\n Note that this error occurred after preprocessing (but not before), "
"so ids may not be as in file, and something seems wrong with preproc."
)
raise TrackEvalException(exc_str)
if len(gt_ids_t) > 0:
unique_ids, counts = np.unique(gt_ids_t, return_counts=True)
if np.max(counts) != 1:
duplicate_ids = unique_ids[counts > 1]
exc_str_init = (
"Ground-truth has the same ID more than once in a single timestep "
"(seq: %s, frame: %i, ids:" % (data["seq"], t + 1)
)
exc_str = (
" ".join([exc_str_init] + [str(d) for d in duplicate_ids]) + ")"
)
if after_preproc:
exc_str_init += (
"\n Note that this error occurred after preprocessing (but not before), "
"so ids may not be as in file, and something seems wrong with preproc."
)
raise TrackEvalException(exc_str)

View File

@@ -0,0 +1,891 @@
# flake8: noqa
import itertools
import json
import os
from collections import defaultdict
import numpy as np
from scipy.optimize import linear_sum_assignment
from .. import _timing, utils
from ..utils import TrackEvalException
from ._base_dataset import _BaseDataset
class TAO_OW(_BaseDataset):
"""Dataset class for TAO tracking"""
@staticmethod
def get_default_dataset_config():
"""Default class config values"""
code_path = utils.get_code_path()
default_config = {
"GT_FOLDER": os.path.join(
code_path, "data/gt/tao/tao_training"
), # Location of GT data
"TRACKERS_FOLDER": os.path.join(
code_path, "data/trackers/tao/tao_training"
), # Trackers location
"OUTPUT_FOLDER": None, # Where to save eval results (if None, same as TRACKERS_FOLDER)
"TRACKERS_TO_EVAL": None, # Filenames of trackers to eval (if None, all in folder)
"CLASSES_TO_EVAL": None, # Classes to eval (if None, all classes)
"SPLIT_TO_EVAL": "training", # Valid: 'training', 'val'
"PRINT_CONFIG": True, # Whether to print current config
"TRACKER_SUB_FOLDER": "data", # Tracker files are in TRACKER_FOLDER/tracker_name/TRACKER_SUB_FOLDER
"OUTPUT_SUB_FOLDER": "", # Output files are saved in OUTPUT_FOLDER/tracker_name/OUTPUT_SUB_FOLDER
"TRACKER_DISPLAY_NAMES": None, # Names of trackers to display, if None: TRACKERS_TO_EVAL
"MAX_DETECTIONS": 300, # Number of maximal allowed detections per image (0 for unlimited)
"SUBSET": "all",
}
return default_config
def __init__(self, config=None):
"""Initialise dataset, checking that all required files are present"""
super().__init__()
# Fill non-given config values with defaults
self.config = utils.init_config(
config, self.get_default_dataset_config(), self.get_name()
)
self.gt_fol = self.config["GT_FOLDER"]
self.tracker_fol = self.config["TRACKERS_FOLDER"]
self.should_classes_combine = True
self.use_super_categories = False
self.tracker_sub_fol = self.config["TRACKER_SUB_FOLDER"]
self.output_fol = self.config["OUTPUT_FOLDER"]
if self.output_fol is None:
self.output_fol = self.tracker_fol
self.output_sub_fol = self.config["OUTPUT_SUB_FOLDER"]
gt_dir_files = [
file for file in os.listdir(self.gt_fol) if file.endswith(".json")
]
if len(gt_dir_files) != 1:
raise TrackEvalException(
self.gt_fol + " does not contain exactly one json file."
)
with open(os.path.join(self.gt_fol, gt_dir_files[0])) as f:
self.gt_data = json.load(f)
self.subset = self.config["SUBSET"]
if self.subset != "all":
# Split GT data into `known`, `unknown` or `distractor`
self._split_known_unknown_distractor()
self.gt_data = self._filter_gt_data(self.gt_data)
# merge categories marked with a merged tag in TAO dataset
self._merge_categories(self.gt_data["annotations"] + self.gt_data["tracks"])
# Get sequences to eval and sequence information
self.seq_list = [
vid["name"].replace("/", "-") for vid in self.gt_data["videos"]
]
self.seq_name_to_seq_id = {
vid["name"].replace("/", "-"): vid["id"] for vid in self.gt_data["videos"]
}
# compute mappings from videos to annotation data
self.videos_to_gt_tracks, self.videos_to_gt_images = self._compute_vid_mappings(
self.gt_data["annotations"]
)
# compute sequence lengths
self.seq_lengths = {vid["id"]: 0 for vid in self.gt_data["videos"]}
for img in self.gt_data["images"]:
self.seq_lengths[img["video_id"]] += 1
self.seq_to_images_to_timestep = self._compute_image_to_timestep_mappings()
self.seq_to_classes = {
vid["id"]: {
"pos_cat_ids": list(
{
track["category_id"]
for track in self.videos_to_gt_tracks[vid["id"]]
}
),
"neg_cat_ids": vid["neg_category_ids"],
"not_exhaustively_labeled_cat_ids": vid["not_exhaustive_category_ids"],
}
for vid in self.gt_data["videos"]
}
# Get classes to eval
considered_vid_ids = [self.seq_name_to_seq_id[vid] for vid in self.seq_list]
seen_cats = set(
[
cat_id
for vid_id in considered_vid_ids
for cat_id in self.seq_to_classes[vid_id]["pos_cat_ids"]
]
)
# only classes with ground truth are evaluated in TAO
self.valid_classes = [
cls["name"] for cls in self.gt_data["categories"] if cls["id"] in seen_cats
]
# cls_name_to_cls_id_map = {cls['name']: cls['id'] for cls in self.gt_data['categories']}
if self.config["CLASSES_TO_EVAL"]:
# self.class_list = [cls.lower() if cls.lower() in self.valid_classes else None
# for cls in self.config['CLASSES_TO_EVAL']]
self.class_list = ["object"] # class-agnostic
if not all(self.class_list):
raise TrackEvalException(
"Attempted to evaluate an invalid class. Only classes "
+ ", ".join(self.valid_classes)
+ " are valid (classes present in ground truth data)."
)
else:
# self.class_list = [cls for cls in self.valid_classes]
self.class_list = ["object"] # class-agnostic
# self.class_name_to_class_id = {k: v for k, v in cls_name_to_cls_id_map.items() if k in self.class_list}
self.class_name_to_class_id = {"object": 1} # class-agnostic
# Get trackers to eval
if self.config["TRACKERS_TO_EVAL"] is None:
self.tracker_list = os.listdir(self.tracker_fol)
else:
self.tracker_list = self.config["TRACKERS_TO_EVAL"]
if self.config["TRACKER_DISPLAY_NAMES"] is None:
self.tracker_to_disp = dict(zip(self.tracker_list, self.tracker_list))
elif (self.config["TRACKERS_TO_EVAL"] is not None) and (
len(self.config["TRACKER_DISPLAY_NAMES"]) == len(self.tracker_list)
):
self.tracker_to_disp = dict(
zip(self.tracker_list, self.config["TRACKER_DISPLAY_NAMES"])
)
else:
raise TrackEvalException(
"List of tracker files and tracker display names do not match."
)
self.tracker_data = {tracker: dict() for tracker in self.tracker_list}
for tracker in self.tracker_list:
tr_dir_files = [
file
for file in os.listdir(
os.path.join(self.tracker_fol, tracker, self.tracker_sub_fol)
)
if file.endswith(".json")
]
if len(tr_dir_files) != 1:
raise TrackEvalException(
os.path.join(self.tracker_fol, tracker, self.tracker_sub_fol)
+ " does not contain exactly one json file."
)
with open(
os.path.join(
self.tracker_fol, tracker, self.tracker_sub_fol, tr_dir_files[0]
)
) as f:
curr_data = json.load(f)
# limit detections if MAX_DETECTIONS > 0
if self.config["MAX_DETECTIONS"]:
curr_data = self._limit_dets_per_image(curr_data)
# fill missing video ids
self._fill_video_ids_inplace(curr_data)
# make track ids unique over whole evaluation set
self._make_track_ids_unique(curr_data)
# merge categories marked with a merged tag in TAO dataset
self._merge_categories(curr_data)
# get tracker sequence information
curr_videos_to_tracker_tracks, curr_videos_to_tracker_images = (
self._compute_vid_mappings(curr_data)
)
self.tracker_data[tracker]["vids_to_tracks"] = curr_videos_to_tracker_tracks
self.tracker_data[tracker]["vids_to_images"] = curr_videos_to_tracker_images
def get_display_name(self, tracker):
return self.tracker_to_disp[tracker]
def _load_raw_file(self, tracker, seq, is_gt):
"""Load a file (gt or tracker) in the TAO format
If is_gt, this returns a dict which contains the fields:
[gt_ids, gt_classes] : list (for each timestep) of 1D NDArrays (for each det).
[gt_dets]: list (for each timestep) of lists of detections.
[classes_to_gt_tracks]: dictionary with class values as keys and list of dictionaries (with frame indices as
keys and corresponding segmentations as values) for each track
[classes_to_gt_track_ids, classes_to_gt_track_areas, classes_to_gt_track_lengths]: dictionary with class values
as keys and lists (for each track) as values
if not is_gt, this returns a dict which contains the fields:
[tracker_ids, tracker_classes, tracker_confidences] : list (for each timestep) of 1D NDArrays (for each det).
[tracker_dets]: list (for each timestep) of lists of detections.
[classes_to_dt_tracks]: dictionary with class values as keys and list of dictionaries (with frame indices as
keys and corresponding segmentations as values) for each track
[classes_to_dt_track_ids, classes_to_dt_track_areas, classes_to_dt_track_lengths]: dictionary with class values
as keys and lists as values
[classes_to_dt_track_scores]: dictionary with class values as keys and 1D numpy arrays as values
"""
seq_id = self.seq_name_to_seq_id[seq]
# File location
if is_gt:
imgs = self.videos_to_gt_images[seq_id]
else:
imgs = self.tracker_data[tracker]["vids_to_images"][seq_id]
# Convert data to required format
num_timesteps = self.seq_lengths[seq_id]
img_to_timestep = self.seq_to_images_to_timestep[seq_id]
data_keys = ["ids", "classes", "dets"]
if not is_gt:
data_keys += ["tracker_confidences"]
raw_data = {key: [None] * num_timesteps for key in data_keys}
for img in imgs:
# some tracker data contains images without any ground truth information, these are ignored
try:
t = img_to_timestep[img["id"]]
except KeyError:
continue
annotations = img["annotations"]
raw_data["dets"][t] = np.atleast_2d(
[ann["bbox"] for ann in annotations]
).astype(float)
raw_data["ids"][t] = np.atleast_1d(
[ann["track_id"] for ann in annotations]
).astype(int)
raw_data["classes"][t] = np.atleast_1d([1 for _ in annotations]).astype(
int
) # class-agnostic
if not is_gt:
raw_data["tracker_confidences"][t] = np.atleast_1d(
[ann["score"] for ann in annotations]
).astype(float)
for t, d in enumerate(raw_data["dets"]):
if d is None:
raw_data["dets"][t] = np.empty((0, 4)).astype(float)
raw_data["ids"][t] = np.empty(0).astype(int)
raw_data["classes"][t] = np.empty(0).astype(int)
if not is_gt:
raw_data["tracker_confidences"][t] = np.empty(0)
if is_gt:
key_map = {"ids": "gt_ids", "classes": "gt_classes", "dets": "gt_dets"}
else:
key_map = {
"ids": "tracker_ids",
"classes": "tracker_classes",
"dets": "tracker_dets",
}
for k, v in key_map.items():
raw_data[v] = raw_data.pop(k)
# all_classes = [self.class_name_to_class_id[cls] for cls in self.class_list]
all_classes = [1] # class-agnostic
if is_gt:
classes_to_consider = all_classes
all_tracks = self.videos_to_gt_tracks[seq_id]
else:
# classes_to_consider = self.seq_to_classes[seq_id]['pos_cat_ids'] \
# + self.seq_to_classes[seq_id]['neg_cat_ids']
classes_to_consider = all_classes # class-agnostic
all_tracks = self.tracker_data[tracker]["vids_to_tracks"][seq_id]
# classes_to_tracks = {cls: [track for track in all_tracks if track['category_id'] == cls]
# if cls in classes_to_consider else [] for cls in all_classes}
classes_to_tracks = {
cls: [track for track in all_tracks] if cls in classes_to_consider else []
for cls in all_classes
} # class-agnostic
# mapping from classes to track information
raw_data["classes_to_tracks"] = {
cls: [
{
det["image_id"]: np.atleast_1d(det["bbox"])
for det in track["annotations"]
}
for track in tracks
]
for cls, tracks in classes_to_tracks.items()
}
raw_data["classes_to_track_ids"] = {
cls: [track["id"] for track in tracks]
for cls, tracks in classes_to_tracks.items()
}
raw_data["classes_to_track_areas"] = {
cls: [track["area"] for track in tracks]
for cls, tracks in classes_to_tracks.items()
}
raw_data["classes_to_track_lengths"] = {
cls: [len(track["annotations"]) for track in tracks]
for cls, tracks in classes_to_tracks.items()
}
if not is_gt:
raw_data["classes_to_dt_track_scores"] = {
cls: np.array(
[
np.mean([float(x["score"]) for x in track["annotations"]])
for track in tracks
]
)
for cls, tracks in classes_to_tracks.items()
}
if is_gt:
key_map = {
"classes_to_tracks": "classes_to_gt_tracks",
"classes_to_track_ids": "classes_to_gt_track_ids",
"classes_to_track_lengths": "classes_to_gt_track_lengths",
"classes_to_track_areas": "classes_to_gt_track_areas",
}
else:
key_map = {
"classes_to_tracks": "classes_to_dt_tracks",
"classes_to_track_ids": "classes_to_dt_track_ids",
"classes_to_track_lengths": "classes_to_dt_track_lengths",
"classes_to_track_areas": "classes_to_dt_track_areas",
}
for k, v in key_map.items():
raw_data[v] = raw_data.pop(k)
raw_data["num_timesteps"] = num_timesteps
raw_data["neg_cat_ids"] = self.seq_to_classes[seq_id]["neg_cat_ids"]
raw_data["not_exhaustively_labeled_cls"] = self.seq_to_classes[seq_id][
"not_exhaustively_labeled_cat_ids"
]
raw_data["seq"] = seq
return raw_data
@_timing.time
def get_preprocessed_seq_data(self, raw_data, cls):
"""Preprocess data for a single sequence for a single class ready for evaluation.
Inputs:
- raw_data is a dict containing the data for the sequence already read in by get_raw_seq_data().
- cls is the class to be evaluated.
Outputs:
- data is a dict containing all of the information that metrics need to perform evaluation.
It contains the following fields:
[num_timesteps, num_gt_ids, num_tracker_ids, num_gt_dets, num_tracker_dets] : integers.
[gt_ids, tracker_ids, tracker_confidences]: list (for each timestep) of 1D NDArrays (for each det).
[gt_dets, tracker_dets]: list (for each timestep) of lists of detections.
[similarity_scores]: list (for each timestep) of 2D NDArrays.
Notes:
General preprocessing (preproc) occurs in 4 steps. Some datasets may not use all of these steps.
1) Extract only detections relevant for the class to be evaluated (including distractor detections).
2) Match gt dets and tracker dets. Remove tracker dets that are matched to a gt det that is of a
distractor class, or otherwise marked as to be removed.
3) Remove unmatched tracker dets if they fall within a crowd ignore region or don't meet a certain
other criteria (e.g. are too small).
4) Remove gt dets that were only useful for preprocessing and not for actual evaluation.
After the above preprocessing steps, this function also calculates the number of gt and tracker detections
and unique track ids. It also relabels gt and tracker ids to be contiguous and checks that ids are
unique within each timestep.
TAO:
In TAO, the 4 preproc steps are as follow:
1) All classes present in the ground truth data are evaluated separately.
2) No matched tracker detections are removed.
3) Unmatched tracker detections are removed if there is not ground truth data and the class does not
belong to the categories marked as negative for this sequence. Additionally, unmatched tracker
detections for classes which are marked as not exhaustively labeled are removed.
4) No gt detections are removed.
Further, for TrackMAP computation track representations for the given class are accessed from a dictionary
and the tracks from the tracker data are sorted according to the tracker confidence.
"""
cls_id = self.class_name_to_class_id[cls]
is_not_exhaustively_labeled = cls_id in raw_data["not_exhaustively_labeled_cls"]
is_neg_category = cls_id in raw_data["neg_cat_ids"]
data_keys = [
"gt_ids",
"tracker_ids",
"gt_dets",
"tracker_dets",
"tracker_confidences",
"similarity_scores",
]
data = {key: [None] * raw_data["num_timesteps"] for key in data_keys}
unique_gt_ids = []
unique_tracker_ids = []
num_gt_dets = 0
num_tracker_dets = 0
for t in range(raw_data["num_timesteps"]):
# Only extract relevant dets for this class for preproc and eval (cls)
gt_class_mask = np.atleast_1d(raw_data["gt_classes"][t] == cls_id)
gt_class_mask = gt_class_mask.astype(bool)
gt_ids = raw_data["gt_ids"][t][gt_class_mask]
gt_dets = raw_data["gt_dets"][t][gt_class_mask]
tracker_class_mask = np.atleast_1d(raw_data["tracker_classes"][t] == cls_id)
tracker_class_mask = tracker_class_mask.astype(bool)
tracker_ids = raw_data["tracker_ids"][t][tracker_class_mask]
tracker_dets = raw_data["tracker_dets"][t][tracker_class_mask]
tracker_confidences = raw_data["tracker_confidences"][t][tracker_class_mask]
similarity_scores = raw_data["similarity_scores"][t][gt_class_mask, :][
:, tracker_class_mask
]
# Match tracker and gt dets (with hungarian algorithm).
unmatched_indices = np.arange(tracker_ids.shape[0])
if gt_ids.shape[0] > 0 and tracker_ids.shape[0] > 0:
matching_scores = similarity_scores.copy()
matching_scores[matching_scores < 0.5 - np.finfo("float").eps] = 0
match_rows, match_cols = linear_sum_assignment(-matching_scores)
actually_matched_mask = (
matching_scores[match_rows, match_cols] > 0 + np.finfo("float").eps
)
match_cols = match_cols[actually_matched_mask]
unmatched_indices = np.delete(unmatched_indices, match_cols, axis=0)
if gt_ids.shape[0] == 0 and not is_neg_category:
to_remove_tracker = unmatched_indices
elif is_not_exhaustively_labeled:
to_remove_tracker = unmatched_indices
else:
to_remove_tracker = np.array([], dtype=int)
# remove all unwanted unmatched tracker detections
data["tracker_ids"][t] = np.delete(tracker_ids, to_remove_tracker, axis=0)
data["tracker_dets"][t] = np.delete(tracker_dets, to_remove_tracker, axis=0)
data["tracker_confidences"][t] = np.delete(
tracker_confidences, to_remove_tracker, axis=0
)
similarity_scores = np.delete(similarity_scores, to_remove_tracker, axis=1)
data["gt_ids"][t] = gt_ids
data["gt_dets"][t] = gt_dets
data["similarity_scores"][t] = similarity_scores
unique_gt_ids += list(np.unique(data["gt_ids"][t]))
unique_tracker_ids += list(np.unique(data["tracker_ids"][t]))
num_tracker_dets += len(data["tracker_ids"][t])
num_gt_dets += len(data["gt_ids"][t])
# Re-label IDs such that there are no empty IDs
if len(unique_gt_ids) > 0:
unique_gt_ids = np.unique(unique_gt_ids)
gt_id_map = np.nan * np.ones((np.max(unique_gt_ids) + 1))
gt_id_map[unique_gt_ids] = np.arange(len(unique_gt_ids))
for t in range(raw_data["num_timesteps"]):
if len(data["gt_ids"][t]) > 0:
data["gt_ids"][t] = gt_id_map[data["gt_ids"][t]].astype(int)
if len(unique_tracker_ids) > 0:
unique_tracker_ids = np.unique(unique_tracker_ids)
tracker_id_map = np.nan * np.ones((np.max(unique_tracker_ids) + 1))
tracker_id_map[unique_tracker_ids] = np.arange(len(unique_tracker_ids))
for t in range(raw_data["num_timesteps"]):
if len(data["tracker_ids"][t]) > 0:
data["tracker_ids"][t] = tracker_id_map[
data["tracker_ids"][t]
].astype(int)
# Record overview statistics.
data["num_tracker_dets"] = num_tracker_dets
data["num_gt_dets"] = num_gt_dets
data["num_tracker_ids"] = len(unique_tracker_ids)
data["num_gt_ids"] = len(unique_gt_ids)
data["num_timesteps"] = raw_data["num_timesteps"]
data["seq"] = raw_data["seq"]
# get track representations
data["gt_tracks"] = raw_data["classes_to_gt_tracks"][cls_id]
data["gt_track_ids"] = raw_data["classes_to_gt_track_ids"][cls_id]
data["gt_track_lengths"] = raw_data["classes_to_gt_track_lengths"][cls_id]
data["gt_track_areas"] = raw_data["classes_to_gt_track_areas"][cls_id]
data["dt_tracks"] = raw_data["classes_to_dt_tracks"][cls_id]
data["dt_track_ids"] = raw_data["classes_to_dt_track_ids"][cls_id]
data["dt_track_lengths"] = raw_data["classes_to_dt_track_lengths"][cls_id]
data["dt_track_areas"] = raw_data["classes_to_dt_track_areas"][cls_id]
data["dt_track_scores"] = raw_data["classes_to_dt_track_scores"][cls_id]
data["not_exhaustively_labeled"] = is_not_exhaustively_labeled
data["iou_type"] = "bbox"
# sort tracker data tracks by tracker confidence scores
if data["dt_tracks"]:
idx = np.argsort(
[-score for score in data["dt_track_scores"]], kind="mergesort"
)
data["dt_track_scores"] = [data["dt_track_scores"][i] for i in idx]
data["dt_tracks"] = [data["dt_tracks"][i] for i in idx]
data["dt_track_ids"] = [data["dt_track_ids"][i] for i in idx]
data["dt_track_lengths"] = [data["dt_track_lengths"][i] for i in idx]
data["dt_track_areas"] = [data["dt_track_areas"][i] for i in idx]
# Ensure that ids are unique per timestep.
self._check_unique_ids(data)
return data
def _calculate_similarities(self, gt_dets_t, tracker_dets_t):
similarity_scores = self._calculate_box_ious(gt_dets_t, tracker_dets_t)
return similarity_scores
def _merge_categories(self, annotations):
"""
Merges categories with a merged tag. Adapted from https://github.com/TAO-Dataset
:param annotations: the annotations in which the classes should be merged
:return: None
"""
merge_map = {}
for category in self.gt_data["categories"]:
if "merged" in category:
for to_merge in category["merged"]:
merge_map[to_merge["id"]] = category["id"]
for ann in annotations:
ann["category_id"] = merge_map.get(ann["category_id"], ann["category_id"])
def _compute_vid_mappings(self, annotations):
"""
Computes mappings from Videos to corresponding tracks and images.
:param annotations: the annotations for which the mapping should be generated
:return: the video-to-track-mapping, the video-to-image-mapping
"""
vids_to_tracks = {}
vids_to_imgs = {}
vid_ids = [vid["id"] for vid in self.gt_data["videos"]]
# compute an mapping from image IDs to images
images = {}
for image in self.gt_data["images"]:
images[image["id"]] = image
for ann in annotations:
ann["area"] = ann["bbox"][2] * ann["bbox"][3]
vid = ann["video_id"]
if ann["video_id"] not in vids_to_tracks.keys():
vids_to_tracks[ann["video_id"]] = list()
if ann["video_id"] not in vids_to_imgs.keys():
vids_to_imgs[ann["video_id"]] = list()
# Fill in vids_to_tracks
tid = ann["track_id"]
exist_tids = [track["id"] for track in vids_to_tracks[vid]]
try:
index1 = exist_tids.index(tid)
except ValueError:
index1 = -1
if tid not in exist_tids:
curr_track = {
"id": tid,
"category_id": ann["category_id"],
"video_id": vid,
"annotations": [ann],
}
vids_to_tracks[vid].append(curr_track)
else:
vids_to_tracks[vid][index1]["annotations"].append(ann)
# Fill in vids_to_imgs
img_id = ann["image_id"]
exist_img_ids = [img["id"] for img in vids_to_imgs[vid]]
try:
index2 = exist_img_ids.index(img_id)
except ValueError:
index2 = -1
if index2 == -1:
curr_img = {"id": img_id, "annotations": [ann]}
vids_to_imgs[vid].append(curr_img)
else:
vids_to_imgs[vid][index2]["annotations"].append(ann)
# sort annotations by frame index and compute track area
for vid, tracks in vids_to_tracks.items():
for track in tracks:
track["annotations"] = sorted(
track["annotations"],
key=lambda x: images[x["image_id"]]["frame_index"],
)
# Computer average area
track["area"] = sum(x["area"] for x in track["annotations"]) / len(
track["annotations"]
)
# Ensure all videos are present
for vid_id in vid_ids:
if vid_id not in vids_to_tracks.keys():
vids_to_tracks[vid_id] = []
if vid_id not in vids_to_imgs.keys():
vids_to_imgs[vid_id] = []
return vids_to_tracks, vids_to_imgs
def _compute_image_to_timestep_mappings(self):
"""
Computes a mapping from images to the corresponding timestep in the sequence.
:return: the image-to-timestep-mapping
"""
images = {}
for image in self.gt_data["images"]:
images[image["id"]] = image
seq_to_imgs_to_timestep = {vid["id"]: dict() for vid in self.gt_data["videos"]}
for vid in seq_to_imgs_to_timestep:
curr_imgs = [img["id"] for img in self.videos_to_gt_images[vid]]
curr_imgs = sorted(curr_imgs, key=lambda x: images[x]["frame_index"])
seq_to_imgs_to_timestep[vid] = {
curr_imgs[i]: i for i in range(len(curr_imgs))
}
return seq_to_imgs_to_timestep
def _limit_dets_per_image(self, annotations):
"""
Limits the number of detections for each image to config['MAX_DETECTIONS']. Adapted from
https://github.com/TAO-Dataset/
:param annotations: the annotations in which the detections should be limited
:return: the annotations with limited detections
"""
max_dets = self.config["MAX_DETECTIONS"]
img_ann = defaultdict(list)
for ann in annotations:
img_ann[ann["image_id"]].append(ann)
for img_id, _anns in img_ann.items():
if len(_anns) <= max_dets:
continue
_anns = sorted(_anns, key=lambda x: x["score"], reverse=True)
img_ann[img_id] = _anns[:max_dets]
return [ann for anns in img_ann.values() for ann in anns]
def _fill_video_ids_inplace(self, annotations):
"""
Fills in missing video IDs inplace. Adapted from https://github.com/TAO-Dataset/
:param annotations: the annotations for which the videos IDs should be filled inplace
:return: None
"""
missing_video_id = [x for x in annotations if "video_id" not in x]
if missing_video_id:
image_id_to_video_id = {
x["id"]: x["video_id"] for x in self.gt_data["images"]
}
for x in missing_video_id:
x["video_id"] = image_id_to_video_id[x["image_id"]]
@staticmethod
def _make_track_ids_unique(annotations):
"""
Makes the track IDs unqiue over the whole annotation set. Adapted from https://github.com/TAO-Dataset/
:param annotations: the annotation set
:return: the number of updated IDs
"""
track_id_videos = {}
track_ids_to_update = set()
max_track_id = 0
for ann in annotations:
t = ann["track_id"]
if t not in track_id_videos:
track_id_videos[t] = ann["video_id"]
if ann["video_id"] != track_id_videos[t]:
# Track id is assigned to multiple videos
track_ids_to_update.add(t)
max_track_id = max(max_track_id, t)
if track_ids_to_update:
print("true")
next_id = itertools.count(max_track_id + 1)
new_track_ids = defaultdict(lambda: next(next_id))
for ann in annotations:
t = ann["track_id"]
v = ann["video_id"]
if t in track_ids_to_update:
ann["track_id"] = new_track_ids[t, v]
return len(track_ids_to_update)
def _split_known_unknown_distractor(self):
all_ids = set(
[i for i in range(1, 2000)]
) # 2000 is larger than the max category id in TAO-OW.
# `knowns` includes 78 TAO_category_ids that corresponds to 78 COCO classes.
# (The other 2 COCO classes do not have corresponding classes in TAO).
self.knowns = {
4,
13,
1038,
544,
1057,
34,
35,
36,
41,
45,
58,
60,
579,
1091,
1097,
1099,
78,
79,
81,
91,
1115,
1117,
95,
1122,
99,
1132,
621,
1135,
625,
118,
1144,
126,
642,
1155,
133,
1162,
139,
154,
174,
185,
699,
1215,
714,
717,
1229,
211,
729,
221,
229,
747,
235,
237,
779,
276,
805,
299,
829,
852,
347,
371,
382,
896,
392,
926,
937,
428,
429,
961,
452,
979,
980,
982,
475,
480,
993,
1001,
502,
1018,
}
# `distractors` is defined as in the paper "Opening up Open-World Tracking"
self.distractors = {
20,
63,
108,
180,
188,
204,
212,
247,
303,
403,
407,
415,
490,
504,
507,
513,
529,
567,
569,
588,
672,
691,
702,
708,
711,
720,
736,
737,
798,
813,
815,
827,
831,
851,
877,
883,
912,
971,
976,
1130,
1133,
1134,
1169,
1184,
1220,
}
self.unknowns = all_ids.difference(self.knowns.union(self.distractors))
def _filter_gt_data(self, raw_gt_data):
"""
Filter out irrelevant data in the raw_gt_data
Args:
raw_gt_data: directly loaded from json.
Returns:
filtered gt_data
"""
valid_cat_ids = list()
if self.subset == "known":
valid_cat_ids = self.knowns
elif self.subset == "distractor":
valid_cat_ids = self.distractors
elif self.subset == "unknown":
valid_cat_ids = self.unknowns
# elif self.subset == "test_only_unknowns":
# valid_cat_ids = test_only_unknowns
else:
raise Exception("The parameter `SUBSET` is incorrect")
filtered = dict()
filtered["videos"] = raw_gt_data["videos"]
# filtered["videos"] = list()
unwanted_vid = set()
# for video in raw_gt_data["videos"]:
# datasrc = video["name"].split('/')[1]
# if datasrc in data_srcs:
# filtered["videos"].append(video)
# else:
# unwanted_vid.add(video["id"])
filtered["annotations"] = list()
for ann in raw_gt_data["annotations"]:
if (ann["video_id"] not in unwanted_vid) and (
ann["category_id"] in valid_cat_ids
):
filtered["annotations"].append(ann)
filtered["tracks"] = list()
for track in raw_gt_data["tracks"]:
if (track["video_id"] not in unwanted_vid) and (
track["category_id"] in valid_cat_ids
):
filtered["tracks"].append(track)
filtered["images"] = list()
for image in raw_gt_data["images"]:
if image["video_id"] not in unwanted_vid:
filtered["images"].append(image)
filtered["categories"] = list()
for cat in raw_gt_data["categories"]:
if cat["id"] in valid_cat_ids:
filtered["categories"].append(cat)
filtered["info"] = raw_gt_data["info"]
filtered["licenses"] = raw_gt_data["licenses"]
return filtered

View File

@@ -0,0 +1,524 @@
# flake8: noqa
# note: this file has been modified from its original version in TrackEval in
# https://github.com/JonathonLuiten/TrackEval/blob/master/trackeval/datasets/youtube_vis.py
# to support the following:
# 1) bbox evaluation (via `IOU_TYPE`)
# 2) passing GT and prediction data as Python objects (via `GT_JSON_OBJECT` and `TRACKER_JSON_OBJECT`)
# 3) specifying a custom dataset name (via `DATASET_NAME`)
import json
import os
import numpy as np
from .. import _timing, utils
from ..utils import TrackEvalException
from ._base_dataset import _BaseDataset
class YouTubeVIS(_BaseDataset):
"""Dataset class for YouTubeVIS tracking"""
@staticmethod
def get_default_dataset_config():
"""Default class config values"""
code_path = utils.get_code_path()
default_config = {
"GT_FOLDER": os.path.join(
code_path, "data/gt/youtube_vis/"
), # Location of GT data
"TRACKERS_FOLDER": os.path.join(code_path, "data/trackers/youtube_vis/"),
# Trackers location
"OUTPUT_FOLDER": None, # Where to save eval results (if None, same as TRACKERS_FOLDER)
"TRACKERS_TO_EVAL": None, # Filenames of trackers to eval (if None, all in folder)
"CLASSES_TO_EVAL": None, # Classes to eval (if None, all classes)
"SPLIT_TO_EVAL": "train_sub_split", # Valid: 'train', 'val', 'train_sub_split'
"PRINT_CONFIG": True, # Whether to print current config
"OUTPUT_SUB_FOLDER": "", # Output files are saved in OUTPUT_FOLDER/tracker_name/OUTPUT_SUB_FOLDER
"TRACKER_SUB_FOLDER": "data", # Tracker files are in TRACKER_FOLDER/tracker_name/TRACKER_SUB_FOLDER
"TRACKER_DISPLAY_NAMES": None, # Names of trackers to display, if None: TRACKERS_TO_EVAL
# Added for video phrase AP evaluation -- allow directly specifying the GT JSON data and Tracker (result)
# JSON data as Python objects, without reading from files.
"GT_JSON_OBJECT": None,
"TRACKER_JSON_OBJECT": None,
"IOU_TYPE": "segm",
"DATASET_NAME": "video",
}
return default_config
def __init__(self, config=None):
"""Initialise dataset, checking that all required files are present"""
super().__init__()
# Fill non-given config values with defaults
self.config = utils.init_config(config, self.get_default_dataset_config())
self.gt_fol = (
self.config["GT_FOLDER"] + "youtube_vis_" + self.config["SPLIT_TO_EVAL"]
)
self.tracker_fol = (
self.config["TRACKERS_FOLDER"]
+ "youtube_vis_"
+ self.config["SPLIT_TO_EVAL"]
)
self.use_super_categories = False
self.should_classes_combine = True
assert self.config["IOU_TYPE"] in ["segm", "bbox"]
self.iou_type = self.config["IOU_TYPE"]
print("=" * 100)
print(f"Evaluate annotation type *{self.iou_type}*")
self.dataset_name = self.config["DATASET_NAME"]
self.output_fol = self.config["OUTPUT_FOLDER"]
if self.output_fol is None:
self.output_fol = self.tracker_fol
self.output_sub_fol = self.config["OUTPUT_SUB_FOLDER"]
self.tracker_sub_fol = self.config["TRACKER_SUB_FOLDER"]
if self.config["GT_JSON_OBJECT"] is not None:
# allow directly specifying the GT JSON data without reading from files
gt_json = self.config["GT_JSON_OBJECT"]
assert isinstance(gt_json, dict)
assert "videos" in gt_json
assert "categories" in gt_json
assert "annotations" in gt_json
self.gt_data = gt_json
else:
if not os.path.exists(self.gt_fol):
print("GT folder not found: " + self.gt_fol)
raise TrackEvalException(
"GT folder not found: " + os.path.basename(self.gt_fol)
)
gt_dir_files = [
file for file in os.listdir(self.gt_fol) if file.endswith(".json")
]
if len(gt_dir_files) != 1:
raise TrackEvalException(
self.gt_fol + " does not contain exactly one json file."
)
with open(os.path.join(self.gt_fol, gt_dir_files[0])) as f:
self.gt_data = json.load(f)
# Get classes to eval
self.valid_classes = [cls["name"] for cls in self.gt_data["categories"]]
cls_name_to_cls_id_map = {
cls["name"]: cls["id"] for cls in self.gt_data["categories"]
}
if self.config["CLASSES_TO_EVAL"]:
self.class_list = [
cls.lower() if cls.lower() in self.valid_classes else None
for cls in self.config["CLASSES_TO_EVAL"]
]
if not all(self.class_list):
raise TrackEvalException(
"Attempted to evaluate an invalid class. Only classes "
+ ", ".join(self.valid_classes)
+ " are valid."
)
else:
self.class_list = [cls["name"] for cls in self.gt_data["categories"]]
self.class_name_to_class_id = {
k: v for k, v in cls_name_to_cls_id_map.items() if k in self.class_list
}
# Get sequences to eval and check gt files exist
self.seq_list = [
vid["file_names"][0].split("/")[0] for vid in self.gt_data["videos"]
]
self.seq_name_to_seq_id = {
vid["file_names"][0].split("/")[0]: vid["id"]
for vid in self.gt_data["videos"]
}
self.seq_lengths = {
vid["id"]: len(vid["file_names"]) for vid in self.gt_data["videos"]
}
# encode masks and compute track areas
self._prepare_gt_annotations()
# Get trackers to eval
if self.config["TRACKER_JSON_OBJECT"] is not None:
# allow directly specifying the tracker JSON data without reading from files
tracker_json = self.config["TRACKER_JSON_OBJECT"]
assert isinstance(tracker_json, list)
self.tracker_list = ["tracker"]
elif self.config["TRACKERS_TO_EVAL"] is None:
self.tracker_list = os.listdir(self.tracker_fol)
else:
self.tracker_list = self.config["TRACKERS_TO_EVAL"]
if self.config["TRACKER_DISPLAY_NAMES"] is None:
self.tracker_to_disp = dict(zip(self.tracker_list, self.tracker_list))
elif (self.config["TRACKERS_TO_EVAL"] is not None) and (
len(self.config["TRACKER_DISPLAY_NAMES"]) == len(self.tracker_list)
):
self.tracker_to_disp = dict(
zip(self.tracker_list, self.config["TRACKER_DISPLAY_NAMES"])
)
else:
raise TrackEvalException(
"List of tracker files and tracker display names do not match."
)
# counter for globally unique track IDs
self.global_tid_counter = 0
self.tracker_data = dict()
if self.config["TRACKER_JSON_OBJECT"] is not None:
# allow directly specifying the tracker JSON data without reading from files
tracker = self.tracker_list[0]
self.tracker_data[tracker] = tracker_json
else:
for tracker in self.tracker_list:
tracker_dir_path = os.path.join(
self.tracker_fol, tracker, self.tracker_sub_fol
)
tr_dir_files = [
file
for file in os.listdir(tracker_dir_path)
if file.endswith(".json")
]
if len(tr_dir_files) != 1:
raise TrackEvalException(
tracker_dir_path + " does not contain exactly one json file."
)
with open(os.path.join(tracker_dir_path, tr_dir_files[0])) as f:
curr_data = json.load(f)
self.tracker_data[tracker] = curr_data
def get_display_name(self, tracker):
return self.tracker_to_disp[tracker]
def _load_raw_file(self, tracker, seq, is_gt):
"""Load a file (gt or tracker) in the YouTubeVIS format
If is_gt, this returns a dict which contains the fields:
[gt_ids, gt_classes] : list (for each timestep) of 1D NDArrays (for each det).
[gt_dets]: list (for each timestep) of lists of detections.
[classes_to_gt_tracks]: dictionary with class values as keys and list of dictionaries (with frame indices as
keys and corresponding segmentations as values) for each track
[classes_to_gt_track_ids, classes_to_gt_track_areas, classes_to_gt_track_iscrowd]: dictionary with class values
as keys and lists (for each track) as values
if not is_gt, this returns a dict which contains the fields:
[tracker_ids, tracker_classes, tracker_confidences] : list (for each timestep) of 1D NDArrays (for each det).
[tracker_dets]: list (for each timestep) of lists of detections.
[classes_to_dt_tracks]: dictionary with class values as keys and list of dictionaries (with frame indices as
keys and corresponding segmentations as values) for each track
[classes_to_dt_track_ids, classes_to_dt_track_areas]: dictionary with class values as keys and lists as values
[classes_to_dt_track_scores]: dictionary with class values as keys and 1D numpy arrays as values
"""
# select sequence tracks
seq_id = self.seq_name_to_seq_id[seq]
if is_gt:
tracks = [
ann for ann in self.gt_data["annotations"] if ann["video_id"] == seq_id
]
else:
tracks = self._get_tracker_seq_tracks(tracker, seq_id)
# Convert data to required format
num_timesteps = self.seq_lengths[seq_id]
data_keys = ["ids", "classes", "dets"]
if not is_gt:
data_keys += ["tracker_confidences"]
raw_data = {key: [None] * num_timesteps for key in data_keys}
result_key = "segmentations" if self.iou_type == "segm" else "bboxes"
for t in range(num_timesteps):
raw_data["dets"][t] = [
track[result_key][t] for track in tracks if track[result_key][t]
]
raw_data["ids"][t] = np.atleast_1d(
[track["id"] for track in tracks if track[result_key][t]]
).astype(int)
raw_data["classes"][t] = np.atleast_1d(
[track["category_id"] for track in tracks if track[result_key][t]]
).astype(int)
if not is_gt:
raw_data["tracker_confidences"][t] = np.atleast_1d(
[track["score"] for track in tracks if track[result_key][t]]
).astype(float)
if is_gt:
key_map = {"ids": "gt_ids", "classes": "gt_classes", "dets": "gt_dets"}
else:
key_map = {
"ids": "tracker_ids",
"classes": "tracker_classes",
"dets": "tracker_dets",
}
for k, v in key_map.items():
raw_data[v] = raw_data.pop(k)
all_cls_ids = {self.class_name_to_class_id[cls] for cls in self.class_list}
classes_to_tracks = {
cls: [track for track in tracks if track["category_id"] == cls]
for cls in all_cls_ids
}
# mapping from classes to track representations and track information
raw_data["classes_to_tracks"] = {
cls: [
{i: track[result_key][i] for i in range(len(track[result_key]))}
for track in tracks
]
for cls, tracks in classes_to_tracks.items()
}
raw_data["classes_to_track_ids"] = {
cls: [track["id"] for track in tracks]
for cls, tracks in classes_to_tracks.items()
}
raw_data["classes_to_track_areas"] = {
cls: [track["area"] for track in tracks]
for cls, tracks in classes_to_tracks.items()
}
if is_gt:
raw_data["classes_to_gt_track_iscrowd"] = {
cls: [track["iscrowd"] for track in tracks]
for cls, tracks in classes_to_tracks.items()
}
else:
raw_data["classes_to_dt_track_scores"] = {
cls: np.array([track["score"] for track in tracks])
for cls, tracks in classes_to_tracks.items()
}
if is_gt:
key_map = {
"classes_to_tracks": "classes_to_gt_tracks",
"classes_to_track_ids": "classes_to_gt_track_ids",
"classes_to_track_areas": "classes_to_gt_track_areas",
}
else:
key_map = {
"classes_to_tracks": "classes_to_dt_tracks",
"classes_to_track_ids": "classes_to_dt_track_ids",
"classes_to_track_areas": "classes_to_dt_track_areas",
}
for k, v in key_map.items():
raw_data[v] = raw_data.pop(k)
raw_data["num_timesteps"] = num_timesteps
raw_data["seq"] = seq
return raw_data
@_timing.time
def get_preprocessed_seq_data(self, raw_data, cls):
"""Preprocess data for a single sequence for a single class ready for evaluation.
Inputs:
- raw_data is a dict containing the data for the sequence already read in by get_raw_seq_data().
- cls is the class to be evaluated.
Outputs:
- data is a dict containing all of the information that metrics need to perform evaluation.
It contains the following fields:
[num_timesteps, num_gt_ids, num_tracker_ids, num_gt_dets, num_tracker_dets] : integers.
[gt_ids, tracker_ids, tracker_confidences]: list (for each timestep) of 1D NDArrays (for each det).
[gt_dets, tracker_dets]: list (for each timestep) of lists of detections.
[similarity_scores]: list (for each timestep) of 2D NDArrays.
Notes:
General preprocessing (preproc) occurs in 4 steps. Some datasets may not use all of these steps.
1) Extract only detections relevant for the class to be evaluated (including distractor detections).
2) Match gt dets and tracker dets. Remove tracker dets that are matched to a gt det that is of a
distractor class, or otherwise marked as to be removed.
3) Remove unmatched tracker dets if they fall within a crowd ignore region or don't meet a certain
other criteria (e.g. are too small).
4) Remove gt dets that were only useful for preprocessing and not for actual evaluation.
After the above preprocessing steps, this function also calculates the number of gt and tracker detections
and unique track ids. It also relabels gt and tracker ids to be contiguous and checks that ids are
unique within each timestep.
YouTubeVIS:
In YouTubeVIS, the 4 preproc steps are as follow:
1) There are 40 classes which are evaluated separately.
2) No matched tracker dets are removed.
3) No unmatched tracker dets are removed.
4) No gt dets are removed.
Further, for TrackMAP computation track representations for the given class are accessed from a dictionary
and the tracks from the tracker data are sorted according to the tracker confidence.
"""
cls_id = self.class_name_to_class_id[cls]
data_keys = [
"gt_ids",
"tracker_ids",
"gt_dets",
"tracker_dets",
"similarity_scores",
]
data = {key: [None] * raw_data["num_timesteps"] for key in data_keys}
unique_gt_ids = []
unique_tracker_ids = []
num_gt_dets = 0
num_tracker_dets = 0
for t in range(raw_data["num_timesteps"]):
# Only extract relevant dets for this class for eval (cls)
gt_class_mask = np.atleast_1d(raw_data["gt_classes"][t] == cls_id)
gt_class_mask = gt_class_mask.astype(bool)
gt_ids = raw_data["gt_ids"][t][gt_class_mask]
gt_dets = [
raw_data["gt_dets"][t][ind]
for ind in range(len(gt_class_mask))
if gt_class_mask[ind]
]
tracker_class_mask = np.atleast_1d(raw_data["tracker_classes"][t] == cls_id)
tracker_class_mask = tracker_class_mask.astype(bool)
tracker_ids = raw_data["tracker_ids"][t][tracker_class_mask]
tracker_dets = [
raw_data["tracker_dets"][t][ind]
for ind in range(len(tracker_class_mask))
if tracker_class_mask[ind]
]
similarity_scores = raw_data["similarity_scores"][t][gt_class_mask, :][
:, tracker_class_mask
]
data["tracker_ids"][t] = tracker_ids
data["tracker_dets"][t] = tracker_dets
data["gt_ids"][t] = gt_ids
data["gt_dets"][t] = gt_dets
data["similarity_scores"][t] = similarity_scores
unique_gt_ids += list(np.unique(data["gt_ids"][t]))
unique_tracker_ids += list(np.unique(data["tracker_ids"][t]))
num_tracker_dets += len(data["tracker_ids"][t])
num_gt_dets += len(data["gt_ids"][t])
# Re-label IDs such that there are no empty IDs
if len(unique_gt_ids) > 0:
unique_gt_ids = np.unique(unique_gt_ids)
gt_id_map = np.nan * np.ones((np.max(unique_gt_ids) + 1))
gt_id_map[unique_gt_ids] = np.arange(len(unique_gt_ids))
for t in range(raw_data["num_timesteps"]):
if len(data["gt_ids"][t]) > 0:
data["gt_ids"][t] = gt_id_map[data["gt_ids"][t]].astype(int)
if len(unique_tracker_ids) > 0:
unique_tracker_ids = np.unique(unique_tracker_ids)
tracker_id_map = np.nan * np.ones((np.max(unique_tracker_ids) + 1))
tracker_id_map[unique_tracker_ids] = np.arange(len(unique_tracker_ids))
for t in range(raw_data["num_timesteps"]):
if len(data["tracker_ids"][t]) > 0:
data["tracker_ids"][t] = tracker_id_map[
data["tracker_ids"][t]
].astype(int)
# Ensure that ids are unique per timestep.
self._check_unique_ids(data)
# Record overview statistics.
data["num_tracker_dets"] = num_tracker_dets
data["num_gt_dets"] = num_gt_dets
data["num_tracker_ids"] = len(unique_tracker_ids)
data["num_gt_ids"] = len(unique_gt_ids)
data["num_timesteps"] = raw_data["num_timesteps"]
data["seq"] = raw_data["seq"]
# get track representations
data["gt_tracks"] = raw_data["classes_to_gt_tracks"][cls_id]
data["gt_track_ids"] = raw_data["classes_to_gt_track_ids"][cls_id]
data["gt_track_areas"] = raw_data["classes_to_gt_track_areas"][cls_id]
data["gt_track_iscrowd"] = raw_data["classes_to_gt_track_iscrowd"][cls_id]
data["dt_tracks"] = raw_data["classes_to_dt_tracks"][cls_id]
data["dt_track_ids"] = raw_data["classes_to_dt_track_ids"][cls_id]
data["dt_track_areas"] = raw_data["classes_to_dt_track_areas"][cls_id]
data["dt_track_scores"] = raw_data["classes_to_dt_track_scores"][cls_id]
data["iou_type"] = "mask"
# sort tracker data tracks by tracker confidence scores
if data["dt_tracks"]:
idx = np.argsort(
[-score for score in data["dt_track_scores"]], kind="mergesort"
)
data["dt_track_scores"] = [data["dt_track_scores"][i] for i in idx]
data["dt_tracks"] = [data["dt_tracks"][i] for i in idx]
data["dt_track_ids"] = [data["dt_track_ids"][i] for i in idx]
data["dt_track_areas"] = [data["dt_track_areas"][i] for i in idx]
return data
def _calculate_similarities(self, gt_dets_t, tracker_dets_t):
if self.iou_type == "segm":
similarity_scores = self._calculate_mask_ious(
gt_dets_t, tracker_dets_t, is_encoded=True, do_ioa=False
)
else:
gt_dets_t = np.array(gt_dets_t, dtype=np.float32).reshape(-1, 4)
tracker_dets_t = np.array(tracker_dets_t, dtype=np.float32).reshape(-1, 4)
similarity_scores = self._calculate_box_ious(
gt_dets_t, tracker_dets_t, box_format="xywh", do_ioa=False
)
return similarity_scores
def _prepare_gt_annotations(self):
"""
Prepares GT data by rle encoding segmentations and computing the average track area.
:return: None
"""
if self.iou_type == "segm":
# only loaded when needed to reduce minimum requirements
from pycocotools import mask as mask_utils
for track in self.gt_data["annotations"]:
h = track["height"]
w = track["width"]
for i, seg in enumerate(track["segmentations"]):
if seg is not None and isinstance(seg["counts"], list):
track["segmentations"][i] = mask_utils.frPyObjects(seg, h, w)
areas = [a for a in track["areas"] if a]
if len(areas) == 0:
track["area"] = 0
else:
track["area"] = np.array(areas).mean()
else:
for track in self.gt_data["annotations"]:
# For bbox eval, compute areas from bboxes if not already available
areas = [a for a in track.get("areas", []) if a]
if not areas:
areas = []
for bbox in track.get("bboxes", []):
if bbox is not None:
areas.append(bbox[2] * bbox[3])
track["area"] = np.array(areas).mean() if areas else 0
def _get_tracker_seq_tracks(self, tracker, seq_id):
"""
Prepares tracker data for a given sequence. Extracts all annotations for given sequence ID, computes
average track area and assigns a track ID.
:param tracker: the given tracker
:param seq_id: the sequence ID
:return: the extracted tracks
"""
# only loaded when needed to reduce minimum requirements
from pycocotools import mask as mask_utils
tracks = [
ann for ann in self.tracker_data[tracker] if ann["video_id"] == seq_id
]
for track in tracks:
if "areas" not in track:
if self.iou_type == "segm":
for seg in track["segmentations"]:
if seg:
track["areas"].append(mask_utils.area(seg))
else:
track["areas"].append(None)
else:
for bbox in track["bboxes"]:
if bbox:
track["areas"].append(bbox[2] * bbox[3])
else:
track["areas"].append(None)
areas = [a for a in track["areas"] if a]
if len(areas) == 0:
track["area"] = 0
else:
track["area"] = np.array(areas).mean()
track["id"] = self.global_tid_counter
self.global_tid_counter += 1
return tracks
def get_name(self):
return self.dataset_name

View File

@@ -0,0 +1,395 @@
# flake8: noqa
import os
import time
import traceback
from functools import partial
from multiprocessing.pool import Pool
import numpy as np
from . import _timing, utils
from .metrics import Count
from .utils import TrackEvalException
try:
import tqdm
TQDM_IMPORTED = True
except ImportError as _:
TQDM_IMPORTED = False
class Evaluator:
"""Evaluator class for evaluating different metrics for different datasets"""
@staticmethod
def get_default_eval_config():
"""Returns the default config values for evaluation"""
code_path = utils.get_code_path()
default_config = {
"USE_PARALLEL": False,
"NUM_PARALLEL_CORES": 8,
"BREAK_ON_ERROR": True, # Raises exception and exits with error
"RETURN_ON_ERROR": False, # if not BREAK_ON_ERROR, then returns from function on error
"LOG_ON_ERROR": os.path.join(
code_path, "error_log.txt"
), # if not None, save any errors into a log file.
"PRINT_RESULTS": True,
"PRINT_ONLY_COMBINED": False,
"PRINT_CONFIG": True,
"TIME_PROGRESS": True,
"DISPLAY_LESS_PROGRESS": True,
"OUTPUT_SUMMARY": True,
"OUTPUT_EMPTY_CLASSES": True, # If False, summary files are not output for classes with no detections
"OUTPUT_DETAILED": True,
"PLOT_CURVES": True,
}
return default_config
def __init__(self, config=None):
"""Initialise the evaluator with a config file"""
self.config = utils.init_config(config, self.get_default_eval_config(), "Eval")
# Only run timing analysis if not run in parallel.
if self.config["TIME_PROGRESS"] and not self.config["USE_PARALLEL"]:
_timing.DO_TIMING = True
if self.config["DISPLAY_LESS_PROGRESS"]:
_timing.DISPLAY_LESS_PROGRESS = True
def _combine_results(
self,
res,
metrics_list,
metric_names,
dataset,
res_field="COMBINED_SEQ",
target_tag=None,
):
assert res_field.startswith("COMBINED_SEQ")
# collecting combined cls keys (cls averaged, det averaged, super classes)
tracker_list, seq_list, class_list = dataset.get_eval_info()
combined_cls_keys = []
res[res_field] = {}
# narrow the target for evaluation
if target_tag is not None:
target_video_ids = [
annot["video_id"]
for annot in dataset.gt_data["annotations"]
if target_tag in annot["tags"]
]
vid2name = {
video["id"]: video["file_names"][0].split("/")[0]
for video in dataset.gt_data["videos"]
}
target_video_ids = set(target_video_ids)
target_video = [vid2name[video_id] for video_id in target_video_ids]
if len(target_video) == 0:
raise TrackEvalException(
"No sequences found with the tag %s" % target_tag
)
target_annotations = [
annot
for annot in dataset.gt_data["annotations"]
if annot["video_id"] in target_video_ids
]
assert all(target_tag in annot["tags"] for annot in target_annotations), (
f"Not all annotations in the target sequences have the target tag {target_tag}. "
"We currently only support a target tag at the sequence level, not at the annotation level."
)
else:
target_video = seq_list
# combine sequences for each class
for c_cls in class_list:
res[res_field][c_cls] = {}
for metric, metric_name in zip(metrics_list, metric_names):
curr_res = {
seq_key: seq_value[c_cls][metric_name]
for seq_key, seq_value in res.items()
if not seq_key.startswith("COMBINED_SEQ")
and seq_key in target_video
}
res[res_field][c_cls][metric_name] = metric.combine_sequences(curr_res)
# combine classes
if dataset.should_classes_combine:
combined_cls_keys += [
"cls_comb_cls_av",
"cls_comb_det_av",
"all",
]
res[res_field]["cls_comb_cls_av"] = {}
res[res_field]["cls_comb_det_av"] = {}
for metric, metric_name in zip(metrics_list, metric_names):
cls_res = {
cls_key: cls_value[metric_name]
for cls_key, cls_value in res[res_field].items()
if cls_key not in combined_cls_keys
}
res[res_field]["cls_comb_cls_av"][metric_name] = (
metric.combine_classes_class_averaged(cls_res)
)
res[res_field]["cls_comb_det_av"][metric_name] = (
metric.combine_classes_det_averaged(cls_res)
)
# combine classes to super classes
if dataset.use_super_categories:
for cat, sub_cats in dataset.super_categories.items():
combined_cls_keys.append(cat)
res[res_field][cat] = {}
for metric, metric_name in zip(metrics_list, metric_names):
cat_res = {
cls_key: cls_value[metric_name]
for cls_key, cls_value in res[res_field].items()
if cls_key in sub_cats
}
res[res_field][cat][metric_name] = (
metric.combine_classes_det_averaged(cat_res)
)
return res, combined_cls_keys
def _summarize_results(
self,
res,
tracker,
metrics_list,
metric_names,
dataset,
res_field,
combined_cls_keys,
):
config = self.config
output_fol = dataset.get_output_fol(tracker)
tracker_display_name = dataset.get_display_name(tracker)
for c_cls in res[
res_field
].keys(): # class_list + combined classes if calculated
summaries = []
details = []
num_dets = res[res_field][c_cls]["Count"]["Dets"]
if config["OUTPUT_EMPTY_CLASSES"] or num_dets > 0:
for metric, metric_name in zip(metrics_list, metric_names):
# for combined classes there is no per sequence evaluation
if c_cls in combined_cls_keys:
table_res = {res_field: res[res_field][c_cls][metric_name]}
else:
table_res = {
seq_key: seq_value[c_cls][metric_name]
for seq_key, seq_value in res.items()
}
if config["PRINT_RESULTS"] and config["PRINT_ONLY_COMBINED"]:
dont_print = (
dataset.should_classes_combine
and c_cls not in combined_cls_keys
)
if not dont_print:
metric.print_table(
{res_field: table_res[res_field]},
tracker_display_name,
c_cls,
res_field,
res_field,
)
elif config["PRINT_RESULTS"]:
metric.print_table(
table_res, tracker_display_name, c_cls, res_field, res_field
)
if config["OUTPUT_SUMMARY"]:
summaries.append(metric.summary_results(table_res))
if config["OUTPUT_DETAILED"]:
details.append(metric.detailed_results(table_res))
if config["PLOT_CURVES"]:
metric.plot_single_tracker_results(
table_res,
tracker_display_name,
c_cls,
output_fol,
)
if config["OUTPUT_SUMMARY"]:
utils.write_summary_results(summaries, c_cls, output_fol)
if config["OUTPUT_DETAILED"]:
utils.write_detailed_results(details, c_cls, output_fol)
@_timing.time
def evaluate(self, dataset_list, metrics_list, show_progressbar=False):
"""Evaluate a set of metrics on a set of datasets"""
config = self.config
metrics_list = metrics_list + [Count()] # Count metrics are always run
metric_names = utils.validate_metrics_list(metrics_list)
dataset_names = [dataset.get_name() for dataset in dataset_list]
output_res = {}
output_msg = {}
for dataset, dataset_name in zip(dataset_list, dataset_names):
# Get dataset info about what to evaluate
output_res[dataset_name] = {}
output_msg[dataset_name] = {}
tracker_list, seq_list, class_list = dataset.get_eval_info()
print(
"\nEvaluating %i tracker(s) on %i sequence(s) for %i class(es) on %s dataset using the following "
"metrics: %s\n"
% (
len(tracker_list),
len(seq_list),
len(class_list),
dataset_name,
", ".join(metric_names),
)
)
# Evaluate each tracker
for tracker in tracker_list:
# if not config['BREAK_ON_ERROR'] then go to next tracker without breaking
try:
# Evaluate each sequence in parallel or in series.
# returns a nested dict (res), indexed like: res[seq][class][metric_name][sub_metric field]
# e.g. res[seq_0001][pedestrian][hota][DetA]
print("\nEvaluating %s\n" % tracker)
time_start = time.time()
if config["USE_PARALLEL"]:
if show_progressbar and TQDM_IMPORTED:
seq_list_sorted = sorted(seq_list)
with Pool(config["NUM_PARALLEL_CORES"]) as pool, tqdm.tqdm(
total=len(seq_list)
) as pbar:
_eval_sequence = partial(
eval_sequence,
dataset=dataset,
tracker=tracker,
class_list=class_list,
metrics_list=metrics_list,
metric_names=metric_names,
)
results = []
for r in pool.imap(
_eval_sequence, seq_list_sorted, chunksize=20
):
results.append(r)
pbar.update()
res = dict(zip(seq_list_sorted, results))
else:
with Pool(config["NUM_PARALLEL_CORES"]) as pool:
_eval_sequence = partial(
eval_sequence,
dataset=dataset,
tracker=tracker,
class_list=class_list,
metrics_list=metrics_list,
metric_names=metric_names,
)
results = pool.map(_eval_sequence, seq_list)
res = dict(zip(seq_list, results))
else:
res = {}
if show_progressbar and TQDM_IMPORTED:
seq_list_sorted = sorted(seq_list)
for curr_seq in tqdm.tqdm(seq_list_sorted):
res[curr_seq] = eval_sequence(
curr_seq,
dataset,
tracker,
class_list,
metrics_list,
metric_names,
)
else:
for curr_seq in sorted(seq_list):
res[curr_seq] = eval_sequence(
curr_seq,
dataset,
tracker,
class_list,
metrics_list,
metric_names,
)
# Combine results over all sequences and then over all classes
res, combined_cls_keys = self._combine_results(
res, metrics_list, metric_names, dataset, "COMBINED_SEQ"
)
if np.all(
["tags" in annot for annot in dataset.gt_data["annotations"]]
):
# Combine results over the challenging sequences and then over all classes
# currently only support "tracking_challenging_pair"
res, _ = self._combine_results(
res,
metrics_list,
metric_names,
dataset,
"COMBINED_SEQ_CHALLENGING",
"tracking_challenging_pair",
)
# Print and output results in various formats
if config["TIME_PROGRESS"]:
print(
"\nAll sequences for %s finished in %.2f seconds"
% (tracker, time.time() - time_start)
)
self._summarize_results(
res,
tracker,
metrics_list,
metric_names,
dataset,
"COMBINED_SEQ",
combined_cls_keys,
)
if "COMBINED_SEQ_CHALLENGING" in res:
self._summarize_results(
res,
tracker,
metrics_list,
metric_names,
dataset,
"COMBINED_SEQ_CHALLENGING",
combined_cls_keys,
)
# Output for returning from function
output_res[dataset_name][tracker] = res
output_msg[dataset_name][tracker] = "Success"
except Exception as err:
output_res[dataset_name][tracker] = None
if type(err) == TrackEvalException:
output_msg[dataset_name][tracker] = str(err)
else:
output_msg[dataset_name][tracker] = "Unknown error occurred."
print("Tracker %s was unable to be evaluated." % tracker)
print(err)
traceback.print_exc()
if config["LOG_ON_ERROR"] is not None:
with open(config["LOG_ON_ERROR"], "a") as f:
print(dataset_name, file=f)
print(tracker, file=f)
print(traceback.format_exc(), file=f)
print("\n\n\n", file=f)
if config["BREAK_ON_ERROR"]:
raise err
elif config["RETURN_ON_ERROR"]:
return output_res, output_msg
return output_res, output_msg
@_timing.time
def eval_sequence(seq, dataset, tracker, class_list, metrics_list, metric_names):
"""Function for evaluating a single sequence"""
raw_data = dataset.get_raw_seq_data(tracker, seq)
seq_res = {}
for cls in class_list:
seq_res[cls] = {}
data = dataset.get_preprocessed_seq_data(raw_data, cls)
for metric, met_name in zip(metrics_list, metric_names):
seq_res[cls][met_name] = metric.eval_sequence(data)
return seq_res

View File

@@ -0,0 +1,4 @@
# flake8: noqa
from .count import Count
from .hota import HOTA

View File

@@ -0,0 +1,145 @@
# flake8: noqa
from abc import ABC, abstractmethod
import numpy as np
from .. import _timing
from ..utils import TrackEvalException
class _BaseMetric(ABC):
@abstractmethod
def __init__(self):
self.plottable = False
self.integer_fields = []
self.float_fields = []
self.array_labels = []
self.integer_array_fields = []
self.float_array_fields = []
self.fields = []
self.summary_fields = []
self.registered = False
#####################################################################
# Abstract functions for subclasses to implement
@_timing.time
@abstractmethod
def eval_sequence(self, data): ...
@abstractmethod
def combine_sequences(self, all_res): ...
@abstractmethod
def combine_classes_class_averaged(self, all_res, ignore_empty_classes=False): ...
@abstractmethod
def combine_classes_det_averaged(self, all_res): ...
def plot_single_tracker_results(self, all_res, tracker, output_folder, cls):
"""Plot results of metrics, only valid for metrics with self.plottable"""
if self.plottable:
raise NotImplementedError(
"plot_results is not implemented for metric %s" % self.get_name()
)
else:
pass
#####################################################################
# Helper functions which are useful for all metrics:
@classmethod
def get_name(cls):
return cls.__name__
@staticmethod
def _combine_sum(all_res, field):
"""Combine sequence results via sum"""
return sum([all_res[k][field] for k in all_res.keys()])
@staticmethod
def _combine_weighted_av(all_res, field, comb_res, weight_field):
"""Combine sequence results via weighted average"""
return sum(
[all_res[k][field] * all_res[k][weight_field] for k in all_res.keys()]
) / np.maximum(1.0, comb_res[weight_field])
def print_table(
self, table_res, tracker, cls, res_field="COMBINED_SEQ", output_lable="COMBINED"
):
"""Prints table of results for all sequences"""
print("")
metric_name = self.get_name()
self._row_print(
[metric_name + ": " + tracker + "-" + cls] + self.summary_fields
)
for seq, results in sorted(table_res.items()):
if seq.startswith("COMBINED_SEQ"):
continue
summary_res = self._summary_row(results)
self._row_print([seq] + summary_res)
summary_res = self._summary_row(table_res[res_field])
self._row_print([output_lable] + summary_res)
def _summary_row(self, results_):
vals = []
for h in self.summary_fields:
if h in self.float_array_fields:
vals.append("{0:1.5g}".format(100 * np.mean(results_[h])))
elif h in self.float_fields:
vals.append("{0:1.5g}".format(100 * float(results_[h])))
elif h in self.integer_fields:
vals.append("{0:d}".format(int(results_[h])))
else:
raise NotImplementedError(
"Summary function not implemented for this field type."
)
return vals
@staticmethod
def _row_print(*argv):
"""Prints results in an evenly spaced rows, with more space in first row"""
if len(argv) == 1:
argv = argv[0]
to_print = "%-35s" % argv[0]
for v in argv[1:]:
to_print += "%-10s" % str(v)
print(to_print)
def summary_results(self, table_res):
"""Returns a simple summary of final results for a tracker"""
return dict(
zip(self.summary_fields, self._summary_row(table_res["COMBINED_SEQ"]))
)
def detailed_results(self, table_res):
"""Returns detailed final results for a tracker"""
# Get detailed field information
detailed_fields = self.float_fields + self.integer_fields
for h in self.float_array_fields + self.integer_array_fields:
for alpha in [int(100 * x) for x in self.array_labels]:
detailed_fields.append(h + "___" + str(alpha))
detailed_fields.append(h + "___AUC")
# Get detailed results
detailed_results = {}
for seq, res in table_res.items():
detailed_row = self._detailed_row(res)
if len(detailed_row) != len(detailed_fields):
raise TrackEvalException(
"Field names and data have different sizes (%i and %i)"
% (len(detailed_row), len(detailed_fields))
)
detailed_results[seq] = dict(zip(detailed_fields, detailed_row))
return detailed_results
def _detailed_row(self, res):
detailed_row = []
for h in self.float_fields + self.integer_fields:
detailed_row.append(res[h])
for h in self.float_array_fields + self.integer_array_fields:
for i, alpha in enumerate([int(100 * x) for x in self.array_labels]):
detailed_row.append(res[h][i])
detailed_row.append(np.mean(res[h]))
return detailed_row

View File

@@ -0,0 +1,48 @@
# flake8: noqa
from .. import _timing
from ._base_metric import _BaseMetric
class Count(_BaseMetric):
"""Class which simply counts the number of tracker and gt detections and ids."""
def __init__(self, config=None):
super().__init__()
self.integer_fields = ["Dets", "GT_Dets", "IDs", "GT_IDs"]
self.fields = self.integer_fields
self.summary_fields = self.fields
@_timing.time
def eval_sequence(self, data):
"""Returns counts for one sequence"""
# Get results
res = {
"Dets": data["num_tracker_dets"],
"GT_Dets": data["num_gt_dets"],
"IDs": data["num_tracker_ids"],
"GT_IDs": data["num_gt_ids"],
"Frames": data["num_timesteps"],
}
return res
def combine_sequences(self, all_res):
"""Combines metrics across all sequences"""
res = {}
for field in self.integer_fields:
res[field] = self._combine_sum(all_res, field)
return res
def combine_classes_class_averaged(self, all_res, ignore_empty_classes=None):
"""Combines metrics across all classes by averaging over the class values"""
res = {}
for field in self.integer_fields:
res[field] = self._combine_sum(all_res, field)
return res
def combine_classes_det_averaged(self, all_res):
"""Combines metrics across all classes by averaging over the detection values"""
res = {}
for field in self.integer_fields:
res[field] = self._combine_sum(all_res, field)
return res

View File

@@ -0,0 +1,291 @@
# flake8: noqa
import os
import numpy as np
from scipy.optimize import linear_sum_assignment
from .. import _timing
from ._base_metric import _BaseMetric
class HOTA(_BaseMetric):
"""Class which implements the HOTA metrics.
See: https://link.springer.com/article/10.1007/s11263-020-01375-2
"""
def __init__(self, config=None):
super().__init__()
self.plottable = True
self.array_labels = np.arange(0.05, 0.99, 0.05)
self.integer_array_fields = ["HOTA_TP", "HOTA_FN", "HOTA_FP"]
self.float_array_fields = [
"HOTA",
"DetA",
"AssA",
"DetRe",
"DetPr",
"AssRe",
"AssPr",
"LocA",
"OWTA",
]
self.float_fields = ["HOTA(0)", "LocA(0)", "HOTALocA(0)"]
self.fields = (
self.float_array_fields + self.integer_array_fields + self.float_fields
)
self.summary_fields = self.float_array_fields + self.float_fields
@_timing.time
def eval_sequence(self, data):
"""Calculates the HOTA metrics for one sequence"""
# Initialise results
res = {}
for field in self.float_array_fields + self.integer_array_fields:
res[field] = np.zeros((len(self.array_labels)), dtype=float)
for field in self.float_fields:
res[field] = 0
# Return result quickly if tracker or gt sequence is empty
if data["num_tracker_dets"] == 0:
res["HOTA_FN"] = data["num_gt_dets"] * np.ones(
(len(self.array_labels)), dtype=float
)
res["LocA"] = np.ones((len(self.array_labels)), dtype=float)
res["LocA(0)"] = 1.0
return res
if data["num_gt_dets"] == 0:
res["HOTA_FP"] = data["num_tracker_dets"] * np.ones(
(len(self.array_labels)), dtype=float
)
res["LocA"] = np.ones((len(self.array_labels)), dtype=float)
res["LocA(0)"] = 1.0
return res
# Variables counting global association
potential_matches_count = np.zeros(
(data["num_gt_ids"], data["num_tracker_ids"])
)
gt_id_count = np.zeros((data["num_gt_ids"], 1))
tracker_id_count = np.zeros((1, data["num_tracker_ids"]))
# First loop through each timestep and accumulate global track information.
for t, (gt_ids_t, tracker_ids_t) in enumerate(
zip(data["gt_ids"], data["tracker_ids"])
):
# Count the potential matches between ids in each timestep
# These are normalised, weighted by the match similarity.
similarity = data["similarity_scores"][t]
sim_iou_denom = (
similarity.sum(0)[np.newaxis, :]
+ similarity.sum(1)[:, np.newaxis]
- similarity
)
sim_iou = np.zeros_like(similarity)
sim_iou_mask = sim_iou_denom > 0 + np.finfo("float").eps
sim_iou[sim_iou_mask] = (
similarity[sim_iou_mask] / sim_iou_denom[sim_iou_mask]
)
potential_matches_count[
gt_ids_t[:, np.newaxis], tracker_ids_t[np.newaxis, :]
] += sim_iou
# Calculate the total number of dets for each gt_id and tracker_id.
gt_id_count[gt_ids_t] += 1
tracker_id_count[0, tracker_ids_t] += 1
# Calculate overall jaccard alignment score (before unique matching) between IDs
global_alignment_score = potential_matches_count / (
gt_id_count + tracker_id_count - potential_matches_count
)
matches_counts = [
np.zeros_like(potential_matches_count) for _ in self.array_labels
]
# Calculate scores for each timestep
for t, (gt_ids_t, tracker_ids_t) in enumerate(
zip(data["gt_ids"], data["tracker_ids"])
):
# Deal with the case that there are no gt_det/tracker_det in a timestep.
if len(gt_ids_t) == 0:
for a, alpha in enumerate(self.array_labels):
res["HOTA_FP"][a] += len(tracker_ids_t)
continue
if len(tracker_ids_t) == 0:
for a, alpha in enumerate(self.array_labels):
res["HOTA_FN"][a] += len(gt_ids_t)
continue
# Get matching scores between pairs of dets for optimizing HOTA
similarity = data["similarity_scores"][t]
score_mat = (
global_alignment_score[
gt_ids_t[:, np.newaxis], tracker_ids_t[np.newaxis, :]
]
* similarity
)
# Hungarian algorithm to find best matches
match_rows, match_cols = linear_sum_assignment(-score_mat)
# Calculate and accumulate basic statistics
for a, alpha in enumerate(self.array_labels):
actually_matched_mask = (
similarity[match_rows, match_cols] >= alpha - np.finfo("float").eps
)
alpha_match_rows = match_rows[actually_matched_mask]
alpha_match_cols = match_cols[actually_matched_mask]
num_matches = len(alpha_match_rows)
res["HOTA_TP"][a] += num_matches
res["HOTA_FN"][a] += len(gt_ids_t) - num_matches
res["HOTA_FP"][a] += len(tracker_ids_t) - num_matches
if num_matches > 0:
res["LocA"][a] += sum(
similarity[alpha_match_rows, alpha_match_cols]
)
matches_counts[a][
gt_ids_t[alpha_match_rows], tracker_ids_t[alpha_match_cols]
] += 1
# Calculate association scores (AssA, AssRe, AssPr) for the alpha value.
# First calculate scores per gt_id/tracker_id combo and then average over the number of detections.
for a, alpha in enumerate(self.array_labels):
matches_count = matches_counts[a]
ass_a = matches_count / np.maximum(
1, gt_id_count + tracker_id_count - matches_count
)
res["AssA"][a] = np.sum(matches_count * ass_a) / np.maximum(
1, res["HOTA_TP"][a]
)
ass_re = matches_count / np.maximum(1, gt_id_count)
res["AssRe"][a] = np.sum(matches_count * ass_re) / np.maximum(
1, res["HOTA_TP"][a]
)
ass_pr = matches_count / np.maximum(1, tracker_id_count)
res["AssPr"][a] = np.sum(matches_count * ass_pr) / np.maximum(
1, res["HOTA_TP"][a]
)
# Calculate final scores
res["LocA"] = np.maximum(1e-10, res["LocA"]) / np.maximum(1e-10, res["HOTA_TP"])
res = self._compute_final_fields(res)
return res
def combine_sequences(self, all_res):
"""Combines metrics across all sequences"""
res = {}
for field in self.integer_array_fields:
res[field] = self._combine_sum(all_res, field)
for field in ["AssRe", "AssPr", "AssA"]:
res[field] = self._combine_weighted_av(
all_res, field, res, weight_field="HOTA_TP"
)
loca_weighted_sum = sum(
[all_res[k]["LocA"] * all_res[k]["HOTA_TP"] for k in all_res.keys()]
)
res["LocA"] = np.maximum(1e-10, loca_weighted_sum) / np.maximum(
1e-10, res["HOTA_TP"]
)
res = self._compute_final_fields(res)
return res
def combine_classes_class_averaged(self, all_res, ignore_empty_classes=False):
"""Combines metrics across all classes by averaging over the class values.
If 'ignore_empty_classes' is True, then it only sums over classes with at least one gt or predicted detection.
"""
res = {}
for field in self.integer_array_fields:
if ignore_empty_classes:
res[field] = self._combine_sum(
{
k: v
for k, v in all_res.items()
if (
v["HOTA_TP"] + v["HOTA_FN"] + v["HOTA_FP"]
> 0 + np.finfo("float").eps
).any()
},
field,
)
else:
res[field] = self._combine_sum(
{k: v for k, v in all_res.items()}, field
)
for field in self.float_fields + self.float_array_fields:
if ignore_empty_classes:
res[field] = np.mean(
[
v[field]
for v in all_res.values()
if (
v["HOTA_TP"] + v["HOTA_FN"] + v["HOTA_FP"]
> 0 + np.finfo("float").eps
).any()
],
axis=0,
)
else:
res[field] = np.mean([v[field] for v in all_res.values()], axis=0)
return res
def combine_classes_det_averaged(self, all_res):
"""Combines metrics across all classes by averaging over the detection values"""
res = {}
for field in self.integer_array_fields:
res[field] = self._combine_sum(all_res, field)
for field in ["AssRe", "AssPr", "AssA"]:
res[field] = self._combine_weighted_av(
all_res, field, res, weight_field="HOTA_TP"
)
loca_weighted_sum = sum(
[all_res[k]["LocA"] * all_res[k]["HOTA_TP"] for k in all_res.keys()]
)
res["LocA"] = np.maximum(1e-10, loca_weighted_sum) / np.maximum(
1e-10, res["HOTA_TP"]
)
res = self._compute_final_fields(res)
return res
@staticmethod
def _compute_final_fields(res):
"""Calculate sub-metric ('field') values which only depend on other sub-metric values.
This function is used both for both per-sequence calculation, and in combining values across sequences.
"""
res["DetRe"] = res["HOTA_TP"] / np.maximum(1, res["HOTA_TP"] + res["HOTA_FN"])
res["DetPr"] = res["HOTA_TP"] / np.maximum(1, res["HOTA_TP"] + res["HOTA_FP"])
res["DetA"] = res["HOTA_TP"] / np.maximum(
1, res["HOTA_TP"] + res["HOTA_FN"] + res["HOTA_FP"]
)
res["HOTA"] = np.sqrt(res["DetA"] * res["AssA"])
res["OWTA"] = np.sqrt(res["DetRe"] * res["AssA"])
res["HOTA(0)"] = res["HOTA"][0]
res["LocA(0)"] = res["LocA"][0]
res["HOTALocA(0)"] = res["HOTA(0)"] * res["LocA(0)"]
return res
def plot_single_tracker_results(self, table_res, tracker, cls, output_folder):
"""Create plot of results"""
# Only loaded when run to reduce minimum requirements
from matplotlib import pyplot as plt
res = table_res["COMBINED_SEQ"]
styles_to_plot = ["r", "b", "g", "b--", "b:", "g--", "g:", "m"]
for name, style in zip(self.float_array_fields, styles_to_plot):
plt.plot(self.array_labels, res[name], style)
plt.xlabel("alpha")
plt.ylabel("score")
plt.title(tracker + " - " + cls)
plt.axis([0, 1, 0, 1])
legend = []
for name in self.float_array_fields:
legend += [name + " (" + str(np.round(np.mean(res[name]), 2)) + ")"]
plt.legend(legend, loc="lower left")
out_file = os.path.join(output_folder, cls + "_plot.pdf")
os.makedirs(os.path.dirname(out_file), exist_ok=True)
plt.savefig(out_file)
plt.savefig(out_file.replace(".pdf", ".png"))
plt.clf()

View File

@@ -0,0 +1,195 @@
# flake8: noqa
import argparse
import csv
import os
from collections import OrderedDict
def init_config(config, default_config, name=None):
"""Initialise non-given config values with defaults"""
if config is None:
config = default_config
else:
for k in default_config.keys():
if k not in config.keys():
config[k] = default_config[k]
if name and config["PRINT_CONFIG"]:
print("\n%s Config:" % name)
for c in config.keys():
print("%-20s : %-30s" % (c, config[c]))
return config
def update_config(config):
"""
Parse the arguments of a script and updates the config values for a given value if specified in the arguments.
:param config: the config to update
:return: the updated config
"""
parser = argparse.ArgumentParser()
for setting in config.keys():
if type(config[setting]) == list or type(config[setting]) == type(None):
parser.add_argument("--" + setting, nargs="+")
else:
parser.add_argument("--" + setting)
args = parser.parse_args().__dict__
for setting in args.keys():
if args[setting] is not None:
if type(config[setting]) == type(True):
if args[setting] == "True":
x = True
elif args[setting] == "False":
x = False
else:
raise Exception(
"Command line parameter " + setting + "must be True or False"
)
elif type(config[setting]) == type(1):
x = int(args[setting])
elif type(args[setting]) == type(None):
x = None
else:
x = args[setting]
config[setting] = x
return config
def get_code_path():
"""Get base path where code is"""
return os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
def validate_metrics_list(metrics_list):
"""Get names of metric class and ensures they are unique, further checks that the fields within each metric class
do not have overlapping names.
"""
metric_names = [metric.get_name() for metric in metrics_list]
# check metric names are unique
if len(metric_names) != len(set(metric_names)):
raise TrackEvalException(
"Code being run with multiple metrics of the same name"
)
fields = []
for m in metrics_list:
fields += m.fields
# check metric fields are unique
if len(fields) != len(set(fields)):
raise TrackEvalException(
"Code being run with multiple metrics with fields of the same name"
)
return metric_names
def write_summary_results(summaries, cls, output_folder):
"""Write summary results to file"""
fields = sum([list(s.keys()) for s in summaries], [])
values = sum([list(s.values()) for s in summaries], [])
# In order to remain consistent upon new fields being adding, for each of the following fields if they are present
# they will be output in the summary first in the order below. Any further fields will be output in the order each
# metric family is called, and within each family either in the order they were added to the dict (python >= 3.6) or
# randomly (python < 3.6).
default_order = [
"HOTA",
"DetA",
"AssA",
"DetRe",
"DetPr",
"AssRe",
"AssPr",
"LocA",
"OWTA",
"HOTA(0)",
"LocA(0)",
"HOTALocA(0)",
"MOTA",
"MOTP",
"MODA",
"CLR_Re",
"CLR_Pr",
"MTR",
"PTR",
"MLR",
"CLR_TP",
"CLR_FN",
"CLR_FP",
"IDSW",
"MT",
"PT",
"ML",
"Frag",
"sMOTA",
"IDF1",
"IDR",
"IDP",
"IDTP",
"IDFN",
"IDFP",
"Dets",
"GT_Dets",
"IDs",
"GT_IDs",
]
default_ordered_dict = OrderedDict(
zip(default_order, [None for _ in default_order])
)
for f, v in zip(fields, values):
default_ordered_dict[f] = v
for df in default_order:
if default_ordered_dict[df] is None:
del default_ordered_dict[df]
fields = list(default_ordered_dict.keys())
values = list(default_ordered_dict.values())
out_file = os.path.join(output_folder, cls + "_summary.txt")
os.makedirs(os.path.dirname(out_file), exist_ok=True)
with open(out_file, "w", newline="") as f:
writer = csv.writer(f, delimiter=" ")
writer.writerow(fields)
writer.writerow(values)
def write_detailed_results(details, cls, output_folder):
"""Write detailed results to file"""
sequences = details[0].keys()
fields = ["seq"] + sum([list(s["COMBINED_SEQ"].keys()) for s in details], [])
out_file = os.path.join(output_folder, cls + "_detailed.csv")
os.makedirs(os.path.dirname(out_file), exist_ok=True)
with open(out_file, "w", newline="") as f:
writer = csv.writer(f)
writer.writerow(fields)
for seq in sorted(sequences):
if seq == "COMBINED_SEQ":
continue
writer.writerow([seq] + sum([list(s[seq].values()) for s in details], []))
writer.writerow(
["COMBINED"] + sum([list(s["COMBINED_SEQ"].values()) for s in details], [])
)
def load_detail(file):
"""Loads detailed data for a tracker."""
data = {}
with open(file) as f:
for i, row_text in enumerate(f):
row = row_text.replace("\r", "").replace("\n", "").split(",")
if i == 0:
keys = row[1:]
continue
current_values = row[1:]
seq = row[0]
if seq == "COMBINED":
seq = "COMBINED_SEQ"
if (len(current_values) == len(keys)) and seq != "":
data[seq] = {}
for key, value in zip(keys, current_values):
data[seq][key] = float(value)
return data
class TrackEvalException(Exception):
"""Custom exception for catching expected errors."""
...

648
sam3/eval/postprocessors.py Normal file
View File

@@ -0,0 +1,648 @@
# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
"""Postprocessors class to transform MDETR output according to the downstream task"""
import dataclasses
import logging
from collections import defaultdict
from typing import Dict, List, Optional
import numpy as np
import torch
from sam3.model import box_ops
from sam3.model.data_misc import BatchedInferenceMetadata, interpolate
from sam3.train.masks_ops import rle_encode, robust_rle_encode
from torch import nn
class PostProcessNullOp(nn.Module):
def __init__(self, **kwargs):
super(PostProcessNullOp).__init__()
pass
def forward(self, input):
pass
def process_results(self, **kwargs):
return kwargs["find_stages"]
class PostProcessImage(nn.Module):
"""This module converts the model's output into the format expected by the coco api"""
def __init__(
self,
max_dets_per_img: int,
iou_type="bbox",
to_cpu: bool = True,
use_original_ids: bool = False,
use_original_sizes_box: bool = False,
use_original_sizes_mask: bool = False,
convert_mask_to_rle: bool = False,
always_interpolate_masks_on_gpu: bool = True,
use_presence: bool = True,
detection_threshold: float = -1.0,
) -> None:
super().__init__()
self.max_dets_per_img = max_dets_per_img
self.iou_type = iou_type
self.to_cpu = to_cpu
self.convert_mask_to_rle = convert_mask_to_rle
self.always_interpolate_masks_on_gpu = always_interpolate_masks_on_gpu
self.use_presence = use_presence
self.detection_threshold = detection_threshold
self.use_original_ids = use_original_ids
self.use_original_sizes_box = use_original_sizes_box
self.use_original_sizes_mask = use_original_sizes_mask
@torch.no_grad()
def forward(
self,
outputs,
target_sizes_boxes,
target_sizes_masks,
forced_labels=None,
consistent=False,
ret_tensordict: bool = False, # This is experimental
):
"""Perform the computation
Parameters:
outputs: raw outputs of the model
target_sizes_boxes: tensor of dimension [batch_size x 2] containing the size of each images of the batch
For evaluation, this must be the original image size (before any data augmentation)
For visualization, this should be the image size after data augment, but before padding
target_sizes_masks: same but used to resize masks
forced_labels: tensor of dimension [batch_size] containing the label to force for each image of the batch
This is useful when evaluating the model using standard metrics (eg on COCO, LVIS). In that case,
we query the model with every possible class label, so we when we pass the predictions to the evaluator,
we want to make sure that the predicted "class" matches the one that was queried.
consistent: whether all target sizes are equal
ret_tensordict: Experimental argument. If true, return a tensordict.TensorDict instead of a list of dictionaries for easier manipulation.
"""
if ret_tensordict:
assert (
consistent is True
), "We don't support returning TensorDict if the outputs have different shapes" # NOTE: It's possible but we don't support it.
assert self.detection_threshold <= 0.0, "TODO: implement?"
try:
from tensordict import TensorDict
except ImportError:
logging.info(
"tensordict is not installed. Install by running `pip install tensordict --no-deps`. Falling back by setting `ret_tensordict=False`"
)
ret_tensordict = False
out_bbox = outputs["pred_boxes"] if "pred_boxes" in outputs else None
out_logits = outputs["pred_logits"]
pred_masks = outputs["pred_masks"] if self.iou_type == "segm" else None
out_probs = out_logits.sigmoid()
if self.use_presence:
presence_score = outputs["presence_logit_dec"].sigmoid().unsqueeze(1)
out_probs = out_probs * presence_score
assert target_sizes_boxes.shape[1] == 2
assert target_sizes_masks.shape[1] == 2
batch_size = target_sizes_boxes.shape[0]
boxes, scores, labels, keep = self._process_boxes_and_labels(
target_sizes_boxes, forced_labels, out_bbox, out_probs
)
assert boxes is None or len(boxes) == batch_size
out_masks = self._process_masks(
target_sizes_masks, pred_masks, consistent=consistent, keep=keep
)
del pred_masks
if boxes is None:
assert out_masks is not None
assert not ret_tensordict, "We don't support returning TensorDict if the output does not contain boxes"
B = len(out_masks)
boxes = [None] * B
scores = [None] * B
labels = [None] * B
results = {
"scores": scores,
"labels": labels,
"boxes": boxes,
}
if out_masks is not None:
if self.convert_mask_to_rle:
results.update(masks_rle=out_masks)
else:
results.update(masks=out_masks)
if ret_tensordict:
results = TensorDict(results).auto_batch_size_()
if self.to_cpu:
results = results.cpu()
else:
# Convert a dictonary of lists/tensors to list of dictionaries
results = [
dict(zip(results.keys(), res_tuple))
for res_tuple in zip(*results.values())
]
return results
def _process_masks(self, target_sizes, pred_masks, consistent=True, keep=None):
if pred_masks is None:
return None
if self.always_interpolate_masks_on_gpu:
gpu_device = target_sizes.device
assert gpu_device.type == "cuda"
pred_masks = pred_masks.to(device=gpu_device)
if consistent:
assert keep is None, "TODO: implement?"
# All masks should have the same shape, expected when processing a batch of size 1
target_size = target_sizes.unique(dim=0)
assert target_size.size(0) == 1, "Expecting all target sizes to be equal"
out_masks = (
interpolate(
pred_masks,
target_size.squeeze().tolist(),
mode="bilinear",
align_corners=False,
).sigmoid()
> 0.5
)
if self.convert_mask_to_rle:
raise RuntimeError("TODO: implement?")
if self.to_cpu:
out_masks = out_masks.cpu()
else:
out_masks = [[]] * len(pred_masks)
assert keep is None or len(keep) == len(pred_masks)
for i, mask in enumerate(pred_masks):
h, w = target_sizes[i]
if keep is not None:
mask = mask[keep[i]]
# Uses the gpu version fist, moves masks to cpu if it fails"""
try:
interpolated = (
interpolate(
mask.unsqueeze(1),
(h, w),
mode="bilinear",
align_corners=False,
).sigmoid()
> 0.5
)
except Exception as e:
logging.info("Issue found, reverting to CPU mode!")
mask_device = mask.device
mask = mask.cpu()
interpolated = (
interpolate(
mask.unsqueeze(1),
(h, w),
mode="bilinear",
align_corners=False,
).sigmoid()
> 0.5
)
interpolated = interpolated.to(mask_device)
if self.convert_mask_to_rle:
out_masks[i] = robust_rle_encode(interpolated.squeeze(1))
else:
out_masks[i] = interpolated
if self.to_cpu:
out_masks[i] = out_masks[i].cpu()
return out_masks
def _process_boxes_and_labels(
self, target_sizes, forced_labels, out_bbox, out_probs
):
if out_bbox is None:
return None, None, None, None
assert len(out_probs) == len(target_sizes)
if self.to_cpu:
out_probs = out_probs.cpu()
scores, labels = out_probs.max(-1)
if forced_labels is None:
labels = torch.ones_like(labels)
else:
labels = forced_labels[:, None].expand_as(labels)
# convert to [x0, y0, x1, y1] format
boxes = box_ops.box_cxcywh_to_xyxy(out_bbox)
img_h, img_w = target_sizes.unbind(1)
scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
boxes = boxes * scale_fct[:, None, :]
if self.to_cpu:
boxes = boxes.cpu()
keep = None
if self.detection_threshold > 0:
# Filter out the boxes with scores below the detection threshold
keep = scores > self.detection_threshold
assert len(keep) == len(boxes) == len(scores) == len(labels)
boxes = [b[k.to(b.device)] for b, k in zip(boxes, keep)]
scores = [s[k.to(s.device)] for s, k in zip(scores, keep)]
labels = [l[k.to(l.device)] for l, k in zip(labels, keep)]
return boxes, scores, labels, keep
def process_results(
self, find_stages, find_metadatas: List[BatchedInferenceMetadata], **kwargs
):
if find_stages.loss_stages is not None:
find_metadatas = [find_metadatas[i] for i in find_stages.loss_stages]
assert len(find_stages) == len(find_metadatas)
results = {}
for outputs, meta in zip(find_stages, find_metadatas):
img_size_for_boxes = (
meta.original_size
if self.use_original_sizes_box
else torch.ones_like(meta.original_size)
)
img_size_for_masks = (
meta.original_size
if self.use_original_sizes_mask
else torch.ones_like(meta.original_size)
)
detection_results = self(
outputs,
img_size_for_boxes,
img_size_for_masks,
forced_labels=(
meta.original_category_id if self.use_original_ids else None
),
)
ids = (
meta.original_image_id if self.use_original_ids else meta.coco_image_id
)
assert len(detection_results) == len(ids)
for img_id, result in zip(ids, detection_results):
if img_id.item() not in results:
results[img_id.item()] = result
else:
assert set(results[img_id.item()].keys()) == set(result.keys())
for k in result.keys():
if isinstance(result[k], torch.Tensor):
results[img_id.item()][k] = torch.cat(
[results[img_id.item()][k], result[k]], dim=0
)
elif isinstance(result[k], list):
results[img_id.item()][k] += result[k]
else:
raise NotImplementedError(
f"Unexpected type {type(result[k])} in result."
)
# Prune the results to the max number of detections per image.
for img_id, result in results.items():
if (
self.max_dets_per_img > 0
and len(result["scores"]) > self.max_dets_per_img
):
_, topk_indexes = torch.topk(
result["scores"], self.max_dets_per_img, dim=0
)
if self.to_cpu:
topk_indexes = topk_indexes.cpu()
for k in result.keys():
if isinstance(results[img_id][k], list):
results[img_id][k] = [
results[img_id][k][i] for i in topk_indexes.tolist()
]
else:
results[img_id][k] = results[img_id][k].to(topk_indexes.device)[
topk_indexes
]
return results
class PostProcessAPIVideo(PostProcessImage):
"""This module converts the video model's output into the format expected by the YT-VIS api"""
def __init__(
self,
*args,
to_cpu: bool = True,
convert_mask_to_rle: bool = False,
always_interpolate_masks_on_gpu: bool = True,
prob_thresh: float = 0.5,
use_presence: bool = False,
**kwargs,
):
super().__init__(
*args,
# Here we always set `convert_mask_to_rle=False` in the base `PostProcessAPI` class
# (so that its `_process_masks` won't return a list of RLEs). If we want to return
# RLEs for video masklets, we handle it in this `PostProcessAPIVideo` class instead.
convert_mask_to_rle=False,
# Here we always set `to_cpu=False` in the base `PostProcessAPI` class (so that
# the interpolated masks won't be automatically moved back to CPU). We will handle
# it in this `PostProcessAPIVideo` class instead.
always_interpolate_masks_on_gpu=always_interpolate_masks_on_gpu,
use_presence=use_presence,
**kwargs,
)
# Expected keys in the output dict to postprocess
self.EXPECTED_KEYS = [
"pred_logits",
"pred_boxes",
"pred_masks",
]
# Whether to post-process video masklets (under packed representation) into RLE format
self.convert_mask_to_rle_for_video = convert_mask_to_rle
self.to_cpu_for_video = to_cpu
self.prob_thresh = prob_thresh
def process_results(
self, find_stages, find_metadatas: List[BatchedInferenceMetadata], **kwargs
):
"""
Tracking Postprocessor for SAM 3 video model.
This function takes in the output of the SAM 3 video model and processes it to extract all the tracklet predictions.
Args:
find_stages: A list of tensors representing the output of the SAM 3 video model.
find_metadatas: A list of BatchedInferenceMetadata objects containing metadata about each frame.
**kwargs: Additional keyword arguments.
Returns:
A dictionary of predcitions with video_id as key.
"""
# Import tensordict here to avoid global dependency.
try:
from tensordict import TensorDict
except ImportError as e:
logging.error(
"tensordict is not installed, please install by running `pip install tensordict --no-deps`"
)
raise e
# Notes and assumptions:
# 1- This postprocessor assumes results only for a single video.
# 2- There are N stage outputs corresponding to N video frames
# 3- Each stage outputs contains PxQ preds, where P is number of prompts and Q is number of object queries. The output should also contain the tracking object ids corresponding to each object query.
# 4- The tracking object id has a default value of -1, indicating that the object query is not tracking any object in the frame, and hence its predictions can be ingored for a given frame.
# 5- Some objects may be tracked in a subset of frames only. So, we first extract the predictions in a packed representation (for efficient postprocessing -- specially memory)
# and then we convert the packed representation into a padded one, where we zero pad boxes/masks for objects that are not tracked in some frames.
# 6- We refer to objects by an object id, which is a tuple (prompt_idx, obj_id)
assert len(find_stages) > 0, "There is nothing to postprocess?"
PROMPT_AXIS, OBJ_QUERY_AXIS = (0, 1)
NO_OBJ_ID = -1
# Maps object ID -> [indices in packed tensor]
tracked_objects_packed_idx = defaultdict(list)
# Maps object ID -> [indices in padded tensor (abs frame index)]
tracked_objects_frame_idx = defaultdict(list)
total_num_preds = 0
# This will hold the packed representation of predictions.
vid_preds_packed: List[TensorDict] = []
vid_masklets_rle_packed: List[Optional[Dict]] = []
video_id = -1 # We assume single video postprocessing, this ID should be unique in the datapoint.
for frame_idx, (frame_outs, meta) in enumerate(
zip(find_stages, find_metadatas)
):
# only store keys we need to extract the results
frame_outs_td = TensorDict(
{k: frame_outs[k] for k in self.EXPECTED_KEYS}
).auto_batch_size_() # Shape is [P,Q,...]
meta_td = TensorDict(
dataclasses.asdict(meta)
).auto_batch_size_() # Shape is [P,...]
unique_vid_id = meta.original_image_id.unique()
assert unique_vid_id.size(0) == 1
if video_id == -1:
video_id = unique_vid_id.item()
else:
assert (
video_id == unique_vid_id.item()
), "We can only postprocess one video per datapoint"
# keeping track of which objects appear in the current frame
obj_ids_per_frame = frame_outs["pred_object_ids"]
assert obj_ids_per_frame.size(-1) == frame_outs["pred_logits"].size(-2)
if self.prob_thresh is not None:
# only keep the predictions on this frame with probability above the threshold
# (remove those predictions during the keep-alive period of a tracking query,
# where its "pred_object_ids" is still the tracked object ID rather than -1)
pred_probs = frame_outs["pred_logits"].sigmoid().squeeze(-1)
obj_ids_per_frame = torch.where(
pred_probs >= self.prob_thresh, obj_ids_per_frame, NO_OBJ_ID
)
tracked_obj_ids_idx = torch.where(obj_ids_per_frame != NO_OBJ_ID)
# Object id is a tuple of (prompt_idx, obj_id). This is because the model can assign same obj_id for two different prompts.
tracked_obj_ids = [
(p_id.item(), obj_ids_per_frame[p_id, q_id].item())
for p_id, q_id in zip(
tracked_obj_ids_idx[PROMPT_AXIS],
tracked_obj_ids_idx[OBJ_QUERY_AXIS],
)
]
if len(tracked_obj_ids) == 0:
continue
# For each object, we keep track of the packed and padded (frame index) indices
for oid in tracked_obj_ids:
tracked_objects_packed_idx[oid].append(total_num_preds)
tracked_objects_frame_idx[oid].append(frame_idx)
total_num_preds += 1
# Since we have P*Q masks per frame, mask interpolation is the GPU memory bottleneck or time bottleneck in case of cpu processing.
# Instead, we first extract results only for tracked objects, reducing the number of masks to K = sum_i(tracked_objs_per_ith_prompt), hopefully <<< P*Q
tracked_objs_outs_td = frame_outs_td[
tracked_obj_ids_idx
] # [P,Q,...] --> [K,...]
meta_td = meta_td[tracked_obj_ids_idx[PROMPT_AXIS].cpu()]
if self.always_interpolate_masks_on_gpu:
gpu_device = meta_td["original_size"].device
assert gpu_device.type == "cuda"
tracked_objs_outs_td = tracked_objs_outs_td.to(device=gpu_device)
frame_results_td = self(
tracked_objs_outs_td.unsqueeze(1),
(
meta_td["original_size"]
if self.use_original_sizes
else torch.ones_like(meta_td["original_size"])
),
forced_labels=(
meta_td["original_category_id"] if self.use_original_ids else None
),
consistent=True,
ret_tensordict=True,
).squeeze(1)
del tracked_objs_outs_td
# Optionally, remove "masks" from output tensor dict and directly encode them
# to RLE format under packed representations
if self.convert_mask_to_rle_for_video:
interpolated_binary_masks = frame_results_td.pop("masks")
rle_list = rle_encode(interpolated_binary_masks, return_areas=True)
vid_masklets_rle_packed.extend(rle_list)
# Optionally, move output TensorDict to CPU (do this after RLE encoding step above)
if self.to_cpu_for_video:
frame_results_td = frame_results_td.cpu()
vid_preds_packed.append(frame_results_td)
if len(vid_preds_packed) == 0:
logging.debug(f"Video {video_id} has no predictions")
return {video_id: []}
vid_preds_packed = torch.cat(vid_preds_packed, dim=0)
############### Construct a padded representation of the predictions ###############
num_preds = len(tracked_objects_packed_idx)
num_frames = len(find_stages)
# We zero pad any missing prediction
# NOTE: here, we also have padded tensors for "scores" and "labels", but we overwrite them later.
padded_frames_results = TensorDict(
{
k: torch.zeros(
num_preds, num_frames, *v.shape[1:], device=v.device, dtype=v.dtype
)
for k, v in vid_preds_packed.items()
},
batch_size=[
num_preds,
num_frames,
],
)
padded_frames_results["scores"][...] = -1e8 # a very low score for empty object
# Track scores and labels of each pred tracklet, only for frames where the model was able to track that object
tracklet_scores = []
tracklet_labels = []
# Optionally, fill the list of RLEs for masklets
# note: only frames with actual predicted masks (in packed format) will be
# filled with RLEs; the rest will remains None in results["masks_rle"]
if self.convert_mask_to_rle_for_video:
vid_masklets_rle_padded = [[None] * num_frames for _ in range(num_preds)]
for o_idx, oid in enumerate(tracked_objects_packed_idx):
oid2packed_idx = tracked_objects_packed_idx[oid]
oid2padded_idx = tracked_objects_frame_idx[oid]
obj_packed_results = vid_preds_packed[oid2packed_idx]
padded_frames_results[o_idx][oid2padded_idx] = obj_packed_results
if self.convert_mask_to_rle_for_video:
for packed_idx, padded_idx in zip(oid2packed_idx, oid2padded_idx):
vid_masklets_rle_padded[o_idx][padded_idx] = (
vid_masklets_rle_packed[packed_idx]
)
# NOTE: We need a single confidence score per tracklet for the mAP metric.
# We use the average confidence score across time. (How does this impact AP?)
tracklet_scores.append(obj_packed_results["scores"].mean())
# We also need to have a unique category Id per tracklet.
# This is not a problem for phrase AP, however, for mAP we do majority voting across time.
tracklet_labels.append(obj_packed_results["labels"].mode()[0])
results = padded_frames_results.to_dict()
results["scores"] = torch.stack(tracklet_scores, dim=0)
results["labels"] = torch.stack(tracklet_labels, dim=0)
if self.convert_mask_to_rle_for_video:
results["masks_rle"] = vid_masklets_rle_padded
# we keep the frame-level scores since it's needed by some evaluation scripts
results["per_frame_scores"] = padded_frames_results["scores"]
return {video_id: results}
class PostProcessTracking(PostProcessImage):
"""This module converts the model's output into the format expected by the coco api"""
def __init__(
self,
max_dets_per_img: int,
iou_type="bbox",
force_single_mask: bool = False,
**kwargs,
) -> None:
super().__init__(max_dets_per_img=max_dets_per_img, iou_type=iou_type, **kwargs)
self.force_single_mask = force_single_mask
def process_results(
self, find_stages, find_metadatas: BatchedInferenceMetadata, **kwargs
):
assert len(find_stages) == len(find_metadatas)
results = {}
for outputs, meta in zip(find_stages, find_metadatas):
if self.force_single_mask:
scores, labels = outputs["pred_logits"].max(-1)
m = []
for i in range(len(outputs["pred_masks"])):
score, idx = scores[i].max(0)
m.append(outputs["pred_masks"][i][idx])
outputs["pred_masks"] = torch.stack(m, 0).unsqueeze(1)
detection_results = self(outputs, meta.original_size, consistent=False)
assert len(detection_results) == len(meta.coco_image_id)
results.update(
{
(media_id.item(), object_id.item(), frame_index.item()): result
for media_id, object_id, frame_index, result in zip(
meta.original_image_id,
meta.object_id,
meta.frame_index,
detection_results,
)
}
)
return results
class PostProcessCounting(nn.Module):
"""This module converts the model's output to be evaluated for counting tasks"""
def __init__(
self,
use_original_ids: bool = False,
threshold: float = 0.5,
use_presence: bool = False,
) -> None:
"""
Args:
use_original_ids: whether to use the original image ids or the coco ids
threshold: threshold for counting (values above this are counted)
"""
super().__init__()
self.use_original_ids = use_original_ids
self.threshold = threshold
self.use_presence = use_presence
def forward(self, outputs, target_sizes):
"""Perform the computation
Parameters:
outputs: raw outputs of the model
target_sizes: tensor of dimension [batch_size x 2] containing the size of each images of the batch
"""
# Extract scores from model outputs and apply sigmoid
scores = torch.sigmoid(outputs["pred_logits"]).squeeze(-1) # [B, N]
if self.use_presence:
presence_score = outputs["presence_logit_dec"].sigmoid()
if presence_score.ndim == 1:
presence_score = presence_score.unsqueeze(1) # [B, 1]
scores = scores * presence_score # [B, N]
# Calculate counts by summing values above threshold
counts = (scores > self.threshold).float().sum(dim=1)
assert len(counts) == len(target_sizes)
results = []
for count in counts:
results.append({"count": count.item()})
return results
@torch.no_grad()
def process_results(
self, find_stages, find_metadatas: List[BatchedInferenceMetadata], **kwargs
):
assert len(find_stages) == len(find_metadatas)
results = {}
for outputs, meta in zip(find_stages, find_metadatas):
detection_results = self(
outputs,
meta.original_size,
)
ids = (
meta.original_image_id if self.use_original_ids else meta.coco_image_id
)
assert len(detection_results) == len(ids)
for img_id, result in zip(ids, detection_results):
results[img_id.item()] = result
return results

View File

@@ -0,0 +1,155 @@
# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
import argparse
import json
import os
from collections import defaultdict
from iopath.common.file_io import g_pathmgr
from sam3.eval.saco_veval_evaluators import (
VideoCGF1Evaluator,
VideoPhraseApEvaluator,
VideoPhraseHotaEvaluator,
VideoTetaEvaluator,
YTVISPredFileEvaluator,
)
class VEvalEvaluator:
def __init__(self, gt_annot_file: str, eval_res_file: str):
self.gt_annot_file = gt_annot_file
self.eval_res_file = eval_res_file
self.evaluators = [
# mAP
YTVISPredFileEvaluator(gt_annot_file),
# Phrase AP
VideoPhraseApEvaluator(gt_annot_file),
# TETA
VideoTetaEvaluator(gt_annot_file, use_mask=True, is_exhaustive=True),
# HOTA
VideoPhraseHotaEvaluator(gt_annot_file),
# cgF1
VideoCGF1Evaluator(gt_annot_file),
]
def run_eval(self, pred_file: str):
dataset_results = {}
video_np_results = defaultdict(dict)
for evaluator in self.evaluators:
d_res, v_np_res = evaluator.evaluate(pred_file)
dataset_results.update(d_res)
for (video_id, category_id), res in v_np_res.items():
video_np_results[(video_id, category_id)].update(res)
if len(dataset_results) == 0:
dataset_results = {"": 0.0}
formatted_video_np_results = [
{"video_id": video_id, "category_id": category_id, **res}
for (video_id, category_id), res in video_np_results.items()
]
eval_metrics = {
"dataset_results": dataset_results,
"video_np_results": formatted_video_np_results,
}
with g_pathmgr.open(self.eval_res_file, "w") as f:
json.dump(eval_metrics, f)
return eval_metrics
def run_main_all(dataset_name, args):
gt_annot_file = os.path.join(args.gt_annot_dir, dataset_name + ".json")
pred_file = os.path.join(args.pred_dir, dataset_name + "_preds.json")
eval_res_file = os.path.join(args.eval_res_dir, dataset_name + "_eval_res.json")
print(f"=== Running evaluation for Pred {pred_file} vs GT {gt_annot_file} ===")
veval_evaluator = VEvalEvaluator(
gt_annot_file=gt_annot_file, eval_res_file=eval_res_file
)
_ = veval_evaluator.run_eval(pred_file=pred_file)
print(f"=== Results saved to {eval_res_file} ===")
def main_all(args):
saco_veval_dataset_names = [
"saco_veval_sav_test",
"saco_veval_sav_val",
"saco_veval_yt1b_test",
"saco_veval_yt1b_val",
"saco_veval_smartglasses_test",
"saco_veval_smartglasses_val",
]
# multiprocessing may not really work as inner evaluator also using multiprocessing
# so we just for loop
for dataset_name in saco_veval_dataset_names:
print(f"=== Running evaluation for dataset {dataset_name} ===")
run_main_all(dataset_name=dataset_name, args=args)
def main_one(args):
gt_annot_file = args.gt_annot_file
pred_file = args.pred_file
eval_res_file = args.eval_res_file
print(f"=== Running evaluation for Pred {pred_file} vs GT {gt_annot_file} ===")
veval_evaluator = VEvalEvaluator(
gt_annot_file=gt_annot_file, eval_res_file=eval_res_file
)
_ = veval_evaluator.run_eval(pred_file=pred_file)
print(f"=== Results saved to {eval_res_file} ===")
def main():
parser = argparse.ArgumentParser(description="Run video grounding evaluators")
# Create subparsers for different commands
subparsers = parser.add_subparsers(dest="command", required=True)
# Run evaluation for all datasets
all_parser = subparsers.add_parser("all", help="Run evaluation for all datasets")
all_parser.add_argument(
"--gt_annot_dir",
type=str,
help="Directory that contains the ground truth annotation files",
)
all_parser.add_argument(
"--pred_dir",
type=str,
help="Directory that contains the prediction files",
)
all_parser.add_argument(
"--eval_res_dir",
type=str,
help="Directory that contains the eval results files",
)
all_parser.set_defaults(func=main_all)
# Run evaluation for one dataset
one_parser = subparsers.add_parser("one", help="Run evaluation for one dataset")
one_parser.add_argument(
"--gt_annot_file",
type=str,
help="Path to the ground truth annotation file",
)
one_parser.add_argument(
"--pred_file",
type=str,
help="Path to the prediction file",
)
one_parser.add_argument(
"--eval_res_file",
type=str,
help="Path to the eval results file",
)
one_parser.set_defaults(func=main_one)
# Parse and dispatch
args = parser.parse_args()
args.func(args)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,838 @@
# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
import json
import os
import tempfile
from collections import defaultdict
from typing import Dict, Optional, Sequence, Tuple
import numpy as np
import pycocotools.mask
from sam3.eval.cgf1_eval import CGF1_METRICS
from sam3.eval.conversion_util import (
convert_ytbvis_to_cocovid_gt,
convert_ytbvis_to_cocovid_pred,
)
from sam3.eval.hota_eval_toolkit.run_ytvis_eval import run_ytvis_eval
from sam3.eval.teta_eval_toolkit import config, Evaluator, metrics
from sam3.eval.teta_eval_toolkit.datasets import COCO, TAO
from sam3.eval.ytvis_coco_wrapper import YTVIS
from sam3.eval.ytvis_eval import VideoDemoF1Eval, YTVISeval
from sam3.train.nms_helper import process_frame_level_nms, process_track_level_nms
def _get_metric_index(metric_name: str, iou_threshold: Optional[float] = None) -> int:
"""
Find the index of a metric in CGF1_METRICS by name and IoU threshold.
Args:
metric_name: Name of the metric (e.g., "cgF1", "precision", "recall")
iou_threshold: IoU threshold (None for average over 0.5:0.95, or specific value like 0.5, 0.75)
Returns:
Index of the metric in CGF1_METRICS
Raises:
ValueError: If metric not found
"""
for idx, metric in enumerate(CGF1_METRICS):
if metric.name == metric_name and metric.iou_threshold == iou_threshold:
return idx
raise ValueError(
f"Metric '{metric_name}' with IoU threshold {iou_threshold} not found in CGF1_METRICS"
)
class BasePredFileEvaluator:
"""A base class for evaluating a prediction file."""
pass
class YTVISPredFileEvaluator(BasePredFileEvaluator):
"""Evaluate class mAP for YT-VIS prediction files."""
def __init__(
self,
gt_ann_file: str,
dataset_name: str = "video",
iou_types: Optional[Sequence[str]] = None,
):
self.gt_ann_file = gt_ann_file
self.dataset_name = dataset_name
self.iou_types = list(iou_types) if iou_types is not None else ["bbox", "segm"]
assert all(iou_type in ["bbox", "segm"] for iou_type in self.iou_types)
def evaluate(self, pred_file: str) -> Dict[str, float]:
# use our internal video evaluation toolkit for YT-VIS pred file
# (i.e. the same one we're using for video phrase AP)
results = {}
use_cats = True # YT-VIS mAP evaluation uses categories
ytvisGT = YTVIS(self.gt_ann_file, ignore_gt_cats=not use_cats)
# the original YT-VIS GT annotations have uncompressed RLEs ("counts" is an integer list)
# rather than compressed RLEs ("counts" is a string), so we first convert them here.
if "segm" in self.iou_types:
for ann in ytvisGT.dataset["annotations"]:
ann["segmentations"] = [
_compress_rle(rle) for rle in ann["segmentations"]
]
with open(pred_file) as f:
dt = json.load(f)
# Our prediction file saves "video_id" and absolute (unnormalized) boxes.
# Note that we should use the official (original) YT-VIS annotations (i.e. the one
# saved via "scripts/datasets/training/ytvis_split.py", instead of the one saved
# via "scripts/api_db_to_ytvis_json.py") in this evaluator, which contain absolute
# boxes coordinates in its GT annotations.
for d in dt:
d["image_id"] = d["video_id"]
ytvisDT = ytvisGT.loadRes(dt)
for iou_type in self.iou_types:
ytvisEval = YTVISeval(ytvisGT, ytvisDT, iou_type)
# set the area ranges for small, medium, and large objects (using
# absolute pixel areas) as in the official YT-VIS evaluation toolkit:
# https://github.com/achalddave/ytvosapi/blob/eca601117c9f86bad084cb91f1d918e9ab665a75/PythonAPI/ytvostools/ytvoseval.py#L538
ytvisEval.params.areaRng = [
[0**2, 1e5**2],
[0**2, 128**2],
[128**2, 256**2],
[256**2, 1e5**2],
]
ytvisEval.params.areaRngLbl = ["all", "small", "medium", "large"]
ytvisEval.params.useCats = use_cats
ytvisEval.evaluate()
ytvisEval.accumulate()
ytvisEval.summarize()
result_key = f"{self.dataset_name}_{'mask' if iou_type == 'segm' else 'bbox'}_mAP_50_95"
results[result_key] = ytvisEval.stats[0]
# video-NP level results not supported for `YTVISPredFileEvaluator` yet
video_np_level_results = {}
return results, video_np_level_results
class VideoPhraseApEvaluator(BasePredFileEvaluator):
"""Evaluate Video Phrase AP with YT-VIS format prediction and GT files."""
def __init__(
self,
gt_ann_file: str,
dataset_name: str = "video",
iou_types: Optional[Sequence[str]] = None,
):
self.gt_ann_file = gt_ann_file
self.dataset_name = dataset_name
self.iou_types = list(iou_types) if iou_types is not None else ["bbox", "segm"]
assert all(iou_type in ["bbox", "segm"] for iou_type in self.iou_types)
def evaluate(self, pred_file: str) -> Dict[str, float]:
with open(self.gt_ann_file) as f:
gt = json.load(f)
with open(pred_file) as f:
dt = json.load(f)
# For phrase AP and demo F1 evaluation, we need to remap each pair of (video_id, category_id) to
# a new unique video_id, so that we don't mix detections from different categories under `useCat=False`
gt, dt = remap_video_category_pairs_to_unique_video_ids(gt, dt)
if "segm" in self.iou_types:
for ann in gt["annotations"]:
ann["segmentations"] = [
_compress_rle(rle) for rle in ann["segmentations"]
]
for d in dt:
d["image_id"] = d["video_id"]
results = {}
use_cats = False # Phrase AP evaluation does not use categories
ytvisGT = YTVIS(annotation_file=None, ignore_gt_cats=not use_cats)
ytvisGT.dataset = gt
ytvisGT.createIndex()
ytvisDT = ytvisGT.loadRes(dt)
for iou_type in self.iou_types:
phraseApEval = YTVISeval(ytvisGT, ytvisDT, iou_type)
# set the area ranges for small, medium, and large objects (using
# absolute pixel areas) as in the official YT-VIS evaluation toolkit:
# https://github.com/achalddave/ytvosapi/blob/eca601117c9f86bad084cb91f1d918e9ab665a75/PythonAPI/ytvostools/ytvoseval.py#L538
phraseApEval.params.areaRng = [
[0**2, 1e5**2],
[0**2, 128**2],
[128**2, 256**2],
[256**2, 1e5**2],
]
phraseApEval.params.areaRngLbl = ["all", "small", "medium", "large"]
phraseApEval.params.useCats = use_cats
phraseApEval.evaluate()
phraseApEval.accumulate()
phraseApEval.summarize()
result_prefix = f"{self.dataset_name}"
result_prefix += f"_{'mask' if iou_type == 'segm' else 'bbox'}_phrase_ap"
# fetch Phrase AP results from the corresponding indices in `phraseApEval.stats`
# (see `_summarizeDets` in https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/pycocotools/cocoeval.py)
results[result_prefix + "_50_95"] = phraseApEval.stats[0] # IoU=0.5:0.95
results[result_prefix + "_50"] = phraseApEval.stats[1] # IoU=0.5
results[result_prefix + "_75"] = phraseApEval.stats[2] # IoU=0.75
# video-NP level results not supported for `VideoPhraseApEvaluator` yet
video_np_level_results = {}
return results, video_np_level_results
class VideoCGF1Evaluator(BasePredFileEvaluator):
"""Evaluate Video Demo F1 with YT-VIS format prediction and GT files."""
def __init__(
self,
gt_ann_file: str,
dataset_name: str = "video",
prob_thresh: float = 0.5,
iou_types: Optional[Sequence[str]] = None,
):
self.gt_ann_file = gt_ann_file
self.dataset_name = dataset_name
self.prob_thresh = prob_thresh
self.iou_types = list(iou_types) if iou_types is not None else ["bbox", "segm"]
assert all(iou_type in ["bbox", "segm"] for iou_type in self.iou_types)
def evaluate(self, pred_file: str) -> Dict[str, float]:
with open(self.gt_ann_file) as f:
gt = json.load(f)
with open(pred_file) as f:
dt = json.load(f)
# compute IL_MCC and CG-F1 can only be computed if we have "video_np_pairs" keys in the GT JSON
compute_ilmcc_and_cgf1 = "video_np_pairs" in gt
if not compute_ilmcc_and_cgf1:
print(
f"Warning: IL_MCC and CG-F1 are not computed for {pred_file=} as it does not have 'video_np_pairs' keys in the GT JSON"
)
# For phrase AP and demo F1 evaluation, we need to remap each pair of (video_id, category_id) to
# a new unique video_id, so that we don't mix detections from different categories under `useCat=False`
gt, dt = remap_video_category_pairs_to_unique_video_ids(
gt, dt, add_negative_np_pairs=compute_ilmcc_and_cgf1
)
if "segm" in self.iou_types:
for ann in gt["annotations"]:
ann["segmentations"] = [
_compress_rle(rle) for rle in ann["segmentations"]
]
for d in dt:
d["image_id"] = d["video_id"]
results = {}
use_cats = False # Demo F1 evaluation does not use categories
ytvisGT = YTVIS(annotation_file=None, ignore_gt_cats=not use_cats)
ytvisGT.dataset = gt
ytvisGT.createIndex()
ytvisDT = ytvisGT.loadRes(dt)
video_np_level_results = {}
for iou_type in self.iou_types:
demoF1Eval = VideoDemoF1Eval(ytvisGT, ytvisDT, iou_type, self.prob_thresh)
demoF1Eval.params.useCats = use_cats
demoF1Eval.params.areaRng = [[0**2, 1e5**2]]
demoF1Eval.params.areaRngLbl = ["all"]
demoF1Eval.params.maxDets = [100000]
demoF1Eval.evaluate()
demoF1Eval.accumulate()
demoF1Eval.summarize()
result_prefix = f"{self.dataset_name}"
result_prefix += f"_{'mask' if iou_type == 'segm' else 'bbox'}_demo"
stats = demoF1Eval.stats
if compute_ilmcc_and_cgf1:
# Average IoU threshold (0.5:0.95)
cgf1_micro_avg_idx = _get_metric_index("cgF1", None)
positive_micro_f1_avg_idx = _get_metric_index("positive_micro_F1", None)
ilmcc_avg_idx = _get_metric_index("IL_MCC", None)
results[result_prefix + "_cgf1_micro_50_95"] = stats[cgf1_micro_avg_idx]
results[result_prefix + "_ilmcc_50_95"] = stats[ilmcc_avg_idx]
results[result_prefix + "_positive_micro_f1_50_95"] = stats[
positive_micro_f1_avg_idx
]
# IoU = 0.5
cgf1_micro_50_idx = _get_metric_index("cgF1", 0.5)
positive_micro_f1_50_idx = _get_metric_index("positive_micro_F1", 0.5)
results[result_prefix + "_cgf1_micro_50"] = stats[cgf1_micro_50_idx]
results[result_prefix + "_ilmcc_50"] = float(
np.array(stats[cgf1_micro_50_idx])
/ np.array(stats[positive_micro_f1_50_idx])
)
results[result_prefix + "_positive_micro_f1_50"] = stats[
positive_micro_f1_50_idx
]
# IoU = 0.75
cgf1_micro_75_idx = _get_metric_index("cgF1", 0.75)
positive_micro_f1_75_idx = _get_metric_index("positive_micro_F1", 0.75)
results[result_prefix + "_cgf1_micro_75"] = stats[cgf1_micro_75_idx]
results[result_prefix + "_ilmcc_75"] = float(
np.array(stats[cgf1_micro_75_idx])
/ np.array(stats[positive_micro_f1_75_idx])
)
results[result_prefix + "_positive_micro_f1_75"] = stats[
positive_micro_f1_75_idx
]
self.extract_video_np_level_results(demoF1Eval, video_np_level_results)
return results, video_np_level_results
def extract_video_np_level_results(self, demoF1Eval, video_np_level_results):
"""Aggregate statistics for video-level metrics."""
num_iou_thrs = len(demoF1Eval.params.iouThrs)
iou_50_index = int(np.where(demoF1Eval.params.iouThrs == 0.5)[0])
iou_75_index = int(np.where(demoF1Eval.params.iouThrs == 0.75)[0])
result_prefix = "mask" if demoF1Eval.params.iouType == "segm" else "bbox"
assert len(demoF1Eval.evalImgs) == len(demoF1Eval.cocoGt.dataset["images"])
for i, video in enumerate(demoF1Eval.cocoGt.dataset["images"]):
# the original video id and category id before remapping
video_id = video["orig_video_id"]
category_id = video["orig_category_id"]
eval_img_dict = demoF1Eval.evalImgs[i]
TPs = eval_img_dict.get("TPs", np.zeros(num_iou_thrs, dtype=np.int64))
FPs = eval_img_dict.get("FPs", np.zeros(num_iou_thrs, dtype=np.int64))
FNs = eval_img_dict.get("FNs", np.zeros(num_iou_thrs, dtype=np.int64))
assert len(TPs) == len(FPs) == len(FNs) == num_iou_thrs
# F1 = 2*TP / (2*TP + FP + FN), and we set F1 to 1.0 if denominator is 0
denominator = 2 * TPs + FPs + FNs
F1s = np.where(denominator > 0, 2 * TPs / np.maximum(denominator, 1), 1.0)
local_results = {
f"{result_prefix}_TP_50_95": float(TPs.mean()),
f"{result_prefix}_FP_50_95": float(FPs.mean()),
f"{result_prefix}_FN_50_95": float(FNs.mean()),
f"{result_prefix}_F1_50_95": float(F1s.mean()),
f"{result_prefix}_TP_50": float(TPs[iou_50_index]),
f"{result_prefix}_FP_50": float(FPs[iou_50_index]),
f"{result_prefix}_FN_50": float(FNs[iou_50_index]),
f"{result_prefix}_F1_50": float(F1s[iou_50_index]),
f"{result_prefix}_TP_75": float(TPs[iou_75_index]),
f"{result_prefix}_FP_75": float(FPs[iou_75_index]),
f"{result_prefix}_FN_75": float(FNs[iou_75_index]),
f"{result_prefix}_F1_75": float(F1s[iou_75_index]),
}
if (video_id, category_id) not in video_np_level_results:
video_np_level_results[(video_id, category_id)] = {}
video_np_level_results[(video_id, category_id)].update(local_results)
class VideoTetaEvaluator(BasePredFileEvaluator):
"""Evaluate TETA metric using YouTubeVIS format prediction and GT files."""
def __init__(
self,
gt_ann_file: str,
dataset_name: str = "video",
tracker_name: str = "Sam3",
nms_threshold: float = 0.5,
nms_strategy: str = "none", # "track", "frame", or "none"
prob_thresh: float = 0.5,
is_exhaustive: bool = False,
use_mask: bool = False,
num_parallel_cores: int = 8,
):
self.gt_ann_file = gt_ann_file
self.dataset_name = dataset_name
self.tracker_name = tracker_name
self.nms_threshold = nms_threshold
self.nms_strategy = nms_strategy.lower() # Convert to lowercase for consistency
self.prob_thresh = prob_thresh
self.metric_prefix = "TETA"
self.is_exhaustive = is_exhaustive
self.use_mask = use_mask
self.num_parallel_cores = num_parallel_cores
# Verify NMS strategy is valid
valid_strategies = ["track", "frame", "none"]
print("current nms_strategy:", self.nms_strategy)
if self.nms_strategy not in valid_strategies:
raise ValueError(
f"Invalid NMS strategy: {self.nms_strategy}. Must be one of {valid_strategies}"
)
print(f"Initialized VideoTetaEvaluator with NMS strategy: {self.nms_strategy}")
print(f"Probability threshold set to: {self.prob_thresh}")
print(f"Dataset exhaustivity set to: {self.is_exhaustive}")
print(f"Tracker name set to: {self.tracker_name}")
print(f"Dataset name set to: {self.dataset_name}")
print(f"Use mask set to: {self.use_mask}")
def process_predictions(self, pred_file: str, tmp_dir: str) -> str:
"""Process predictions with selected NMS strategy"""
with open(pred_file, "r") as f:
raw_preds = json.load(f)
print(f"Processing predictions with {self.nms_strategy} NMS strategy")
# Filter by score threshold
if self.prob_thresh > 0:
raw_preds = [d for d in raw_preds if d["score"] >= self.prob_thresh]
print(
f"Filtered to {len(raw_preds)} predictions with score >= {self.prob_thresh}"
)
# Group predictions by video_id
video_groups = defaultdict(list)
for pred in raw_preds:
video_groups[pred["video_id"]].append(pred)
# Process based on NMS strategy
if self.nms_strategy == "track":
process_track_level_nms(video_groups, nms_threshold=self.nms_threshold)
elif self.nms_strategy == "frame":
process_frame_level_nms(video_groups, nms_threshold=self.nms_threshold)
elif self.nms_strategy == "none":
print("Skipping NMS processing as strategy is set to 'none'")
# No processing needed for "none" strategy
# Save processed predictions
processed_preds = [
track for tracks in video_groups.values() for track in tracks
]
processed_path = os.path.join(tmp_dir, "processed_preds.json")
with open(processed_path, "w") as f:
json.dump(processed_preds, f)
print(f"Saved processed predictions to {processed_path}")
return processed_path
def evaluate(self, pred_file: str) -> Tuple[Dict[str, float], Dict]:
"""Main evaluation method"""
print(f"Evaluating TETA Metric with {self.nms_strategy.upper()} NMS strategy")
with tempfile.TemporaryDirectory() as tmp_dir:
# Process predictions first
processed_pred_file = self.process_predictions(pred_file, tmp_dir)
# Convert GT to COCO-vid format
gt_dir = os.path.join(tmp_dir, "gt")
os.makedirs(gt_dir, exist_ok=True)
gt_coco_path = os.path.join(gt_dir, "annotations.json")
convert_ytbvis_to_cocovid_gt(self.gt_ann_file, gt_coco_path)
# Convert processed predictions to COCO-vid format
pred_dir = os.path.join(tmp_dir, "predictions")
tracker_dir = os.path.join(pred_dir, self.tracker_name)
os.makedirs(tracker_dir, exist_ok=True)
pred_coco_path = os.path.join(tracker_dir, "track_results_cocofmt.json")
convert_ytbvis_to_cocovid_pred(
youtubevis_pred_path=processed_pred_file,
converted_dataset_path=gt_coco_path,
output_path=pred_coco_path,
)
# Configure TETA evaluator
default_eval_config = config.get_default_eval_config()
default_eval_config["PRINT_ONLY_COMBINED"] = True
default_eval_config["DISPLAY_LESS_PROGRESS"] = True
default_eval_config["OUTPUT_TEMP_RAW_DATA"] = True
default_eval_config["NUM_PARALLEL_CORES"] = self.num_parallel_cores
default_dataset_config = config.get_default_dataset_config()
default_dataset_config["TRACKERS_TO_EVAL"] = [self.tracker_name]
default_dataset_config["GT_FOLDER"] = gt_dir
default_dataset_config["OUTPUT_FOLDER"] = pred_dir
default_dataset_config["TRACKER_SUB_FOLDER"] = tracker_dir
default_dataset_config["USE_MASK"] = self.use_mask
evaluator = Evaluator(default_eval_config)
if self.is_exhaustive:
dataset_list = [COCO(default_dataset_config)]
dataset_parsing_key = "COCO"
else:
dataset_list = [TAO(default_dataset_config)]
dataset_parsing_key = "TAO"
# Run evaluation
eval_results, _ = evaluator.evaluate(
dataset_list, [metrics.TETA(exhaustive=self.is_exhaustive)]
)
# Extract and format results
results = {
f"{self.dataset_name}_{'mask' if self.use_mask else 'bbox'}_teta": float(
eval_results[dataset_parsing_key]["TETA"][0]
),
f"{self.dataset_name}_{'mask' if self.use_mask else 'bbox'}_loc_a": float(
eval_results[dataset_parsing_key]["TETA"][1]
),
f"{self.dataset_name}_{'mask' if self.use_mask else 'bbox'}_assoc_a": float(
eval_results[dataset_parsing_key]["TETA"][2]
),
f"{self.dataset_name}_{'mask' if self.use_mask else 'bbox'}_cls_a": float(
eval_results[dataset_parsing_key]["TETA"][3]
),
f"{self.dataset_name}_{'mask' if self.use_mask else 'bbox'}_loc_re": float(
eval_results[dataset_parsing_key]["TETA"][4]
),
f"{self.dataset_name}_{'mask' if self.use_mask else 'bbox'}_loc_pr": float(
eval_results[dataset_parsing_key]["TETA"][5]
),
f"{self.dataset_name}_{'mask' if self.use_mask else 'bbox'}_assoc_re": float(
eval_results[dataset_parsing_key]["TETA"][6]
),
f"{self.dataset_name}_{'mask' if self.use_mask else 'bbox'}_assoc_pr": float(
eval_results[dataset_parsing_key]["TETA"][7]
),
f"{self.dataset_name}_{'mask' if self.use_mask else 'bbox'}_cls_re": float(
eval_results[dataset_parsing_key]["TETA"][8]
),
f"{self.dataset_name}_{'mask' if self.use_mask else 'bbox'}_cls_pr": float(
eval_results[dataset_parsing_key]["TETA"][9]
),
}
# video-NP level results not supported for `VideoTetaEvaluator` yet
video_np_level_results = {}
return results, video_np_level_results
class VideoPhraseHotaEvaluator(BasePredFileEvaluator):
"""Evaluate Video Phrase HOTA with YT-VIS format prediction and GT files."""
def __init__(
self,
gt_ann_file: str,
dataset_name: str = "video",
prob_thresh: float = 0.5,
iou_types: Optional[Sequence[str]] = None,
compute_video_mot_hota: bool = False,
):
self.gt_ann_file = gt_ann_file
self.dataset_name = dataset_name
self.prob_thresh = prob_thresh
self.metric_prefix = "phrase"
# the list of metrics to collect from the HOTA evaluation results
self.metric_to_collect = [
"HOTA",
"DetA",
"AssA",
"DetRe",
"DetPr",
"AssRe",
"AssPr",
"LocA",
"OWTA",
]
self.iou_types = list(iou_types) if iou_types is not None else ["bbox", "segm"]
assert all(iou_type in ["bbox", "segm"] for iou_type in self.iou_types)
# If True, compute video MOT HOTA, aggregating predictions/GT from all categories.
self.compute_video_mot_hota = compute_video_mot_hota
def evaluate(self, pred_file: str) -> Dict[str, float]:
# use the YT-VIS evaluation toolkit in TrackEval
with open(self.gt_ann_file) as f:
gt = json.load(f)
with open(pred_file) as f:
dt = json.load(f)
# keep only predictions with score above the probability threshold
dt = [d for d in dt if d["score"] > self.prob_thresh]
for d in dt:
assert len(d["areas"]) == len(d["bboxes"])
assert len(d["areas"]) == len(d["segmentations"])
# remove empty boxes (otherwise they will count as false positives for during
# per-frame detection accuracy in HOTA evaluation)
for t in range(len(d["bboxes"])):
bbox = d["bboxes"][t]
if d["areas"][t] == 0 or bbox is None or all(x == 0 for x in bbox):
d["segmentations"][t] = None
d["bboxes"][t] = None
d["areas"][t] = None
# check that box occurence and mask occurence are consistent
for bbox, mask, area in zip(d["bboxes"], d["segmentations"], d["areas"]):
assert (area is None) == (bbox is None)
assert (area is None) == (mask is None)
# set all scores to 1.0 for HOTA evaluation (just like Demo F1, the exact score
# value is not used in HOTA metrics; it will be treated as a detection prediction
# as long as its score is above the threshold)
d["score"] = 1.0
# remap the GT and DT annotations for phrase HOTA evaluation
gt = _fill_in_ann_height_width(gt)
if not self.compute_video_mot_hota:
# remap the GT and DT annotations for phrase HOTA evaluation
gt, dt = self._remap_gt_dt(gt, dt)
else:
# Compute video-level MOT HOTA
# Apply track-level NMS
video_groups = defaultdict(list)
for pred in dt:
video_groups[pred["video_id"]].append(pred)
process_track_level_nms(video_groups, nms_threshold=0.5)
dt = [track for tracks in video_groups.values() for track in tracks]
# Remap GT track ids for class-agnostic HOTA
gt, dt = remap_gt_dt_class_agnostic(gt, dt)
# run the HOTA evaluation using TrackEval on the remapped (video_id, category_id) pairs
out_dict = {}
video_np_level_results = {}
for iou_type in self.iou_types:
output_res, _ = run_ytvis_eval(
args=[
"--METRICS",
"HOTA",
"--IOU_TYPE",
iou_type,
"--DATASET_NAME",
self.dataset_name,
"--USE_PARALLEL",
"True",
"--NUM_PARALLEL_CORES",
"8",
"--PLOT_CURVES",
"False",
"--LOG_ON_ERROR",
"None",
"--PRINT_ONLY_COMBINED",
"True",
"--OUTPUT_SUMMARY",
"False",
"--OUTPUT_DETAILED",
"False",
"--TIME_PROGRESS",
"False",
"--PRINT_CONFIG",
"False",
],
gt_json=gt,
dt_json=dt,
)
self.extract_video_np_level_results(
iou_type=iou_type,
remapped_gt=gt,
raw_results=output_res[self.dataset_name]["tracker"],
video_np_level_results=video_np_level_results,
)
def _summarize_results(output_res, iou_type, field, suffix):
eval_res = output_res[self.dataset_name]["tracker"][field]
result_prefix = f"{self.dataset_name}_{'mask' if iou_type == 'segm' else 'bbox'}_{suffix}"
for metric_name in self.metric_to_collect:
eval_res_hota = eval_res["cls_comb_cls_av"]["HOTA"]
result_key = f"{result_prefix}_{self.metric_prefix}_{metric_name}"
result_value = float(np.mean(eval_res_hota[metric_name]))
out_dict[result_key] = result_value
_summarize_results(output_res, iou_type, "COMBINED_SEQ", "all")
if "COMBINED_SEQ_CHALLENGING" in output_res[self.dataset_name]["tracker"]:
_summarize_results(
output_res, iou_type, "COMBINED_SEQ_CHALLENGING", "challenging"
)
# video-NP level results not supported for `VideoPhraseHotaEvaluator` yet
return out_dict, video_np_level_results
def _remap_gt_dt(self, gt, dt):
# For phrase HOTA evaluation, we need to remap each pair of (video_id, category_id) to
# a new unique video_id, so that we don't mix detections from different categories
gt, dt = remap_video_category_pairs_to_unique_video_ids(gt, dt)
# We further map all the categories to category_id=1 in HOTA evaluation toolkit
# for phrase HOTA (similar to "useCat=False" for video phrase AP)
remapped_category_id = 1
gt["categories"] = [
{
"supercategory": "object",
"id": remapped_category_id,
"name": "_REMAPPED_FOR_PHRASE_METRICS_",
}
]
for ann in gt["annotations"]:
ann["category_id"] = remapped_category_id
for d in dt:
d["category_id"] = remapped_category_id
# To be compatible with the TrackEval YT-VIS evaluation toolkit, we need to give
# unique filenames to each remapped video, so we add remapped video_id as prefix.
for video in gt["videos"]:
new_video_id = video["id"]
video["file_names"] = [
f"remapped_vid_{new_video_id:012d}/{name}"
for name in video["file_names"]
]
return gt, dt
def extract_video_np_level_results(
self, iou_type, remapped_gt, raw_results, video_np_level_results
):
"""Aggregate statistics for video-level metrics."""
result_prefix = "mask" if iou_type == "segm" else "bbox"
for video in remapped_gt["videos"]:
# the original video id and category id before remapping
video_id = video["orig_video_id"]
category_id = video["orig_category_id"]
video_key = f"remapped_vid_{video['id']:012d}"
results = raw_results[video_key]["_REMAPPED_FOR_PHRASE_METRICS_"]["HOTA"]
local_results = {}
for metric_name in self.metric_to_collect:
result_key = f"{result_prefix}_{metric_name}"
local_results[result_key] = float(results[metric_name].mean())
if (video_id, category_id) not in video_np_level_results:
video_np_level_results[(video_id, category_id)] = {}
video_np_level_results[(video_id, category_id)].update(local_results)
class VideoClassBasedHotaEvaluator(VideoPhraseHotaEvaluator):
def __init__(
self,
gt_ann_file: str,
dataset_name: str = "video",
prob_thresh: float = 0.5,
):
super().__init__(gt_ann_file, dataset_name, prob_thresh)
self.metric_prefix = "class"
def _remap_gt_dt(self, gt, dt):
return gt, dt # no remapping needed for class-based HOTA evaluation
def extract_video_np_level_results(self, *args, **kwargs):
pass # no video-NP level results for class-based HOTA evaluation
def _compress_rle(rle):
"""Convert RLEs from uncompressed (integer list) to compressed (string) format."""
if rle is None:
return None
if isinstance(rle["counts"], list):
rle = pycocotools.mask.frPyObjects(rle, rle["size"][0], rle["size"][1])
rle["counts"] = rle["counts"].decode()
return rle
def remap_video_category_pairs_to_unique_video_ids(
gt_json, dt_json, add_negative_np_pairs=False
):
"""
Remap each pair of (video_id, category_id) to a new unique video_id. This is useful
for phrase AP and demo F1 evaluation on videos, where we have `useCat=False` and
rely on separating different NPs (from the same video) into different new video ids,
so that we don't mix detections from different categories in computeIoU under `useCat=False`.
This is consistent with how do we phrase AP and demo F1 evaluation on images, where we
use a remapped unique coco_image_id for each image-NP pair (based in its query["id"] in
CustomCocoDetectionAPI.load_queries in modulated_detection_api.py)
"""
# collect the unique video_id-category_id pairs
video_id_to_video = {v["id"]: v for v in gt_json["videos"]}
video_id_category_id_pairs = set()
for pred in dt_json:
video_id_category_id_pairs.add((pred["video_id"], pred["category_id"]))
for ann in gt_json["annotations"]:
video_id_category_id_pairs.add((ann["video_id"], ann["category_id"]))
# assign the video_id-category_id pairs to unique video ids
video_id_category_id_pairs = sorted(video_id_category_id_pairs)
video_id_category_id_to_new_video_id = {
pair: (i + 1) for i, pair in enumerate(video_id_category_id_pairs)
}
# also map the negative NP pairs -- this is needed for IL_MCC and CG-F1 evaluation
if add_negative_np_pairs:
for vnp in gt_json["video_np_pairs"]:
pair = (vnp["video_id"], vnp["category_id"])
if pair not in video_id_category_id_to_new_video_id:
video_id_category_id_to_new_video_id[pair] = (
len(video_id_category_id_to_new_video_id) + 1
)
# map the "video_id" in predictions
for pred in dt_json:
pred["video_id"] = video_id_category_id_to_new_video_id[
(pred["video_id"], pred["category_id"])
]
# map the "video_id" in gt_json["annotations"]
for ann in gt_json["annotations"]:
ann["video_id"] = video_id_category_id_to_new_video_id[
(ann["video_id"], ann["category_id"])
]
# map and duplicate gt_json["videos"]
new_videos = []
for (
video_id,
category_id,
), new_video_id in video_id_category_id_to_new_video_id.items():
video = video_id_to_video[video_id].copy()
video["id"] = new_video_id
# preserve the original video_id and category_id of each remapped video entry,
# so that we can associate sample-level eval metrics with the original video-NP pairs
video["orig_video_id"] = video_id
video["orig_category_id"] = category_id
new_videos.append(video)
gt_json["videos"] = new_videos
return gt_json, dt_json
def remap_gt_dt_class_agnostic(gt, dt):
"""
For class-agnostic HOTA, merge all GT tracks for each video (across NPs),
ensure unique track_ids, and set all category_id to 1.
Also, add orig_video_id and orig_category_id for compatibility.
"""
# 1. Remap all GT track_ids to be unique per video
gt_anns_by_video = defaultdict(list)
for ann in gt["annotations"]:
gt_anns_by_video[ann["video_id"]].append(ann)
# Ensure unique track ids across tracks of all videos
next_tid = 1
for _, anns in gt_anns_by_video.items():
# Map old track_ids to new unique ones
old_to_new_tid = {}
for ann in anns:
old_tid = ann["id"]
if old_tid not in old_to_new_tid:
old_to_new_tid[old_tid] = next_tid
next_tid += 1
ann["id"] = old_to_new_tid[old_tid]
# Set category_id to 1 for class-agnostic
ann["category_id"] = 1
# Set all GT categories to a single category
gt["categories"] = [
{
"supercategory": "object",
"id": 1,
"name": "_REMAPPED_FOR_PHRASE_METRICS_",
}
]
# Add orig_video_id and orig_category_id to each video for compatibility
anns_by_video = defaultdict(list)
for ann in gt["annotations"]:
anns_by_video[ann["video_id"]].append(ann)
for video in gt["videos"]:
video["orig_video_id"] = video["id"]
# Use the first annotation's original category_id if available, else None
orig_cat = (
anns_by_video[video["id"]][0]["category_id"]
if anns_by_video[video["id"]]
else None
)
video["orig_category_id"] = orig_cat
video["file_names"] = [
f"remapped_vid_{video['id']:012d}/{name}" for name in video["file_names"]
]
# Set all DT category_id to 1
for d in dt:
d["category_id"] = 1
return gt, dt
def _fill_in_ann_height_width(gt_json):
"""Fill in missing height/width in GT annotations from its video info."""
video_id_to_video = {v["id"]: v for v in gt_json["videos"]}
for ann in gt_json["annotations"]:
if "height" not in ann or "width" not in ann:
video = video_id_to_video[ann["video_id"]]
if "height" not in ann:
ann["height"] = video["height"]
if "width" not in ann:
ann["width"] = video["width"]
return gt_json

View File

@@ -0,0 +1,5 @@
# fmt: off
# flake8: noqa
from . import config, datasets, metrics, utils
from .eval import Evaluator

View File

@@ -0,0 +1,69 @@
# fmt: off
# flake8: noqa
import inspect
from functools import wraps
from time import perf_counter
DO_TIMING = False
DISPLAY_LESS_PROGRESS = False
timer_dict = {}
counter = 0
def time(f):
@wraps(f)
def wrap(*args, **kw):
if DO_TIMING:
# Run function with timing
ts = perf_counter()
result = f(*args, **kw)
te = perf_counter()
tt = te - ts
# Get function name
arg_names = inspect.getfullargspec(f)[0]
if arg_names[0] == "self" and DISPLAY_LESS_PROGRESS:
return result
elif arg_names[0] == "self":
method_name = type(args[0]).__name__ + "." + f.__name__
else:
method_name = f.__name__
# Record accumulative time in each function for analysis
if method_name in timer_dict.keys():
timer_dict[method_name] += tt
else:
timer_dict[method_name] = tt
# If code is finished, display timing summary
if method_name == "Evaluator.evaluate":
print("")
print("Timing analysis:")
for key, value in timer_dict.items():
print("%-70s %2.4f sec" % (key, value))
else:
# Get function argument values for printing special arguments of interest
arg_titles = ["tracker", "seq", "cls"]
arg_vals = []
for i, a in enumerate(arg_names):
if a in arg_titles:
arg_vals.append(args[i])
arg_text = "(" + ", ".join(arg_vals) + ")"
# Display methods and functions with different indentation.
if arg_names[0] == "self":
print("%-74s %2.4f sec" % (" " * 4 + method_name + arg_text, tt))
elif arg_names[0] == "test":
pass
else:
global counter
counter += 1
print("%i %-70s %2.4f sec" % (counter, method_name + arg_text, tt))
return result
else:
# If config["TIME_PROGRESS"] is false, or config["USE_PARALLEL"] is true, run functions normally without timing.
return f(*args, **kw)
return wrap

View File

@@ -0,0 +1,153 @@
# fmt: off
# flake8: noqa
"""Config."""
import argparse
import os
def parse_configs():
"""Parse command line."""
default_eval_config = get_default_eval_config()
default_eval_config["DISPLAY_LESS_PROGRESS"] = True
default_dataset_config = get_default_dataset_config()
default_metrics_config = {"METRICS": ["TETA"]}
config = {
**default_eval_config,
**default_dataset_config,
**default_metrics_config,
}
parser = argparse.ArgumentParser()
for setting in config.keys():
if type(config[setting]) == list or type(config[setting]) == type(None):
parser.add_argument("--" + setting, nargs="+")
else:
parser.add_argument("--" + setting)
args = parser.parse_args().__dict__
for setting in args.keys():
if args[setting] is not None:
if type(config[setting]) == type(True):
if args[setting] == "True":
x = True
elif args[setting] == "False":
x = False
else:
raise Exception(
f"Command line parameter {setting} must be True/False"
)
elif type(config[setting]) == type(1):
x = int(args[setting])
elif type(args[setting]) == type(None):
x = None
else:
x = args[setting]
config[setting] = x
eval_config = {k: v for k, v in config.items() if k in default_eval_config.keys()}
dataset_config = {
k: v for k, v in config.items() if k in default_dataset_config.keys()
}
metrics_config = {
k: v for k, v in config.items() if k in default_metrics_config.keys()
}
return eval_config, dataset_config, metrics_config
def get_default_eval_config():
"""Returns the default config values for evaluation."""
code_path = get_code_path()
default_config = {
"USE_PARALLEL": True,
"NUM_PARALLEL_CORES": 8,
"BREAK_ON_ERROR": True,
"RETURN_ON_ERROR": False,
"LOG_ON_ERROR": os.path.join(code_path, "error_log.txt"),
"PRINT_RESULTS": True,
"PRINT_ONLY_COMBINED": True,
"PRINT_CONFIG": True,
"TIME_PROGRESS": True,
"DISPLAY_LESS_PROGRESS": True,
"OUTPUT_SUMMARY": True,
"OUTPUT_EMPTY_CLASSES": True,
"OUTPUT_TEM_RAW_DATA": True,
"OUTPUT_PER_SEQ_RES": True,
}
return default_config
def get_default_dataset_config():
"""Default class config values"""
code_path = get_code_path()
default_config = {
"GT_FOLDER": os.path.join(
code_path, "data/gt/tao/tao_training"
), # Location of GT data
"TRACKERS_FOLDER": os.path.join(
code_path, "data/trackers/tao/tao_training"
), # Trackers location
"OUTPUT_FOLDER": None, # Where to save eval results (if None, same as TRACKERS_FOLDER)
"TRACKERS_TO_EVAL": ['TETer'], # Filenames of trackers to eval (if None, all in folder)
"CLASSES_TO_EVAL": None, # Classes to eval (if None, all classes)
"SPLIT_TO_EVAL": "training", # Valid: 'training', 'val'
"PRINT_CONFIG": True, # Whether to print current config
"TRACKER_SUB_FOLDER": "data", # Tracker files are in TRACKER_FOLDER/tracker_name/TRACKER_SUB_FOLDER
"OUTPUT_SUB_FOLDER": "", # Output files are saved in OUTPUT_FOLDER/tracker_name/OUTPUT_SUB_FOLDER
"TRACKER_DISPLAY_NAMES": None, # Names of trackers to display, if None: TRACKERS_TO_EVAL
"MAX_DETECTIONS": 0, # Number of maximal allowed detections per image (0 for unlimited)
"USE_MASK": False, # Whether to use mask data for evaluation
}
return default_config
def init_config(config, default_config, name=None):
"""Initialize non-given config values with defaults."""
if config is None:
config = default_config
else:
for k in default_config.keys():
if k not in config.keys():
config[k] = default_config[k]
if name and config["PRINT_CONFIG"]:
print("\n%s Config:" % name)
for c in config.keys():
print("%-20s : %-30s" % (c, config[c]))
return config
def update_config(config):
"""
Parse the arguments of a script and updates the config values for a given value if specified in the arguments.
:param config: the config to update
:return: the updated config
"""
parser = argparse.ArgumentParser()
for setting in config.keys():
if type(config[setting]) == list or type(config[setting]) == type(None):
parser.add_argument("--" + setting, nargs="+")
else:
parser.add_argument("--" + setting)
args = parser.parse_args().__dict__
for setting in args.keys():
if args[setting] is not None:
if type(config[setting]) == type(True):
if args[setting] == "True":
x = True
elif args[setting] == "False":
x = False
else:
raise Exception(
"Command line parameter " + setting + "must be True or False"
)
elif type(config[setting]) == type(1):
x = int(args[setting])
elif type(args[setting]) == type(None):
x = None
else:
x = args[setting]
config[setting] = x
return config
def get_code_path():
"""Get base path where code is"""
return os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))

View File

@@ -0,0 +1,5 @@
# fmt: off
# flake8: noqa
"""Datasets."""
from .coco import COCO
from .tao import TAO

View File

@@ -0,0 +1,379 @@
# fmt: off
# flake8: noqa
import csv
import io
import os
import traceback
import zipfile
from abc import ABC, abstractmethod
from copy import deepcopy
import numpy as np
from .. import _timing
from ..utils import TrackEvalException
class _BaseDataset(ABC):
@abstractmethod
def __init__(self):
self.tracker_list = None
self.seq_list = None
self.class_list = None
self.output_fol = None
self.output_sub_fol = None
self.should_classes_combine = True
self.use_super_categories = False
# Functions to implement:
@abstractmethod
def _load_raw_file(self, tracker, seq, is_gt):
...
@_timing.time
@abstractmethod
def get_preprocessed_seq_data(self, raw_data, cls):
...
@abstractmethod
def _calculate_similarities(self, gt_dets_t, tracker_dets_t):
...
# Helper functions for all datasets:
@classmethod
def get_class_name(cls):
return cls.__name__
def get_name(self):
return self.get_class_name()
def get_output_fol(self, tracker):
return os.path.join(self.output_fol, tracker, self.output_sub_fol)
def get_display_name(self, tracker):
"""Can be overwritten if the trackers name (in files) is different to how it should be displayed.
By default this method just returns the trackers name as is.
"""
return tracker
def get_eval_info(self):
"""Return info about the dataset needed for the Evaluator"""
return self.tracker_list, self.seq_list, self.class_list
@_timing.time
def get_raw_seq_data(self, tracker, seq):
"""Loads raw data (tracker and ground-truth) for a single tracker on a single sequence.
Raw data includes all of the information needed for both preprocessing and evaluation, for all classes.
A later function (get_processed_seq_data) will perform such preprocessing and extract relevant information for
the evaluation of each class.
This returns a dict which contains the fields:
[num_timesteps]: integer
[gt_ids, tracker_ids, gt_classes, tracker_classes, tracker_confidences]:
list (for each timestep) of 1D NDArrays (for each det).
[gt_dets, tracker_dets, gt_crowd_ignore_regions]: list (for each timestep) of lists of detections.
[similarity_scores]: list (for each timestep) of 2D NDArrays.
[gt_extras]: dict (for each extra) of lists (for each timestep) of 1D NDArrays (for each det).
gt_extras contains dataset specific information used for preprocessing such as occlusion and truncation levels.
Note that similarities are extracted as part of the dataset and not the metric, because almost all metrics are
independent of the exact method of calculating the similarity. However datasets are not (e.g. segmentation
masks vs 2D boxes vs 3D boxes).
We calculate the similarity before preprocessing because often both preprocessing and evaluation require it and
we don't wish to calculate this twice.
We calculate similarity between all gt and tracker classes (not just each class individually) to allow for
calculation of metrics such as class confusion matrices. Typically the impact of this on performance is low.
"""
# Load raw data.
raw_gt_data = self._load_raw_file(tracker, seq, is_gt=True)
raw_tracker_data = self._load_raw_file(tracker, seq, is_gt=False)
raw_data = {**raw_tracker_data, **raw_gt_data} # Merges dictionaries
# Calculate similarities for each timestep.
similarity_scores = []
for _, (gt_dets_t, tracker_dets_t) in enumerate(
zip(raw_data["gt_dets"], raw_data["tk_dets"])
):
ious = self._calculate_similarities(gt_dets_t, tracker_dets_t)
similarity_scores.append(ious)
raw_data["similarity_scores"] = similarity_scores
return raw_data
@staticmethod
def _load_simple_text_file(
file,
time_col=0,
id_col=None,
remove_negative_ids=False,
valid_filter=None,
crowd_ignore_filter=None,
convert_filter=None,
is_zipped=False,
zip_file=None,
force_delimiters=None,
):
"""Function that loads data which is in a commonly used text file format.
Assumes each det is given by one row of a text file.
There is no limit to the number or meaning of each column,
however one column needs to give the timestep of each det (time_col) which is default col 0.
The file dialect (deliminator, num cols, etc) is determined automatically.
This function automatically separates dets by timestep,
and is much faster than alternatives such as np.loadtext or pandas.
If remove_negative_ids is True and id_col is not None, dets with negative values in id_col are excluded.
These are not excluded from ignore data.
valid_filter can be used to only include certain classes.
It is a dict with ints as keys, and lists as values,
such that a row is included if "row[key].lower() is in value" for all key/value pairs in the dict.
If None, all classes are included.
crowd_ignore_filter can be used to read crowd_ignore regions separately. It has the same format as valid filter.
convert_filter can be used to convert value read to another format.
This is used most commonly to convert classes given as string to a class id.
This is a dict such that the key is the column to convert, and the value is another dict giving the mapping.
Optionally, input files could be a zip of multiple text files for storage efficiency.
Returns read_data and ignore_data.
Each is a dict (with keys as timesteps as strings) of lists (over dets) of lists (over column values).
Note that all data is returned as strings, and must be converted to float/int later if needed.
Note that timesteps will not be present in the returned dict keys if there are no dets for them
"""
if remove_negative_ids and id_col is None:
raise TrackEvalException(
"remove_negative_ids is True, but id_col is not given."
)
if crowd_ignore_filter is None:
crowd_ignore_filter = {}
if convert_filter is None:
convert_filter = {}
try:
if is_zipped: # Either open file directly or within a zip.
if zip_file is None:
raise TrackEvalException(
"is_zipped set to True, but no zip_file is given."
)
archive = zipfile.ZipFile(os.path.join(zip_file), "r")
fp = io.TextIOWrapper(archive.open(file, "r"))
else:
fp = open(file)
read_data = {}
crowd_ignore_data = {}
fp.seek(0, os.SEEK_END)
# check if file is empty
if fp.tell():
fp.seek(0)
dialect = csv.Sniffer().sniff(
fp.readline(), delimiters=force_delimiters
) # Auto determine structure.
dialect.skipinitialspace = (
True # Deal with extra spaces between columns
)
fp.seek(0)
reader = csv.reader(fp, dialect)
for row in reader:
try:
# Deal with extra trailing spaces at the end of rows
if row[-1] in "":
row = row[:-1]
timestep = str(int(float(row[time_col])))
# Read ignore regions separately.
is_ignored = False
for ignore_key, ignore_value in crowd_ignore_filter.items():
if row[ignore_key].lower() in ignore_value:
# Convert values in one column (e.g. string to id)
for (
convert_key,
convert_value,
) in convert_filter.items():
row[convert_key] = convert_value[
row[convert_key].lower()
]
# Save data separated by timestep.
if timestep in crowd_ignore_data.keys():
crowd_ignore_data[timestep].append(row)
else:
crowd_ignore_data[timestep] = [row]
is_ignored = True
if (
is_ignored
): # if det is an ignore region, it cannot be a normal det.
continue
# Exclude some dets if not valid.
if valid_filter is not None:
for key, value in valid_filter.items():
if row[key].lower() not in value:
continue
if remove_negative_ids:
if int(float(row[id_col])) < 0:
continue
# Convert values in one column (e.g. string to id)
for convert_key, convert_value in convert_filter.items():
row[convert_key] = convert_value[row[convert_key].lower()]
# Save data separated by timestep.
if timestep in read_data.keys():
read_data[timestep].append(row)
else:
read_data[timestep] = [row]
except Exception:
exc_str_init = (
"In file %s the following line cannot be read correctly: \n"
% os.path.basename(file)
)
exc_str = " ".join([exc_str_init] + row)
raise TrackEvalException(exc_str)
fp.close()
except Exception:
print("Error loading file: %s, printing traceback." % file)
traceback.print_exc()
raise TrackEvalException(
"File %s cannot be read because it is either not present or invalidly formatted"
% os.path.basename(file)
)
return read_data, crowd_ignore_data
@staticmethod
def _calculate_mask_ious(masks1, masks2, is_encoded=False, do_ioa=False):
"""Calculates the IOU (intersection over union) between two arrays of segmentation masks.
If is_encoded a run length encoding with pycocotools is assumed as input format, otherwise an input of numpy
arrays of the shape (num_masks, height, width) is assumed and the encoding is performed.
If do_ioa (intersection over area) , then calculates the intersection over the area of masks1 - this is commonly
used to determine if detections are within crowd ignore region.
:param masks1: first set of masks (numpy array of shape (num_masks, height, width) if not encoded,
else pycocotools rle encoded format)
:param masks2: second set of masks (numpy array of shape (num_masks, height, width) if not encoded,
else pycocotools rle encoded format)
:param is_encoded: whether the input is in pycocotools rle encoded format
:param do_ioa: whether to perform IoA computation
:return: the IoU/IoA scores
"""
# Only loaded when run to reduce minimum requirements
from pycocotools import mask as mask_utils
# use pycocotools for run length encoding of masks
if not is_encoded:
masks1 = mask_utils.encode(
np.array(np.transpose(masks1, (1, 2, 0)), order="F")
)
masks2 = mask_utils.encode(
np.array(np.transpose(masks2, (1, 2, 0)), order="F")
)
# use pycocotools for iou computation of rle encoded masks
ious = mask_utils.iou(masks1, masks2, [do_ioa] * len(masks2))
if len(masks1) == 0 or len(masks2) == 0:
ious = np.asarray(ious).reshape(len(masks1), len(masks2))
assert (ious >= 0 - np.finfo("float").eps).all()
assert (ious <= 1 + np.finfo("float").eps).all()
return ious
@staticmethod
def _calculate_box_ious(bboxes1, bboxes2, box_format="xywh", do_ioa=False):
"""Calculates the IOU (intersection over union) between two arrays of boxes.
Allows variable box formats ('xywh' and 'x0y0x1y1').
If do_ioa (intersection over area) , then calculates the intersection over the area of boxes1 - this is commonly
used to determine if detections are within crowd ignore region.
"""
if box_format in "xywh":
# layout: (x0, y0, w, h)
bboxes1 = deepcopy(bboxes1)
bboxes2 = deepcopy(bboxes2)
bboxes1[:, 2] = bboxes1[:, 0] + bboxes1[:, 2]
bboxes1[:, 3] = bboxes1[:, 1] + bboxes1[:, 3]
bboxes2[:, 2] = bboxes2[:, 0] + bboxes2[:, 2]
bboxes2[:, 3] = bboxes2[:, 1] + bboxes2[:, 3]
elif box_format not in "x0y0x1y1":
raise (TrackEvalException("box_format %s is not implemented" % box_format))
# layout: (x0, y0, x1, y1)
min_ = np.minimum(bboxes1[:, np.newaxis, :], bboxes2[np.newaxis, :, :])
max_ = np.maximum(bboxes1[:, np.newaxis, :], bboxes2[np.newaxis, :, :])
intersection = np.maximum(min_[..., 2] - max_[..., 0], 0) * np.maximum(
min_[..., 3] - max_[..., 1], 0
)
area1 = (bboxes1[..., 2] - bboxes1[..., 0]) * (
bboxes1[..., 3] - bboxes1[..., 1]
)
if do_ioa:
ioas = np.zeros_like(intersection)
valid_mask = area1 > 0 + np.finfo("float").eps
ioas[valid_mask, :] = (
intersection[valid_mask, :] / area1[valid_mask][:, np.newaxis]
)
return ioas
else:
area2 = (bboxes2[..., 2] - bboxes2[..., 0]) * (
bboxes2[..., 3] - bboxes2[..., 1]
)
union = area1[:, np.newaxis] + area2[np.newaxis, :] - intersection
intersection[area1 <= 0 + np.finfo("float").eps, :] = 0
intersection[:, area2 <= 0 + np.finfo("float").eps] = 0
intersection[union <= 0 + np.finfo("float").eps] = 0
union[union <= 0 + np.finfo("float").eps] = 1
ious = intersection / union
return ious
@staticmethod
def _calculate_euclidean_similarity(dets1, dets2, zero_distance=2.0):
"""Calculates the euclidean distance between two sets of detections, and then converts this into a similarity
measure with values between 0 and 1 using the following formula: sim = max(0, 1 - dist/zero_distance).
The default zero_distance of 2.0, corresponds to the default used in MOT15_3D, such that a 0.5 similarity
threshold corresponds to a 1m distance threshold for TPs.
"""
dist = np.linalg.norm(dets1[:, np.newaxis] - dets2[np.newaxis, :], axis=2)
sim = np.maximum(0, 1 - dist / zero_distance)
return sim
@staticmethod
def _check_unique_ids(data, after_preproc=False):
"""Check the requirement that the tracker_ids and gt_ids are unique per timestep"""
gt_ids = data["gt_ids"]
tracker_ids = data["tk_ids"]
for t, (gt_ids_t, tracker_ids_t) in enumerate(zip(gt_ids, tracker_ids)):
if len(tracker_ids_t) > 0:
unique_ids, counts = np.unique(tracker_ids_t, return_counts=True)
if np.max(counts) != 1:
duplicate_ids = unique_ids[counts > 1]
exc_str_init = (
"Tracker predicts the same ID more than once in a single timestep "
"(seq: %s, frame: %i, ids:" % (data["seq"], t + 1)
)
exc_str = (
" ".join([exc_str_init] + [str(d) for d in duplicate_ids]) + ")"
)
if after_preproc:
exc_str_init += (
"\n Note that this error occurred after preprocessing (but not before), "
"so ids may not be as in file, and something seems wrong with preproc."
)
raise TrackEvalException(exc_str)
if len(gt_ids_t) > 0:
unique_ids, counts = np.unique(gt_ids_t, return_counts=True)
if np.max(counts) != 1:
duplicate_ids = unique_ids[counts > 1]
exc_str_init = (
"Ground-truth has the same ID more than once in a single timestep "
"(seq: %s, frame: %i, ids:" % (data["seq"], t + 1)
)
exc_str = (
" ".join([exc_str_init] + [str(d) for d in duplicate_ids]) + ")"
)
if after_preproc:
exc_str_init += (
"\n Note that this error occurred after preprocessing (but not before), "
"so ids may not be as in file, and something seems wrong with preproc."
)
raise TrackEvalException(exc_str)

View File

@@ -0,0 +1,637 @@
# fmt: off
# flake8: noqa
"""COCO Dataset."""
import copy
import itertools
import json
import os
from collections import defaultdict
import numpy as np
from scipy.optimize import linear_sum_assignment
from .. import _timing, utils
from ..config import get_default_dataset_config, init_config
from ..utils import TrackEvalException
from ._base_dataset import _BaseDataset
class COCO(_BaseDataset):
"""Tracking datasets in COCO format."""
def __init__(self, config=None):
"""Initialize dataset, checking that all required files are present."""
super().__init__()
# Fill non-given config values with defaults
self.config = init_config(config, get_default_dataset_config(), self.get_name())
self.gt_fol = self.config["GT_FOLDER"]
self.tracker_fol = self.config["TRACKERS_FOLDER"]
self.should_classes_combine = True
self.use_super_categories = False
self.use_mask = self.config["USE_MASK"]
self.tracker_sub_fol = self.config["TRACKER_SUB_FOLDER"]
self.output_fol = self.config["OUTPUT_FOLDER"]
if self.output_fol is None:
self.output_fol = self.tracker_fol
self.output_sub_fol = self.config["OUTPUT_SUB_FOLDER"]
if self.gt_fol.endswith(".json"):
self.gt_data = json.load(open(self.gt_fol, "r"))
else:
gt_dir_files = [
file for file in os.listdir(self.gt_fol) if file.endswith(".json")
]
if len(gt_dir_files) != 1:
raise TrackEvalException(
f"{self.gt_fol} does not contain exactly one json file."
)
with open(os.path.join(self.gt_fol, gt_dir_files[0])) as f:
self.gt_data = json.load(f)
# fill missing video ids
self._fill_video_ids_inplace(self.gt_data["annotations"])
# get sequences to eval and sequence information
self.seq_list = [
vid["name"].replace("/", "-") for vid in self.gt_data["videos"]
]
self.seq_name2seqid = {
vid["name"].replace("/", "-"): vid["id"] for vid in self.gt_data["videos"]
}
# compute mappings from videos to annotation data
self.video2gt_track, self.video2gt_image = self._compute_vid_mappings(
self.gt_data["annotations"]
)
# compute sequence lengths
self.seq_lengths = {vid["id"]: 0 for vid in self.gt_data["videos"]}
for img in self.gt_data["images"]:
self.seq_lengths[img["video_id"]] += 1
self.seq2images2timestep = self._compute_image_to_timestep_mappings()
self.seq2cls = {
vid["id"]: {
"pos_cat_ids": list(
{track["category_id"] for track in self.video2gt_track[vid["id"]]}
),
}
for vid in self.gt_data["videos"]
}
# Get classes to eval
considered_vid_ids = [self.seq_name2seqid[vid] for vid in self.seq_list]
seen_cats = set(
[
cat_id
for vid_id in considered_vid_ids
for cat_id in self.seq2cls[vid_id]["pos_cat_ids"]
]
)
# only classes with ground truth are evaluated in TAO
self.valid_classes = [
cls["name"] for cls in self.gt_data["categories"] if cls["id"] in seen_cats
]
cls_name2clsid_map = {
cls["name"]: cls["id"] for cls in self.gt_data["categories"]
}
if self.config["CLASSES_TO_EVAL"]:
self.class_list = [
cls.lower() if cls.lower() in self.valid_classes else None
for cls in self.config["CLASSES_TO_EVAL"]
]
if not all(self.class_list):
valid_cls = ", ".join(self.valid_classes)
raise TrackEvalException(
"Attempted to evaluate an invalid class. Only classes "
f"{valid_cls} are valid (classes present in ground truth"
" data)."
)
else:
self.class_list = [cls for cls in self.valid_classes]
self.cls_name2clsid = {
k: v for k, v in cls_name2clsid_map.items() if k in self.class_list
}
self.clsid2cls_name = {
v: k for k, v in cls_name2clsid_map.items() if k in self.class_list
}
# get trackers to eval
if self.config["TRACKERS_TO_EVAL"] is None:
self.tracker_list = os.listdir(self.tracker_fol)
else:
self.tracker_list = self.config["TRACKERS_TO_EVAL"]
if self.config["TRACKER_DISPLAY_NAMES"] is None:
self.tracker_to_disp = dict(zip(self.tracker_list, self.tracker_list))
elif (self.config["TRACKERS_TO_EVAL"] is not None) and (
len(self.config["TK_DISPLAY_NAMES"]) == len(self.tracker_list)
):
self.tracker_to_disp = dict(
zip(self.tracker_list, self.config["TK_DISPLAY_NAMES"])
)
else:
raise TrackEvalException(
"List of tracker files and tracker display names do not match."
)
self.tracker_data = {tracker: dict() for tracker in self.tracker_list}
for tracker in self.tracker_list:
if self.tracker_sub_fol.endswith(".json"):
with open(os.path.join(self.tracker_sub_fol)) as f:
curr_data = json.load(f)
else:
tr_dir = os.path.join(self.tracker_fol, tracker, self.tracker_sub_fol)
tr_dir_files = [
file for file in os.listdir(tr_dir) if file.endswith(".json")
]
if len(tr_dir_files) != 1:
raise TrackEvalException(
f"{tr_dir} does not contain exactly one json file."
)
with open(os.path.join(tr_dir, tr_dir_files[0])) as f:
curr_data = json.load(f)
# limit detections if MAX_DETECTIONS > 0
if self.config["MAX_DETECTIONS"]:
curr_data = self._limit_dets_per_image(curr_data)
# fill missing video ids
self._fill_video_ids_inplace(curr_data)
# make track ids unique over whole evaluation set
self._make_tk_ids_unique(curr_data)
# get tracker sequence information
curr_vids2tracks, curr_vids2images = self._compute_vid_mappings(curr_data)
self.tracker_data[tracker]["vids_to_tracks"] = curr_vids2tracks
self.tracker_data[tracker]["vids_to_images"] = curr_vids2images
def get_display_name(self, tracker):
return self.tracker_to_disp[tracker]
def _load_raw_file(self, tracker, seq, is_gt):
"""Load a file (gt or tracker) in the TAO format
If is_gt, this returns a dict which contains the fields:
[gt_ids, gt_classes]:
list (for each timestep) of 1D NDArrays (for each det).
[gt_dets]: list (for each timestep) of lists of detections.
if not is_gt, this returns a dict which contains the fields:
[tk_ids, tk_classes]:
list (for each timestep) of 1D NDArrays (for each det).
[tk_dets]: list (for each timestep) of lists of detections.
"""
seq_id = self.seq_name2seqid[seq]
# file location
if is_gt:
imgs = self.video2gt_image[seq_id]
else:
imgs = self.tracker_data[tracker]["vids_to_images"][seq_id]
# convert data to required format
num_timesteps = self.seq_lengths[seq_id]
img_to_timestep = self.seq2images2timestep[seq_id]
data_keys = ["ids", "classes", "dets"]
# if not is_gt:
# data_keys += ["tk_confidences"]
raw_data = {key: [None] * num_timesteps for key in data_keys}
for img in imgs:
# some tracker data contains images without any ground truth info,
# these are ignored
if img["id"] not in img_to_timestep:
continue
t = img_to_timestep[img["id"]]
anns = img["annotations"]
tk_str = utils.get_track_id_str(anns[0])
if self.use_mask:
# When using mask, extract segmentation data
raw_data["dets"][t] = [ann.get("segmentation") for ann in anns]
else:
# When using bbox, extract bbox data
raw_data["dets"][t] = np.atleast_2d([ann["bbox"] for ann in anns]).astype(
float
)
raw_data["ids"][t] = np.atleast_1d([ann[tk_str] for ann in anns]).astype(
int
)
raw_data["classes"][t] = np.atleast_1d(
[ann["category_id"] for ann in anns]
).astype(int)
# if not is_gt:
# raw_data["tk_confidences"][t] = np.atleast_1d(
# [ann["score"] for ann in anns]
# ).astype(float)
for t, d in enumerate(raw_data["dets"]):
if d is None:
raw_data["dets"][t] = np.empty((0, 4)).astype(float)
raw_data["ids"][t] = np.empty(0).astype(int)
raw_data["classes"][t] = np.empty(0).astype(int)
# if not is_gt:
# raw_data["tk_confidences"][t] = np.empty(0)
if is_gt:
key_map = {"ids": "gt_ids", "classes": "gt_classes", "dets": "gt_dets"}
else:
key_map = {"ids": "tk_ids", "classes": "tk_classes", "dets": "tk_dets"}
for k, v in key_map.items():
raw_data[v] = raw_data.pop(k)
raw_data["num_timesteps"] = num_timesteps
raw_data["seq"] = seq
return raw_data
def get_preprocessed_seq_data_thr(self, raw_data, cls, assignment=None):
"""Preprocess data for a single sequence for a single class.
Inputs:
raw_data: dict containing the data for the sequence already
read in by get_raw_seq_data().
cls: class to be evaluated.
Outputs:
gt_ids:
list (for each timestep) of ids of GT tracks
tk_ids:
list (for each timestep) of ids of predicted tracks (all for TP
matching (Det + AssocA))
tk_overlap_ids:
list (for each timestep) of ids of predicted tracks that overlap
with GTs
tk_dets:
list (for each timestep) of lists of detections that
corresponding to the tk_ids
tk_classes:
list (for each timestep) of lists of classes that corresponding
to the tk_ids
tk_confidences:
list (for each timestep) of lists of classes that corresponding
to the tk_ids
sim_scores:
similarity score between gt_ids and tk_ids.
"""
if cls != "all":
cls_id = self.cls_name2clsid[cls]
data_keys = [
"gt_ids",
"tk_ids",
"gt_id_map",
"tk_id_map",
"gt_dets",
"gt_classes",
"gt_class_name",
"tk_overlap_classes",
"tk_overlap_ids",
"tk_class_eval_tk_ids",
"tk_dets",
"tk_classes",
# "tk_confidences",
"tk_exh_ids",
"sim_scores",
]
data = {key: [None] * raw_data["num_timesteps"] for key in data_keys}
unique_gt_ids = []
unique_tk_ids = []
num_gt_dets = 0
num_tk_cls_dets = 0
num_tk_overlap_dets = 0
overlap_ious_thr = 0.5
loc_and_asso_tk_ids = []
exh_class_tk_ids = []
for t in range(raw_data["num_timesteps"]):
# only extract relevant dets for this class for preproc and eval
if cls == "all":
gt_class_mask = np.ones_like(raw_data["gt_classes"][t]).astype(bool)
else:
gt_class_mask = np.atleast_1d(
raw_data["gt_classes"][t] == cls_id
).astype(bool)
# select GT that is not in the evaluating classes
if assignment is not None and assignment:
all_gt_ids = list(assignment[t].keys())
gt_ids_in = raw_data["gt_ids"][t][gt_class_mask]
gt_ids_out = set(all_gt_ids) - set(gt_ids_in)
tk_ids_out = set([assignment[t][key] for key in list(gt_ids_out)])
# compute overlapped tracks and add their ids to overlap_tk_ids
sim_scores = raw_data["similarity_scores"]
overlap_ids_masks = (sim_scores[t][gt_class_mask] >= overlap_ious_thr).any(
axis=0
)
overlap_tk_ids_t = raw_data["tk_ids"][t][overlap_ids_masks]
if assignment is not None and assignment:
data["tk_overlap_ids"][t] = list(set(overlap_tk_ids_t) - tk_ids_out)
else:
data["tk_overlap_ids"][t] = list(set(overlap_tk_ids_t))
loc_and_asso_tk_ids += data["tk_overlap_ids"][t]
data["tk_exh_ids"][t] = []
if cls == "all":
continue
# add the track ids of exclusive annotated class to exh_class_tk_ids
tk_exh_mask = np.atleast_1d(raw_data["tk_classes"][t] == cls_id)
tk_exh_mask = tk_exh_mask.astype(bool)
exh_class_tk_ids_t = raw_data["tk_ids"][t][tk_exh_mask]
exh_class_tk_ids.append(exh_class_tk_ids_t)
data["tk_exh_ids"][t] = exh_class_tk_ids_t
# remove tk_ids that has been assigned to GT belongs to other classes.
loc_and_asso_tk_ids = list(set(loc_and_asso_tk_ids))
# remove all unwanted unmatched tracker detections
for t in range(raw_data["num_timesteps"]):
# add gt to the data
if cls == "all":
gt_class_mask = np.ones_like(raw_data["gt_classes"][t]).astype(bool)
else:
gt_class_mask = np.atleast_1d(
raw_data["gt_classes"][t] == cls_id
).astype(bool)
data["gt_classes"][t] = cls_id
data["gt_class_name"][t] = cls
gt_ids = raw_data["gt_ids"][t][gt_class_mask]
if self.use_mask:
gt_dets = [raw_data['gt_dets'][t][ind] for ind in range(len(gt_class_mask)) if gt_class_mask[ind]]
else:
gt_dets = raw_data["gt_dets"][t][gt_class_mask]
data["gt_ids"][t] = gt_ids
data["gt_dets"][t] = gt_dets
# filter pred and only keep those that highly overlap with GTs
tk_mask = np.isin(
raw_data["tk_ids"][t], np.array(loc_and_asso_tk_ids), assume_unique=True
)
tk_overlap_mask = np.isin(
raw_data["tk_ids"][t],
np.array(data["tk_overlap_ids"][t]),
assume_unique=True,
)
tk_ids = raw_data["tk_ids"][t][tk_mask]
if self.use_mask:
tk_dets = [raw_data['tk_dets'][t][ind] for ind in range(len(tk_mask)) if
tk_mask[ind]]
else:
tk_dets = raw_data["tk_dets"][t][tk_mask]
tracker_classes = raw_data["tk_classes"][t][tk_mask]
# add overlap classes for computing the FP for Cls term
tracker_overlap_classes = raw_data["tk_classes"][t][tk_overlap_mask]
# tracker_confidences = raw_data["tk_confidences"][t][tk_mask]
sim_scores_masked = sim_scores[t][gt_class_mask, :][:, tk_mask]
# add filtered prediction to the data
data["tk_classes"][t] = tracker_classes
data["tk_overlap_classes"][t] = tracker_overlap_classes
data["tk_ids"][t] = tk_ids
data["tk_dets"][t] = tk_dets
# data["tk_confidences"][t] = tracker_confidences
data["sim_scores"][t] = sim_scores_masked
data["tk_class_eval_tk_ids"][t] = set(
list(data["tk_overlap_ids"][t]) + list(data["tk_exh_ids"][t])
)
# count total number of detections
unique_gt_ids += list(np.unique(data["gt_ids"][t]))
# the unique track ids are for association.
unique_tk_ids += list(np.unique(data["tk_ids"][t]))
num_tk_overlap_dets += len(data["tk_overlap_ids"][t])
num_tk_cls_dets += len(data["tk_class_eval_tk_ids"][t])
num_gt_dets += len(data["gt_ids"][t])
# re-label IDs such that there are no empty IDs
if len(unique_gt_ids) > 0:
unique_gt_ids = np.unique(unique_gt_ids)
gt_id_map = np.nan * np.ones((np.max(unique_gt_ids) + 1))
gt_id_map[unique_gt_ids] = np.arange(len(unique_gt_ids))
data["gt_id_map"] = {}
for gt_id in unique_gt_ids:
new_gt_id = gt_id_map[gt_id].astype(int)
data["gt_id_map"][new_gt_id] = gt_id
for t in range(raw_data["num_timesteps"]):
if len(data["gt_ids"][t]) > 0:
data["gt_ids"][t] = gt_id_map[data["gt_ids"][t]].astype(int)
if len(unique_tk_ids) > 0:
unique_tk_ids = np.unique(unique_tk_ids)
tk_id_map = np.nan * np.ones((np.max(unique_tk_ids) + 1))
tk_id_map[unique_tk_ids] = np.arange(len(unique_tk_ids))
data["tk_id_map"] = {}
for track_id in unique_tk_ids:
new_track_id = tk_id_map[track_id].astype(int)
data["tk_id_map"][new_track_id] = track_id
for t in range(raw_data["num_timesteps"]):
if len(data["tk_ids"][t]) > 0:
data["tk_ids"][t] = tk_id_map[data["tk_ids"][t]].astype(int)
if len(data["tk_overlap_ids"][t]) > 0:
data["tk_overlap_ids"][t] = tk_id_map[
data["tk_overlap_ids"][t]
].astype(int)
# record overview statistics.
data["num_tk_cls_dets"] = num_tk_cls_dets
data["num_tk_overlap_dets"] = num_tk_overlap_dets
data["num_gt_dets"] = num_gt_dets
data["num_tk_ids"] = len(unique_tk_ids)
data["num_gt_ids"] = len(unique_gt_ids)
data["num_timesteps"] = raw_data["num_timesteps"]
data["seq"] = raw_data["seq"]
self._check_unique_ids(data)
return data
@_timing.time
def get_preprocessed_seq_data(
self, raw_data, cls, assignment=None, thresholds=[50, 75]
):
"""Preprocess data for a single sequence for a single class."""
data = {}
if thresholds is None:
thresholds = [50, 75]
elif isinstance(thresholds, int):
thresholds = [thresholds]
for thr in thresholds:
assignment_thr = None
if assignment is not None:
assignment_thr = assignment[thr]
data[thr] = self.get_preprocessed_seq_data_thr(
raw_data, cls, assignment_thr
)
return data
def _calculate_similarities(self, gt_dets_t, tk_dets_t):
"""Compute similarity scores."""
if self.use_mask:
similarity_scores = self._calculate_mask_ious(gt_dets_t, tk_dets_t, is_encoded=True, do_ioa=False)
else:
similarity_scores = self._calculate_box_ious(gt_dets_t, tk_dets_t)
return similarity_scores
def _compute_vid_mappings(self, annotations):
"""Computes mappings from videos to corresponding tracks and images."""
vids_to_tracks = {}
vids_to_imgs = {}
vid_ids = [vid["id"] for vid in self.gt_data["videos"]]
# compute an mapping from image IDs to images
images = {}
for image in self.gt_data["images"]:
images[image["id"]] = image
tk_str = utils.get_track_id_str(annotations[0])
for ann in annotations:
ann["area"] = ann["bbox"][2] * ann["bbox"][3]
vid = ann["video_id"]
if ann["video_id"] not in vids_to_tracks.keys():
vids_to_tracks[ann["video_id"]] = list()
if ann["video_id"] not in vids_to_imgs.keys():
vids_to_imgs[ann["video_id"]] = list()
# fill in vids_to_tracks
tid = ann[tk_str]
exist_tids = [track["id"] for track in vids_to_tracks[vid]]
try:
index1 = exist_tids.index(tid)
except ValueError:
index1 = -1
if tid not in exist_tids:
curr_track = {
"id": tid,
"category_id": ann["category_id"],
"video_id": vid,
"annotations": [ann],
}
vids_to_tracks[vid].append(curr_track)
else:
vids_to_tracks[vid][index1]["annotations"].append(ann)
# fill in vids_to_imgs
img_id = ann["image_id"]
exist_img_ids = [img["id"] for img in vids_to_imgs[vid]]
try:
index2 = exist_img_ids.index(img_id)
except ValueError:
index2 = -1
if index2 == -1:
curr_img = {"id": img_id, "annotations": [ann]}
vids_to_imgs[vid].append(curr_img)
else:
vids_to_imgs[vid][index2]["annotations"].append(ann)
# sort annotations by frame index and compute track area
for vid, tracks in vids_to_tracks.items():
for track in tracks:
track["annotations"] = sorted(
track["annotations"],
key=lambda x: images[x["image_id"]]["frame_id"],
)
# compute average area
track["area"] = sum(x["area"] for x in track["annotations"]) / len(
track["annotations"]
)
# ensure all videos are present
for vid_id in vid_ids:
if vid_id not in vids_to_tracks.keys():
vids_to_tracks[vid_id] = []
if vid_id not in vids_to_imgs.keys():
vids_to_imgs[vid_id] = []
return vids_to_tracks, vids_to_imgs
def _compute_image_to_timestep_mappings(self):
"""Computes a mapping from images to timestep in sequence."""
images = {}
for image in self.gt_data["images"]:
images[image["id"]] = image
seq_to_imgs_to_timestep = {vid["id"]: dict() for vid in self.gt_data["videos"]}
for vid in seq_to_imgs_to_timestep:
curr_imgs = [img["id"] for img in self.video2gt_image[vid]]
curr_imgs = sorted(curr_imgs, key=lambda x: images[x]["frame_id"])
seq_to_imgs_to_timestep[vid] = {
curr_imgs[i]: i for i in range(len(curr_imgs))
}
return seq_to_imgs_to_timestep
def _limit_dets_per_image(self, annotations):
"""Limits the number of detections for each image.
Adapted from https://github.com/TAO-Dataset/.
"""
max_dets = self.config["MAX_DETECTIONS"]
img_ann = defaultdict(list)
for ann in annotations:
img_ann[ann["image_id"]].append(ann)
for img_id, _anns in img_ann.items():
if len(_anns) <= max_dets:
continue
_anns = sorted(_anns, key=lambda x: x["score"], reverse=True)
img_ann[img_id] = _anns[:max_dets]
return [ann for anns in img_ann.values() for ann in anns]
def _fill_video_ids_inplace(self, annotations):
"""Fills in missing video IDs inplace.
Adapted from https://github.com/TAO-Dataset/.
"""
missing_video_id = [x for x in annotations if "video_id" not in x]
if missing_video_id:
image_id_to_video_id = {
x["id"]: x["video_id"] for x in self.gt_data["images"]
}
for x in missing_video_id:
x["video_id"] = image_id_to_video_id[x["image_id"]]
@staticmethod
def _make_tk_ids_unique(annotations):
"""Makes track IDs unqiue over the whole annotation set.
Adapted from https://github.com/TAO-Dataset/.
"""
track_id_videos = {}
track_ids_to_update = set()
max_track_id = 0
tk_str = utils.get_track_id_str(annotations[0])
for ann in annotations:
t = int(ann[tk_str])
if t not in track_id_videos:
track_id_videos[t] = ann["video_id"]
if ann["video_id"] != track_id_videos[t]:
# track id is assigned to multiple videos
track_ids_to_update.add(t)
max_track_id = max(max_track_id, t)
if track_ids_to_update:
print("true")
next_id = itertools.count(max_track_id + 1)
new_tk_ids = defaultdict(lambda: next(next_id))
for ann in annotations:
t = ann[tk_str]
v = ann["video_id"]
if t in track_ids_to_update:
ann[tk_str] = new_tk_ids[t, v]
return len(track_ids_to_update)

View File

@@ -0,0 +1,659 @@
# fmt: off
# flake8: noqa
"""TAO Dataset."""
import copy
import itertools
import json
import os
from collections import defaultdict
import numpy as np
from .. import _timing
from ..config import get_default_dataset_config, init_config
from ..utils import TrackEvalException
from ._base_dataset import _BaseDataset
class TAO(_BaseDataset):
"""Dataset class for TAO tracking"""
def __init__(self, config=None):
"""Initialize dataset, checking that all required files are present."""
super().__init__()
# Fill non-given config values with defaults
self.config = init_config(config, get_default_dataset_config(), self.get_name())
self.gt_fol = self.config["GT_FOLDER"]
self.tracker_fol = self.config["TRACKERS_FOLDER"]
self.should_classes_combine = True
self.use_super_categories = False
self.use_mask = self.config["USE_MASK"]
self.tracker_sub_fol = self.config["TRACKER_SUB_FOLDER"]
self.output_fol = self.config["OUTPUT_FOLDER"]
if self.output_fol is None:
self.output_fol = self.tracker_fol
self.output_sub_fol = self.config["OUTPUT_SUB_FOLDER"]
if self.gt_fol.endswith(".json"):
self.gt_data = json.load(open(self.gt_fol, "r"))
else:
gt_dir_files = [
file for file in os.listdir(self.gt_fol) if file.endswith(".json")
]
if len(gt_dir_files) != 1:
raise TrackEvalException(
f"{self.gt_fol} does not contain exactly one json file."
)
with open(os.path.join(self.gt_fol, gt_dir_files[0])) as f:
self.gt_data = json.load(f)
# merge categories marked with a merged tag in TAO dataset
self._merge_categories(self.gt_data["annotations"] + self.gt_data["tracks"])
# get sequences to eval and sequence information
self.seq_list = [
vid["name"].replace("/", "-") for vid in self.gt_data["videos"]
]
self.seq_name2seqid = {
vid["name"].replace("/", "-"): vid["id"] for vid in self.gt_data["videos"]
}
# compute mappings from videos to annotation data
self.video2gt_track, self.video2gt_image = self._compute_vid_mappings(
self.gt_data["annotations"]
)
# compute sequence lengths
self.seq_lengths = {vid["id"]: 0 for vid in self.gt_data["videos"]}
for img in self.gt_data["images"]:
self.seq_lengths[img["video_id"]] += 1
self.seq2images2timestep = self._compute_image_to_timestep_mappings()
self.seq2cls = {
vid["id"]: {
"pos_cat_ids": list(
{track["category_id"] for track in self.video2gt_track[vid["id"]]}
),
"neg_cat_ids": vid["neg_category_ids"],
"not_exh_labeled_cat_ids": vid["not_exhaustive_category_ids"],
}
for vid in self.gt_data["videos"]
}
# Get classes to eval
considered_vid_ids = [self.seq_name2seqid[vid] for vid in self.seq_list]
seen_cats = set(
[
cat_id
for vid_id in considered_vid_ids
for cat_id in self.seq2cls[vid_id]["pos_cat_ids"]
]
)
# only classes with ground truth are evaluated in TAO
self.valid_classes = [
cls["name"] for cls in self.gt_data["categories"] if cls["id"] in seen_cats
]
cls_name2clsid_map = {
cls["name"]: cls["id"] for cls in self.gt_data["categories"]
}
if self.config["CLASSES_TO_EVAL"]:
self.class_list = [
cls.lower() if cls.lower() in self.valid_classes else None
for cls in self.config["CLASSES_TO_EVAL"]
]
if not all(self.class_list):
valid_cls = ", ".join(self.valid_classes)
raise TrackEvalException(
"Attempted to evaluate an invalid class. Only classes "
f"{valid_cls} are valid (classes present in ground truth"
" data)."
)
else:
self.class_list = [cls for cls in self.valid_classes]
self.cls_name2clsid = {
k: v for k, v in cls_name2clsid_map.items() if k in self.class_list
}
self.clsid2cls_name = {
v: k for k, v in cls_name2clsid_map.items() if k in self.class_list
}
# get trackers to eval
print(self.config["TRACKERS_TO_EVAL"] )
if self.config["TRACKERS_TO_EVAL"] is None:
self.tracker_list = os.listdir(self.tracker_fol)
else:
self.tracker_list = self.config["TRACKERS_TO_EVAL"]
if self.config["TRACKER_DISPLAY_NAMES"] is None:
self.tracker_to_disp = dict(zip(self.tracker_list, self.tracker_list))
elif (self.config["TRACKERS_TO_EVAL"] is not None) and (
len(self.config["TK_DISPLAY_NAMES"]) == len(self.tracker_list)
):
self.tracker_to_disp = dict(
zip(self.tracker_list, self.config["TK_DISPLAY_NAMES"])
)
else:
raise TrackEvalException(
"List of tracker files and tracker display names do not match."
)
self.tracker_data = {tracker: dict() for tracker in self.tracker_list}
for tracker in self.tracker_list:
if self.tracker_sub_fol.endswith(".json"):
with open(os.path.join(self.tracker_sub_fol)) as f:
curr_data = json.load(f)
else:
tr_dir = os.path.join(self.tracker_fol, tracker, self.tracker_sub_fol)
tr_dir_files = [
file for file in os.listdir(tr_dir) if file.endswith(".json")
]
if len(tr_dir_files) != 1:
raise TrackEvalException(
f"{tr_dir} does not contain exactly one json file."
)
with open(os.path.join(tr_dir, tr_dir_files[0])) as f:
curr_data = json.load(f)
# limit detections if MAX_DETECTIONS > 0
if self.config["MAX_DETECTIONS"]:
curr_data = self._limit_dets_per_image(curr_data)
# fill missing video ids
self._fill_video_ids_inplace(curr_data)
# make track ids unique over whole evaluation set
self._make_tk_ids_unique(curr_data)
# merge categories marked with a merged tag in TAO dataset
self._merge_categories(curr_data)
# get tracker sequence information
curr_vids2tracks, curr_vids2images = self._compute_vid_mappings(curr_data)
self.tracker_data[tracker]["vids_to_tracks"] = curr_vids2tracks
self.tracker_data[tracker]["vids_to_images"] = curr_vids2images
def get_display_name(self, tracker):
return self.tracker_to_disp[tracker]
def _load_raw_file(self, tracker, seq, is_gt):
"""Load a file (gt or tracker) in the TAO format
If is_gt, this returns a dict which contains the fields:
[gt_ids, gt_classes]:
list (for each timestep) of 1D NDArrays (for each det).
[gt_dets]: list (for each timestep) of lists of detections.
if not is_gt, this returns a dict which contains the fields:
[tk_ids, tk_classes, tk_confidences]:
list (for each timestep) of 1D NDArrays (for each det).
[tk_dets]: list (for each timestep) of lists of detections.
"""
seq_id = self.seq_name2seqid[seq]
# file location
if is_gt:
imgs = self.video2gt_image[seq_id]
else:
imgs = self.tracker_data[tracker]["vids_to_images"][seq_id]
# convert data to required format
num_timesteps = self.seq_lengths[seq_id]
img_to_timestep = self.seq2images2timestep[seq_id]
data_keys = ["ids", "classes", "dets"]
if not is_gt:
data_keys += ["tk_confidences"]
raw_data = {key: [None] * num_timesteps for key in data_keys}
for img in imgs:
# some tracker data contains images without any ground truth info,
# these are ignored
if img["id"] not in img_to_timestep:
continue
t = img_to_timestep[img["id"]]
anns = img["annotations"]
if self.use_mask:
# When using mask, extract segmentation data
raw_data["dets"][t] = [ann.get("segmentation") for ann in anns]
else:
# When using bbox, extract bbox data
raw_data["dets"][t] = np.atleast_2d([ann["bbox"] for ann in anns]).astype(
float
)
raw_data["ids"][t] = np.atleast_1d(
[ann["track_id"] for ann in anns]
).astype(int)
raw_data["classes"][t] = np.atleast_1d(
[ann["category_id"] for ann in anns]
).astype(int)
if not is_gt:
raw_data["tk_confidences"][t] = np.atleast_1d(
[ann["score"] for ann in anns]
).astype(float)
for t, d in enumerate(raw_data["dets"]):
if d is None:
raw_data["dets"][t] = np.empty((0, 4)).astype(float)
raw_data["ids"][t] = np.empty(0).astype(int)
raw_data["classes"][t] = np.empty(0).astype(int)
if not is_gt:
raw_data["tk_confidences"][t] = np.empty(0)
if is_gt:
key_map = {"ids": "gt_ids", "classes": "gt_classes", "dets": "gt_dets"}
else:
key_map = {"ids": "tk_ids", "classes": "tk_classes", "dets": "tk_dets"}
for k, v in key_map.items():
raw_data[v] = raw_data.pop(k)
raw_data["num_timesteps"] = num_timesteps
raw_data["neg_cat_ids"] = self.seq2cls[seq_id]["neg_cat_ids"]
raw_data["not_exh_labeled_cls"] = self.seq2cls[seq_id][
"not_exh_labeled_cat_ids"
]
raw_data["seq"] = seq
return raw_data
def get_preprocessed_seq_data_thr(self, raw_data, cls, assignment=None):
"""Preprocess data for a single sequence for a single class.
Inputs:
raw_data: dict containing the data for the sequence already
read in by get_raw_seq_data().
cls: class to be evaluated.
Outputs:
gt_ids:
list (for each timestep) of ids of GT tracks
tk_ids:
list (for each timestep) of ids of predicted tracks (all for TP
matching (Det + AssocA))
tk_overlap_ids:
list (for each timestep) of ids of predicted tracks that overlap
with GTs
tk_neg_ids:
list (for each timestep) of ids of predicted tracks that with
the class id on the negative list for the current sequence.
tk_exh_ids:
list (for each timestep) of ids of predicted tracks that do not
overlap with existing GTs but have the class id on the
exhaustive annotated class list for the current sequence.
tk_dets:
list (for each timestep) of lists of detections that
corresponding to the tk_ids
tk_classes:
list (for each timestep) of lists of classes that corresponding
to the tk_ids
tk_confidences:
list (for each timestep) of lists of classes that corresponding
to the tk_ids
sim_scores:
similarity score between gt_ids and tk_ids.
"""
if cls != "all":
cls_id = self.cls_name2clsid[cls]
data_keys = [
"gt_ids",
"tk_ids",
"gt_id_map",
"tk_id_map",
"gt_dets",
"gt_classes",
"gt_class_name",
"tk_overlap_classes",
"tk_overlap_ids",
"tk_neg_ids",
"tk_exh_ids",
"tk_class_eval_tk_ids",
"tk_dets",
"tk_classes",
"tk_confidences",
"sim_scores",
]
data = {key: [None] * raw_data["num_timesteps"] for key in data_keys}
unique_gt_ids = []
unique_tk_ids = []
num_gt_dets = 0
num_tk_cls_dets = 0
num_tk_overlap_dets = 0
overlap_ious_thr = 0.5
loc_and_asso_tk_ids = []
for t in range(raw_data["num_timesteps"]):
# only extract relevant dets for this class for preproc and eval
if cls == "all":
gt_class_mask = np.ones_like(raw_data["gt_classes"][t]).astype(bool)
else:
gt_class_mask = np.atleast_1d(
raw_data["gt_classes"][t] == cls_id
).astype(bool)
# select GT that is not in the evaluating classes
if assignment is not None and assignment:
all_gt_ids = list(assignment[t].keys())
gt_ids_in = raw_data["gt_ids"][t][gt_class_mask]
gt_ids_out = set(all_gt_ids) - set(gt_ids_in)
tk_ids_out = set([assignment[t][key] for key in list(gt_ids_out)])
# compute overlapped tracks and add their ids to overlap_tk_ids
sim_scores = raw_data["similarity_scores"]
overlap_ids_masks = (sim_scores[t][gt_class_mask] >= overlap_ious_thr).any(
axis=0
)
overlap_tk_ids_t = raw_data["tk_ids"][t][overlap_ids_masks]
if assignment is not None and assignment:
data["tk_overlap_ids"][t] = list(set(overlap_tk_ids_t) - tk_ids_out)
else:
data["tk_overlap_ids"][t] = list(set(overlap_tk_ids_t))
loc_and_asso_tk_ids += data["tk_overlap_ids"][t]
data["tk_exh_ids"][t] = []
data["tk_neg_ids"][t] = []
if cls == "all":
continue
# remove tk_ids that has been assigned to GT belongs to other classes.
loc_and_asso_tk_ids = list(set(loc_and_asso_tk_ids))
# remove all unwanted unmatched tracker detections
for t in range(raw_data["num_timesteps"]):
# add gt to the data
if cls == "all":
gt_class_mask = np.ones_like(raw_data["gt_classes"][t]).astype(bool)
else:
gt_class_mask = np.atleast_1d(
raw_data["gt_classes"][t] == cls_id
).astype(bool)
data["gt_classes"][t] = cls_id
data["gt_class_name"][t] = cls
gt_ids = raw_data["gt_ids"][t][gt_class_mask]
if self.use_mask:
gt_dets = [raw_data['gt_dets'][t][ind] for ind in range(len(gt_class_mask)) if gt_class_mask[ind]]
else:
gt_dets = raw_data["gt_dets"][t][gt_class_mask]
data["gt_ids"][t] = gt_ids
data["gt_dets"][t] = gt_dets
# filter pred and only keep those that highly overlap with GTs
tk_mask = np.isin(
raw_data["tk_ids"][t], np.array(loc_and_asso_tk_ids), assume_unique=True
)
tk_overlap_mask = np.isin(
raw_data["tk_ids"][t],
np.array(data["tk_overlap_ids"][t]),
assume_unique=True,
)
tk_ids = raw_data["tk_ids"][t][tk_mask]
if self.use_mask:
tk_dets = [raw_data['tk_dets'][t][ind] for ind in range(len(tk_mask)) if
tk_mask[ind]]
else:
tk_dets = raw_data["tk_dets"][t][tk_mask]
tracker_classes = raw_data["tk_classes"][t][tk_mask]
# add overlap classes for computing the FP for Cls term
tracker_overlap_classes = raw_data["tk_classes"][t][tk_overlap_mask]
tracker_confidences = raw_data["tk_confidences"][t][tk_mask]
sim_scores_masked = sim_scores[t][gt_class_mask, :][:, tk_mask]
# add filtered prediction to the data
data["tk_classes"][t] = tracker_classes
data["tk_overlap_classes"][t] = tracker_overlap_classes
data["tk_ids"][t] = tk_ids
data["tk_dets"][t] = tk_dets
data["tk_confidences"][t] = tracker_confidences
data["sim_scores"][t] = sim_scores_masked
data["tk_class_eval_tk_ids"][t] = set(
list(data["tk_overlap_ids"][t])
+ list(data["tk_neg_ids"][t])
+ list(data["tk_exh_ids"][t])
)
# count total number of detections
unique_gt_ids += list(np.unique(data["gt_ids"][t]))
# the unique track ids are for association.
unique_tk_ids += list(np.unique(data["tk_ids"][t]))
num_tk_overlap_dets += len(data["tk_overlap_ids"][t])
num_tk_cls_dets += len(data["tk_class_eval_tk_ids"][t])
num_gt_dets += len(data["gt_ids"][t])
# re-label IDs such that there are no empty IDs
if len(unique_gt_ids) > 0:
unique_gt_ids = np.unique(unique_gt_ids)
gt_id_map = np.nan * np.ones((np.max(unique_gt_ids) + 1))
gt_id_map[unique_gt_ids] = np.arange(len(unique_gt_ids))
data["gt_id_map"] = {}
for gt_id in unique_gt_ids:
new_gt_id = gt_id_map[gt_id].astype(int)
data["gt_id_map"][new_gt_id] = gt_id
for t in range(raw_data["num_timesteps"]):
if len(data["gt_ids"][t]) > 0:
data["gt_ids"][t] = gt_id_map[data["gt_ids"][t]].astype(int)
if len(unique_tk_ids) > 0:
unique_tk_ids = np.unique(unique_tk_ids)
tk_id_map = np.nan * np.ones((np.max(unique_tk_ids) + 1))
tk_id_map[unique_tk_ids] = np.arange(len(unique_tk_ids))
data["tk_id_map"] = {}
for track_id in unique_tk_ids:
new_track_id = tk_id_map[track_id].astype(int)
data["tk_id_map"][new_track_id] = track_id
for t in range(raw_data["num_timesteps"]):
if len(data["tk_ids"][t]) > 0:
data["tk_ids"][t] = tk_id_map[data["tk_ids"][t]].astype(int)
if len(data["tk_overlap_ids"][t]) > 0:
data["tk_overlap_ids"][t] = tk_id_map[
data["tk_overlap_ids"][t]
].astype(int)
# record overview statistics.
data["num_tk_cls_dets"] = num_tk_cls_dets
data["num_tk_overlap_dets"] = num_tk_overlap_dets
data["num_gt_dets"] = num_gt_dets
data["num_tk_ids"] = len(unique_tk_ids)
data["num_gt_ids"] = len(unique_gt_ids)
data["num_timesteps"] = raw_data["num_timesteps"]
data["seq"] = raw_data["seq"]
self._check_unique_ids(data)
return data
@_timing.time
def get_preprocessed_seq_data(
self, raw_data, cls, assignment=None, thresholds=[50, 75]
):
"""Preprocess data for a single sequence for a single class."""
data = {}
if thresholds is None:
thresholds = [50]
elif isinstance(thresholds, int):
thresholds = [thresholds]
for thr in thresholds:
assignment_thr = None
if assignment is not None:
assignment_thr = assignment[thr]
data[thr] = self.get_preprocessed_seq_data_thr(
raw_data, cls, assignment_thr
)
return data
def _calculate_similarities(self, gt_dets_t, tk_dets_t):
"""Compute similarity scores."""
if self.use_mask:
similarity_scores = self._calculate_mask_ious(gt_dets_t, tk_dets_t, is_encoded=True, do_ioa=False)
else:
similarity_scores = self._calculate_box_ious(gt_dets_t, tk_dets_t)
return similarity_scores
def _merge_categories(self, annotations):
"""Merges categories with a merged tag.
Adapted from https://github.com/TAO-Dataset.
"""
merge_map = {}
for category in self.gt_data["categories"]:
if "merged" in category:
for to_merge in category["merged"]:
merge_map[to_merge["id"]] = category["id"]
for ann in annotations:
ann["category_id"] = merge_map.get(ann["category_id"], ann["category_id"])
def _compute_vid_mappings(self, annotations):
"""Computes mappings from videos to corresponding tracks and images."""
vids_to_tracks = {}
vids_to_imgs = {}
vid_ids = [vid["id"] for vid in self.gt_data["videos"]]
# compute an mapping from image IDs to images
images = {}
for image in self.gt_data["images"]:
images[image["id"]] = image
for ann in annotations:
ann["area"] = ann["bbox"][2] * ann["bbox"][3]
vid = ann["video_id"]
if ann["video_id"] not in vids_to_tracks.keys():
vids_to_tracks[ann["video_id"]] = list()
if ann["video_id"] not in vids_to_imgs.keys():
vids_to_imgs[ann["video_id"]] = list()
# fill in vids_to_tracks
tid = ann["track_id"]
exist_tids = [track["id"] for track in vids_to_tracks[vid]]
try:
index1 = exist_tids.index(tid)
except ValueError:
index1 = -1
if tid not in exist_tids:
curr_track = {
"id": tid,
"category_id": ann["category_id"],
"video_id": vid,
"annotations": [ann],
}
vids_to_tracks[vid].append(curr_track)
else:
vids_to_tracks[vid][index1]["annotations"].append(ann)
# fill in vids_to_imgs
img_id = ann["image_id"]
exist_img_ids = [img["id"] for img in vids_to_imgs[vid]]
try:
index2 = exist_img_ids.index(img_id)
except ValueError:
index2 = -1
if index2 == -1:
curr_img = {"id": img_id, "annotations": [ann]}
vids_to_imgs[vid].append(curr_img)
else:
vids_to_imgs[vid][index2]["annotations"].append(ann)
# sort annotations by frame index and compute track area
for vid, tracks in vids_to_tracks.items():
for track in tracks:
track["annotations"] = sorted(
track["annotations"],
key=lambda x: images[x["image_id"]]["frame_index"],
)
# compute average area
track["area"] = sum(x["area"] for x in track["annotations"]) / len(
track["annotations"]
)
# ensure all videos are present
for vid_id in vid_ids:
if vid_id not in vids_to_tracks.keys():
vids_to_tracks[vid_id] = []
if vid_id not in vids_to_imgs.keys():
vids_to_imgs[vid_id] = []
return vids_to_tracks, vids_to_imgs
def _compute_image_to_timestep_mappings(self):
"""Computes a mapping from images to timestep in sequence."""
images = {}
for image in self.gt_data["images"]:
images[image["id"]] = image
seq_to_imgs_to_timestep = {vid["id"]: dict() for vid in self.gt_data["videos"]}
for vid in seq_to_imgs_to_timestep:
curr_imgs = [img["id"] for img in self.video2gt_image[vid]]
curr_imgs = sorted(curr_imgs, key=lambda x: images[x]["frame_index"])
seq_to_imgs_to_timestep[vid] = {
curr_imgs[i]: i for i in range(len(curr_imgs))
}
return seq_to_imgs_to_timestep
def _limit_dets_per_image(self, annotations):
"""Limits the number of detections for each image.
Adapted from https://github.com/TAO-Dataset/.
"""
max_dets = self.config["MAX_DETECTIONS"]
img_ann = defaultdict(list)
for ann in annotations:
img_ann[ann["image_id"]].append(ann)
for img_id, _anns in img_ann.items():
if len(_anns) <= max_dets:
continue
_anns = sorted(_anns, key=lambda x: x["score"], reverse=True)
img_ann[img_id] = _anns[:max_dets]
return [ann for anns in img_ann.values() for ann in anns]
def _fill_video_ids_inplace(self, annotations):
"""Fills in missing video IDs inplace.
Adapted from https://github.com/TAO-Dataset/.
"""
missing_video_id = [x for x in annotations if "video_id" not in x]
if missing_video_id:
image_id_to_video_id = {
x["id"]: x["video_id"] for x in self.gt_data["images"]
}
for x in missing_video_id:
x["video_id"] = image_id_to_video_id[x["image_id"]]
@staticmethod
def _make_tk_ids_unique(annotations):
"""Makes track IDs unqiue over the whole annotation set.
Adapted from https://github.com/TAO-Dataset/.
"""
track_id_videos = {}
track_ids_to_update = set()
max_track_id = 0
for ann in annotations:
t = ann["track_id"]
if t not in track_id_videos:
track_id_videos[t] = ann["video_id"]
if ann["video_id"] != track_id_videos[t]:
# track id is assigned to multiple videos
track_ids_to_update.add(t)
max_track_id = max(max_track_id, t)
if track_ids_to_update:
print("true")
next_id = itertools.count(max_track_id + 1)
new_tk_ids = defaultdict(lambda: next(next_id))
for ann in annotations:
t = ann["track_id"]
v = ann["video_id"]
if t in track_ids_to_update:
ann["track_id"] = new_tk_ids[t, v]
return len(track_ids_to_update)

View File

@@ -0,0 +1,275 @@
# fmt: off
# flake8: noqa
import copy
import os
import pickle
import time
import traceback
from functools import partial
from multiprocessing.pool import Pool
import numpy as np
from . import _timing, utils
from .config import get_default_eval_config, init_config
from .utils import TrackEvalException
class Evaluator:
"""Evaluator class for evaluating different metrics for each datasets."""
def __init__(self, config=None):
"""Initialize the evaluator with a config file."""
self.config = init_config(config, get_default_eval_config(), "Eval")
# Only run timing analysis if not run in parallel.
if self.config["TIME_PROGRESS"] and not self.config["USE_PARALLEL"]:
_timing.DO_TIMING = True
if self.config["DISPLAY_LESS_PROGRESS"]:
_timing.DISPLAY_LESS_PROGRESS = True
@_timing.time
def evaluate(self, dataset_list, metrics_list):
"""Evaluate a set of metrics on a set of datasets."""
config = self.config
metrics_list = metrics_list
metric_names = utils.validate_metrics_list(metrics_list)
dataset_names = [dataset.get_name() for dataset in dataset_list]
output_res = {}
output_msg = {}
for dataset, dname in zip(dataset_list, dataset_names):
# Get dataset info about what to evaluate
output_res[dname] = {}
output_msg[dname] = {}
tracker_list, seq_list, class_list = dataset.get_eval_info()
print(
f"\nEvaluating {len(tracker_list)} tracker(s) on "
f"{len(seq_list)} sequence(s) for {len(class_list)} class(es)"
f" on {dname} dataset using the following "
f'metrics: {", ".join(metric_names)}\n'
)
# Evaluate each tracker
for tracker in tracker_list:
try:
output_res, output_msg = self.evaluate_tracker(
tracker,
dataset,
dname,
class_list,
metrics_list,
metric_names,
seq_list,
output_res,
output_msg,
)
except Exception as err:
output_res[dname][tracker] = None
if type(err) == TrackEvalException:
output_msg[dname][tracker] = str(err)
else:
output_msg[dname][tracker] = "Unknown error occurred."
print("Tracker %s was unable to be evaluated." % tracker)
print(err)
traceback.print_exc()
if config["LOG_ON_ERROR"] is not None:
with open(config["LOG_ON_ERROR"], "a") as f:
print(dname, file=f)
print(tracker, file=f)
print(traceback.format_exc(), file=f)
print("\n\n\n", file=f)
if config["BREAK_ON_ERROR"]:
raise err
elif config["RETURN_ON_ERROR"]:
return output_res, output_msg
return output_res, output_msg
def evaluate_tracker(
self,
tracker,
dataset,
dname,
class_list,
metrics_list,
metric_names,
seq_list,
output_res,
output_msg,
):
"""Evaluate each sequence in parallel or in series."""
print("\nEvaluating %s\n" % tracker)
time_start = time.time()
config = self.config
if config["USE_PARALLEL"]:
with Pool(config["NUM_PARALLEL_CORES"]) as pool:
_eval_sequence = partial(
eval_sequence,
dataset=dataset,
tracker=tracker,
class_list=class_list,
metrics_list=metrics_list,
metric_names=metric_names,
)
results = pool.map(_eval_sequence, seq_list)
res = dict(zip(seq_list, results))
else:
res = {}
for curr_seq in sorted(seq_list):
res[curr_seq] = eval_sequence(
curr_seq, dataset, tracker, class_list, metrics_list, metric_names
)
# collecting combined cls keys (cls averaged, det averaged, super classes)
cls_keys = []
res["COMBINED_SEQ"] = {}
# combine sequences for each class
for c_cls in class_list:
res["COMBINED_SEQ"][c_cls] = {}
for metric, mname in zip(metrics_list, metric_names):
curr_res = {
seq_key: seq_value[c_cls][mname]
for seq_key, seq_value in res.items()
if seq_key != "COMBINED_SEQ"
}
# combine results over all sequences and then over all classes
res["COMBINED_SEQ"][c_cls][mname] = metric.combine_sequences(curr_res)
# combine classes
if dataset.should_classes_combine:
if config["OUTPUT_PER_SEQ_RES"]:
video_keys = res.keys()
else:
video_keys = ["COMBINED_SEQ"]
for v_key in video_keys:
cls_keys += ["average"]
res[v_key]["average"] = {}
for metric, mname in zip(metrics_list, metric_names):
cls_res = {
cls_key: cls_value[mname]
for cls_key, cls_value in res[v_key].items()
if cls_key not in cls_keys
}
res[v_key]["average"][
mname
] = metric.combine_classes_class_averaged(
cls_res, ignore_empty=True
)
# combine classes to super classes
if dataset.use_super_categories:
for cat, sub_cats in dataset.super_categories.items():
cls_keys.append(cat)
res["COMBINED_SEQ"][cat] = {}
for metric, mname in zip(metrics_list, metric_names):
cat_res = {
cls_key: cls_value[mname]
for cls_key, cls_value in res["COMBINED_SEQ"].items()
if cls_key in sub_cats
}
res["COMBINED_SEQ"][cat][
mname
] = metric.combine_classes_det_averaged(cat_res)
# Print and output results in various formats
if config["TIME_PROGRESS"]:
print(
f"\nAll sequences for {tracker} finished in"
f" {time.time() - time_start} seconds"
)
output_fol = dataset.get_output_fol(tracker)
os.makedirs(output_fol, exist_ok=True)
# take a mean of each field of each thr
if config["OUTPUT_PER_SEQ_RES"]:
all_res = copy.deepcopy(res)
summary_keys = res.keys()
else:
all_res = copy.deepcopy(res["COMBINED_SEQ"])
summary_keys = ["COMBINED_SEQ"]
thr_key_list = [50]
for s_key in summary_keys:
for metric, mname in zip(metrics_list, metric_names):
if mname != "TETA":
if s_key == "COMBINED_SEQ":
metric.print_table(
{"COMBINED_SEQ": res["COMBINED_SEQ"][cls_keys[0]][mname]},
tracker,
cls_keys[0],
)
continue
for c_cls in res[s_key].keys():
for thr in thr_key_list:
all_res[s_key][c_cls][mname][thr] = metric._summary_row(
res[s_key][c_cls][mname][thr]
)
x = (
np.array(list(all_res[s_key][c_cls]["TETA"].values()))
.astype("float")
.mean(axis=0)
)
all_res_summary = list(x.round(decimals=2).astype("str"))
all_res[s_key][c_cls][mname]["ALL"] = all_res_summary
if config["OUTPUT_SUMMARY"] and s_key == "COMBINED_SEQ":
for t in thr_key_list:
metric.print_summary_table(
all_res[s_key][cls_keys[0]][mname][t],
t,
tracker,
cls_keys[0],
)
if config["OUTPUT_TEM_RAW_DATA"]:
out_file = os.path.join(output_fol, "teta_summary_results.pth")
pickle.dump(all_res, open(out_file, "wb"))
print("Saved the TETA summary results.")
# output
output_res[dname][mname] = all_res[s_key][cls_keys[0]][mname][t]
output_msg[dname][tracker] = "Success"
return output_res, output_msg
@_timing.time
def eval_sequence(seq, dataset, tracker, class_list, metrics_list, metric_names):
"""Function for evaluating a single sequence."""
raw_data = dataset.get_raw_seq_data(tracker, seq)
seq_res = {}
if "TETA" in metric_names:
thresholds = [50]
data_all_class = dataset.get_preprocessed_seq_data(
raw_data, "all", thresholds=thresholds
)
teta = metrics_list[metric_names.index("TETA")]
assignment = teta.compute_global_assignment(data_all_class)
# create a dict to save Cls_FP for each class in different thr.
cls_fp = {
key: {
cls: np.zeros((len(np.arange(0.5, 0.99, 0.05)))) for cls in class_list
}
for key in thresholds
}
for cls in class_list:
seq_res[cls] = {}
data = dataset.get_preprocessed_seq_data(raw_data, cls, assignment, thresholds)
for metric, mname in zip(metrics_list, metric_names):
if mname == "TETA":
seq_res[cls][mname], cls_fp, _ = metric.eval_sequence(
data, cls, dataset.clsid2cls_name, cls_fp
)
else:
seq_res[cls][mname] = metric.eval_sequence(data)
if "TETA" in metric_names:
for thr in thresholds:
for cls in class_list:
seq_res[cls]["TETA"][thr]["Cls_FP"] += cls_fp[thr][cls]
return seq_res

View File

@@ -0,0 +1,4 @@
# fmt: off
# flake8: noqa
from .teta import TETA

View File

@@ -0,0 +1,148 @@
# fmt: off
# flake8: noqa
from abc import ABC, abstractmethod
import numpy as np
from .. import _timing
from ..utils import TrackEvalException
class _BaseMetric(ABC):
@abstractmethod
def __init__(self):
self.plottable = False
self.integer_fields = []
self.float_fields = []
self.array_labels = []
self.integer_array_fields = []
self.float_array_fields = []
self.fields = []
self.summary_fields = []
self.registered = False
#####################################################################
# Abstract functions for subclasses to implement
@_timing.time
@abstractmethod
def eval_sequence(self, data):
...
@abstractmethod
def combine_sequences(self, all_res):
...
@abstractmethod
def combine_classes_class_averaged(self, all_res, ignore_empty=False):
...
@abstractmethod
def combine_classes_det_averaged(self, all_res):
...
def plot_single_tracker_results(self, all_res, tracker, output_folder, cls):
"""Plot results, only valid for metrics with self.plottable."""
if self.plottable:
raise NotImplementedError(
f"plot_results is not implemented for metric {self.get_name()}"
)
else:
pass
#####################################################################
# Helper functions which are useful for all metrics:
@classmethod
def get_name(cls):
return cls.__name__
@staticmethod
def _combine_sum(all_res, field):
"""Combine sequence results via sum"""
return sum([all_res[k][field] for k in all_res.keys()])
@staticmethod
def _combine_weighted_av(all_res, field, comb_res, weight_field):
"""Combine sequence results via weighted average."""
return sum(
[all_res[k][field] * all_res[k][weight_field] for k in all_res.keys()]
) / np.maximum(1.0, comb_res[weight_field])
def print_table(self, table_res, tracker, cls):
"""Print table of results for all sequences."""
print("")
metric_name = self.get_name()
self._row_print(
[metric_name + ": " + tracker + "-" + cls] + self.summary_fields
)
for seq, results in sorted(table_res.items()):
if seq == "COMBINED_SEQ":
continue
summary_res = self._summary_row(results)
self._row_print([seq] + summary_res)
summary_res = self._summary_row(table_res["COMBINED_SEQ"])
self._row_print(["COMBINED"] + summary_res)
def _summary_row(self, results_):
vals = []
for h in self.summary_fields:
if h in self.float_array_fields:
vals.append("{0:1.5g}".format(100 * np.mean(results_[h])))
elif h in self.float_fields:
vals.append("{0:1.5g}".format(100 * float(results_[h])))
elif h in self.integer_fields:
vals.append("{0:d}".format(int(results_[h])))
else:
raise NotImplementedError(
"Summary function not implemented for this field type."
)
return vals
@staticmethod
def _row_print(*argv):
"""Print results in evenly spaced rows, with more space in first row."""
if len(argv) == 1:
argv = argv[0]
to_print = "%-35s" % argv[0]
for v in argv[1:]:
to_print += "%-10s" % str(v)
print(to_print)
def summary_results(self, table_res):
"""Return a simple summary of final results for a tracker."""
return dict(
zip(self.summary_fields, self._summary_row(table_res["COMBINED_SEQ"]),)
)
def detailed_results(self, table_res):
"""Return detailed final results for a tracker."""
# Get detailed field information
detailed_fields = self.float_fields + self.integer_fields
for h in self.float_array_fields + self.integer_array_fields:
for alpha in [int(100 * x) for x in self.array_labels]:
detailed_fields.append(h + "___" + str(alpha))
detailed_fields.append(h + "___AUC")
# Get detailed results
detailed_results = {}
for seq, res in table_res.items():
detailed_row = self._detailed_row(res)
if len(detailed_row) != len(detailed_fields):
raise TrackEvalException(
f"Field names and data have different sizes "
f"({len(detailed_row)} and {len(detailed_fields)})"
)
detailed_results[seq] = dict(zip(detailed_fields, detailed_row))
return detailed_results
def _detailed_row(self, res):
detailed_row = []
for h in self.float_fields + self.integer_fields:
detailed_row.append(res[h])
for h in self.float_array_fields + self.integer_array_fields:
for i, _ in enumerate([int(100 * x) for x in self.array_labels]):
detailed_row.append(res[h][i])
detailed_row.append(np.mean(res[h]))
return detailed_row

View File

@@ -0,0 +1,399 @@
# fmt: off
# flake8: noqa
"""Track Every Thing Accuracy metric."""
import numpy as np
from scipy.optimize import linear_sum_assignment
from .. import _timing
from ._base_metric import _BaseMetric
EPS = np.finfo("float").eps # epsilon
class TETA(_BaseMetric):
"""TETA metric."""
def __init__(self, exhaustive=False, config=None):
"""Initialize metric."""
super().__init__()
self.plottable = True
self.array_labels = np.arange(0.0, 0.99, 0.05)
self.cls_array_labels = np.arange(0.5, 0.99, 0.05)
self.integer_array_fields = [
"Loc_TP",
"Loc_FN",
"Loc_FP",
"Cls_TP",
"Cls_FN",
"Cls_FP",
]
self.float_array_fields = (
["TETA", "LocA", "AssocA", "ClsA"]
+ ["LocRe", "LocPr"]
+ ["AssocRe", "AssocPr"]
+ ["ClsRe", "ClsPr"]
)
self.fields = self.float_array_fields + self.integer_array_fields
self.summary_fields = self.float_array_fields
self.exhaustive = exhaustive
def compute_global_assignment(self, data_thr, alpha=0.5):
"""Compute global assignment of TP."""
res = {
thr: {t: {} for t in range(data_thr[thr]["num_timesteps"])}
for thr in data_thr
}
for thr in data_thr:
data = data_thr[thr]
# return empty result if tracker or gt sequence is empty
if data["num_tk_overlap_dets"] == 0 or data["num_gt_dets"] == 0:
return res
# global alignment score
ga_score, _, _ = self.compute_global_alignment_score(data)
# calculate scores for each timestep
for t, (gt_ids_t, tk_ids_t) in enumerate(
zip(data["gt_ids"], data["tk_ids"])
):
# get matches optimizing for TETA
amatch_rows, amatch_cols = self.compute_matches(
data, t, ga_score, gt_ids_t, tk_ids_t, alpha=alpha
)
gt_ids = [data["gt_id_map"][tid] for tid in gt_ids_t[amatch_rows[0]]]
matched_ids = [
data["tk_id_map"][tid] for tid in tk_ids_t[amatch_cols[0]]
]
res[thr][t] = dict(zip(gt_ids, matched_ids))
return res
def eval_sequence_single_thr(self, data, cls, cid2clsname, cls_fp_thr, thr):
"""Computes TETA metric for one threshold for one sequence."""
res = {}
class_info_list = []
for field in self.float_array_fields + self.integer_array_fields:
if field.startswith("Cls"):
res[field] = np.zeros(len(self.cls_array_labels), dtype=float)
else:
res[field] = np.zeros((len(self.array_labels)), dtype=float)
# return empty result if tracker or gt sequence is empty
if data["num_tk_overlap_dets"] == 0:
res["Loc_FN"] = data["num_gt_dets"] * np.ones(
(len(self.array_labels)), dtype=float
)
if self.exhaustive:
cls_fp_thr[cls] = data["num_tk_cls_dets"] * np.ones(
(len(self.cls_array_labels)), dtype=float
)
res = self._compute_final_fields(res)
return res, cls_fp_thr, class_info_list
if data["num_gt_dets"] == 0:
if self.exhaustive:
cls_fp_thr[cls] = data["num_tk_cls_dets"] * np.ones(
(len(self.cls_array_labels)), dtype=float
)
res = self._compute_final_fields(res)
return res, cls_fp_thr, class_info_list
# global alignment score
ga_score, gt_id_count, tk_id_count = self.compute_global_alignment_score(data)
matches_counts = [np.zeros_like(ga_score) for _ in self.array_labels]
# calculate scores for each timestep
for t, (gt_ids_t, tk_ids_t, tk_overlap_ids_t, tk_cls_ids_t) in enumerate(
zip(
data["gt_ids"],
data["tk_ids"],
data["tk_overlap_ids"],
data["tk_class_eval_tk_ids"],
)
):
# deal with the case that there are no gt_det/tk_det in a timestep
if len(gt_ids_t) == 0:
if self.exhaustive:
cls_fp_thr[cls] += len(tk_cls_ids_t)
continue
# get matches optimizing for TETA
amatch_rows, amatch_cols = self.compute_matches(
data, t, ga_score, gt_ids_t, tk_ids_t, list(self.array_labels)
)
# map overlap_ids to original ids.
if len(tk_overlap_ids_t) != 0:
sorter = np.argsort(tk_ids_t)
indexes = sorter[
np.searchsorted(tk_ids_t, tk_overlap_ids_t, sorter=sorter)
]
sim_t = data["sim_scores"][t][:, indexes]
fpl_candidates = tk_overlap_ids_t[(sim_t >= (thr / 100)).any(axis=0)]
fpl_candidates_ori_ids_t = np.array(
[data["tk_id_map"][tid] for tid in fpl_candidates]
)
else:
fpl_candidates_ori_ids_t = []
if self.exhaustive:
cls_fp_thr[cls] += len(tk_cls_ids_t) - len(tk_overlap_ids_t)
# calculate and accumulate basic statistics
for a, alpha in enumerate(self.array_labels):
match_row, match_col = amatch_rows[a], amatch_cols[a]
num_matches = len(match_row)
matched_ori_ids = set(
[data["tk_id_map"][tid] for tid in tk_ids_t[match_col]]
)
match_tk_cls = data["tk_classes"][t][match_col]
wrong_tk_cls = match_tk_cls[match_tk_cls != data["gt_classes"][t]]
num_class_and_det_matches = np.sum(
match_tk_cls == data["gt_classes"][t]
)
if alpha >= 0.5:
for cid in wrong_tk_cls:
if cid in cid2clsname:
cname = cid2clsname[cid]
cls_fp_thr[cname][a - 10] += 1
res["Cls_TP"][a - 10] += num_class_and_det_matches
res["Cls_FN"][a - 10] += num_matches - num_class_and_det_matches
res["Loc_TP"][a] += num_matches
res["Loc_FN"][a] += len(gt_ids_t) - num_matches
res["Loc_FP"][a] += len(set(fpl_candidates_ori_ids_t) - matched_ori_ids)
if num_matches > 0:
matches_counts[a][gt_ids_t[match_row], tk_ids_t[match_col]] += 1
# calculate AssocA, AssocRe, AssocPr
self.compute_association_scores(res, matches_counts, gt_id_count, tk_id_count)
# calculate final scores
res = self._compute_final_fields(res)
return res, cls_fp_thr, class_info_list
def compute_global_alignment_score(self, data):
"""Computes global alignment score."""
num_matches = np.zeros((data["num_gt_ids"], data["num_tk_ids"]))
gt_id_count = np.zeros((data["num_gt_ids"], 1))
tk_id_count = np.zeros((1, data["num_tk_ids"]))
# loop through each timestep and accumulate global track info.
for t, (gt_ids_t, tk_ids_t) in enumerate(zip(data["gt_ids"], data["tk_ids"])):
# count potential matches between ids in each time step
# these are normalized, weighted by match similarity
sim = data["sim_scores"][t]
sim_iou_denom = sim.sum(0, keepdims=True) + sim.sum(1, keepdims=True) - sim
sim_iou = np.zeros_like(sim)
mask = sim_iou_denom > (0 + EPS)
sim_iou[mask] = sim[mask] / sim_iou_denom[mask]
num_matches[gt_ids_t[:, None], tk_ids_t[None, :]] += sim_iou
# calculate total number of dets for each gt_id and tk_id.
gt_id_count[gt_ids_t] += 1
tk_id_count[0, tk_ids_t] += 1
# Calculate overall Jaccard alignment score between IDs
ga_score = num_matches / (gt_id_count + tk_id_count - num_matches)
return ga_score, gt_id_count, tk_id_count
def compute_matches(self, data, t, ga_score, gt_ids, tk_ids, alpha):
"""Compute matches based on alignment score."""
sim = data["sim_scores"][t]
score_mat = ga_score[gt_ids[:, None], tk_ids[None, :]] * sim
# Hungarian algorithm to find best matches
match_rows, match_cols = linear_sum_assignment(-score_mat)
if not isinstance(alpha, list):
alpha = [alpha]
alpha_match_rows, alpha_match_cols = [], []
for a in alpha:
matched_mask = sim[match_rows, match_cols] >= a - EPS
alpha_match_rows.append(match_rows[matched_mask])
alpha_match_cols.append(match_cols[matched_mask])
return alpha_match_rows, alpha_match_cols
def compute_association_scores(self, res, matches_counts, gt_id_count, tk_id_count):
"""Calculate association scores for each alpha.
First calculate scores per gt_id/tk_id combo,
and then average over the number of detections.
"""
for a, _ in enumerate(self.array_labels):
matches_count = matches_counts[a]
ass_a = matches_count / np.maximum(
1, gt_id_count + tk_id_count - matches_count
)
res["AssocA"][a] = np.sum(matches_count * ass_a) / np.maximum(
1, res["Loc_TP"][a]
)
ass_re = matches_count / np.maximum(1, gt_id_count)
res["AssocRe"][a] = np.sum(matches_count * ass_re) / np.maximum(
1, res["Loc_TP"][a]
)
ass_pr = matches_count / np.maximum(1, tk_id_count)
res["AssocPr"][a] = np.sum(matches_count * ass_pr) / np.maximum(
1, res["Loc_TP"][a]
)
@_timing.time
def eval_sequence(self, data, cls, cls_id_name_mapping, cls_fp):
"""Evaluate a single sequence across all thresholds."""
res = {}
class_info_dict = {}
for thr in data:
res[thr], cls_fp[thr], cls_info = self.eval_sequence_single_thr(
data[thr], cls, cls_id_name_mapping, cls_fp[thr], thr
)
class_info_dict[thr] = cls_info
return res, cls_fp, class_info_dict
def combine_sequences(self, all_res):
"""Combines metrics across all sequences."""
data = {}
res = {}
if all_res:
thresholds = list(list(all_res.values())[0].keys())
else:
thresholds = [50]
for thr in thresholds:
data[thr] = {}
for seq_key in all_res:
data[thr][seq_key] = all_res[seq_key][thr]
for thr in thresholds:
res[thr] = self._combine_sequences_thr(data[thr])
return res
def _combine_sequences_thr(self, all_res):
"""Combines sequences over each threshold."""
res = {}
for field in self.integer_array_fields:
res[field] = self._combine_sum(all_res, field)
for field in ["AssocRe", "AssocPr", "AssocA"]:
res[field] = self._combine_weighted_av(
all_res, field, res, weight_field="Loc_TP"
)
res = self._compute_final_fields(res)
return res
def combine_classes_class_averaged(self, all_res, ignore_empty=False):
"""Combines metrics across all classes by averaging over classes.
If 'ignore_empty' is True, then it only sums over classes
with at least one gt or predicted detection.
"""
data = {}
res = {}
if all_res:
thresholds = list(list(all_res.values())[0].keys())
else:
thresholds = [50]
for thr in thresholds:
data[thr] = {}
for cls_key in all_res:
data[thr][cls_key] = all_res[cls_key][thr]
for thr in data:
res[thr] = self._combine_classes_class_averaged_thr(
data[thr], ignore_empty=ignore_empty
)
return res
def _combine_classes_class_averaged_thr(self, all_res, ignore_empty=False):
"""Combines classes over each threshold."""
res = {}
def check_empty(val):
"""Returns True if empty."""
return not (val["Loc_TP"] + val["Loc_FN"] + val["Loc_FP"] > 0 + EPS).any()
for field in self.integer_array_fields:
if ignore_empty:
res_field = {k: v for k, v in all_res.items() if not check_empty(v)}
else:
res_field = {k: v for k, v in all_res.items()}
res[field] = self._combine_sum(res_field, field)
for field in self.float_array_fields:
if ignore_empty:
res_field = [v[field] for v in all_res.values() if not check_empty(v)]
else:
res_field = [v[field] for v in all_res.values()]
res[field] = np.mean(res_field, axis=0)
return res
def combine_classes_det_averaged(self, all_res):
"""Combines metrics across all classes by averaging over detections."""
data = {}
res = {}
if all_res:
thresholds = list(list(all_res.values())[0].keys())
else:
thresholds = [50]
for thr in thresholds:
data[thr] = {}
for cls_key in all_res:
data[thr][cls_key] = all_res[cls_key][thr]
for thr in data:
res[thr] = self._combine_classes_det_averaged_thr(data[thr])
return res
def _combine_classes_det_averaged_thr(self, all_res):
"""Combines detections over each threshold."""
res = {}
for field in self.integer_array_fields:
res[field] = self._combine_sum(all_res, field)
for field in ["AssocRe", "AssocPr", "AssocA"]:
res[field] = self._combine_weighted_av(
all_res, field, res, weight_field="Loc_TP"
)
res = self._compute_final_fields(res)
return res
@staticmethod
def _compute_final_fields(res):
"""Calculate final metric values.
This function is used both for both per-sequence calculation,
and in combining values across sequences.
"""
# LocA
res["LocRe"] = res["Loc_TP"] / np.maximum(1, res["Loc_TP"] + res["Loc_FN"])
res["LocPr"] = res["Loc_TP"] / np.maximum(1, res["Loc_TP"] + res["Loc_FP"])
res["LocA"] = res["Loc_TP"] / np.maximum(
1, res["Loc_TP"] + res["Loc_FN"] + res["Loc_FP"]
)
# ClsA
res["ClsRe"] = res["Cls_TP"] / np.maximum(1, res["Cls_TP"] + res["Cls_FN"])
res["ClsPr"] = res["Cls_TP"] / np.maximum(1, res["Cls_TP"] + res["Cls_FP"])
res["ClsA"] = res["Cls_TP"] / np.maximum(
1, res["Cls_TP"] + res["Cls_FN"] + res["Cls_FP"]
)
res["ClsRe"] = np.mean(res["ClsRe"])
res["ClsPr"] = np.mean(res["ClsPr"])
res["ClsA"] = np.mean(res["ClsA"])
res["TETA"] = (res["LocA"] + res["AssocA"] + res["ClsA"]) / 3
return res
def print_summary_table(self, thr_res, thr, tracker, cls):
"""Prints summary table of results."""
print("")
metric_name = self.get_name()
self._row_print(
[f"{metric_name}{str(thr)}: {tracker}-{cls}"] + self.summary_fields
)
self._row_print(["COMBINED"] + thr_res)

View File

@@ -0,0 +1,46 @@
# fmt: off
# flake8: noqa
import csv
import os
from collections import OrderedDict
def validate_metrics_list(metrics_list):
"""Get names of metric class and ensures they are unique, further checks that the fields within each metric class
do not have overlapping names.
"""
metric_names = [metric.get_name() for metric in metrics_list]
# check metric names are unique
if len(metric_names) != len(set(metric_names)):
raise TrackEvalException(
"Code being run with multiple metrics of the same name"
)
fields = []
for m in metrics_list:
fields += m.fields
# check metric fields are unique
if len(fields) != len(set(fields)):
raise TrackEvalException(
"Code being run with multiple metrics with fields of the same name"
)
return metric_names
def get_track_id_str(ann):
"""Get name of track ID in annotation."""
if "track_id" in ann:
tk_str = "track_id"
elif "instance_id" in ann:
tk_str = "instance_id"
elif "scalabel_id" in ann:
tk_str = "scalabel_id"
else:
assert False, "No track/instance ID."
return tk_str
class TrackEvalException(Exception):
"""Custom exception for catching expected errors."""
...

View File

@@ -0,0 +1,146 @@
# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
import copy
import json
import logging
import numpy as np
import pycocotools.mask as mask_util
from pycocotools.coco import COCO
from typing_extensions import override
class YTVIS(COCO):
"""
Helper class for reading YT-VIS annotations
"""
@override
def __init__(self, annotation_file: str = None, ignore_gt_cats: bool = True):
"""
Args:
annotation_file: Path to the annotation file
ignore_gt_cats: If True, we ignore the ground truth categories and replace them with a dummy "object" category. This is useful for Phrase AP evaluation.
"""
self.ignore_gt_cats = ignore_gt_cats
super().__init__(annotation_file=annotation_file)
@override
def createIndex(self):
# We rename some keys to match the COCO format before creating the index.
if "annotations" in self.dataset:
for ann in self.dataset["annotations"]:
if "video_id" in ann:
ann["image_id"] = int(ann.pop("video_id"))
if self.ignore_gt_cats:
ann["category_id"] = -1
else:
ann["category_id"] = int(ann["category_id"])
if "bboxes" in ann:
# note that in some datasets we load under this YTVIS class,
# some "bboxes" could be None for when the GT object is invisible,
# so we replace them with [0, 0, 0, 0]
ann["bboxes"] = [
bbox if bbox is not None else [0, 0, 0, 0]
for bbox in ann["bboxes"]
]
if "areas" in ann:
# similar to "bboxes", some areas could be None for when the GT
# object is invisible, so we replace them with 0
areas = [a if a is not None else 0 for a in ann["areas"]]
# Compute average area of tracklet
ann["area"] = np.mean(areas)
if "videos" in self.dataset:
for vid in self.dataset["videos"]:
vid["id"] = int(vid["id"])
self.dataset["images"] = self.dataset.pop("videos")
if self.ignore_gt_cats:
self.dataset["categories"] = [
{"supercategory": "object", "id": -1, "name": "object"}
]
else:
for cat in self.dataset["categories"]:
cat["id"] = int(cat["id"])
super().createIndex()
@override
def getAnnIds(self, imgIds=[], catIds=[], areaRng=[], iscrowd=None):
if len(areaRng) > 0:
logging.warning(
"Note that we filter out objects based on their *average* area across the video, not per frame area"
)
return super().getAnnIds(imgIds=imgIds, catIds=catIds, iscrowd=iscrowd)
@override
def showAnns(self, anns, draw_bbox=False):
raise NotImplementedError("Showing annotations is not supported")
@override
def loadRes(self, resFile):
# Adapted from COCO.loadRes to support tracklets/masklets
res = YTVIS(ignore_gt_cats=self.ignore_gt_cats)
res.dataset["images"] = [img for img in self.dataset["images"]]
if type(resFile) == str:
with open(resFile) as f:
anns = json.load(f)
elif type(resFile) == np.ndarray:
anns = self.loadNumpyAnnotations(resFile)
else:
anns = resFile
assert type(anns) == list, "results is not an array of objects"
annsImgIds = [ann["image_id"] for ann in anns]
assert set(annsImgIds) == (
set(annsImgIds) & set(self.getImgIds())
), "Results do not correspond to current coco set"
if "bboxes" in anns[0] and not anns[0]["bboxes"] == []:
res.dataset["categories"] = copy.deepcopy(self.dataset["categories"])
for id, ann in enumerate(anns):
bbs = [(bb if bb is not None else [0, 0, 0, 0]) for bb in ann["bboxes"]]
xxyy = [[bb[0], bb[0] + bb[2], bb[1], bb[1] + bb[3]] for bb in bbs]
if not "segmentations" in ann:
ann["segmentations"] = [
[[x1, y1, x1, y2, x2, y2, x2, y1]] for (x1, x2, y1, y2) in xxyy
]
ann["areas"] = [bb[2] * bb[3] for bb in bbs]
# NOTE: We also compute average area of a tracklet across video, allowing us to compute area based mAP.
ann["area"] = np.mean(ann["areas"])
ann["id"] = id + 1
ann["iscrowd"] = 0
elif "segmentations" in anns[0]:
res.dataset["categories"] = copy.deepcopy(self.dataset["categories"])
for id, ann in enumerate(anns):
ann["bboxes"] = [
mask_util.toBbox(segm) for segm in ann["segmentations"]
]
if "areas" not in ann:
ann["areas"] = [
mask_util.area(segm) for segm in ann["segmentations"]
]
# NOTE: We also compute average area of a tracklet across video, allowing us to compute area based mAP.
ann["area"] = np.mean(ann["areas"])
ann["id"] = id + 1
ann["iscrowd"] = 0
res.dataset["annotations"] = anns
res.createIndex()
return res
@override
def download(self, tarDir=None, imgIds=[]):
raise NotImplementedError
@override
def loadNumpyAnnotations(self, data):
raise NotImplementedError("We don't support numpy annotations for now")
@override
def annToRLE(self, ann):
raise NotImplementedError("We expect masks to be already in RLE format")
@override
def annToMask(self, ann):
raise NotImplementedError("We expect masks to be already in RLE format")

411
sam3/eval/ytvis_eval.py Normal file
View File

@@ -0,0 +1,411 @@
# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
import copy
import gc
import logging
import os
from collections import defaultdict
from operator import xor
from pathlib import Path
from typing import List, Optional
import numpy as np
import pycocotools.mask as mask_util
import torch
from pycocotools.cocoeval import COCOeval
from sam3.eval.cgf1_eval import CGF1Eval
from sam3.eval.coco_eval_offline import convert_to_xywh
from sam3.model.box_ops import box_xywh_inter_union
from sam3.train.masks_ops import rle_encode
from sam3.train.utils import distributed as dist
from typing_extensions import override
try:
import rapidjson as json
except ModuleNotFoundError:
import json
from iopath.common.file_io import g_pathmgr
class YTVISevalMixin:
"""
Identical to COCOeval but adapts computeIoU to compute IoU between tracklets/masklets.
"""
@override
def _prepare(self):
"""
Copied from cocoeval.py but doesn't convert masks to RLEs (we assume they already are RLEs)
"""
p = self.params
if p.useCats:
gts = self.cocoGt.loadAnns(
self.cocoGt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds)
)
dts = self.cocoDt.loadAnns(
self.cocoDt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds)
)
else:
gts = self.cocoGt.loadAnns(self.cocoGt.getAnnIds(imgIds=p.imgIds))
dts = self.cocoDt.loadAnns(self.cocoDt.getAnnIds(imgIds=p.imgIds))
# set ignore flag
for gt in gts:
gt["ignore"] = gt["ignore"] if "ignore" in gt else 0
gt["ignore"] = "iscrowd" in gt and gt["iscrowd"]
if p.iouType == "keypoints":
gt["ignore"] = (gt["num_keypoints"] == 0) or gt["ignore"]
self._gts = defaultdict(list) # gt for evaluation
self._dts = defaultdict(list) # dt for evaluation
for gt in gts:
self._gts[gt["image_id"], gt["category_id"]].append(gt)
for dt in dts:
self._dts[dt["image_id"], dt["category_id"]].append(dt)
self.evalImgs = defaultdict(list) # per-image per-category evaluation results
self.eval = {} # accumulated evaluation results
def computeIoU(self, imgId, catId):
"""
Compute IoU between tracklets. Copied from cocoeval.py but adapted for videos (in YT-VIS format)
"""
p = self.params
if p.useCats:
gt = self._gts[imgId, catId]
dt = self._dts[imgId, catId]
else:
gt = [_ for cId in p.catIds for _ in self._gts[imgId, cId]]
dt = [_ for cId in p.catIds for _ in self._dts[imgId, cId]]
if len(gt) == 0 or len(dt) == 0:
return []
# For class mAP and phrase AP evaluation, we sort the detections in descending order of scores (as in COCOeval).
# For demo F1 evaluation, we DO NOT sort the detections (but match them with GTs via Hungarian matching).
assert hasattr(self, "sort_inds_by_scores_in_iou"), (
"subclasses that inherits YTVISevalMixin should set `self.sort_inds_by_scores_in_iou` "
"(True for class mAP and phrase AP, False for demo F1)"
)
if self.sort_inds_by_scores_in_iou:
inds = np.argsort([-d["score"] for d in dt], kind="mergesort")
dt = [dt[i] for i in inds]
if len(dt) > p.maxDets[-1]:
dt = dt[0 : p.maxDets[-1]]
if p.iouType == "segm":
g = [g["segmentations"] for g in gt]
d = [d["segmentations"] for d in dt]
elif p.iouType == "bbox":
g = [g["bboxes"] for g in gt]
d = [d["bboxes"] for d in dt]
else:
raise Exception("unknown iouType for iou computation")
def iou_tracklets(preds, gts):
preds = torch.tensor(preds)
gts = torch.tensor(gts)
inter, union = box_xywh_inter_union(
preds.unsqueeze(1), gts.unsqueeze(0)
) # Num preds x Num GTS x Num frames
inter = inter.sum(-1)
union = union.sum(-1)
assert (
union > 0
).all(), (
"There exists a tracklet with zero GTs across time. This is suspicious"
)
return inter / union
def iou_masklets(preds, gts):
inter = 0
union = 0
for p_i, gt_i in zip(preds, gts):
if p_i and gt_i:
# Compute areas of intersection and union
inter += mask_util.area(
mask_util.merge([p_i, gt_i], intersect=True)
)
union += mask_util.area(
mask_util.merge([p_i, gt_i], intersect=False)
)
elif gt_i:
union += mask_util.area(gt_i)
elif p_i:
union += mask_util.area(p_i)
if union > 0:
iou = inter / union
assert iou >= 0 and iou <= 1, "Encountered an error in IoU computation"
else:
assert np.isclose(inter, 0) and np.isclose(
union, 0
), "Encountered an error in IoU computation"
iou = 1
return iou
if p.iouType == "segm":
ious = [[iou_masklets(d_i, g_i) for g_i in g] for d_i in d]
else:
ious = iou_tracklets(d, g)
return np.array(ious)
class YTVISeval(YTVISevalMixin, COCOeval):
# For class mAP and phrase AP evaluation, we sort the detections in descending order of scores (as in COCOeval).
sort_inds_by_scores_in_iou = True
class VideoDemoF1Eval(YTVISevalMixin, CGF1Eval):
# For demo F1 evaluation, we DO NOT sort the detections (but match them with GTs via Hungarian matching).
sort_inds_by_scores_in_iou = False
class YTVISResultsWriter:
"""
Gather and dumps predictions in YT-VIS format.
Expected flow of API calls: reset() -> N * update() -> compute_synced()
"""
def __init__(
self,
dump_file: str,
postprocessor,
gather_pred_via_filesys=False,
pred_file_evaluators: Optional[List] = None,
save_per_frame_scores: bool = False,
write_eval_metrics_file: bool = True,
eval_metrics_file_suffix: str = ".sam3_eval_metrics",
):
self.dump_file = dump_file
self.dump = []
self.postprocessor = postprocessor
self.gather_pred_via_filesys = gather_pred_via_filesys
if dist.is_main_process():
dirname = os.path.dirname(self.dump_file)
if not os.path.exists(dirname):
os.makedirs(dirname, exist_ok=True)
logging.info(f"Creating folder: {dirname}")
# the evaluation hooks to be applied to the prediction files
self.pred_file_evaluators = pred_file_evaluators or []
self.save_per_frame_scores = save_per_frame_scores
# in addition to the prediction file, we also write the evaluation metrics
# for easier debugging and analysis (stored in another eval_metrics_file
# so that we can keep the dumped prediction file under YT-VIS format)
self.write_eval_metrics_file = write_eval_metrics_file
if self.write_eval_metrics_file:
self.eval_metrics_file = self.dump_file + eval_metrics_file_suffix
os.makedirs(os.path.dirname(self.eval_metrics_file), exist_ok=True)
def _dump_vid_preds(self, results):
dumped_results = copy.deepcopy(results)
self.dump.extend(dumped_results)
def prepare(self, predictions):
ytvis_results = []
for video_id, prediction in predictions.items():
if len(prediction) == 0:
continue
for k in ["boxes", "scores", "labels"]:
assert (
k in prediction
), f"Expected predictions to have `{k}` key, available keys are {prediction.keys()}"
if self.save_per_frame_scores:
assert (
"per_frame_scores" in prediction
), f"Expected predictions to have `per_frame_scores` key, available keys are {prediction.keys()}"
assert xor(
"masks" in prediction, "masks_rle" in prediction
), f"Expected predictions to have either `masks` key or `masks_rle` key, available keys are {prediction.keys()}"
boxes = prediction["boxes"]
boxes = convert_to_xywh(boxes).tolist()
scores = prediction["scores"].tolist()
labels = prediction["labels"].tolist()
if "masks" in prediction:
masks = prediction["masks"].squeeze(2)
assert (
masks.ndim == 4
), "Expected masks to be of shape(N_preds,T_frames,H,W)"
areas = [mask.flatten(1).sum(1).tolist() for mask in masks]
rles = [rle_encode(masklet) for masklet in masks]
# memory clean
del masks
del prediction["masks"]
elif "masks_rle" in prediction:
rles = prediction.pop("masks_rle")
areas = [
[0 if rle is None else rle.pop("area") for rle in rles_per_obj]
for rles_per_obj in rles
]
else:
raise ValueError(
"Expected either `masks` or `masks_rle` key in the predictions."
)
new_results = [
{
"video_id": video_id,
"category_id": track_label,
"bboxes": track_boxes,
"score": track_score,
"segmentations": track_masks,
"areas": track_areas,
}
for (
track_boxes,
track_masks,
track_areas,
track_score,
track_label,
) in zip(boxes, rles, areas, scores, labels)
]
# Optionally, save per-frame scores
if self.save_per_frame_scores:
per_frame_scores = prediction["per_frame_scores"].tolist()
for res, track_per_frame_scores in zip(new_results, per_frame_scores):
res["per_frame_scores"] = track_per_frame_scores
ytvis_results.extend(new_results)
return ytvis_results
def set_sync_device(self, device: torch.device):
self._sync_device = device
def update(self, *args, **kwargs):
predictions = self.postprocessor.process_results(*args, **kwargs)
results = self.prepare(predictions)
self._dump_vid_preds(results)
def _dump_preds(self):
if not dist.is_main_process():
self.dump = []
gc.collect()
return
dumped_file = Path(self.dump_file)
logging.info(f"YTVIS evaluator: Dumping predictions to {dumped_file}")
with g_pathmgr.open(str(dumped_file), "w") as f:
json.dump(self.dump, f)
self.dump = []
gc.collect()
return str(dumped_file)
def synchronize_between_processes(self):
logging.info("YT-VIS evaluator: Synchronizing between processes")
dump_dict = self._dedup_pre_gather(self.dump)
if self.gather_pred_via_filesys:
dump_dict_all_gpus = dist.gather_to_rank_0_via_filesys(dump_dict)
else:
dump_dict_all_gpus = dist.all_gather(dump_dict, force_cpu=True)
self.dump = self._dedup_post_gather(dump_dict_all_gpus)
logging.info(f"Gathered all {len(self.dump)} predictions")
def _dedup_pre_gather(self, predictions):
"""
Organize the predictions as a dict-of-list using (video_id, category_id) as keys
for deduplication after gathering them across GPUs.
During evaluation, PyTorch data loader under `drop_last: False` would wrap
around the dataset length to be a multiple of world size (GPU num) and duplicate
the remaining batches. This causes the same test sample to appear simultaneously
in multiple GPUs, resulting in duplicated predictions being saved into prediction
files. These duplicates are then counted as false positives under detection mAP
metrics (since a ground truth can be matched with only one prediction).
For example, if there are 4 GPUs and 6 samples [A1, A2, B1, B2, C1, C2], the data
loader (under `drop_last: False`) would load it by wrapping it around like
`[A1, A2, B1, B2, C1, C2, *A1*, *A2*]` to make a multiple of 4 and then split it as
- GPU 0: A1, C1
- GPU 1: A2, C2
- GPU 3: B1, **A1**
- GPU 4: B2, **A2**
(as in DistributedSampler in https://github.com/pytorch/pytorch/blob/521588519da9f4876d90ddd7a17c10d0eca89dc6/torch/utils/data/distributed.py#L116-L124)
so the predictions on A1 and A2 will occur twice in the final gathered outputs
in the prediction file (and counted as false positives). This also affects our
YT-VIS official val evaluation, but to a lesser extent than YT-VIS dev since
the latter is much smaller and more susceptible to false positives.
So we to deduplicate this. The tricky part is that we cannot deduplicate them
simply using video id, given that we are sharding the classes in each video
across multiple batches (with 20 prompts per batch) in our "orig_cats" eval dbs.
The solution is to deduplicate based on (video_id, category_id) tuple as keys.
We organize the predictions as a dict-of-list using (video_id, category_id) as
keys on each GPU, with the list of masklets under this (video_id, category_id)
on this GPU as values. Then, we all-gather this dict-of-list across GPUs and
if a key (video_id, category_id) appears in multiple GPUs, we only take the
prediction masklet list from one GPU.
"""
prediction_dict = defaultdict(list)
for p in predictions:
prediction_dict[(p["video_id"], p["category_id"])].append(p)
return prediction_dict
def _dedup_post_gather(self, list_of_prediction_dict):
"""
Deduplicate the predictions from all GPUs. See `_dedup_pre_gather` for details.
"""
dedup_prediction_dict = {}
duplication_keys = []
for prediction_dict in list_of_prediction_dict:
for k, v in prediction_dict.items():
if k not in dedup_prediction_dict:
dedup_prediction_dict[k] = v
else:
duplication_keys.append(k)
logging.info(
f"skipped {len(duplication_keys)} duplicated predictions in YTVISResultsWriter "
f"with the following (video_id, category_id) tuples: {duplication_keys}"
)
dedup_predictions = sum(dedup_prediction_dict.values(), [])
return dedup_predictions
def compute_synced(
self,
):
self.synchronize_between_processes()
dumped_file = self._dump_preds()
if not dist.is_main_process():
return {"": 0.0}
# run evaluation hooks on the prediction file
meters = {}
all_video_np_level_results = defaultdict(dict)
for evaluator in self.pred_file_evaluators:
gc.collect()
results, video_np_level_results = evaluator.evaluate(dumped_file)
meters.update(results)
for (video_id, category_id), res in video_np_level_results.items():
all_video_np_level_results[(video_id, category_id)].update(res)
gc.collect()
if self.write_eval_metrics_file:
# convert the nested dict of {(video_id, category_id): per_sample_metric_dict}
# to a list of per-sample metric dicts (with video_id and category_id) for JSON,
# as JSON doesn't allow using tuples like (video_id, category_id) as dict keys
video_np_level_metrics = [
{"video_id": video_id, "category_id": category_id, **res}
for (video_id, category_id), res in all_video_np_level_results.items()
]
eval_metrics = {
"dataset_level_metrics": meters,
"video_np_level_metrics": video_np_level_metrics,
}
with g_pathmgr.open(self.eval_metrics_file, "w") as f:
json.dump(eval_metrics, f)
logging.info(
f"YTVIS evaluator: Dumped evaluation metrics to {self.eval_metrics_file}"
)
if len(meters) == 0:
meters = {"": 0.0}
return meters
def compute(self):
return {"": 0.0}
def reset(self, *args, **kwargs):
self.dump = []