Initial commit
fbshipit-source-id: da6be2f26e3a1202f4bffde8cb980e2dcb851294
This commit is contained in:
1
sam3/eval/__init__.py
Normal file
1
sam3/eval/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
|
||||
703
sam3/eval/cgf1_eval.py
Normal file
703
sam3/eval/cgf1_eval.py
Normal file
@@ -0,0 +1,703 @@
|
||||
# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
|
||||
|
||||
import contextlib
|
||||
import copy
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
from collections import defaultdict
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Union
|
||||
|
||||
import numpy as np
|
||||
import pycocotools.mask as maskUtils
|
||||
from pycocotools.coco import COCO
|
||||
from pycocotools.cocoeval import COCOeval
|
||||
from scipy.optimize import linear_sum_assignment
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
@dataclass
|
||||
class Metric:
|
||||
name: str
|
||||
|
||||
# whether the metric is computed at the image level or the box level
|
||||
image_level: bool
|
||||
|
||||
# iou threshold (None is used for image level metrics or to indicate averaging over all thresholds in [0.5:0.95])
|
||||
iou_threshold: Union[float, None]
|
||||
|
||||
|
||||
CGF1_METRICS = [
|
||||
Metric(name="cgF1", image_level=False, iou_threshold=None),
|
||||
Metric(name="precision", image_level=False, iou_threshold=None),
|
||||
Metric(name="recall", image_level=False, iou_threshold=None),
|
||||
Metric(name="F1", image_level=False, iou_threshold=None),
|
||||
Metric(name="positive_macro_F1", image_level=False, iou_threshold=None),
|
||||
Metric(name="positive_micro_F1", image_level=False, iou_threshold=None),
|
||||
Metric(name="positive_micro_precision", image_level=False, iou_threshold=None),
|
||||
Metric(name="IL_precision", image_level=True, iou_threshold=None),
|
||||
Metric(name="IL_recall", image_level=True, iou_threshold=None),
|
||||
Metric(name="IL_F1", image_level=True, iou_threshold=None),
|
||||
Metric(name="IL_FPR", image_level=True, iou_threshold=None),
|
||||
Metric(name="IL_MCC", image_level=True, iou_threshold=None),
|
||||
Metric(name="cgF1", image_level=False, iou_threshold=0.5),
|
||||
Metric(name="precision", image_level=False, iou_threshold=0.5),
|
||||
Metric(name="recall", image_level=False, iou_threshold=0.5),
|
||||
Metric(name="F1", image_level=False, iou_threshold=0.5),
|
||||
Metric(name="positive_macro_F1", image_level=False, iou_threshold=0.5),
|
||||
Metric(name="positive_micro_F1", image_level=False, iou_threshold=0.5),
|
||||
Metric(name="positive_micro_precision", image_level=False, iou_threshold=0.5),
|
||||
Metric(name="cgF1", image_level=False, iou_threshold=0.75),
|
||||
Metric(name="precision", image_level=False, iou_threshold=0.75),
|
||||
Metric(name="recall", image_level=False, iou_threshold=0.75),
|
||||
Metric(name="F1", image_level=False, iou_threshold=0.75),
|
||||
Metric(name="positive_macro_F1", image_level=False, iou_threshold=0.75),
|
||||
Metric(name="positive_micro_F1", image_level=False, iou_threshold=0.75),
|
||||
Metric(name="positive_micro_precision", image_level=False, iou_threshold=0.75),
|
||||
]
|
||||
|
||||
|
||||
class COCOCustom(COCO):
|
||||
"""COCO class from pycocotools with tiny modifications for speed"""
|
||||
|
||||
def createIndex(self):
|
||||
# create index
|
||||
print("creating index...")
|
||||
anns, cats, imgs = {}, {}, {}
|
||||
imgToAnns, catToImgs = defaultdict(list), defaultdict(list)
|
||||
if "annotations" in self.dataset:
|
||||
for ann in self.dataset["annotations"]:
|
||||
imgToAnns[ann["image_id"]].append(ann)
|
||||
anns[ann["id"]] = ann
|
||||
|
||||
if "images" in self.dataset:
|
||||
# MODIFICATION: do not reload imgs if they are already there
|
||||
if self.imgs:
|
||||
imgs = self.imgs
|
||||
else:
|
||||
for img in self.dataset["images"]:
|
||||
imgs[img["id"]] = img
|
||||
# END MODIFICATION
|
||||
|
||||
if "categories" in self.dataset:
|
||||
for cat in self.dataset["categories"]:
|
||||
cats[cat["id"]] = cat
|
||||
|
||||
if "annotations" in self.dataset and "categories" in self.dataset:
|
||||
for ann in self.dataset["annotations"]:
|
||||
catToImgs[ann["category_id"]].append(ann["image_id"])
|
||||
|
||||
print("index created!")
|
||||
|
||||
# create class members
|
||||
self.anns = anns
|
||||
self.imgToAnns = imgToAnns
|
||||
self.catToImgs = catToImgs
|
||||
self.imgs = imgs
|
||||
self.cats = cats
|
||||
|
||||
def loadRes(self, resFile):
|
||||
"""
|
||||
Load result file and return a result api object.
|
||||
:param resFile (str) : file name of result file
|
||||
:return: res (obj) : result api object
|
||||
"""
|
||||
res = COCOCustom()
|
||||
res.dataset["info"] = copy.deepcopy(self.dataset.get("info", {}))
|
||||
# MODIFICATION: no copy
|
||||
# res.dataset['images'] = [img for img in self.dataset['images']]
|
||||
res.dataset["images"] = self.dataset["images"]
|
||||
# END MODIFICATION
|
||||
|
||||
print("Loading and preparing results...")
|
||||
tic = time.time()
|
||||
if type(resFile) == str:
|
||||
with open(resFile) as f:
|
||||
anns = json.load(f)
|
||||
elif type(resFile) == np.ndarray:
|
||||
anns = self.loadNumpyAnnotations(resFile)
|
||||
else:
|
||||
anns = resFile
|
||||
assert type(anns) == list, "results in not an array of objects"
|
||||
annsImgIds = [ann["image_id"] for ann in anns]
|
||||
# MODIFICATION: faster and cached subset check
|
||||
if not hasattr(self, "img_id_set"):
|
||||
self.img_id_set = set(self.getImgIds())
|
||||
assert set(annsImgIds).issubset(
|
||||
self.img_id_set
|
||||
), "Results do not correspond to current coco set"
|
||||
# END MODIFICATION
|
||||
if "caption" in anns[0]:
|
||||
imgIds = set([img["id"] for img in res.dataset["images"]]) & set(
|
||||
[ann["image_id"] for ann in anns]
|
||||
)
|
||||
res.dataset["images"] = [
|
||||
img for img in res.dataset["images"] if img["id"] in imgIds
|
||||
]
|
||||
for id, ann in enumerate(anns):
|
||||
ann["id"] = id + 1
|
||||
elif "bbox" in anns[0] and not anns[0]["bbox"] == []:
|
||||
res.dataset["categories"] = copy.deepcopy(self.dataset["categories"])
|
||||
for id, ann in enumerate(anns):
|
||||
bb = ann["bbox"]
|
||||
x1, x2, y1, y2 = [bb[0], bb[0] + bb[2], bb[1], bb[1] + bb[3]]
|
||||
if not "segmentation" in ann:
|
||||
ann["segmentation"] = [[x1, y1, x1, y2, x2, y2, x2, y1]]
|
||||
ann["area"] = bb[2] * bb[3]
|
||||
ann["id"] = id + 1
|
||||
ann["iscrowd"] = 0
|
||||
elif "segmentation" in anns[0]:
|
||||
res.dataset["categories"] = copy.deepcopy(self.dataset["categories"])
|
||||
for id, ann in enumerate(anns):
|
||||
# now only support compressed RLE format as segmentation results
|
||||
ann["area"] = maskUtils.area(ann["segmentation"])
|
||||
if not "bbox" in ann:
|
||||
ann["bbox"] = maskUtils.toBbox(ann["segmentation"])
|
||||
ann["id"] = id + 1
|
||||
ann["iscrowd"] = 0
|
||||
elif "keypoints" in anns[0]:
|
||||
res.dataset["categories"] = copy.deepcopy(self.dataset["categories"])
|
||||
for id, ann in enumerate(anns):
|
||||
s = ann["keypoints"]
|
||||
x = s[0::3]
|
||||
y = s[1::3]
|
||||
x0, x1, y0, y1 = np.min(x), np.max(x), np.min(y), np.max(y)
|
||||
ann["area"] = (x1 - x0) * (y1 - y0)
|
||||
ann["id"] = id + 1
|
||||
ann["bbox"] = [x0, y0, x1 - x0, y1 - y0]
|
||||
print("DONE (t={:0.2f}s)".format(time.time() - tic))
|
||||
|
||||
res.dataset["annotations"] = anns
|
||||
# MODIFICATION: inherit images
|
||||
res.imgs = self.imgs
|
||||
# END MODIFICATION
|
||||
res.createIndex()
|
||||
return res
|
||||
|
||||
|
||||
class CGF1Eval(COCOeval):
|
||||
"""
|
||||
This evaluator is based upon COCO evaluation, but evaluates the model in a more realistic setting
|
||||
for downstream applications.
|
||||
See SAM3 paper for the details on the CGF1 metric.
|
||||
|
||||
Do not use this evaluator directly. Prefer the CGF1Evaluator wrapper.
|
||||
|
||||
Notes:
|
||||
- This evaluator does not support per-category evaluation (in the way defined by pyCocotools)
|
||||
- In open vocabulary settings, we have different noun-phrases for each image. What we call an "image_id" here is actually an (image, noun-phrase) pair. So in every "image_id" there is only one category, implied by the noun-phrase. Thus we can ignore the usual coco "category" field of the predictions
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
coco_gt=None,
|
||||
coco_dt=None,
|
||||
iouType="segm",
|
||||
threshold=0.5,
|
||||
):
|
||||
"""
|
||||
Args:
|
||||
coco_gt (COCO): ground truth COCO API
|
||||
coco_dt (COCO): detections COCO API
|
||||
iou_type (str): type of IoU to evaluate
|
||||
threshold (float): threshold for predictions
|
||||
"""
|
||||
super().__init__(coco_gt, coco_dt, iouType)
|
||||
self.threshold = threshold
|
||||
|
||||
self.params.useCats = False
|
||||
self.params.areaRng = [[0**2, 1e5**2]]
|
||||
self.params.areaRngLbl = ["all"]
|
||||
self.params.maxDets = [1000000]
|
||||
|
||||
def computeIoU(self, imgId, catId):
|
||||
# Same as the original COCOeval.computeIoU, but without sorting
|
||||
p = self.params
|
||||
if p.useCats:
|
||||
gt = self._gts[imgId, catId]
|
||||
dt = self._dts[imgId, catId]
|
||||
else:
|
||||
gt = [_ for cId in p.catIds for _ in self._gts[imgId, cId]]
|
||||
dt = [_ for cId in p.catIds for _ in self._dts[imgId, cId]]
|
||||
if len(gt) == 0 and len(dt) == 0:
|
||||
return []
|
||||
|
||||
if p.iouType == "segm":
|
||||
g = [g["segmentation"] for g in gt]
|
||||
d = [d["segmentation"] for d in dt]
|
||||
elif p.iouType == "bbox":
|
||||
g = [g["bbox"] for g in gt]
|
||||
d = [d["bbox"] for d in dt]
|
||||
else:
|
||||
raise Exception("unknown iouType for iou computation")
|
||||
|
||||
# compute iou between each dt and gt region
|
||||
iscrowd = [int(o["iscrowd"]) for o in gt]
|
||||
ious = maskUtils.iou(d, g, iscrowd)
|
||||
return ious
|
||||
|
||||
def evaluateImg(self, imgId, catId, aRng, maxDet):
|
||||
"""
|
||||
perform evaluation for single category and image
|
||||
:return: dict (single image results)
|
||||
"""
|
||||
p = self.params
|
||||
assert not p.useCats, "This evaluator does not support per-category evaluation."
|
||||
assert catId == -1
|
||||
all_gts = [_ for cId in p.catIds for _ in self._gts[imgId, cId]]
|
||||
keep_gt = np.array([not g["ignore"] for g in all_gts], dtype=bool)
|
||||
gt = [g for g in all_gts if not g["ignore"]]
|
||||
all_dts = [_ for cId in p.catIds for _ in self._dts[imgId, cId]]
|
||||
keep_dt = np.array([d["score"] >= self.threshold for d in all_dts], dtype=bool)
|
||||
dt = [d for d in all_dts if d["score"] >= self.threshold]
|
||||
if len(gt) == 0 and len(dt) == 0:
|
||||
# This is a "true negative" case, where there are no GTs and no predictions
|
||||
# The box-level metrics are ill-defined, so we don't add them to this dict
|
||||
return {
|
||||
"image_id": imgId,
|
||||
"IL_TP": 0,
|
||||
"IL_TN": 1,
|
||||
"IL_FP": 0,
|
||||
"IL_FN": 0,
|
||||
"num_dt": len(dt),
|
||||
}
|
||||
|
||||
if len(gt) > 0 and len(dt) == 0:
|
||||
# This is a "false negative" case, where there are GTs but no predictions
|
||||
return {
|
||||
"image_id": imgId,
|
||||
"IL_TP": 0,
|
||||
"IL_TN": 0,
|
||||
"IL_FP": 0,
|
||||
"IL_FN": 1,
|
||||
"TPs": np.zeros((len(p.iouThrs),), dtype=np.int64),
|
||||
"FPs": np.zeros((len(p.iouThrs),), dtype=np.int64),
|
||||
"FNs": np.ones((len(p.iouThrs),), dtype=np.int64) * len(gt),
|
||||
"local_F1s": np.zeros((len(p.iouThrs),), dtype=np.int64),
|
||||
"local_positive_F1s": np.zeros((len(p.iouThrs),), dtype=np.int64),
|
||||
"num_dt": len(dt),
|
||||
}
|
||||
|
||||
# Load pre-computed ious
|
||||
ious = self.ious[(imgId, catId)]
|
||||
|
||||
# compute matching
|
||||
if len(ious) == 0:
|
||||
ious = np.zeros((len(dt), len(gt)))
|
||||
else:
|
||||
ious = ious[keep_dt, :][:, keep_gt]
|
||||
assert ious.shape == (len(dt), len(gt))
|
||||
|
||||
matched_dt, matched_gt = linear_sum_assignment(-ious)
|
||||
|
||||
match_scores = ious[matched_dt, matched_gt]
|
||||
|
||||
TPs, FPs, FNs = [], [], []
|
||||
IL_perfect = []
|
||||
for thresh in p.iouThrs:
|
||||
TP = (match_scores >= thresh).sum()
|
||||
FP = len(dt) - TP
|
||||
FN = len(gt) - TP
|
||||
assert (
|
||||
FP >= 0 and FN >= 0
|
||||
), f"FP: {FP}, FN: {FN}, TP: {TP}, match_scores: {match_scores}, len(dt): {len(dt)}, len(gt): {len(gt)}, ious: {ious}"
|
||||
TPs.append(TP)
|
||||
FPs.append(FP)
|
||||
FNs.append(FN)
|
||||
|
||||
if FP == FN and FP == 0:
|
||||
IL_perfect.append(1)
|
||||
else:
|
||||
IL_perfect.append(0)
|
||||
|
||||
TPs = np.array(TPs, dtype=np.int64)
|
||||
FPs = np.array(FPs, dtype=np.int64)
|
||||
FNs = np.array(FNs, dtype=np.int64)
|
||||
IL_perfect = np.array(IL_perfect, dtype=np.int64)
|
||||
|
||||
# compute precision recall and F1
|
||||
precision = TPs / (TPs + FPs + 1e-4)
|
||||
assert np.all(precision <= 1)
|
||||
recall = TPs / (TPs + FNs + 1e-4)
|
||||
assert np.all(recall <= 1)
|
||||
F1 = 2 * precision * recall / (precision + recall + 1e-4)
|
||||
|
||||
result = {
|
||||
"image_id": imgId,
|
||||
"TPs": TPs,
|
||||
"FPs": FPs,
|
||||
"FNs": FNs,
|
||||
"local_F1s": F1,
|
||||
"IL_TP": (len(gt) > 0) and (len(dt) > 0),
|
||||
"IL_FP": (len(gt) == 0) and (len(dt) > 0),
|
||||
"IL_TN": (len(gt) == 0) and (len(dt) == 0),
|
||||
"IL_FN": (len(gt) > 0) and (len(dt) == 0),
|
||||
"num_dt": len(dt),
|
||||
}
|
||||
if len(gt) > 0 and len(dt) > 0:
|
||||
result["local_positive_F1s"] = F1
|
||||
return result
|
||||
|
||||
def accumulate(self, p=None):
|
||||
"""
|
||||
Accumulate per image evaluation results and store the result in self.eval
|
||||
:param p: input params for evaluation
|
||||
:return: None
|
||||
"""
|
||||
if self.evalImgs is None or len(self.evalImgs) == 0:
|
||||
print("Please run evaluate() first")
|
||||
# allows input customized parameters
|
||||
if p is None:
|
||||
p = self.params
|
||||
|
||||
setImgIds = set(p.imgIds)
|
||||
|
||||
# TPs, FPs, FNs
|
||||
TPs = np.zeros((len(p.iouThrs),), dtype=np.int64)
|
||||
FPs = np.zeros((len(p.iouThrs),), dtype=np.int64)
|
||||
pmFPs = np.zeros((len(p.iouThrs),), dtype=np.int64)
|
||||
FNs = np.zeros((len(p.iouThrs),), dtype=np.int64)
|
||||
local_F1s = np.zeros((len(p.iouThrs),), dtype=np.float64)
|
||||
|
||||
# Image level metrics
|
||||
IL_TPs = 0
|
||||
IL_FPs = 0
|
||||
IL_TNs = 0
|
||||
IL_FNs = 0
|
||||
|
||||
valid_img_count = 0
|
||||
valid_F1_count = 0
|
||||
evaledImgIds = set()
|
||||
for res in self.evalImgs:
|
||||
if res["image_id"] not in setImgIds:
|
||||
continue
|
||||
evaledImgIds.add(res["image_id"])
|
||||
IL_TPs += res["IL_TP"]
|
||||
IL_FPs += res["IL_FP"]
|
||||
IL_TNs += res["IL_TN"]
|
||||
IL_FNs += res["IL_FN"]
|
||||
|
||||
if "TPs" not in res:
|
||||
continue
|
||||
|
||||
TPs += res["TPs"]
|
||||
FPs += res["FPs"]
|
||||
FNs += res["FNs"]
|
||||
valid_img_count += 1
|
||||
|
||||
if "local_positive_F1s" in res:
|
||||
local_F1s += res["local_positive_F1s"]
|
||||
pmFPs += res["FPs"]
|
||||
if res["num_dt"] > 0:
|
||||
valid_F1_count += 1
|
||||
|
||||
assert len(setImgIds - evaledImgIds) == 0, (
|
||||
f"{len(setImgIds - evaledImgIds)} images not evaluated. "
|
||||
f"Here are the IDs of the first 3: {list(setImgIds - evaledImgIds)[:3]}"
|
||||
)
|
||||
|
||||
# compute precision recall and F1
|
||||
precision = TPs / (TPs + FPs + 1e-4)
|
||||
positive_micro_precision = TPs / (TPs + pmFPs + 1e-4)
|
||||
assert np.all(precision <= 1)
|
||||
recall = TPs / (TPs + FNs + 1e-4)
|
||||
assert np.all(recall <= 1)
|
||||
F1 = 2 * precision * recall / (precision + recall + 1e-4)
|
||||
positive_micro_F1 = (
|
||||
2
|
||||
* positive_micro_precision
|
||||
* recall
|
||||
/ (positive_micro_precision + recall + 1e-4)
|
||||
)
|
||||
|
||||
IL_rec = IL_TPs / (IL_TPs + IL_FNs + 1e-6)
|
||||
IL_prec = IL_TPs / (IL_TPs + IL_FPs + 1e-6)
|
||||
IL_F1 = 2 * IL_prec * IL_rec / (IL_prec + IL_rec + 1e-6)
|
||||
IL_FPR = IL_FPs / (IL_FPs + IL_TNs + 1e-6)
|
||||
IL_MCC = float(IL_TPs * IL_TNs - IL_FPs * IL_FNs) / (
|
||||
(
|
||||
float(IL_TPs + IL_FPs)
|
||||
* float(IL_TPs + IL_FNs)
|
||||
* float(IL_TNs + IL_FPs)
|
||||
* float(IL_TNs + IL_FNs)
|
||||
)
|
||||
** 0.5
|
||||
+ 1e-6
|
||||
)
|
||||
|
||||
self.eval = {
|
||||
"params": p,
|
||||
"TPs": TPs,
|
||||
"FPs": FPs,
|
||||
"positive_micro_FPs": pmFPs,
|
||||
"FNs": FNs,
|
||||
"precision": precision,
|
||||
"positive_micro_precision": positive_micro_precision,
|
||||
"recall": recall,
|
||||
"F1": F1,
|
||||
"positive_micro_F1": positive_micro_F1,
|
||||
"positive_macro_F1": local_F1s / valid_F1_count,
|
||||
"IL_recall": IL_rec,
|
||||
"IL_precision": IL_prec,
|
||||
"IL_F1": IL_F1,
|
||||
"IL_FPR": IL_FPR,
|
||||
"IL_MCC": IL_MCC,
|
||||
}
|
||||
self.eval["cgF1"] = self.eval["positive_micro_F1"] * self.eval["IL_MCC"]
|
||||
|
||||
def summarize(self):
|
||||
"""
|
||||
Compute and display summary metrics for evaluation results.
|
||||
"""
|
||||
if not self.eval:
|
||||
raise Exception("Please run accumulate() first")
|
||||
|
||||
def _summarize(iouThr=None, metric=""):
|
||||
p = self.params
|
||||
iStr = " {:<18} @[ IoU={:<9}] = {:0.3f}"
|
||||
titleStr = "Average " + metric
|
||||
iouStr = (
|
||||
"{:0.2f}:{:0.2f}".format(p.iouThrs[0], p.iouThrs[-1])
|
||||
if iouThr is None
|
||||
else "{:0.2f}".format(iouThr)
|
||||
)
|
||||
|
||||
s = self.eval[metric]
|
||||
# IoU
|
||||
if iouThr is not None:
|
||||
t = np.where(iouThr == p.iouThrs)[0]
|
||||
s = s[t]
|
||||
|
||||
if len(s[s > -1]) == 0:
|
||||
mean_s = -1
|
||||
else:
|
||||
mean_s = np.mean(s[s > -1])
|
||||
print(iStr.format(titleStr, iouStr, mean_s))
|
||||
return mean_s
|
||||
|
||||
def _summarize_single(metric=""):
|
||||
titleStr = "Average " + metric
|
||||
iStr = " {:<35} = {:0.3f}"
|
||||
s = self.eval[metric]
|
||||
print(iStr.format(titleStr, s))
|
||||
return s
|
||||
|
||||
def _summarizeDets():
|
||||
stats = []
|
||||
|
||||
for metric in CGF1_METRICS:
|
||||
if metric.image_level:
|
||||
stats.append(_summarize_single(metric=metric.name))
|
||||
else:
|
||||
stats.append(
|
||||
_summarize(iouThr=metric.iou_threshold, metric=metric.name)
|
||||
)
|
||||
return np.asarray(stats)
|
||||
|
||||
summarize = _summarizeDets
|
||||
self.stats = summarize()
|
||||
|
||||
|
||||
def _evaluate(self):
|
||||
"""
|
||||
Run per image evaluation on given images and store results (a list of dict) in self.evalImgs
|
||||
"""
|
||||
p = self.params
|
||||
# add backward compatibility if useSegm is specified in params
|
||||
p.imgIds = list(np.unique(p.imgIds))
|
||||
p.useCats = False
|
||||
p.maxDets = sorted(p.maxDets)
|
||||
self.params = p
|
||||
|
||||
self._prepare()
|
||||
# loop through images, area range, max detection number
|
||||
catIds = [-1]
|
||||
|
||||
if p.iouType == "segm" or p.iouType == "bbox":
|
||||
computeIoU = self.computeIoU
|
||||
else:
|
||||
raise RuntimeError(f"Unsupported iou {p.iouType}")
|
||||
self.ious = {
|
||||
(imgId, catId): computeIoU(imgId, catId)
|
||||
for imgId in p.imgIds
|
||||
for catId in catIds
|
||||
}
|
||||
|
||||
maxDet = p.maxDets[-1]
|
||||
evalImgs = [
|
||||
self.evaluateImg(imgId, catId, areaRng, maxDet)
|
||||
for catId in catIds
|
||||
for areaRng in p.areaRng
|
||||
for imgId in p.imgIds
|
||||
]
|
||||
# this is NOT in the pycocotools code, but could be done outside
|
||||
evalImgs = np.asarray(evalImgs).reshape(len(catIds), len(p.areaRng), len(p.imgIds))
|
||||
return p.imgIds, evalImgs
|
||||
|
||||
|
||||
class CGF1Evaluator:
|
||||
"""
|
||||
Wrapper class for cgF1 evaluation.
|
||||
This supports the oracle setting (when several ground-truths are available per image)
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
gt_path: Union[str, List[str]],
|
||||
iou_type="segm",
|
||||
verbose=False,
|
||||
):
|
||||
"""
|
||||
Args:
|
||||
gt_path (str or list of str): path(s) to ground truth COCO json file(s)
|
||||
iou_type (str): type of IoU to evaluate
|
||||
threshold (float): threshold for predictions
|
||||
"""
|
||||
self.gt_paths = gt_path if isinstance(gt_path, list) else [gt_path]
|
||||
self.iou_type = iou_type
|
||||
|
||||
self.coco_gts = [COCOCustom(gt) for gt in self.gt_paths]
|
||||
|
||||
self.verbose = verbose
|
||||
|
||||
self.coco_evals = []
|
||||
for i, coco_gt in enumerate(self.coco_gts):
|
||||
self.coco_evals.append(
|
||||
CGF1Eval(
|
||||
coco_gt=coco_gt,
|
||||
iouType=iou_type,
|
||||
)
|
||||
)
|
||||
self.coco_evals[i].useCats = False
|
||||
|
||||
exclude_img_ids = set()
|
||||
# exclude_img_ids are the ids that are not exhaustively annotated in any of the other gts
|
||||
for coco_gt in self.coco_gts[1:]:
|
||||
exclude_img_ids = exclude_img_ids.union(
|
||||
{
|
||||
img["id"]
|
||||
for img in coco_gt.dataset["images"]
|
||||
if not img["is_instance_exhaustive"]
|
||||
}
|
||||
)
|
||||
# we only eval on instance exhaustive queries
|
||||
self.eval_img_ids = [
|
||||
img["id"]
|
||||
for img in self.coco_gts[0].dataset["images"]
|
||||
if (img["is_instance_exhaustive"] and img["id"] not in exclude_img_ids)
|
||||
]
|
||||
|
||||
def evaluate(self, pred_file: str):
|
||||
"""
|
||||
Evaluate the detections using cgF1 metric.
|
||||
|
||||
Args:
|
||||
pred_file: path to the predictions COCO json file
|
||||
|
||||
"""
|
||||
assert len(self.coco_gts) > 0, "No ground truth provided for evaluation."
|
||||
assert len(self.coco_gts) == len(
|
||||
self.coco_evals
|
||||
), "Mismatch in number of ground truths and evaluators."
|
||||
|
||||
if self.verbose:
|
||||
print(f"Loading predictions from {pred_file}")
|
||||
|
||||
with open(pred_file, "r") as f:
|
||||
preds = json.load(f)
|
||||
|
||||
if self.verbose:
|
||||
print(f"Loaded {len(preds)} predictions")
|
||||
|
||||
img2preds = defaultdict(list)
|
||||
for pred in preds:
|
||||
img2preds[pred["image_id"]].append(pred)
|
||||
|
||||
all_eval_imgs = []
|
||||
for img_id in tqdm(self.eval_img_ids, disable=not self.verbose):
|
||||
results = img2preds[img_id]
|
||||
all_scorings = []
|
||||
for cur_coco_gt, coco_eval in zip(self.coco_gts, self.coco_evals):
|
||||
# suppress pycocotools prints
|
||||
with open(os.devnull, "w") as devnull:
|
||||
with contextlib.redirect_stdout(devnull):
|
||||
coco_dt = (
|
||||
cur_coco_gt.loadRes(results) if results else COCOCustom()
|
||||
)
|
||||
|
||||
coco_eval.cocoDt = coco_dt
|
||||
coco_eval.params.imgIds = [img_id]
|
||||
coco_eval.params.useCats = False
|
||||
img_ids, eval_imgs = _evaluate(coco_eval)
|
||||
all_scorings.append(eval_imgs)
|
||||
selected = self._select_best_scoring(all_scorings)
|
||||
all_eval_imgs.append(selected)
|
||||
|
||||
# After this point, we have selected the best scoring per image among several ground truths
|
||||
# we can now accumulate and summarize, using only the first coco_eval
|
||||
|
||||
self.coco_evals[0].evalImgs = list(
|
||||
np.concatenate(all_eval_imgs, axis=2).flatten()
|
||||
)
|
||||
self.coco_evals[0].params.imgIds = self.eval_img_ids
|
||||
self.coco_evals[0]._paramsEval = copy.deepcopy(self.coco_evals[0].params)
|
||||
|
||||
if self.verbose:
|
||||
print(f"Accumulating results")
|
||||
self.coco_evals[0].accumulate()
|
||||
print("cgF1 metric, IoU type={}".format(self.iou_type))
|
||||
self.coco_evals[0].summarize()
|
||||
print()
|
||||
|
||||
out = {}
|
||||
for i, value in enumerate(self.coco_evals[0].stats):
|
||||
name = CGF1_METRICS[i].name
|
||||
if CGF1_METRICS[i].iou_threshold is not None:
|
||||
name = f"{name}@{CGF1_METRICS[i].iou_threshold}"
|
||||
out[f"cgF1_eval_{self.iou_type}_{name}"] = float(value)
|
||||
|
||||
return out
|
||||
|
||||
@staticmethod
|
||||
def _select_best_scoring(scorings):
|
||||
# This function is used for "oracle" type evaluation.
|
||||
# It accepts the evaluation results with respect to several ground truths, and picks the best
|
||||
if len(scorings) == 1:
|
||||
return scorings[0]
|
||||
|
||||
assert (
|
||||
scorings[0].ndim == 3
|
||||
), f"Expecting results in [numCats, numAreas, numImgs] format, got {scorings[0].shape}"
|
||||
assert (
|
||||
scorings[0].shape[0] == 1
|
||||
), f"Expecting a single category, got {scorings[0].shape[0]}"
|
||||
|
||||
for scoring in scorings:
|
||||
assert (
|
||||
scoring.shape == scorings[0].shape
|
||||
), f"Shape mismatch: {scoring.shape}, {scorings[0].shape}"
|
||||
|
||||
selected_imgs = []
|
||||
for img_id in range(scorings[0].shape[-1]):
|
||||
best = scorings[0][:, :, img_id]
|
||||
|
||||
for scoring in scorings[1:]:
|
||||
current = scoring[:, :, img_id]
|
||||
if "local_F1s" in best[0, 0] and "local_F1s" in current[0, 0]:
|
||||
# we were able to compute a F1 score for this particular image in both evaluations
|
||||
# best["local_F1s"] contains the results at various IoU thresholds. We simply take the average for comparision
|
||||
best_score = best[0, 0]["local_F1s"].mean()
|
||||
current_score = current[0, 0]["local_F1s"].mean()
|
||||
if current_score > best_score:
|
||||
best = current
|
||||
|
||||
else:
|
||||
# If we're here, it means that in that in some evaluation we were not able to get a valid local F1
|
||||
# This happens when both the predictions and targets are empty. In that case, we can assume it's a perfect prediction
|
||||
if "local_F1s" not in current[0, 0]:
|
||||
best = current
|
||||
selected_imgs.append(best)
|
||||
result = np.stack(selected_imgs, axis=-1)
|
||||
assert result.shape == scorings[0].shape
|
||||
return result
|
||||
916
sam3/eval/coco_eval.py
Normal file
916
sam3/eval/coco_eval.py
Normal file
@@ -0,0 +1,916 @@
|
||||
# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
|
||||
|
||||
"""
|
||||
COCO evaluator that works in distributed mode.
|
||||
|
||||
Mostly copy-paste from https://github.com/pytorch/vision/blob/edfd5a7/references/detection/coco_eval.py
|
||||
The difference is that there is less copy-pasting from pycocotools
|
||||
in the end of the file, as python3 can suppress prints with contextlib
|
||||
"""
|
||||
|
||||
import contextlib
|
||||
import copy
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import pickle
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
|
||||
from typing import Any, List, Optional
|
||||
|
||||
import numpy as np
|
||||
|
||||
import pycocotools.mask as mask_utils
|
||||
import torch
|
||||
from iopath.common.file_io import g_pathmgr
|
||||
from pycocotools.coco import COCO
|
||||
from pycocotools.cocoeval import COCOeval
|
||||
|
||||
from sam3.train.masks_ops import rle_encode
|
||||
|
||||
from sam3.train.utils.distributed import (
|
||||
all_gather,
|
||||
gather_to_rank_0_via_filesys,
|
||||
get_rank,
|
||||
is_main_process,
|
||||
)
|
||||
|
||||
RARITY_BUCKETS = {0: "frequent", 1: "common", 2: "medium", 3: "rare"}
|
||||
|
||||
|
||||
class CocoEvaluator:
|
||||
def __init__(
|
||||
self,
|
||||
coco_gt,
|
||||
iou_types: List[str],
|
||||
useCats: bool,
|
||||
dump_dir: Optional[str],
|
||||
postprocessor,
|
||||
average_by_rarity=False,
|
||||
metrics_dump_dir: Optional[str] = None,
|
||||
gather_pred_via_filesys=False,
|
||||
use_normalized_areas=True,
|
||||
maxdets=[1, 10, 100],
|
||||
exhaustive_only=False,
|
||||
all_exhaustive_only=True,
|
||||
):
|
||||
"""Online coco evaluator. It will evaluate images as they are generated by the model, then accumulate/summarize at the end
|
||||
|
||||
Args:
|
||||
- coco_gt: COCO api object containing the gt
|
||||
- iou_types: can be either "bbox" or "segm"
|
||||
- useCats: If true, categories will be used for evaluation
|
||||
- dump_dir: if non null, then the predictions will be dumped in that directory
|
||||
- postprocessor: Module to convert the model's output into the coco format
|
||||
- average_by_rarity: if true then we expect the images information in the gt dataset
|
||||
to have a "rarity" field. Then the AP will be computed on all rarity buckets
|
||||
individually, then averaged
|
||||
- gather_pred_via_filesys: if true, we use the filesystem for collective gathers
|
||||
- use_normalized_areas: if true, the areas of the objects in the GT are assumed to be
|
||||
normalized by the area of the image. In that case, the size buckets are adjusted
|
||||
- maxdets: maximal number of detections to be evaluated on each image.
|
||||
- exhaustive_only: If true, we restrict eval only to exhaustive annotations
|
||||
- all_exhaustive_only: If true, datapoints are restricted only to those with all exhaustive annotations
|
||||
|
||||
"""
|
||||
# coco_gt = copy.deepcopy(coco_gt)
|
||||
self.coco_gts = [coco_gt] if not isinstance(coco_gt, list) else coco_gt
|
||||
assert len(maxdets) == 3, f"expecting 3 detection threshold, got {len(maxdets)}"
|
||||
|
||||
self.use_normalized_areas = use_normalized_areas
|
||||
self.iou_types = iou_types
|
||||
self.useCats = useCats
|
||||
self.maxdets = maxdets
|
||||
self.dump = None
|
||||
self.dump_dir = dump_dir
|
||||
if self.dump_dir is not None:
|
||||
self.dump = []
|
||||
if is_main_process():
|
||||
if not os.path.exists(self.dump_dir):
|
||||
os.makedirs(self.dump_dir, exist_ok=True)
|
||||
logging.info(f"Create the folder: {dump_dir}")
|
||||
|
||||
self.initialized = False
|
||||
|
||||
# Whether to gather predictions through filesystem (instead of torch
|
||||
# collective ops; requiring a shared filesystem across all ranks)
|
||||
self.gather_pred_via_filesys = gather_pred_via_filesys
|
||||
self.use_self_evaluate = True # CPP version is disabled
|
||||
self.postprocessor = postprocessor
|
||||
self.average_by_rarity = average_by_rarity
|
||||
self.exhaustive_only = exhaustive_only
|
||||
self.all_exhaustive_only = all_exhaustive_only
|
||||
self.metrics_dump_dir = metrics_dump_dir
|
||||
if self.metrics_dump_dir is not None:
|
||||
if is_main_process():
|
||||
if not os.path.exists(self.metrics_dump_dir):
|
||||
os.makedirs(self.metrics_dump_dir, exist_ok=True)
|
||||
logging.info(f"Create the folder: {metrics_dump_dir}")
|
||||
|
||||
def _lazy_init(self, coco_cls=COCO):
|
||||
if self.initialized:
|
||||
return
|
||||
|
||||
self.initialized = True
|
||||
|
||||
self.coco_gts = [
|
||||
coco_cls(g_pathmgr.get_local_path(gt)) if isinstance(gt, str) else gt
|
||||
for gt in self.coco_gts
|
||||
]
|
||||
|
||||
self.reset()
|
||||
|
||||
self.eval_img_ids = None
|
||||
|
||||
if self.exhaustive_only:
|
||||
exclude_img_ids = set()
|
||||
# exclude_img_ids are the ids that are not exhaustively annotated in any of the other gts
|
||||
if self.all_exhaustive_only:
|
||||
for coco_gt in self.coco_gts[1:]:
|
||||
exclude_img_ids = exclude_img_ids.union(
|
||||
{
|
||||
img["id"]
|
||||
for img in coco_gt.dataset["images"]
|
||||
if not img["is_instance_exhaustive"]
|
||||
}
|
||||
)
|
||||
# we only eval on instance exhaustive queries
|
||||
self.eval_img_ids = [
|
||||
img["id"]
|
||||
for img in self.coco_gts[0].dataset["images"]
|
||||
if (img["is_instance_exhaustive"] and img["id"] not in exclude_img_ids)
|
||||
]
|
||||
|
||||
self.rarity_buckets = None
|
||||
if self.average_by_rarity:
|
||||
self.rarity_buckets = defaultdict(list)
|
||||
eval_img_ids_set = (
|
||||
set(self.eval_img_ids) if self.eval_img_ids is not None else None
|
||||
)
|
||||
for img in self.coco_gts[0].dataset["images"]:
|
||||
if self.eval_img_ids is not None and img["id"] not in eval_img_ids_set:
|
||||
continue
|
||||
self.rarity_buckets[img["rarity"]].append(img["id"])
|
||||
print("Rarity buckets sizes:")
|
||||
for k, v in self.rarity_buckets.items():
|
||||
print(f"{k}: {len(v)}")
|
||||
|
||||
def set_sync_device(self, device: torch.device) -> Any:
|
||||
self._sync_device = device
|
||||
|
||||
def _evaluate(self, *args, **kwargs):
|
||||
return evaluate(*args, **kwargs)
|
||||
|
||||
def _loadRes(self, *args, **kwargs):
|
||||
return loadRes(*args, **kwargs)
|
||||
|
||||
def update(self, *args, **kwargs):
|
||||
self._lazy_init()
|
||||
predictions = self.postprocessor.process_results(*args, **kwargs)
|
||||
|
||||
img_ids = list(np.unique(list(predictions.keys())))
|
||||
self.img_ids.extend(img_ids)
|
||||
|
||||
for iou_type in self.iou_types:
|
||||
results = self.prepare(predictions, iou_type)
|
||||
self._dump(results)
|
||||
|
||||
assert len(self.coco_gts) == len(self.coco_evals)
|
||||
all_scorings = []
|
||||
for cur_coco_gt, cur_coco_eval in zip(self.coco_gts, self.coco_evals):
|
||||
# suppress pycocotools prints
|
||||
with open(os.devnull, "w") as devnull:
|
||||
with contextlib.redirect_stdout(devnull):
|
||||
coco_dt = (
|
||||
self._loadRes(cur_coco_gt, results) if results else COCO()
|
||||
)
|
||||
|
||||
coco_eval = cur_coco_eval[iou_type]
|
||||
|
||||
coco_eval.cocoDt = coco_dt
|
||||
coco_eval.params.imgIds = list(img_ids)
|
||||
coco_eval.params.useCats = self.useCats
|
||||
coco_eval.params.maxDets = self.maxdets
|
||||
img_ids, eval_imgs = self._evaluate(coco_eval, self.use_self_evaluate)
|
||||
all_scorings.append(eval_imgs)
|
||||
|
||||
selected = self.select_best_scoring(all_scorings)
|
||||
self.eval_imgs[iou_type].append(selected)
|
||||
|
||||
def select_best_scoring(self, scorings):
|
||||
# This function is used for "oracle" type evaluation.
|
||||
# It accepts the evaluation results with respect to several ground truths, and picks the best
|
||||
if len(scorings) == 1:
|
||||
return scorings[0]
|
||||
|
||||
# Currently we don't support Oracle Phrase AP.
|
||||
# To implement it, we likely need to modify the cpp code since the eval_image type is opaque
|
||||
raise RuntimeError("Not implemented")
|
||||
|
||||
def _dump(self, results):
|
||||
if self.dump is not None:
|
||||
dumped_results = copy.deepcopy(results)
|
||||
for r in dumped_results:
|
||||
if "bbox" not in self.iou_types and "bbox" in r:
|
||||
del r["bbox"]
|
||||
elif "bbox" in r:
|
||||
r["bbox"] = [round(coord, 5) for coord in r["bbox"]]
|
||||
r["score"] = round(r["score"], 5)
|
||||
self.dump.extend(dumped_results)
|
||||
|
||||
def synchronize_between_processes(self):
|
||||
self._lazy_init()
|
||||
logging.info("Coco evaluator: Synchronizing between processes")
|
||||
for iou_type in self.iou_types:
|
||||
if len(self.eval_imgs[iou_type]) > 0:
|
||||
self.eval_imgs[iou_type] = np.concatenate(self.eval_imgs[iou_type], 2)
|
||||
else:
|
||||
num_areas = len(self.coco_evals[0][iou_type].params.areaRng)
|
||||
# assuming 1 class
|
||||
assert not self.useCats
|
||||
self.eval_imgs[iou_type] = np.empty((1, num_areas, 0))
|
||||
create_common_coco_eval(
|
||||
self.coco_evals[0][iou_type],
|
||||
self.img_ids,
|
||||
self.eval_imgs[iou_type],
|
||||
use_self_evaluate=self.use_self_evaluate,
|
||||
gather_pred_via_filesys=self.gather_pred_via_filesys,
|
||||
metrics_dump_dir=self.metrics_dump_dir,
|
||||
)
|
||||
if self.dump is not None:
|
||||
dumped_file = Path(self.dump_dir) / f"coco_predictions_{get_rank()}.json"
|
||||
logging.info(f"COCO evaluator: Dumping local predictions to {dumped_file}")
|
||||
with g_pathmgr.open(str(dumped_file), "w") as f:
|
||||
json.dump(self.dump, f)
|
||||
|
||||
# if self.gather_pred_via_filesys:
|
||||
# dump = gather_to_rank_0_via_filesys(self.dump)
|
||||
# else:
|
||||
# dump = all_gather(self.dump, force_cpu=True)
|
||||
# self.dump = sum(dump, [])
|
||||
|
||||
def accumulate(self, imgIds=None):
|
||||
self._lazy_init()
|
||||
logging.info(
|
||||
f"Coco evaluator: Accumulating on {len(imgIds) if imgIds is not None else 'all'} images"
|
||||
)
|
||||
if not is_main_process():
|
||||
return
|
||||
|
||||
if imgIds is None:
|
||||
for coco_eval in self.coco_evals[0].values():
|
||||
accumulate(coco_eval, use_self_eval=self.use_self_evaluate)
|
||||
|
||||
if imgIds is not None:
|
||||
imgIds = set(imgIds)
|
||||
for coco_eval in self.coco_evals[0].values():
|
||||
p = coco_eval.params
|
||||
id_mask = np.array([(i in imgIds) for i in p.imgIds], dtype=bool)
|
||||
old_img_ids = p.imgIds
|
||||
coco_eval.params.imgIds = np.asarray(p.imgIds)[id_mask]
|
||||
old_img_evals = coco_eval.evalImgs
|
||||
catIds = p.catIds if p.useCats else [-1]
|
||||
coco_eval.evalImgs = list(
|
||||
np.asarray(coco_eval.evalImgs)
|
||||
.reshape(len(catIds), len(p.areaRng), len(old_img_ids))[
|
||||
..., id_mask
|
||||
]
|
||||
.flatten()
|
||||
)
|
||||
accumulate(coco_eval, use_self_eval=self.use_self_evaluate)
|
||||
coco_eval.evalImgs = old_img_evals
|
||||
coco_eval.params.imgIds = old_img_ids
|
||||
|
||||
def summarize(self):
|
||||
self._lazy_init()
|
||||
logging.info("Coco evaluator: Summarizing")
|
||||
if not is_main_process():
|
||||
return {}
|
||||
|
||||
outs = {}
|
||||
if self.rarity_buckets is None:
|
||||
self.accumulate(self.eval_img_ids)
|
||||
for iou_type, coco_eval in self.coco_evals[0].items():
|
||||
print("IoU metric: {}".format(iou_type))
|
||||
summarize(coco_eval)
|
||||
|
||||
if "bbox" in self.coco_evals[0]:
|
||||
for key, value in zip(*self.coco_evals[0]["bbox"].stats):
|
||||
outs[f"coco_eval_bbox_{key}"] = value
|
||||
if "segm" in self.coco_evals[0]:
|
||||
for key, value in zip(*self.coco_evals[0]["segm"].stats):
|
||||
outs[f"coco_eval_masks_{key}"] = value
|
||||
else:
|
||||
total_stats = {}
|
||||
all_keys = {}
|
||||
for bucket, img_list in self.rarity_buckets.items():
|
||||
self.accumulate(imgIds=img_list)
|
||||
bucket_name = RARITY_BUCKETS[bucket]
|
||||
for iou_type, coco_eval in self.coco_evals[0].items():
|
||||
print(f"IoU metric: {iou_type}. Rarity bucket: {bucket_name}")
|
||||
summarize(coco_eval)
|
||||
|
||||
if "bbox" in self.coco_evals[0]:
|
||||
if "bbox" not in total_stats:
|
||||
total_stats["bbox"] = np.zeros_like(
|
||||
self.coco_evals[0]["bbox"].stats[1]
|
||||
)
|
||||
all_keys["bbox"] = self.coco_evals[0]["bbox"].stats[0]
|
||||
total_stats["bbox"] += self.coco_evals[0]["bbox"].stats[1]
|
||||
for key, value in zip(*self.coco_evals[0]["bbox"].stats):
|
||||
outs[f"coco_eval_bbox_{bucket_name}_{key}"] = value
|
||||
if "segm" in self.coco_evals[0]:
|
||||
if "segm" not in total_stats:
|
||||
total_stats["segm"] = np.zeros_like(
|
||||
self.coco_evals[0]["segm"].stats[1]
|
||||
)
|
||||
all_keys["segm"] = self.coco_evals[0]["segm"].stats[0]
|
||||
total_stats["segm"] += self.coco_evals[0]["segm"].stats[1]
|
||||
for key, value in zip(*self.coco_evals[0]["segm"].stats):
|
||||
outs[f"coco_eval_masks_{bucket_name}_{key}"] = value
|
||||
|
||||
if "bbox" in total_stats:
|
||||
total_stats["bbox"] /= len(self.rarity_buckets)
|
||||
for key, value in zip(all_keys["bbox"], total_stats["bbox"]):
|
||||
outs[f"coco_eval_bbox_{key}"] = value
|
||||
if "segm" in total_stats:
|
||||
total_stats["segm"] /= len(self.rarity_buckets)
|
||||
for key, value in zip(all_keys["segm"], total_stats["segm"]):
|
||||
outs[f"coco_eval_masks_{key}"] = value
|
||||
|
||||
# if self.dump is not None:
|
||||
# assert self.dump_dir is not None
|
||||
# logging.info("Coco evaluator: Dumping the global result file to disk")
|
||||
# with g_pathmgr.open(str(Path(self.dump_dir) / "coco_eval.json"), "w") as f:
|
||||
# json.dump(self.dump, f)
|
||||
return outs
|
||||
|
||||
def compute_synced(self):
|
||||
self._lazy_init()
|
||||
self.synchronize_between_processes()
|
||||
return self.summarize()
|
||||
|
||||
def compute(self):
|
||||
self._lazy_init()
|
||||
return {"": 0.0}
|
||||
|
||||
def reset(self, cocoeval_cls=COCOeval):
|
||||
self.coco_evals = [{} for _ in range(len(self.coco_gts))]
|
||||
for i, coco_gt in enumerate(self.coco_gts):
|
||||
for iou_type in self.iou_types:
|
||||
self.coco_evals[i][iou_type] = cocoeval_cls(coco_gt, iouType=iou_type)
|
||||
self.coco_evals[i][iou_type].params.useCats = self.useCats
|
||||
self.coco_evals[i][iou_type].params.maxDets = self.maxdets
|
||||
if self.use_normalized_areas:
|
||||
self.coco_evals[i][iou_type].params.areaRng = [
|
||||
[0, 1e5],
|
||||
[0, 0.001],
|
||||
[0.001, 0.01],
|
||||
[0.01, 0.1],
|
||||
[0.1, 0.5],
|
||||
[0.5, 0.95],
|
||||
[0.95, 1e5],
|
||||
]
|
||||
self.coco_evals[i][iou_type].params.areaRngLbl = [
|
||||
"all",
|
||||
"tiny",
|
||||
"small",
|
||||
"medium",
|
||||
"large",
|
||||
"huge",
|
||||
"whole_image",
|
||||
]
|
||||
|
||||
self.img_ids = []
|
||||
self.eval_imgs = {k: [] for k in self.iou_types}
|
||||
if self.dump is not None:
|
||||
self.dump = []
|
||||
|
||||
def write(self, stats):
|
||||
self._lazy_init()
|
||||
"""Write the results in the stats dict"""
|
||||
if "bbox" in self.coco_evals[0]:
|
||||
stats["coco_eval_bbox"] = self.coco_evals[0]["bbox"].stats.tolist()
|
||||
if "segm" in self.coco_evals[0]:
|
||||
stats["coco_eval_masks"] = self.coco_evals[0]["segm"].stats.tolist()
|
||||
return stats
|
||||
|
||||
def prepare(self, predictions, iou_type):
|
||||
self._lazy_init()
|
||||
if iou_type == "bbox":
|
||||
return self.prepare_for_coco_detection(predictions)
|
||||
elif iou_type == "segm":
|
||||
return self.prepare_for_coco_segmentation(predictions)
|
||||
elif iou_type == "keypoints":
|
||||
return self.prepare_for_coco_keypoint(predictions)
|
||||
else:
|
||||
raise ValueError("Unknown iou type {}".format(iou_type))
|
||||
|
||||
def prepare_for_coco_detection(self, predictions):
|
||||
self._lazy_init()
|
||||
coco_results = []
|
||||
for original_id, prediction in predictions.items():
|
||||
if len(prediction) == 0:
|
||||
continue
|
||||
|
||||
boxes = prediction["boxes"]
|
||||
boxes = convert_to_xywh(boxes).tolist()
|
||||
scores = prediction["scores"].tolist()
|
||||
labels = prediction["labels"].tolist()
|
||||
|
||||
coco_results.extend(
|
||||
[
|
||||
{
|
||||
"image_id": original_id,
|
||||
"category_id": labels[k],
|
||||
"bbox": box,
|
||||
"score": scores[k],
|
||||
}
|
||||
for k, box in enumerate(boxes)
|
||||
]
|
||||
)
|
||||
return coco_results
|
||||
|
||||
@torch.no_grad()
|
||||
def prepare_for_coco_segmentation(self, predictions):
|
||||
self._lazy_init()
|
||||
coco_results = []
|
||||
for original_id, prediction in predictions.items():
|
||||
if len(prediction) == 0:
|
||||
continue
|
||||
|
||||
scores = prediction["scores"].tolist()
|
||||
labels = prediction["labels"].tolist()
|
||||
boundaries, dilated_boundaries = None, None
|
||||
if "boundaries" in prediction:
|
||||
boundaries = prediction["boundaries"]
|
||||
dilated_boundaries = prediction["dilated_boundaries"]
|
||||
assert dilated_boundaries is not None
|
||||
assert len(scores) == len(boundaries)
|
||||
|
||||
if "masks_rle" in prediction:
|
||||
rles = prediction["masks_rle"]
|
||||
areas = []
|
||||
for rle in rles:
|
||||
cur_area = mask_utils.area(rle)
|
||||
h, w = rle["size"]
|
||||
areas.append(cur_area / (h * w))
|
||||
else:
|
||||
masks = prediction["masks"]
|
||||
|
||||
masks = masks > 0.5
|
||||
h, w = masks.shape[-2:]
|
||||
|
||||
areas = masks.flatten(1).sum(1) / (h * w)
|
||||
areas = areas.tolist()
|
||||
|
||||
rles = rle_encode(masks.squeeze(1))
|
||||
|
||||
# memory clean
|
||||
del masks
|
||||
del prediction["masks"]
|
||||
|
||||
assert len(areas) == len(rles) == len(scores)
|
||||
for k, rle in enumerate(rles):
|
||||
payload = {
|
||||
"image_id": original_id,
|
||||
"category_id": labels[k],
|
||||
"segmentation": rle,
|
||||
"score": scores[k],
|
||||
"area": areas[k],
|
||||
}
|
||||
if boundaries is not None:
|
||||
payload["boundary"] = boundaries[k]
|
||||
payload["dilated_boundary"] = dilated_boundaries[k]
|
||||
|
||||
coco_results.append(payload)
|
||||
|
||||
return coco_results
|
||||
|
||||
def prepare_for_coco_keypoint(self, predictions):
|
||||
self._lazy_init()
|
||||
coco_results = []
|
||||
for original_id, prediction in predictions.items():
|
||||
if len(prediction) == 0:
|
||||
continue
|
||||
|
||||
boxes = prediction["boxes"]
|
||||
boxes = convert_to_xywh(boxes).tolist()
|
||||
scores = prediction["scores"].tolist()
|
||||
labels = prediction["labels"].tolist()
|
||||
keypoints = prediction["keypoints"]
|
||||
keypoints = keypoints.flatten(start_dim=1).tolist()
|
||||
|
||||
coco_results.extend(
|
||||
[
|
||||
{
|
||||
"image_id": original_id,
|
||||
"category_id": labels[k],
|
||||
"keypoints": keypoint,
|
||||
"score": scores[k],
|
||||
}
|
||||
for k, keypoint in enumerate(keypoints)
|
||||
]
|
||||
)
|
||||
return coco_results
|
||||
|
||||
|
||||
def convert_to_xywh(boxes):
|
||||
xmin, ymin, xmax, ymax = boxes.unbind(-1)
|
||||
return torch.stack((xmin, ymin, xmax - xmin, ymax - ymin), dim=-1)
|
||||
|
||||
|
||||
def merge(img_ids, eval_imgs, gather_pred_via_filesys=False):
|
||||
if gather_pred_via_filesys:
|
||||
# only gather the predictions to rank 0 (other ranks will receive empty
|
||||
# lists for `all_img_ids` and `all_eval_imgs`, which should be OK as
|
||||
# merging and evaluation are only done on rank 0)
|
||||
all_img_ids = gather_to_rank_0_via_filesys(img_ids)
|
||||
all_eval_imgs = gather_to_rank_0_via_filesys(eval_imgs)
|
||||
else:
|
||||
all_img_ids = all_gather(img_ids, force_cpu=True)
|
||||
all_eval_imgs = all_gather(eval_imgs, force_cpu=True)
|
||||
if not is_main_process():
|
||||
return None, None
|
||||
|
||||
merged_img_ids = []
|
||||
for p in all_img_ids:
|
||||
merged_img_ids.extend(p)
|
||||
|
||||
merged_eval_imgs = []
|
||||
for p in all_eval_imgs:
|
||||
merged_eval_imgs.append(p)
|
||||
|
||||
merged_img_ids = np.array(merged_img_ids)
|
||||
merged_eval_imgs = np.concatenate(merged_eval_imgs, 2)
|
||||
|
||||
# keep only unique (and in sorted order) images
|
||||
merged_img_ids, idx = np.unique(merged_img_ids, return_index=True)
|
||||
merged_eval_imgs = merged_eval_imgs[..., idx]
|
||||
|
||||
return merged_img_ids, merged_eval_imgs
|
||||
|
||||
|
||||
def create_common_coco_eval(
|
||||
coco_eval,
|
||||
img_ids,
|
||||
eval_imgs,
|
||||
use_self_evaluate,
|
||||
gather_pred_via_filesys=False,
|
||||
metrics_dump_dir=None,
|
||||
):
|
||||
img_ids, eval_imgs = merge(img_ids, eval_imgs, gather_pred_via_filesys)
|
||||
if not is_main_process():
|
||||
return
|
||||
if metrics_dump_dir is not None:
|
||||
dumped_file = (
|
||||
Path(metrics_dump_dir) / f"coco_eval_img_metrics_{get_rank()}.json"
|
||||
)
|
||||
logging.info(f"COCO evaluator: Dumping local predictions to {dumped_file}")
|
||||
with g_pathmgr.open(str(dumped_file), "w") as f:
|
||||
json.dump(eval_imgs.squeeze(), f, default=lambda x: x.tolist())
|
||||
img_ids = list(img_ids)
|
||||
|
||||
# If some images were not predicted, we need to create dummy detections for them
|
||||
missing_img_ids = set(coco_eval.cocoGt.getImgIds()) - set(img_ids)
|
||||
if len(missing_img_ids) > 0:
|
||||
print(f"WARNING: {len(missing_img_ids)} images were not predicted!")
|
||||
coco_eval.cocoDt = COCO()
|
||||
coco_eval.params.imgIds = list(missing_img_ids)
|
||||
new_img_ids, new_eval_imgs = evaluate(coco_eval, use_self_evaluate)
|
||||
img_ids.extend(new_img_ids)
|
||||
eval_imgs = np.concatenate((eval_imgs, new_eval_imgs), axis=2)
|
||||
|
||||
eval_imgs = list(eval_imgs.flatten())
|
||||
assert len(img_ids) == len(coco_eval.cocoGt.getImgIds())
|
||||
|
||||
coco_eval.evalImgs = eval_imgs
|
||||
coco_eval.params.imgIds = img_ids
|
||||
coco_eval._paramsEval = copy.deepcopy(coco_eval.params)
|
||||
|
||||
|
||||
#################################################################
|
||||
# From pycocotools, just removed the prints and fixed
|
||||
# a Python3 bug about unicode not defined
|
||||
#################################################################
|
||||
|
||||
|
||||
# Copy of COCO prepare, but doesn't convert anntoRLE
|
||||
def segmentation_prepare(self):
|
||||
"""
|
||||
Prepare ._gts and ._dts for evaluation based on params
|
||||
:return: None
|
||||
"""
|
||||
p = self.params
|
||||
if p.useCats:
|
||||
gts = self.cocoGt.loadAnns(
|
||||
self.cocoGt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds)
|
||||
)
|
||||
dts = self.cocoDt.loadAnns(
|
||||
self.cocoDt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds)
|
||||
)
|
||||
else:
|
||||
gts = self.cocoGt.loadAnns(self.cocoGt.getAnnIds(imgIds=p.imgIds))
|
||||
dts = self.cocoDt.loadAnns(self.cocoDt.getAnnIds(imgIds=p.imgIds))
|
||||
|
||||
for gt in gts:
|
||||
gt["ignore"] = gt["ignore"] if "ignore" in gt else 0
|
||||
gt["ignore"] = "iscrowd" in gt and gt["iscrowd"]
|
||||
if p.iouType == "keypoints":
|
||||
gt["ignore"] = (gt["num_keypoints"] == 0) or gt["ignore"]
|
||||
self._gts = defaultdict(list) # gt for evaluation
|
||||
self._dts = defaultdict(list) # dt for evaluation
|
||||
for gt in gts:
|
||||
self._gts[gt["image_id"], gt["category_id"]].append(gt)
|
||||
for dt in dts:
|
||||
self._dts[dt["image_id"], dt["category_id"]].append(dt)
|
||||
self.evalImgs = defaultdict(list) # per-image per-category evaluation results
|
||||
self.eval = {} # accumulated evaluation results
|
||||
|
||||
|
||||
def evaluate(self, use_self_evaluate):
|
||||
"""
|
||||
Run per image evaluation on given images and store results (a list of dict) in self.evalImgs
|
||||
:return: None
|
||||
"""
|
||||
# tic = time.time()
|
||||
# print('Running per image evaluation...', use_self_evaluate)
|
||||
p = self.params
|
||||
# add backward compatibility if useSegm is specified in params
|
||||
if p.useSegm is not None:
|
||||
p.iouType = "segm" if p.useSegm == 1 else "bbox"
|
||||
print(
|
||||
"useSegm (deprecated) is not None. Running {} evaluation".format(p.iouType)
|
||||
)
|
||||
# print('Evaluate annotation type *{}*'.format(p.iouType))
|
||||
p.imgIds = list(np.unique(p.imgIds))
|
||||
if p.useCats:
|
||||
p.catIds = list(np.unique(p.catIds))
|
||||
p.maxDets = sorted(p.maxDets)
|
||||
self.params = p
|
||||
|
||||
self._prepare()
|
||||
# loop through images, area range, max detection number
|
||||
catIds = p.catIds if p.useCats else [-1]
|
||||
|
||||
if p.iouType == "segm" or p.iouType == "bbox":
|
||||
computeIoU = self.computeIoU
|
||||
elif p.iouType == "keypoints":
|
||||
computeIoU = self.computeOks
|
||||
self.ious = {
|
||||
(imgId, catId): computeIoU(imgId, catId)
|
||||
for imgId in p.imgIds
|
||||
for catId in catIds
|
||||
}
|
||||
|
||||
maxDet = p.maxDets[-1]
|
||||
if use_self_evaluate:
|
||||
evalImgs = [
|
||||
self.evaluateImg(imgId, catId, areaRng, maxDet)
|
||||
for catId in catIds
|
||||
for areaRng in p.areaRng
|
||||
for imgId in p.imgIds
|
||||
]
|
||||
# this is NOT in the pycocotools code, but could be done outside
|
||||
evalImgs = np.asarray(evalImgs).reshape(
|
||||
len(catIds), len(p.areaRng), len(p.imgIds)
|
||||
)
|
||||
return p.imgIds, evalImgs
|
||||
|
||||
# <<<< Beginning of code differences with original COCO API
|
||||
# def convert_instances_to_cpp(instances, is_det=False):
|
||||
# # Convert annotations for a list of instances in an image to a format that's fast
|
||||
# # to access in C++
|
||||
# instances_cpp = []
|
||||
# for instance in instances:
|
||||
# instance_cpp = _CPP.InstanceAnnotation(
|
||||
# int(instance["id"]),
|
||||
# instance["score"] if is_det else instance.get("score", 0.0),
|
||||
# instance["area"],
|
||||
# bool(instance.get("iscrowd", 0)),
|
||||
# bool(instance.get("ignore", 0)),
|
||||
# )
|
||||
# instances_cpp.append(instance_cpp)
|
||||
# return instances_cpp
|
||||
|
||||
# # Convert GT annotations, detections, and IOUs to a format that's fast to access in C++
|
||||
# ground_truth_instances = [
|
||||
# [convert_instances_to_cpp(self._gts[imgId, catId]) for catId in p.catIds]
|
||||
# for imgId in p.imgIds
|
||||
# ]
|
||||
# detected_instances = [
|
||||
# [
|
||||
# convert_instances_to_cpp(self._dts[imgId, catId], is_det=True)
|
||||
# for catId in p.catIds
|
||||
# ]
|
||||
# for imgId in p.imgIds
|
||||
# ]
|
||||
# ious = [[self.ious[imgId, catId] for catId in catIds] for imgId in p.imgIds]
|
||||
|
||||
# if not p.useCats:
|
||||
# # For each image, flatten per-category lists into a single list
|
||||
# ground_truth_instances = [
|
||||
# [[o for c in i for o in c]] for i in ground_truth_instances
|
||||
# ]
|
||||
# detected_instances = [[[o for c in i for o in c]] for i in detected_instances]
|
||||
|
||||
# # Call C++ implementation of self.evaluateImgs()
|
||||
# _evalImgs_cpp = _CPP.COCOevalEvaluateImages(
|
||||
# p.areaRng, maxDet, p.iouThrs, ious, ground_truth_instances, detected_instances
|
||||
# )
|
||||
|
||||
# self._paramsEval = copy.deepcopy(self.params)
|
||||
# evalImgs = np.asarray(_evalImgs_cpp).reshape(
|
||||
# len(catIds), len(p.areaRng), len(p.imgIds)
|
||||
# )
|
||||
# return p.imgIds, evalImgs
|
||||
|
||||
|
||||
#################################################################
|
||||
# end of straight copy from pycocotools, just removing the prints
|
||||
#################################################################
|
||||
|
||||
|
||||
#################################################################
|
||||
# From pycocotools, but disabled mask->box conversion which is
|
||||
# pointless
|
||||
#################################################################
|
||||
def loadRes(self, resFile):
|
||||
"""
|
||||
Load result file and return a result api object.
|
||||
:param resFile (str) : file name of result file
|
||||
:return: res (obj) : result api object
|
||||
"""
|
||||
res = COCO()
|
||||
res.dataset["images"] = [img for img in self.dataset["images"]]
|
||||
|
||||
if type(resFile) == str:
|
||||
anns = json.load(open(resFile))
|
||||
elif type(resFile) == np.ndarray:
|
||||
anns = self.loadNumpyAnnotations(resFile)
|
||||
else:
|
||||
anns = resFile
|
||||
assert type(anns) == list, "results in not an array of objects"
|
||||
annsImgIds = [ann["image_id"] for ann in anns]
|
||||
assert set(annsImgIds) == (
|
||||
set(annsImgIds) & set(self.getImgIds())
|
||||
), "Results do not correspond to current coco set"
|
||||
if "caption" in anns[0]:
|
||||
imgIds = set([img["id"] for img in res.dataset["images"]]) & set(
|
||||
[ann["image_id"] for ann in anns]
|
||||
)
|
||||
res.dataset["images"] = [
|
||||
img for img in res.dataset["images"] if img["id"] in imgIds
|
||||
]
|
||||
for id, ann in enumerate(anns):
|
||||
ann["id"] = id + 1
|
||||
elif "bbox" in anns[0] and not anns[0]["bbox"] == []:
|
||||
res.dataset["categories"] = copy.deepcopy(self.dataset["categories"])
|
||||
for id, ann in enumerate(anns):
|
||||
bb = ann["bbox"]
|
||||
x1, x2, y1, y2 = [bb[0], bb[0] + bb[2], bb[1], bb[1] + bb[3]]
|
||||
if "segmentation" not in ann:
|
||||
ann["segmentation"] = [[x1, y1, x1, y2, x2, y2, x2, y1]]
|
||||
ann["area"] = bb[2] * bb[3]
|
||||
ann["id"] = id + 1
|
||||
ann["iscrowd"] = 0
|
||||
elif "segmentation" in anns[0]:
|
||||
res.dataset["categories"] = copy.deepcopy(self.dataset["categories"])
|
||||
for id, ann in enumerate(anns):
|
||||
# now only support compressed RLE format as segmentation results
|
||||
# ann["area"] = mask_util.area(ann["segmentation"])
|
||||
# The following lines are disabled because they are pointless
|
||||
# if not 'bbox' in ann:
|
||||
# ann['bbox'] = maskUtils.toBbox(ann['segmentation'])
|
||||
ann["id"] = id + 1
|
||||
ann["iscrowd"] = 0
|
||||
elif "keypoints" in anns[0]:
|
||||
res.dataset["categories"] = copy.deepcopy(self.dataset["categories"])
|
||||
for id, ann in enumerate(anns):
|
||||
s = ann["keypoints"]
|
||||
x = s[0::3]
|
||||
y = s[1::3]
|
||||
x0, x1, y0, y1 = np.min(x), np.max(x), np.min(y), np.max(y)
|
||||
ann["area"] = (x1 - x0) * (y1 - y0)
|
||||
ann["id"] = id + 1
|
||||
ann["bbox"] = [x0, y0, x1 - x0, y1 - y0]
|
||||
|
||||
res.dataset["annotations"] = anns
|
||||
res.createIndex()
|
||||
return res
|
||||
|
||||
|
||||
#################################################################
|
||||
# end of straight copy from pycocotools
|
||||
#################################################################
|
||||
|
||||
|
||||
#################################################################
|
||||
# From pycocotools, but added handling of custom area rngs, and returns stat keys
|
||||
#################################################################
|
||||
def summarize(self):
|
||||
"""
|
||||
Compute and display summary metrics for evaluation results.
|
||||
Note this functin can *only* be applied on the default parameter setting
|
||||
"""
|
||||
|
||||
def _summarize(ap=1, iouThr=None, areaRng="all", maxDets=100):
|
||||
p = self.params
|
||||
iStr = " {:<18} {} @[ IoU={:<9} | area={:>6s} | maxDets={:>3d} ] = {:0.3f}"
|
||||
titleStr = "Average Precision" if ap == 1 else "Average Recall"
|
||||
typeStr = "(AP)" if ap == 1 else "(AR)"
|
||||
iouStr = (
|
||||
"{:0.2f}:{:0.2f}".format(p.iouThrs[0], p.iouThrs[-1])
|
||||
if iouThr is None
|
||||
else "{:0.2f}".format(iouThr)
|
||||
)
|
||||
|
||||
aind = [i for i, aRng in enumerate(p.areaRngLbl) if aRng == areaRng]
|
||||
mind = [i for i, mDet in enumerate(p.maxDets) if mDet == maxDets]
|
||||
if ap == 1:
|
||||
# dimension of precision: [TxRxKxAxM]
|
||||
s = self.eval["precision"]
|
||||
# IoU
|
||||
if iouThr is not None:
|
||||
t = np.where(iouThr == p.iouThrs)[0]
|
||||
s = s[t]
|
||||
s = s[:, :, :, aind, mind]
|
||||
else:
|
||||
# dimension of recall: [TxKxAxM]
|
||||
s = self.eval["recall"]
|
||||
if iouThr is not None:
|
||||
t = np.where(iouThr == p.iouThrs)[0]
|
||||
s = s[t]
|
||||
s = s[:, :, aind, mind]
|
||||
if len(s[s > -1]) == 0:
|
||||
mean_s = -1
|
||||
else:
|
||||
mean_s = np.mean(s[s > -1])
|
||||
print(iStr.format(titleStr, typeStr, iouStr, areaRng, maxDets, mean_s))
|
||||
return mean_s
|
||||
|
||||
def _summarizeDets():
|
||||
nb_results = 6 + (len(self.params.areaRng) - 1) * 2
|
||||
assert len(self.params.areaRng) == len(self.params.areaRngLbl)
|
||||
stats = np.zeros((nb_results,))
|
||||
keys = ["AP", "AP_50", "AP_75"]
|
||||
stats[0] = _summarize(1, maxDets=self.params.maxDets[2])
|
||||
stats[1] = _summarize(1, iouThr=0.5, maxDets=self.params.maxDets[2])
|
||||
stats[2] = _summarize(1, iouThr=0.75, maxDets=self.params.maxDets[2])
|
||||
cur_id = 3
|
||||
for area in self.params.areaRngLbl[1:]:
|
||||
stats[cur_id] = _summarize(1, areaRng=area, maxDets=self.params.maxDets[2])
|
||||
cur_id += 1
|
||||
keys.append(f"AP_{area}")
|
||||
stats[cur_id] = _summarize(0, maxDets=self.params.maxDets[0])
|
||||
cur_id += 1
|
||||
stats[cur_id] = _summarize(0, maxDets=self.params.maxDets[1])
|
||||
cur_id += 1
|
||||
stats[cur_id] = _summarize(0, maxDets=self.params.maxDets[2])
|
||||
cur_id += 1
|
||||
keys += ["AR", "AR_50", "AR_75"]
|
||||
|
||||
for area in self.params.areaRngLbl[1:]:
|
||||
stats[cur_id] = _summarize(0, areaRng=area, maxDets=self.params.maxDets[2])
|
||||
cur_id += 1
|
||||
keys.append(f"AR_{area}")
|
||||
assert len(stats) == len(keys)
|
||||
return keys, stats
|
||||
|
||||
if not self.eval:
|
||||
raise Exception("Please run accumulate() first")
|
||||
self.stats = _summarizeDets()
|
||||
|
||||
|
||||
#################################################################
|
||||
# end of straight copy from pycocotools
|
||||
#################################################################
|
||||
|
||||
|
||||
#################################################################
|
||||
# From https://github.com/facebookresearch/detectron2/blob/main/detectron2/evaluation/fast_eval_api.py
|
||||
# with slight adjustments
|
||||
#################################################################
|
||||
def accumulate(self, use_self_eval=False):
|
||||
"""
|
||||
Accumulate per image evaluation results and store the result in self.eval. Does not
|
||||
support changing parameter settings from those used by self.evaluate()
|
||||
"""
|
||||
if use_self_eval:
|
||||
self.accumulate()
|
||||
return
|
||||
# CPP code is disabled
|
||||
# self.eval = _CPP.COCOevalAccumulate(self.params, self.evalImgs)
|
||||
|
||||
# # recall is num_iou_thresholds X num_categories X num_area_ranges X num_max_detections
|
||||
# self.eval["recall"] = np.array(self.eval["recall"]).reshape(
|
||||
# self.eval["counts"][:1] + self.eval["counts"][2:]
|
||||
# )
|
||||
|
||||
# # precision and scores are num_iou_thresholds X num_recall_thresholds X num_categories X
|
||||
# # num_area_ranges X num_max_detections
|
||||
# self.eval["precision"] = np.array(self.eval["precision"]).reshape(
|
||||
# self.eval["counts"]
|
||||
# )
|
||||
# self.eval["scores"] = np.array(self.eval["scores"]).reshape(self.eval["counts"])
|
||||
181
sam3/eval/coco_eval_offline.py
Normal file
181
sam3/eval/coco_eval_offline.py
Normal file
@@ -0,0 +1,181 @@
|
||||
# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
|
||||
|
||||
"""
|
||||
This evaluator is meant for regular COCO mAP evaluation, for example on the COCO val set.
|
||||
|
||||
For Category mAP, we need the model to make predictions for all the categories on every single image.
|
||||
In general, since the number of classes can be big, and the API model makes predictions individually for each pair (image, class),
|
||||
we may need to split the inference process for a given image in several chunks.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from collections import defaultdict
|
||||
|
||||
import torch
|
||||
from pycocotools.coco import COCO
|
||||
from pycocotools.cocoeval import COCOeval
|
||||
from sam3.train.utils.distributed import is_main_process
|
||||
|
||||
try:
|
||||
from tidecv import datasets, TIDE
|
||||
|
||||
HAS_TIDE = True
|
||||
except ImportError:
|
||||
HAS_TIDE = False
|
||||
print("WARNING: TIDE not installed. Detailed analysis will not be available.")
|
||||
|
||||
|
||||
# the COCO detection metrics (https://github.com/cocodataset/cocoapi/blob/8c9bcc3cf640524c4c20a9c40e89cb6a2f2fa0e9/PythonAPI/pycocotools/cocoeval.py#L460-L471)
|
||||
COCO_METRICS = [
|
||||
"AP",
|
||||
"AP_50",
|
||||
"AP_75",
|
||||
"AP_small",
|
||||
"AP_medium",
|
||||
"AP_large",
|
||||
"AR_maxDets@1",
|
||||
"AR_maxDets@10",
|
||||
"AR_maxDets@100",
|
||||
"AR_small",
|
||||
"AR_medium",
|
||||
"AR_large",
|
||||
]
|
||||
|
||||
|
||||
def convert_to_xywh(boxes):
|
||||
"""Convert bounding boxes from xyxy format to xywh format."""
|
||||
xmin, ymin, xmax, ymax = boxes.unbind(-1)
|
||||
return torch.stack((xmin, ymin, xmax - xmin, ymax - ymin), dim=-1)
|
||||
|
||||
|
||||
class HeapElement:
|
||||
"""Utility class to make a heap with a custom comparator"""
|
||||
|
||||
def __init__(self, val):
|
||||
self.val = val
|
||||
|
||||
def __lt__(self, other):
|
||||
return self.val["score"] < other.val["score"]
|
||||
|
||||
|
||||
class COCOevalCustom(COCOeval):
|
||||
"""
|
||||
This is a slightly modified version of the original COCO API with added support for positive split evaluation.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, cocoGt=None, cocoDt=None, iouType="segm", dt_only_positive=False
|
||||
):
|
||||
super().__init__(cocoGt, cocoDt, iouType)
|
||||
self.dt_only_positive = dt_only_positive
|
||||
|
||||
def _prepare(self):
|
||||
"""
|
||||
Prepare ._gts and ._dts for evaluation based on params
|
||||
:return: None
|
||||
"""
|
||||
|
||||
def _toMask(anns, coco):
|
||||
# modify ann['segmentation'] by reference
|
||||
for ann in anns:
|
||||
rle = coco.annToRLE(ann)
|
||||
ann["segmentation"] = rle
|
||||
|
||||
p = self.params
|
||||
if p.useCats:
|
||||
gts = self.cocoGt.loadAnns(
|
||||
self.cocoGt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds)
|
||||
)
|
||||
dts = self.cocoDt.loadAnns(
|
||||
self.cocoDt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds)
|
||||
)
|
||||
else:
|
||||
gts = self.cocoGt.loadAnns(self.cocoGt.getAnnIds(imgIds=p.imgIds))
|
||||
dts = self.cocoDt.loadAnns(self.cocoDt.getAnnIds(imgIds=p.imgIds))
|
||||
|
||||
# convert ground truth to mask if iouType == 'segm'
|
||||
if p.iouType == "segm":
|
||||
_toMask(gts, self.cocoGt)
|
||||
_toMask(dts, self.cocoDt)
|
||||
# set ignore flag
|
||||
for gt in gts:
|
||||
gt["ignore"] = gt["ignore"] if "ignore" in gt else 0
|
||||
gt["ignore"] = "iscrowd" in gt and gt["iscrowd"]
|
||||
if p.iouType == "keypoints":
|
||||
gt["ignore"] = (gt["num_keypoints"] == 0) or gt["ignore"]
|
||||
self._gts = defaultdict(list) # gt for evaluation
|
||||
self._dts = defaultdict(list) # dt for evaluation
|
||||
|
||||
_gts_cat_ids = defaultdict(set) # gt for evaluation on positive split
|
||||
for gt in gts:
|
||||
self._gts[gt["image_id"], gt["category_id"]].append(gt)
|
||||
_gts_cat_ids[gt["image_id"]].add(gt["category_id"])
|
||||
|
||||
#### BEGIN MODIFICATION ####
|
||||
for dt in dts:
|
||||
if (
|
||||
self.dt_only_positive
|
||||
and dt["category_id"] not in _gts_cat_ids[dt["image_id"]]
|
||||
):
|
||||
continue
|
||||
self._dts[dt["image_id"], dt["category_id"]].append(dt)
|
||||
#### END MODIFICATION ####
|
||||
self.evalImgs = defaultdict(list) # per-image per-category evaluation results
|
||||
self.eval = {} # accumulated evaluation results
|
||||
|
||||
|
||||
class CocoEvaluatorOfflineWithPredFileEvaluators:
|
||||
def __init__(
|
||||
self,
|
||||
gt_path,
|
||||
tide: bool = True,
|
||||
iou_type: str = "bbox",
|
||||
positive_split=False,
|
||||
):
|
||||
self.gt_path = gt_path
|
||||
self.tide_enabled = HAS_TIDE and tide
|
||||
self.positive_split = positive_split
|
||||
self.iou_type = iou_type
|
||||
|
||||
def evaluate(self, dumped_file):
|
||||
if not is_main_process():
|
||||
return {}
|
||||
|
||||
logging.info("OfflineCoco evaluator: Loading groundtruth")
|
||||
self.gt = COCO(self.gt_path)
|
||||
|
||||
# Creating the result file
|
||||
logging.info("Coco evaluator: Creating the result file")
|
||||
cocoDt = self.gt.loadRes(str(dumped_file))
|
||||
|
||||
# Run the evaluation
|
||||
logging.info("Coco evaluator: Running evaluation")
|
||||
coco_eval = COCOevalCustom(
|
||||
self.gt, cocoDt, iouType=self.iou_type, dt_only_positive=self.positive_split
|
||||
)
|
||||
coco_eval.evaluate()
|
||||
coco_eval.accumulate()
|
||||
coco_eval.summarize()
|
||||
|
||||
outs = {}
|
||||
for i, value in enumerate(coco_eval.stats):
|
||||
outs[f"coco_eval_{self.iou_type}_{COCO_METRICS[i]}"] = value
|
||||
|
||||
if self.tide_enabled:
|
||||
logging.info("Coco evaluator: Loading TIDE")
|
||||
self.tide_gt = datasets.COCO(self.gt_path)
|
||||
self.tide = TIDE(mode="mask" if self.iou_type == "segm" else "bbox")
|
||||
|
||||
# Run TIDE
|
||||
logging.info("Coco evaluator: Running TIDE")
|
||||
self.tide.evaluate(
|
||||
self.tide_gt, datasets.COCOResult(str(dumped_file)), name="coco_eval"
|
||||
)
|
||||
self.tide.summarize()
|
||||
for k, v in self.tide.get_main_errors()["coco_eval"].items():
|
||||
outs[f"coco_eval_{self.iou_type}_TIDE_{k}"] = v
|
||||
|
||||
for k, v in self.tide.get_special_errors()["coco_eval"].items():
|
||||
outs[f"coco_eval_{self.iou_type}_TIDE_{k}"] = v
|
||||
|
||||
return outs
|
||||
230
sam3/eval/coco_reindex.py
Normal file
230
sam3/eval/coco_reindex.py
Normal file
@@ -0,0 +1,230 @@
|
||||
# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
|
||||
|
||||
"""
|
||||
Self-contained COCO JSON re-indexing function that creates temporary files.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
|
||||
def reindex_coco_to_temp(input_json_path: str) -> Optional[str]:
|
||||
"""
|
||||
Convert 0-indexed COCO JSON file to 1-indexed and save to temporary location.
|
||||
|
||||
Args:
|
||||
input_json_path: Path to the input COCO JSON file
|
||||
|
||||
Returns:
|
||||
Path to the new 1-indexed JSON file in temporary directory, or None if no conversion needed
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: If input file doesn't exist
|
||||
json.JSONDecodeError: If input file is not valid JSON
|
||||
ValueError: If input file is not a valid COCO format
|
||||
"""
|
||||
|
||||
def is_coco_json(data: Dict[str, Any]) -> bool:
|
||||
"""Check if data appears to be a COCO format file."""
|
||||
if not isinstance(data, dict):
|
||||
return False
|
||||
# A COCO file should have at least one of these keys
|
||||
coco_keys = {"images", "annotations", "categories"}
|
||||
return any(key in data for key in coco_keys)
|
||||
|
||||
def check_zero_indexed(data: Dict[str, Any]) -> Tuple[bool, bool, bool]:
|
||||
"""
|
||||
Check if annotations, images, or categories start from index 0.
|
||||
|
||||
Returns:
|
||||
Tuple of (annotations_zero_indexed, images_zero_indexed, categories_zero_indexed)
|
||||
"""
|
||||
annotations_zero = False
|
||||
images_zero = False
|
||||
categories_zero = False
|
||||
|
||||
# Check annotations
|
||||
annotations = data.get("annotations", [])
|
||||
if annotations and any(ann.get("id", -1) == 0 for ann in annotations):
|
||||
annotations_zero = True
|
||||
|
||||
# Check images
|
||||
images = data.get("images", [])
|
||||
if images and any(img.get("id", -1) == 0 for img in images):
|
||||
images_zero = True
|
||||
|
||||
# Check categories
|
||||
categories = data.get("categories", [])
|
||||
if categories and any(cat.get("id", -1) == 0 for cat in categories):
|
||||
categories_zero = True
|
||||
|
||||
return annotations_zero, images_zero, categories_zero
|
||||
|
||||
def reindex_coco_data(data: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Convert 0-indexed COCO data to 1-indexed."""
|
||||
modified_data = data.copy()
|
||||
|
||||
annotations_zero, images_zero, categories_zero = check_zero_indexed(data)
|
||||
|
||||
# Create ID mapping for consistency
|
||||
image_id_mapping = {}
|
||||
category_id_mapping = {}
|
||||
|
||||
# Process images first (since annotations reference image IDs)
|
||||
if images_zero and "images" in modified_data:
|
||||
for img in modified_data["images"]:
|
||||
old_id = img["id"]
|
||||
new_id = old_id + 1
|
||||
image_id_mapping[old_id] = new_id
|
||||
img["id"] = new_id
|
||||
|
||||
# Process categories (since annotations reference category IDs)
|
||||
if categories_zero and "categories" in modified_data:
|
||||
for cat in modified_data["categories"]:
|
||||
old_id = cat["id"]
|
||||
new_id = old_id + 1
|
||||
category_id_mapping[old_id] = new_id
|
||||
cat["id"] = new_id
|
||||
|
||||
# Process annotations
|
||||
if "annotations" in modified_data:
|
||||
for ann in modified_data["annotations"]:
|
||||
# Update annotation ID if needed
|
||||
if annotations_zero:
|
||||
ann["id"] = ann["id"] + 1
|
||||
|
||||
# Update image_id reference if images were reindexed
|
||||
if images_zero and ann.get("image_id") is not None:
|
||||
old_image_id = ann["image_id"]
|
||||
if old_image_id in image_id_mapping:
|
||||
ann["image_id"] = image_id_mapping[old_image_id]
|
||||
|
||||
# Update category_id reference if categories were reindexed
|
||||
if categories_zero and ann.get("category_id") is not None:
|
||||
old_category_id = ann["category_id"]
|
||||
if old_category_id in category_id_mapping:
|
||||
ann["category_id"] = category_id_mapping[old_category_id]
|
||||
|
||||
return modified_data
|
||||
|
||||
# Validate input path
|
||||
if not os.path.exists(input_json_path):
|
||||
raise FileNotFoundError(f"Input file not found: {input_json_path}")
|
||||
|
||||
# Load and validate JSON data
|
||||
try:
|
||||
with open(input_json_path, "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
except json.JSONDecodeError as e:
|
||||
raise json.JSONDecodeError(f"Invalid JSON in {input_json_path}: {e}")
|
||||
|
||||
# Validate COCO format
|
||||
if not is_coco_json(data):
|
||||
raise ValueError(
|
||||
f"File does not appear to be in COCO format: {input_json_path}"
|
||||
)
|
||||
|
||||
# Check if reindexing is needed
|
||||
annotations_zero, images_zero, categories_zero = check_zero_indexed(data)
|
||||
|
||||
if not (annotations_zero or images_zero or categories_zero):
|
||||
# No conversion needed - just copy to temp location
|
||||
input_path = Path(input_json_path)
|
||||
temp_dir = tempfile.mkdtemp()
|
||||
temp_filename = f"{input_path.stem}_1_indexed{input_path.suffix}"
|
||||
temp_path = os.path.join(temp_dir, temp_filename)
|
||||
|
||||
with open(temp_path, "w", encoding="utf-8") as f:
|
||||
json.dump(data, f, indent=2, ensure_ascii=False)
|
||||
|
||||
return temp_path
|
||||
|
||||
# Perform reindexing
|
||||
modified_data = reindex_coco_data(data)
|
||||
|
||||
# Create temporary file
|
||||
input_path = Path(input_json_path)
|
||||
temp_dir = tempfile.mkdtemp()
|
||||
temp_filename = f"{input_path.stem}_1_indexed{input_path.suffix}"
|
||||
temp_path = os.path.join(temp_dir, temp_filename)
|
||||
|
||||
# Write modified data to temporary file
|
||||
with open(temp_path, "w", encoding="utf-8") as f:
|
||||
json.dump(modified_data, f, indent=2, ensure_ascii=False)
|
||||
|
||||
return temp_path
|
||||
|
||||
|
||||
# Example usage and test function
|
||||
def test_reindex_function():
|
||||
"""Test the reindex function with a sample COCO file."""
|
||||
|
||||
# Create a test COCO file
|
||||
test_data = {
|
||||
"info": {"description": "Test COCO dataset", "version": "1.0", "year": 2023},
|
||||
"images": [
|
||||
{"id": 0, "width": 640, "height": 480, "file_name": "test1.jpg"},
|
||||
{"id": 1, "width": 640, "height": 480, "file_name": "test2.jpg"},
|
||||
],
|
||||
"categories": [
|
||||
{"id": 0, "name": "person", "supercategory": "person"},
|
||||
{"id": 1, "name": "car", "supercategory": "vehicle"},
|
||||
],
|
||||
"annotations": [
|
||||
{
|
||||
"id": 0,
|
||||
"image_id": 0,
|
||||
"category_id": 0,
|
||||
"bbox": [100, 100, 50, 75],
|
||||
"area": 3750,
|
||||
"iscrowd": 0,
|
||||
},
|
||||
{
|
||||
"id": 1,
|
||||
"image_id": 1,
|
||||
"category_id": 1,
|
||||
"bbox": [200, 150, 120, 80],
|
||||
"area": 9600,
|
||||
"iscrowd": 0,
|
||||
},
|
||||
],
|
||||
}
|
||||
|
||||
# Create temporary test file
|
||||
with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
|
||||
json.dump(test_data, f, indent=2)
|
||||
test_file_path = f.name
|
||||
|
||||
try:
|
||||
# Test the function
|
||||
result_path = reindex_coco_to_temp(test_file_path)
|
||||
print(f"Original file: {test_file_path}")
|
||||
print(f"Converted file: {result_path}")
|
||||
|
||||
# Load and display the result
|
||||
with open(result_path, "r") as f:
|
||||
result_data = json.load(f)
|
||||
|
||||
print("\nConverted data sample:")
|
||||
print(f"First image ID: {result_data['images'][0]['id']}")
|
||||
print(f"First category ID: {result_data['categories'][0]['id']}")
|
||||
print(f"First annotation ID: {result_data['annotations'][0]['id']}")
|
||||
print(f"First annotation image_id: {result_data['annotations'][0]['image_id']}")
|
||||
print(
|
||||
f"First annotation category_id: {result_data['annotations'][0]['category_id']}"
|
||||
)
|
||||
|
||||
# Clean up
|
||||
os.unlink(result_path)
|
||||
os.rmdir(os.path.dirname(result_path))
|
||||
|
||||
finally:
|
||||
# Clean up test file
|
||||
os.unlink(test_file_path)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_reindex_function()
|
||||
352
sam3/eval/coco_writer.py
Normal file
352
sam3/eval/coco_writer.py
Normal file
@@ -0,0 +1,352 @@
|
||||
# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
|
||||
|
||||
"""
|
||||
COCO prediction dumper for distributed training.
|
||||
|
||||
Handles collection and dumping of COCO-format predictions from models.
|
||||
Supports distributed processing with multiple GPUs/processes.
|
||||
"""
|
||||
|
||||
import copy
|
||||
import gc
|
||||
import heapq
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
from typing import Any, Optional
|
||||
|
||||
import pycocotools.mask as mask_utils
|
||||
import torch
|
||||
from iopath.common.file_io import g_pathmgr
|
||||
from sam3.eval.coco_eval_offline import convert_to_xywh
|
||||
from sam3.train.masks_ops import rle_encode
|
||||
from sam3.train.utils.distributed import (
|
||||
all_gather,
|
||||
gather_to_rank_0_via_filesys,
|
||||
get_rank,
|
||||
is_main_process,
|
||||
)
|
||||
|
||||
|
||||
### Helper functions and classes
|
||||
|
||||
|
||||
class HeapElement:
|
||||
"""Utility class to make a heap with a custom comparator based on score."""
|
||||
|
||||
def __init__(self, val):
|
||||
self.val = val
|
||||
|
||||
def __lt__(self, other):
|
||||
return self.val["score"] < other.val["score"]
|
||||
|
||||
|
||||
class PredictionDumper:
|
||||
"""
|
||||
Handles collection and dumping of COCO-format predictions from a model.
|
||||
|
||||
This class processes model outputs through a postprocessor, converts them to COCO format,
|
||||
and saves them to disk. It supports distributed processing with multiple GPUs/processes.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
dump_dir: str,
|
||||
postprocessor,
|
||||
maxdets: int,
|
||||
iou_type: str,
|
||||
gather_pred_via_filesys: bool = False,
|
||||
merge_predictions: bool = False,
|
||||
pred_file_evaluators: Optional[Any] = None,
|
||||
):
|
||||
"""
|
||||
Initialize the PredictionDumper.
|
||||
|
||||
Args:
|
||||
dump_dir: Directory to dump predictions.
|
||||
postprocessor: Module to convert the model's output into COCO format.
|
||||
maxdets: Maximum number of detections per image.
|
||||
iou_type: IoU type to evaluate. Can include "bbox", "segm"
|
||||
gather_pred_via_filesys: If True, use the filesystem for collective gathers across
|
||||
processes (requires a shared filesystem). Otherwise, use torch collective ops.
|
||||
merge_predictions: If True, merge predictions from all processes and dump to a single file.
|
||||
"""
|
||||
self.iou_type = iou_type
|
||||
self.maxdets = maxdets
|
||||
self.dump_dir = dump_dir
|
||||
self.postprocessor = postprocessor
|
||||
self.gather_pred_via_filesys = gather_pred_via_filesys
|
||||
self.merge_predictions = merge_predictions
|
||||
self.pred_file_evaluators = pred_file_evaluators
|
||||
if self.pred_file_evaluators is not None:
|
||||
assert (
|
||||
merge_predictions
|
||||
), "merge_predictions must be True if pred_file_evaluators are provided"
|
||||
assert self.dump_dir is not None, "dump_dir must be provided"
|
||||
|
||||
if is_main_process():
|
||||
os.makedirs(self.dump_dir, exist_ok=True)
|
||||
logging.info(f"Created prediction dump directory: {self.dump_dir}")
|
||||
|
||||
# Initialize state
|
||||
self.reset()
|
||||
|
||||
def update(self, *args, **kwargs):
|
||||
"""
|
||||
Process and accumulate predictions from model outputs.
|
||||
|
||||
Args:
|
||||
*args, **kwargs: Arguments passed to postprocessor.process_results()
|
||||
"""
|
||||
predictions = self.postprocessor.process_results(*args, **kwargs)
|
||||
results = self.prepare(predictions, self.iou_type)
|
||||
self._dump(results)
|
||||
|
||||
def _dump(self, results):
|
||||
"""
|
||||
Add results to the dump list with precision rounding.
|
||||
|
||||
Args:
|
||||
results: List of prediction dictionaries in COCO format.
|
||||
"""
|
||||
dumped_results = copy.deepcopy(results)
|
||||
for r in dumped_results:
|
||||
if "bbox" in r:
|
||||
r["bbox"] = [round(coord, 5) for coord in r["bbox"]]
|
||||
r["score"] = round(r["score"], 5)
|
||||
self.dump.extend(dumped_results)
|
||||
|
||||
def synchronize_between_processes(self):
|
||||
"""
|
||||
Synchronize predictions across all processes and save to disk.
|
||||
|
||||
If gather_pred_via_filesys is True, uses filesystem for gathering.
|
||||
Otherwise, uses torch distributed collective operations.
|
||||
Saves per-rank predictions to separate JSON files.
|
||||
"""
|
||||
logging.info("Prediction Dumper: Synchronizing between processes")
|
||||
|
||||
if not self.merge_predictions:
|
||||
dumped_file = (
|
||||
Path(self.dump_dir)
|
||||
/ f"coco_predictions_{self.iou_type}_{get_rank()}.json"
|
||||
)
|
||||
logging.info(
|
||||
f"Prediction Dumper: Dumping local predictions to {dumped_file}"
|
||||
)
|
||||
with g_pathmgr.open(str(dumped_file), "w") as f:
|
||||
json.dump(self.dump, f)
|
||||
else:
|
||||
self.dump = self.gather_and_merge_predictions()
|
||||
dumped_file = Path(self.dump_dir) / f"coco_predictions_{self.iou_type}.json"
|
||||
if is_main_process():
|
||||
logging.info(
|
||||
f"Prediction Dumper: Dumping merged predictions to {dumped_file}"
|
||||
)
|
||||
with g_pathmgr.open(str(dumped_file), "w") as f:
|
||||
json.dump(self.dump, f)
|
||||
|
||||
self.reset()
|
||||
return dumped_file
|
||||
|
||||
def gather_and_merge_predictions(self):
|
||||
"""
|
||||
Gather predictions from all processes and merge them, keeping top predictions per image.
|
||||
|
||||
This method collects predictions from all processes, then keeps only the top maxdets
|
||||
predictions per image based on score. It also deduplicates predictions by (image_id, category_id).
|
||||
|
||||
Returns:
|
||||
List of merged prediction dictionaries.
|
||||
"""
|
||||
logging.info("Prediction Dumper: Gathering predictions from all processes")
|
||||
gc.collect()
|
||||
|
||||
if self.gather_pred_via_filesys:
|
||||
dump = gather_to_rank_0_via_filesys(self.dump)
|
||||
else:
|
||||
dump = all_gather(self.dump, force_cpu=True)
|
||||
|
||||
# Combine predictions, keeping only top maxdets per image
|
||||
preds_by_image = defaultdict(list)
|
||||
seen_img_cat = set()
|
||||
|
||||
for cur_dump in dump:
|
||||
cur_seen_img_cat = set()
|
||||
for p in cur_dump:
|
||||
image_id = p["image_id"]
|
||||
cat_id = p["category_id"]
|
||||
|
||||
# Skip if we've already seen this image/category pair in a previous dump
|
||||
if (image_id, cat_id) in seen_img_cat:
|
||||
continue
|
||||
|
||||
cur_seen_img_cat.add((image_id, cat_id))
|
||||
|
||||
# Use a min-heap to keep top predictions
|
||||
if len(preds_by_image[image_id]) < self.maxdets:
|
||||
heapq.heappush(preds_by_image[image_id], HeapElement(p))
|
||||
else:
|
||||
heapq.heappushpop(preds_by_image[image_id], HeapElement(p))
|
||||
|
||||
seen_img_cat.update(cur_seen_img_cat)
|
||||
|
||||
# Flatten the heap elements back to a list
|
||||
merged_dump = sum(
|
||||
[[h.val for h in cur_preds] for cur_preds in preds_by_image.values()], []
|
||||
)
|
||||
|
||||
return merged_dump
|
||||
|
||||
def compute_synced(self):
|
||||
"""
|
||||
Synchronize predictions across processes and compute summary.
|
||||
|
||||
Returns:
|
||||
Summary dictionary from summarize().
|
||||
"""
|
||||
dumped_file = self.synchronize_between_processes()
|
||||
if not is_main_process():
|
||||
return {"": 0.0}
|
||||
|
||||
meters = {}
|
||||
if self.pred_file_evaluators is not None:
|
||||
for evaluator in self.pred_file_evaluators:
|
||||
results = evaluator.evaluate(dumped_file)
|
||||
meters.update(results)
|
||||
|
||||
if len(meters) == 0:
|
||||
meters = {"": 0.0}
|
||||
return meters
|
||||
|
||||
def compute(self):
|
||||
"""
|
||||
Compute without synchronization.
|
||||
|
||||
Returns:
|
||||
Empty metric dictionary.
|
||||
"""
|
||||
return {"": 0.0}
|
||||
|
||||
def reset(self):
|
||||
"""Reset internal state for a new evaluation round."""
|
||||
self.dump = []
|
||||
|
||||
def prepare(self, predictions, iou_type):
|
||||
"""
|
||||
Route predictions to the appropriate preparation method based on iou_type.
|
||||
|
||||
Args:
|
||||
predictions: Dictionary mapping image IDs to prediction dictionaries.
|
||||
iou_type: Type of evaluation ("bbox", "segm").
|
||||
|
||||
Returns:
|
||||
List of COCO-format prediction dictionaries.
|
||||
"""
|
||||
if iou_type == "bbox":
|
||||
return self.prepare_for_coco_detection(predictions)
|
||||
elif iou_type == "segm":
|
||||
return self.prepare_for_coco_segmentation(predictions)
|
||||
else:
|
||||
raise ValueError(f"Unknown iou type: {iou_type}")
|
||||
|
||||
def prepare_for_coco_detection(self, predictions):
|
||||
"""
|
||||
Convert predictions to COCO detection format.
|
||||
|
||||
Args:
|
||||
predictions: Dictionary mapping image IDs to prediction dictionaries
|
||||
containing "boxes", "scores", and "labels".
|
||||
|
||||
Returns:
|
||||
List of COCO-format detection dictionaries.
|
||||
"""
|
||||
coco_results = []
|
||||
for original_id, prediction in predictions.items():
|
||||
if len(prediction) == 0:
|
||||
continue
|
||||
|
||||
boxes = prediction["boxes"]
|
||||
boxes = convert_to_xywh(boxes).tolist()
|
||||
scores = prediction["scores"].tolist()
|
||||
labels = prediction["labels"].tolist()
|
||||
|
||||
coco_results.extend(
|
||||
[
|
||||
{
|
||||
"image_id": original_id,
|
||||
"category_id": labels[k],
|
||||
"bbox": box,
|
||||
"score": scores[k],
|
||||
}
|
||||
for k, box in enumerate(boxes)
|
||||
]
|
||||
)
|
||||
return coco_results
|
||||
|
||||
@torch.no_grad()
|
||||
def prepare_for_coco_segmentation(self, predictions):
|
||||
"""
|
||||
Convert predictions to COCO segmentation format.
|
||||
|
||||
Args:
|
||||
predictions: Dictionary mapping image IDs to prediction dictionaries
|
||||
containing "masks" or "masks_rle", "scores", and "labels".
|
||||
Optionally includes "boundaries" and "dilated_boundaries".
|
||||
|
||||
Returns:
|
||||
List of COCO-format segmentation dictionaries with RLE-encoded masks.
|
||||
"""
|
||||
coco_results = []
|
||||
for original_id, prediction in predictions.items():
|
||||
if len(prediction) == 0:
|
||||
continue
|
||||
|
||||
scores = prediction["scores"].tolist()
|
||||
labels = prediction["labels"].tolist()
|
||||
|
||||
boxes = None
|
||||
if "boxes" in prediction:
|
||||
boxes = prediction["boxes"]
|
||||
boxes = convert_to_xywh(boxes).tolist()
|
||||
assert len(boxes) == len(scores)
|
||||
|
||||
if "masks_rle" in prediction:
|
||||
rles = prediction["masks_rle"]
|
||||
areas = []
|
||||
for rle in rles:
|
||||
cur_area = mask_utils.area(rle)
|
||||
h, w = rle["size"]
|
||||
areas.append(cur_area / (h * w))
|
||||
else:
|
||||
masks = prediction["masks"]
|
||||
masks = masks > 0.5
|
||||
h, w = masks.shape[-2:]
|
||||
|
||||
areas = masks.flatten(1).sum(1) / (h * w)
|
||||
areas = areas.tolist()
|
||||
|
||||
rles = rle_encode(masks.squeeze(1))
|
||||
|
||||
# Memory cleanup
|
||||
del masks
|
||||
del prediction["masks"]
|
||||
|
||||
assert len(areas) == len(rles) == len(scores)
|
||||
|
||||
for k, rle in enumerate(rles):
|
||||
payload = {
|
||||
"image_id": original_id,
|
||||
"category_id": labels[k],
|
||||
"segmentation": rle,
|
||||
"score": scores[k],
|
||||
"area": areas[k],
|
||||
}
|
||||
if boxes is not None:
|
||||
payload["bbox"] = boxes[k]
|
||||
|
||||
coco_results.append(payload)
|
||||
|
||||
return coco_results
|
||||
211
sam3/eval/conversion_util.py
Normal file
211
sam3/eval/conversion_util.py
Normal file
@@ -0,0 +1,211 @@
|
||||
# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
|
||||
import json
|
||||
import os
|
||||
from collections import defaultdict
|
||||
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
def convert_ytbvis_to_cocovid_gt(ann_json, save_path=None):
|
||||
"""Convert YouTube VIS dataset to COCO-style video instance segmentation format.
|
||||
|
||||
Args:
|
||||
ann_json (str): Path to YouTube VIS annotation JSON file
|
||||
save_path (str): path to save converted COCO-style JSON
|
||||
"""
|
||||
# Initialize COCO structure
|
||||
VIS = {
|
||||
"info": {},
|
||||
"images": [],
|
||||
"videos": [],
|
||||
"tracks": [],
|
||||
"annotations": [],
|
||||
"categories": [],
|
||||
"licenses": [],
|
||||
}
|
||||
|
||||
# Load original annotations
|
||||
official_anns = json.load(open(ann_json))
|
||||
VIS["categories"] = official_anns["categories"] # Direct copy categories
|
||||
|
||||
# Initialize counters
|
||||
records = dict(img_id=1, ann_id=1)
|
||||
|
||||
# Create video-to-annotations mapping
|
||||
vid_to_anns = defaultdict(list)
|
||||
for ann in official_anns["annotations"]:
|
||||
vid_to_anns[ann["video_id"]].append(ann)
|
||||
|
||||
# Create tracks directly
|
||||
VIS["tracks"] = [
|
||||
{
|
||||
"id": ann["id"],
|
||||
"category_id": ann["category_id"],
|
||||
"video_id": ann["video_id"],
|
||||
}
|
||||
for ann in official_anns["annotations"]
|
||||
]
|
||||
|
||||
# Process videos
|
||||
for video_info in tqdm(official_anns["videos"]):
|
||||
# Create video entry
|
||||
video = {
|
||||
"id": video_info["id"],
|
||||
"name": os.path.dirname(video_info["file_names"][0]),
|
||||
"width": video_info["width"],
|
||||
"height": video_info["height"],
|
||||
"length": video_info["length"],
|
||||
"neg_category_ids": [],
|
||||
"not_exhaustive_category_ids": [],
|
||||
}
|
||||
VIS["videos"].append(video)
|
||||
|
||||
# Process frames
|
||||
num_frames = len(video_info["file_names"])
|
||||
for frame_idx in range(num_frames):
|
||||
# Create image entry
|
||||
image = {
|
||||
"id": records["img_id"],
|
||||
"video_id": video_info["id"],
|
||||
"file_name": video_info["file_names"][frame_idx],
|
||||
"width": video_info["width"],
|
||||
"height": video_info["height"],
|
||||
"frame_index": frame_idx,
|
||||
"frame_id": frame_idx,
|
||||
}
|
||||
VIS["images"].append(image)
|
||||
|
||||
# Process annotations for this frame
|
||||
if video_info["id"] in vid_to_anns:
|
||||
for ann in vid_to_anns[video_info["id"]]:
|
||||
bbox = ann["bboxes"][frame_idx]
|
||||
if bbox is None:
|
||||
continue
|
||||
|
||||
# Create annotation entry
|
||||
annotation = {
|
||||
"id": records["ann_id"],
|
||||
"video_id": video_info["id"],
|
||||
"image_id": records["img_id"],
|
||||
"track_id": ann["id"],
|
||||
"category_id": ann["category_id"],
|
||||
"bbox": bbox,
|
||||
"area": ann["areas"][frame_idx],
|
||||
"segmentation": ann["segmentations"][frame_idx],
|
||||
"iscrowd": ann["iscrowd"],
|
||||
}
|
||||
VIS["annotations"].append(annotation)
|
||||
records["ann_id"] += 1
|
||||
|
||||
records["img_id"] += 1
|
||||
|
||||
# Print summary
|
||||
print(f"Converted {len(VIS['videos'])} videos")
|
||||
print(f"Converted {len(VIS['images'])} images")
|
||||
print(f"Created {len(VIS['tracks'])} tracks")
|
||||
print(f"Created {len(VIS['annotations'])} annotations")
|
||||
|
||||
if save_path is None:
|
||||
return VIS
|
||||
|
||||
# Save output
|
||||
save_dir = os.path.dirname(save_path)
|
||||
os.makedirs(save_dir, exist_ok=True)
|
||||
json.dump(VIS, open(save_path, "w"))
|
||||
|
||||
return VIS
|
||||
|
||||
|
||||
def convert_ytbvis_to_cocovid_pred(
|
||||
youtubevis_pred_path: str, converted_dataset_path: str, output_path: str
|
||||
) -> None:
|
||||
"""
|
||||
Convert YouTubeVIS predictions to COCO format with video_id preservation
|
||||
|
||||
Args:
|
||||
youtubevis_pred_path: Path to YouTubeVIS prediction JSON
|
||||
converted_dataset_path: Path to converted COCO dataset JSON
|
||||
output_path: Path to save COCO format predictions
|
||||
"""
|
||||
|
||||
# Load YouTubeVIS predictions
|
||||
with open(youtubevis_pred_path) as f:
|
||||
ytv_predictions = json.load(f)
|
||||
|
||||
# Load converted dataset for image ID mapping
|
||||
with open(converted_dataset_path) as f:
|
||||
coco_dataset = json.load(f)
|
||||
|
||||
# Create (video_id, frame_idx) -> image_id mapping
|
||||
image_id_map = {
|
||||
(img["video_id"], img["frame_index"]): img["id"]
|
||||
for img in coco_dataset["images"]
|
||||
}
|
||||
|
||||
coco_annotations = []
|
||||
track_id_counter = 1 # Unique track ID generator
|
||||
|
||||
for pred in tqdm(ytv_predictions):
|
||||
video_id = pred["video_id"]
|
||||
category_id = pred["category_id"]
|
||||
bboxes = pred["bboxes"]
|
||||
segmentations = pred.get("segmentations", []) # Get segmentations if available
|
||||
areas = pred.get("areas", []) # Get areas if available
|
||||
score = pred["score"]
|
||||
|
||||
# Assign unique track ID for this prediction
|
||||
track_id = track_id_counter
|
||||
track_id_counter += 1
|
||||
|
||||
# Ensure segmentations and areas have the same length as bboxes
|
||||
if len(segmentations) == 0:
|
||||
segmentations = [None] * len(bboxes)
|
||||
if len(areas) == 0:
|
||||
areas = [None] * len(bboxes)
|
||||
|
||||
for frame_idx, (bbox, segmentation, area_from_pred) in enumerate(
|
||||
zip(bboxes, segmentations, areas)
|
||||
):
|
||||
# Skip frames with missing objects (None or zero bbox)
|
||||
if bbox is None or all(x == 0 for x in bbox):
|
||||
continue
|
||||
|
||||
# Get corresponding image ID from mapping
|
||||
image_id = image_id_map.get((video_id, frame_idx))
|
||||
if image_id is None:
|
||||
raise RuntimeError(
|
||||
f"prediction {video_id=}, {frame_idx=} does not match any images in the converted COCO format"
|
||||
)
|
||||
|
||||
# Extract bbox coordinates
|
||||
x, y, w, h = bbox
|
||||
|
||||
# Calculate area - use area from prediction if available, otherwise from bbox
|
||||
if area_from_pred is not None and area_from_pred > 0:
|
||||
area = area_from_pred
|
||||
else:
|
||||
area = w * h
|
||||
|
||||
# Create COCO annotation with video_id
|
||||
coco_annotation = {
|
||||
"image_id": int(image_id),
|
||||
"video_id": video_id, # Added video_id field
|
||||
"track_id": track_id,
|
||||
"category_id": category_id,
|
||||
"bbox": [float(x), float(y), float(w), float(h)],
|
||||
"area": float(area),
|
||||
"iscrowd": 0,
|
||||
"score": float(score),
|
||||
}
|
||||
|
||||
# Add segmentation if available
|
||||
if segmentation is not None:
|
||||
coco_annotation["segmentation"] = segmentation
|
||||
|
||||
coco_annotations.append(coco_annotation)
|
||||
|
||||
# Save output
|
||||
with open(output_path, "w") as f:
|
||||
json.dump(coco_annotations, f)
|
||||
|
||||
print(f"Converted {len(coco_annotations)} predictions to COCO format with video_id")
|
||||
658
sam3/eval/demo_eval.py
Normal file
658
sam3/eval/demo_eval.py
Normal file
@@ -0,0 +1,658 @@
|
||||
# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
|
||||
|
||||
"""
|
||||
This evaluator is based upon COCO evaluation, but evaluates the model in a "demo" setting.
|
||||
This means that the model's predictions are thresholded and evaluated as "hard" predictions.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Optional
|
||||
|
||||
import numpy as np
|
||||
import pycocotools.mask as maskUtils
|
||||
from pycocotools.cocoeval import COCOeval
|
||||
|
||||
from sam3.eval.coco_eval import CocoEvaluator
|
||||
from sam3.train.masks_ops import compute_F_measure
|
||||
from sam3.train.utils.distributed import is_main_process
|
||||
|
||||
from scipy.optimize import linear_sum_assignment
|
||||
|
||||
|
||||
class DemoEval(COCOeval):
|
||||
"""
|
||||
This evaluator is based upon COCO evaluation, but evaluates the model in a "demo" setting.
|
||||
This means that the model's predictions are thresholded and evaluated as "hard" predictions.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
coco_gt=None,
|
||||
coco_dt=None,
|
||||
iouType="bbox",
|
||||
threshold=0.5,
|
||||
compute_JnF=False,
|
||||
):
|
||||
"""
|
||||
Args:
|
||||
coco_gt (COCO): ground truth COCO API
|
||||
coco_dt (COCO): detections COCO API
|
||||
iou_type (str): type of IoU to evaluate
|
||||
threshold (float): threshold for predictions
|
||||
"""
|
||||
super().__init__(coco_gt, coco_dt, iouType)
|
||||
self.threshold = threshold
|
||||
|
||||
self.params.useCats = False
|
||||
self.params.areaRng = [[0**2, 1e5**2]]
|
||||
self.params.areaRngLbl = ["all"]
|
||||
self.params.maxDets = [100000]
|
||||
self.compute_JnF = compute_JnF
|
||||
|
||||
def computeIoU(self, imgId, catId):
|
||||
# Same as the original COCOeval.computeIoU, but without sorting
|
||||
p = self.params
|
||||
if p.useCats:
|
||||
gt = self._gts[imgId, catId]
|
||||
dt = self._dts[imgId, catId]
|
||||
else:
|
||||
gt = [_ for cId in p.catIds for _ in self._gts[imgId, cId]]
|
||||
dt = [_ for cId in p.catIds for _ in self._dts[imgId, cId]]
|
||||
if len(gt) == 0 and len(dt) == 0:
|
||||
return []
|
||||
|
||||
if p.iouType == "segm":
|
||||
g = [g["segmentation"] for g in gt]
|
||||
d = [d["segmentation"] for d in dt]
|
||||
elif p.iouType == "bbox":
|
||||
g = [g["bbox"] for g in gt]
|
||||
d = [d["bbox"] for d in dt]
|
||||
else:
|
||||
raise Exception("unknown iouType for iou computation")
|
||||
|
||||
# compute iou between each dt and gt region
|
||||
iscrowd = [int(o["iscrowd"]) for o in gt]
|
||||
ious = maskUtils.iou(d, g, iscrowd)
|
||||
return ious
|
||||
|
||||
def evaluateImg(self, imgId, catId, aRng, maxDet):
|
||||
"""
|
||||
perform evaluation for single category and image
|
||||
:return: dict (single image results)
|
||||
"""
|
||||
p = self.params
|
||||
assert not p.useCats, "This evaluator does not support per-category evaluation."
|
||||
assert catId == -1
|
||||
all_gts = [_ for cId in p.catIds for _ in self._gts[imgId, cId]]
|
||||
keep_gt = np.array([not g["ignore"] for g in all_gts], dtype=bool)
|
||||
gt = [g for g in all_gts if not g["ignore"]]
|
||||
all_dts = [_ for cId in p.catIds for _ in self._dts[imgId, cId]]
|
||||
keep_dt = np.array([d["score"] >= self.threshold for d in all_dts], dtype=bool)
|
||||
dt = [d for d in all_dts if d["score"] >= self.threshold]
|
||||
if len(gt) == 0 and len(dt) == 0:
|
||||
# This is a "true negative" case, where there are no GTs and no predictions
|
||||
# The box-level metrics are ill-defined, so we don't add them to this dict
|
||||
return {
|
||||
"image_id": imgId,
|
||||
"IL_TP": 0,
|
||||
"IL_TN": 1,
|
||||
"IL_FP": 0,
|
||||
"IL_FN": 0,
|
||||
"IL_perfect_neg": np.ones((len(p.iouThrs),), dtype=np.int64),
|
||||
"num_dt": len(dt),
|
||||
}
|
||||
|
||||
if len(gt) > 0 and len(dt) == 0:
|
||||
# This is a "false negative" case, where there are GTs but no predictions
|
||||
return {
|
||||
"image_id": imgId,
|
||||
"IL_TP": 0,
|
||||
"IL_TN": 0,
|
||||
"IL_FP": 0,
|
||||
"IL_FN": 1,
|
||||
"TPs": np.zeros((len(p.iouThrs),), dtype=np.int64),
|
||||
"FPs": np.zeros((len(p.iouThrs),), dtype=np.int64),
|
||||
"FNs": np.ones((len(p.iouThrs),), dtype=np.int64) * len(gt),
|
||||
"local_F1s": np.zeros((len(p.iouThrs),), dtype=np.int64),
|
||||
"local_positive_F1s": np.zeros((len(p.iouThrs),), dtype=np.int64),
|
||||
"IL_perfect_pos": np.zeros((len(p.iouThrs),), dtype=np.int64),
|
||||
"num_dt": len(dt),
|
||||
}
|
||||
|
||||
# Load pre-computed ious
|
||||
ious = self.ious[(imgId, catId)]
|
||||
|
||||
# compute matching
|
||||
if len(ious) == 0:
|
||||
ious = np.zeros((len(dt), len(gt)))
|
||||
else:
|
||||
ious = ious[keep_dt, :][:, keep_gt]
|
||||
assert ious.shape == (len(dt), len(gt))
|
||||
|
||||
matched_dt, matched_gt = linear_sum_assignment(-ious)
|
||||
|
||||
match_scores = ious[matched_dt, matched_gt]
|
||||
|
||||
if self.compute_JnF and len(match_scores) > 0:
|
||||
j_score = match_scores.mean()
|
||||
f_measure = 0
|
||||
for dt_id, gt_id in zip(matched_dt, matched_gt):
|
||||
f_measure += compute_F_measure(
|
||||
gt_boundary_rle=gt[gt_id]["boundary"],
|
||||
gt_dilated_boundary_rle=gt[gt_id]["dilated_boundary"],
|
||||
dt_boundary_rle=dt[dt_id]["boundary"],
|
||||
dt_dilated_boundary_rle=dt[dt_id]["dilated_boundary"],
|
||||
)
|
||||
f_measure /= len(match_scores) + 1e-9
|
||||
JnF = (j_score + f_measure) * 0.5
|
||||
else:
|
||||
j_score = f_measure = JnF = -1
|
||||
|
||||
TPs, FPs, FNs = [], [], []
|
||||
IL_perfect = []
|
||||
for thresh in p.iouThrs:
|
||||
TP = (match_scores >= thresh).sum()
|
||||
FP = len(dt) - TP
|
||||
FN = len(gt) - TP
|
||||
assert (
|
||||
FP >= 0 and FN >= 0
|
||||
), f"FP: {FP}, FN: {FN}, TP: {TP}, match_scores: {match_scores}, len(dt): {len(dt)}, len(gt): {len(gt)}, ious: {ious}"
|
||||
TPs.append(TP)
|
||||
FPs.append(FP)
|
||||
FNs.append(FN)
|
||||
|
||||
if FP == FN and FP == 0:
|
||||
IL_perfect.append(1)
|
||||
else:
|
||||
IL_perfect.append(0)
|
||||
|
||||
TPs = np.array(TPs, dtype=np.int64)
|
||||
FPs = np.array(FPs, dtype=np.int64)
|
||||
FNs = np.array(FNs, dtype=np.int64)
|
||||
IL_perfect = np.array(IL_perfect, dtype=np.int64)
|
||||
|
||||
# compute precision recall and F1
|
||||
precision = TPs / (TPs + FPs + 1e-4)
|
||||
assert np.all(precision <= 1)
|
||||
recall = TPs / (TPs + FNs + 1e-4)
|
||||
assert np.all(recall <= 1)
|
||||
F1 = 2 * precision * recall / (precision + recall + 1e-4)
|
||||
|
||||
result = {
|
||||
"image_id": imgId,
|
||||
"TPs": TPs,
|
||||
"FPs": FPs,
|
||||
"FNs": FNs,
|
||||
"local_F1s": F1,
|
||||
"IL_TP": (len(gt) > 0) and (len(dt) > 0),
|
||||
"IL_FP": (len(gt) == 0) and (len(dt) > 0),
|
||||
"IL_TN": (len(gt) == 0) and (len(dt) == 0),
|
||||
"IL_FN": (len(gt) > 0) and (len(dt) == 0),
|
||||
("IL_perfect_pos" if len(gt) > 0 else "IL_perfect_neg"): IL_perfect,
|
||||
"F": f_measure,
|
||||
"J": j_score,
|
||||
"J&F": JnF,
|
||||
"num_dt": len(dt),
|
||||
}
|
||||
if len(gt) > 0 and len(dt) > 0:
|
||||
result["local_positive_F1s"] = F1
|
||||
return result
|
||||
|
||||
def accumulate(self, p=None):
|
||||
"""
|
||||
Accumulate per image evaluation results and store the result in self.eval
|
||||
:param p: input params for evaluation
|
||||
:return: None
|
||||
"""
|
||||
if not self.evalImgs:
|
||||
print("Please run evaluate() first")
|
||||
# allows input customized parameters
|
||||
if p is None:
|
||||
p = self.params
|
||||
|
||||
setImgIds = set(p.imgIds)
|
||||
|
||||
# TPs, FPs, FNs
|
||||
TPs = np.zeros((len(p.iouThrs),), dtype=np.int64)
|
||||
FPs = np.zeros((len(p.iouThrs),), dtype=np.int64)
|
||||
pmFPs = np.zeros((len(p.iouThrs),), dtype=np.int64)
|
||||
FNs = np.zeros((len(p.iouThrs),), dtype=np.int64)
|
||||
local_F1s = np.zeros((len(p.iouThrs),), dtype=np.float64)
|
||||
|
||||
# Image level metrics
|
||||
IL_TPs = 0
|
||||
IL_FPs = 0
|
||||
IL_TNs = 0
|
||||
IL_FNs = 0
|
||||
IL_perfects_neg = np.zeros((len(p.iouThrs),), dtype=np.int64)
|
||||
IL_perfects_pos = np.zeros((len(p.iouThrs),), dtype=np.int64)
|
||||
|
||||
# JnF metric
|
||||
total_J = 0
|
||||
total_F = 0
|
||||
total_JnF = 0
|
||||
|
||||
valid_img_count = 0
|
||||
total_pos_count = 0
|
||||
total_neg_count = 0
|
||||
valid_J_count = 0
|
||||
valid_F1_count = 0
|
||||
valid_F1_count_w0dt = 0
|
||||
for res in self.evalImgs:
|
||||
if res["image_id"] not in setImgIds:
|
||||
continue
|
||||
IL_TPs += res["IL_TP"]
|
||||
IL_FPs += res["IL_FP"]
|
||||
IL_TNs += res["IL_TN"]
|
||||
IL_FNs += res["IL_FN"]
|
||||
if "IL_perfect_neg" in res:
|
||||
IL_perfects_neg += res["IL_perfect_neg"]
|
||||
total_neg_count += 1
|
||||
else:
|
||||
assert "IL_perfect_pos" in res
|
||||
IL_perfects_pos += res["IL_perfect_pos"]
|
||||
total_pos_count += 1
|
||||
|
||||
if "TPs" not in res:
|
||||
continue
|
||||
|
||||
TPs += res["TPs"]
|
||||
FPs += res["FPs"]
|
||||
FNs += res["FNs"]
|
||||
valid_img_count += 1
|
||||
|
||||
if "local_positive_F1s" in res:
|
||||
local_F1s += res["local_positive_F1s"]
|
||||
pmFPs += res["FPs"]
|
||||
valid_F1_count_w0dt += 1
|
||||
if res["num_dt"] > 0:
|
||||
valid_F1_count += 1
|
||||
|
||||
if "J" in res and res["J"] > -1e-9:
|
||||
total_J += res["J"]
|
||||
total_F += res["F"]
|
||||
total_JnF += res["J&F"]
|
||||
valid_J_count += 1
|
||||
|
||||
# compute precision recall and F1
|
||||
precision = TPs / (TPs + FPs + 1e-4)
|
||||
positive_micro_precision = TPs / (TPs + pmFPs + 1e-4)
|
||||
assert np.all(precision <= 1)
|
||||
recall = TPs / (TPs + FNs + 1e-4)
|
||||
assert np.all(recall <= 1)
|
||||
F1 = 2 * precision * recall / (precision + recall + 1e-4)
|
||||
positive_micro_F1 = (
|
||||
2
|
||||
* positive_micro_precision
|
||||
* recall
|
||||
/ (positive_micro_precision + recall + 1e-4)
|
||||
)
|
||||
|
||||
IL_rec = IL_TPs / (IL_TPs + IL_FNs + 1e-6)
|
||||
IL_prec = IL_TPs / (IL_TPs + IL_FPs + 1e-6)
|
||||
IL_F1 = 2 * IL_prec * IL_rec / (IL_prec + IL_rec + 1e-6)
|
||||
IL_FPR = IL_FPs / (IL_FPs + IL_TNs + 1e-6)
|
||||
IL_MCC = float(IL_TPs * IL_TNs - IL_FPs * IL_FNs) / (
|
||||
(
|
||||
float(IL_TPs + IL_FPs)
|
||||
* float(IL_TPs + IL_FNs)
|
||||
* float(IL_TNs + IL_FPs)
|
||||
* float(IL_TNs + IL_FNs)
|
||||
)
|
||||
** 0.5
|
||||
+ 1e-6
|
||||
)
|
||||
IL_perfect_pos = IL_perfects_pos / (total_pos_count + 1e-9)
|
||||
IL_perfect_neg = IL_perfects_neg / (total_neg_count + 1e-9)
|
||||
|
||||
total_J = total_J / (valid_J_count + 1e-9)
|
||||
total_F = total_F / (valid_J_count + 1e-9)
|
||||
total_JnF = total_JnF / (valid_J_count + 1e-9)
|
||||
|
||||
self.eval = {
|
||||
"params": p,
|
||||
"TPs": TPs,
|
||||
"FPs": FPs,
|
||||
"positive_micro_FPs": pmFPs,
|
||||
"FNs": FNs,
|
||||
"precision": precision,
|
||||
"positive_micro_precision": positive_micro_precision,
|
||||
"recall": recall,
|
||||
"F1": F1,
|
||||
"positive_micro_F1": positive_micro_F1,
|
||||
"positive_macro_F1": local_F1s / valid_F1_count,
|
||||
"positive_w0dt_macro_F1": local_F1s / valid_F1_count_w0dt,
|
||||
"IL_recall": IL_rec,
|
||||
"IL_precision": IL_prec,
|
||||
"IL_F1": IL_F1,
|
||||
"IL_FPR": IL_FPR,
|
||||
"IL_MCC": IL_MCC,
|
||||
"IL_perfect_pos": IL_perfect_pos,
|
||||
"IL_perfect_neg": IL_perfect_neg,
|
||||
"J": total_J,
|
||||
"F": total_F,
|
||||
"J&F": total_JnF,
|
||||
}
|
||||
self.eval["CGF1"] = self.eval["positive_macro_F1"] * self.eval["IL_MCC"]
|
||||
self.eval["CGF1_w0dt"] = (
|
||||
self.eval["positive_w0dt_macro_F1"] * self.eval["IL_MCC"]
|
||||
)
|
||||
self.eval["CGF1_micro"] = self.eval["positive_micro_F1"] * self.eval["IL_MCC"]
|
||||
|
||||
def summarize(self):
|
||||
"""
|
||||
Compute and display summary metrics for evaluation results.
|
||||
Note this functin can *only* be applied on the default parameter setting
|
||||
"""
|
||||
if not self.eval:
|
||||
raise Exception("Please run accumulate() first")
|
||||
|
||||
def _summarize(iouThr=None, metric=""):
|
||||
p = self.params
|
||||
iStr = " {:<18} @[ IoU={:<9}] = {:0.3f}"
|
||||
titleStr = "Average " + metric
|
||||
iouStr = (
|
||||
"{:0.2f}:{:0.2f}".format(p.iouThrs[0], p.iouThrs[-1])
|
||||
if iouThr is None
|
||||
else "{:0.2f}".format(iouThr)
|
||||
)
|
||||
|
||||
s = self.eval[metric]
|
||||
# IoU
|
||||
if iouThr is not None:
|
||||
t = np.where(iouThr == p.iouThrs)[0]
|
||||
s = s[t]
|
||||
|
||||
if len(s[s > -1]) == 0:
|
||||
mean_s = -1
|
||||
else:
|
||||
mean_s = np.mean(s[s > -1])
|
||||
print(iStr.format(titleStr, iouStr, mean_s))
|
||||
return mean_s
|
||||
|
||||
def _summarize_single(metric=""):
|
||||
titleStr = "Average " + metric
|
||||
iStr = " {:<35} = {:0.3f}"
|
||||
s = self.eval[metric]
|
||||
print(iStr.format(titleStr, s))
|
||||
return s
|
||||
|
||||
def _summarizeDets():
|
||||
# note: the index of these metrics are also used in video Demo F1 evaluation
|
||||
# when adding new metrics, please update the index in video Demo F1 evaluation
|
||||
# in "evaluate" method of the "VideoDemoF1Evaluator" class
|
||||
stats = np.zeros((len(DEMO_METRICS),))
|
||||
stats[0] = _summarize(metric="CGF1")
|
||||
stats[1] = _summarize(metric="precision")
|
||||
stats[2] = _summarize(metric="recall")
|
||||
stats[3] = _summarize(metric="F1")
|
||||
stats[4] = _summarize(metric="positive_macro_F1")
|
||||
stats[5] = _summarize_single(metric="IL_precision")
|
||||
stats[6] = _summarize_single(metric="IL_recall")
|
||||
stats[7] = _summarize_single(metric="IL_F1")
|
||||
stats[8] = _summarize_single(metric="IL_FPR")
|
||||
stats[9] = _summarize_single(metric="IL_MCC")
|
||||
stats[10] = _summarize(metric="IL_perfect_pos")
|
||||
stats[11] = _summarize(metric="IL_perfect_neg")
|
||||
stats[12] = _summarize(iouThr=0.5, metric="CGF1")
|
||||
stats[13] = _summarize(iouThr=0.5, metric="precision")
|
||||
stats[14] = _summarize(iouThr=0.5, metric="recall")
|
||||
stats[15] = _summarize(iouThr=0.5, metric="F1")
|
||||
stats[16] = _summarize(iouThr=0.5, metric="positive_macro_F1")
|
||||
stats[17] = _summarize(iouThr=0.5, metric="IL_perfect_pos")
|
||||
stats[18] = _summarize(iouThr=0.5, metric="IL_perfect_neg")
|
||||
stats[19] = _summarize(iouThr=0.75, metric="CGF1")
|
||||
stats[20] = _summarize(iouThr=0.75, metric="precision")
|
||||
stats[21] = _summarize(iouThr=0.75, metric="recall")
|
||||
stats[22] = _summarize(iouThr=0.75, metric="F1")
|
||||
stats[23] = _summarize(iouThr=0.75, metric="positive_macro_F1")
|
||||
stats[24] = _summarize(iouThr=0.75, metric="IL_perfect_pos")
|
||||
stats[25] = _summarize(iouThr=0.75, metric="IL_perfect_neg")
|
||||
stats[26] = _summarize_single(metric="J")
|
||||
stats[27] = _summarize_single(metric="F")
|
||||
stats[28] = _summarize_single(metric="J&F")
|
||||
stats[29] = _summarize(metric="CGF1_micro")
|
||||
stats[30] = _summarize(metric="positive_micro_precision")
|
||||
stats[31] = _summarize(metric="positive_micro_F1")
|
||||
stats[32] = _summarize(iouThr=0.5, metric="CGF1_micro")
|
||||
stats[33] = _summarize(iouThr=0.5, metric="positive_micro_precision")
|
||||
stats[34] = _summarize(iouThr=0.5, metric="positive_micro_F1")
|
||||
stats[35] = _summarize(iouThr=0.75, metric="CGF1_micro")
|
||||
stats[36] = _summarize(iouThr=0.75, metric="positive_micro_precision")
|
||||
stats[37] = _summarize(iouThr=0.75, metric="positive_micro_F1")
|
||||
stats[38] = _summarize(metric="CGF1_w0dt")
|
||||
stats[39] = _summarize(metric="positive_w0dt_macro_F1")
|
||||
stats[40] = _summarize(iouThr=0.5, metric="CGF1_w0dt")
|
||||
stats[41] = _summarize(iouThr=0.5, metric="positive_w0dt_macro_F1")
|
||||
stats[42] = _summarize(iouThr=0.75, metric="CGF1_w0dt")
|
||||
stats[43] = _summarize(iouThr=0.75, metric="positive_w0dt_macro_F1")
|
||||
return stats
|
||||
|
||||
summarize = _summarizeDets
|
||||
self.stats = summarize()
|
||||
|
||||
|
||||
DEMO_METRICS = [
|
||||
"CGF1",
|
||||
"Precision",
|
||||
"Recall",
|
||||
"F1",
|
||||
"Macro_F1",
|
||||
"IL_Precision",
|
||||
"IL_Recall",
|
||||
"IL_F1",
|
||||
"IL_FPR",
|
||||
"IL_MCC",
|
||||
"IL_perfect_pos",
|
||||
"IL_perfect_neg",
|
||||
"CGF1@0.5",
|
||||
"Precision@0.5",
|
||||
"Recall@0.5",
|
||||
"F1@0.5",
|
||||
"Macro_F1@0.5",
|
||||
"IL_perfect_pos@0.5",
|
||||
"IL_perfect_neg@0.5",
|
||||
"CGF1@0.75",
|
||||
"Precision@0.75",
|
||||
"Recall@0.75",
|
||||
"F1@0.75",
|
||||
"Macro_F1@0.75",
|
||||
"IL_perfect_pos@0.75",
|
||||
"IL_perfect_neg@0.75",
|
||||
"J",
|
||||
"F",
|
||||
"J&F",
|
||||
"CGF1_micro",
|
||||
"positive_micro_Precision",
|
||||
"positive_micro_F1",
|
||||
"CGF1_micro@0.5",
|
||||
"positive_micro_Precision@0.5",
|
||||
"positive_micro_F1@0.5",
|
||||
"CGF1_micro@0.75",
|
||||
"positive_micro_Precision@0.75",
|
||||
"positive_micro_F1@0.75",
|
||||
"CGF1_w0dt",
|
||||
"positive_w0dt_macro_F1",
|
||||
"CGF1_w0dt@0.5",
|
||||
"positive_w0dt_macro_F1@0.5",
|
||||
"CGF1_w0dt@0.75",
|
||||
"positive_w0dt_macro_F1@0.75",
|
||||
]
|
||||
|
||||
|
||||
class DemoEvaluator(CocoEvaluator):
|
||||
def __init__(
|
||||
self,
|
||||
coco_gt,
|
||||
iou_types,
|
||||
dump_dir: Optional[str],
|
||||
postprocessor,
|
||||
threshold=0.5,
|
||||
average_by_rarity=False,
|
||||
gather_pred_via_filesys=False,
|
||||
exhaustive_only=False,
|
||||
all_exhaustive_only=True,
|
||||
compute_JnF=False,
|
||||
metrics_dump_dir: Optional[str] = None,
|
||||
):
|
||||
self.iou_types = iou_types
|
||||
self.threshold = threshold
|
||||
super().__init__(
|
||||
coco_gt=coco_gt,
|
||||
iou_types=iou_types,
|
||||
useCats=False,
|
||||
dump_dir=dump_dir,
|
||||
postprocessor=postprocessor,
|
||||
# average_by_rarity=average_by_rarity,
|
||||
gather_pred_via_filesys=gather_pred_via_filesys,
|
||||
exhaustive_only=exhaustive_only,
|
||||
all_exhaustive_only=all_exhaustive_only,
|
||||
metrics_dump_dir=metrics_dump_dir,
|
||||
)
|
||||
|
||||
self.use_self_evaluate = True
|
||||
self.compute_JnF = compute_JnF
|
||||
|
||||
def _lazy_init(self):
|
||||
if self.initialized:
|
||||
return
|
||||
super()._lazy_init()
|
||||
self.use_self_evaluate = True
|
||||
self.reset()
|
||||
|
||||
def select_best_scoring(self, scorings):
|
||||
# This function is used for "oracle" type evaluation.
|
||||
# It accepts the evaluation results with respect to several ground truths, and picks the best
|
||||
if len(scorings) == 1:
|
||||
return scorings[0]
|
||||
|
||||
assert (
|
||||
scorings[0].ndim == 3
|
||||
), f"Expecting results in [numCats, numAreas, numImgs] format, got {scorings[0].shape}"
|
||||
assert (
|
||||
scorings[0].shape[0] == 1
|
||||
), f"Expecting a single category, got {scorings[0].shape[0]}"
|
||||
|
||||
for scoring in scorings:
|
||||
assert (
|
||||
scoring.shape == scorings[0].shape
|
||||
), f"Shape mismatch: {scoring.shape}, {scorings[0].shape}"
|
||||
|
||||
selected_imgs = []
|
||||
for img_id in range(scorings[0].shape[-1]):
|
||||
best = scorings[0][:, :, img_id]
|
||||
|
||||
for scoring in scorings[1:]:
|
||||
current = scoring[:, :, img_id]
|
||||
if "local_F1s" in best[0, 0] and "local_F1s" in current[0, 0]:
|
||||
# we were able to compute a F1 score for this particular image in both evaluations
|
||||
# best["local_F1s"] contains the results at various IoU thresholds. We simply take the average for comparision
|
||||
best_score = best[0, 0]["local_F1s"].mean()
|
||||
current_score = current[0, 0]["local_F1s"].mean()
|
||||
if current_score > best_score:
|
||||
best = current
|
||||
|
||||
else:
|
||||
# If we're here, it means that in that in some evaluation we were not able to get a valid local F1
|
||||
# This happens when both the predictions and targets are empty. In that case, we can assume it's a perfect prediction
|
||||
if "local_F1s" not in current[0, 0]:
|
||||
best = current
|
||||
selected_imgs.append(best)
|
||||
result = np.stack(selected_imgs, axis=-1)
|
||||
assert result.shape == scorings[0].shape
|
||||
return result
|
||||
|
||||
def summarize(self):
|
||||
self._lazy_init()
|
||||
logging.info("Demo evaluator: Summarizing")
|
||||
if not is_main_process():
|
||||
return {}
|
||||
outs = {}
|
||||
prefix = "oracle_" if len(self.coco_evals) > 1 else ""
|
||||
# if self.rarity_buckets is None:
|
||||
self.accumulate(self.eval_img_ids)
|
||||
for iou_type, coco_eval in self.coco_evals[0].items():
|
||||
print("Demo metric, IoU type={}".format(iou_type))
|
||||
coco_eval.summarize()
|
||||
|
||||
if "bbox" in self.coco_evals[0]:
|
||||
for i, value in enumerate(self.coco_evals[0]["bbox"].stats):
|
||||
outs[f"coco_eval_bbox_{prefix}{DEMO_METRICS[i]}"] = value
|
||||
if "segm" in self.coco_evals[0]:
|
||||
for i, value in enumerate(self.coco_evals[0]["segm"].stats):
|
||||
outs[f"coco_eval_masks_{prefix}{DEMO_METRICS[i]}"] = value
|
||||
# else:
|
||||
# total_stats = {}
|
||||
# for bucket, img_list in self.rarity_buckets.items():
|
||||
# self.accumulate(imgIds=img_list)
|
||||
# bucket_name = RARITY_BUCKETS[bucket]
|
||||
# for iou_type, coco_eval in self.coco_evals[0].items():
|
||||
# print(
|
||||
# "Demo metric, IoU type={}, Rarity bucket={}".format(
|
||||
# iou_type, bucket_name
|
||||
# )
|
||||
# )
|
||||
# coco_eval.summarize()
|
||||
|
||||
# if "bbox" in self.coco_evals[0]:
|
||||
# if "bbox" not in total_stats:
|
||||
# total_stats["bbox"] = np.zeros_like(
|
||||
# self.coco_evals[0]["bbox"].stats
|
||||
# )
|
||||
# total_stats["bbox"] += self.coco_evals[0]["bbox"].stats
|
||||
# for i, value in enumerate(self.coco_evals[0]["bbox"].stats):
|
||||
# outs[
|
||||
# f"coco_eval_bbox_{bucket_name}_{prefix}{DEMO_METRICS[i]}"
|
||||
# ] = value
|
||||
# if "segm" in self.coco_evals[0]:
|
||||
# if "segm" not in total_stats:
|
||||
# total_stats["segm"] = np.zeros_like(
|
||||
# self.coco_evals[0]["segm"].stats
|
||||
# )
|
||||
# total_stats["segm"] += self.coco_evals[0]["segm"].stats
|
||||
# for i, value in enumerate(self.coco_evals[0]["segm"].stats):
|
||||
# outs[
|
||||
# f"coco_eval_masks_{bucket_name}_{prefix}{DEMO_METRICS[i]}"
|
||||
# ] = value
|
||||
|
||||
# if "bbox" in total_stats:
|
||||
# total_stats["bbox"] /= len(self.rarity_buckets)
|
||||
# for i, value in enumerate(total_stats["bbox"]):
|
||||
# outs[f"coco_eval_bbox_{prefix}{DEMO_METRICS[i]}"] = value
|
||||
# if "segm" in total_stats:
|
||||
# total_stats["segm"] /= len(self.rarity_buckets)
|
||||
# for i, value in enumerate(total_stats["segm"]):
|
||||
# outs[f"coco_eval_masks_{prefix}{DEMO_METRICS[i]}"] = value
|
||||
|
||||
return outs
|
||||
|
||||
def accumulate(self, imgIds=None):
|
||||
self._lazy_init()
|
||||
logging.info(
|
||||
f"demo evaluator: Accumulating on {len(imgIds) if imgIds is not None else 'all'} images"
|
||||
)
|
||||
if not is_main_process():
|
||||
return
|
||||
|
||||
if imgIds is not None:
|
||||
for coco_eval in self.coco_evals[0].values():
|
||||
coco_eval.params.imgIds = list(imgIds)
|
||||
|
||||
for coco_eval in self.coco_evals[0].values():
|
||||
coco_eval.accumulate()
|
||||
|
||||
def reset(self):
|
||||
self.coco_evals = [{} for _ in range(len(self.coco_gts))]
|
||||
for i, coco_gt in enumerate(self.coco_gts):
|
||||
for iou_type in self.iou_types:
|
||||
self.coco_evals[i][iou_type] = DemoEval(
|
||||
coco_gt=coco_gt,
|
||||
iouType=iou_type,
|
||||
threshold=self.threshold,
|
||||
compute_JnF=self.compute_JnF,
|
||||
)
|
||||
self.coco_evals[i][iou_type].useCats = False
|
||||
self.img_ids = []
|
||||
self.eval_imgs = {k: [] for k in self.iou_types}
|
||||
if self.dump is not None:
|
||||
self.dump = []
|
||||
1
sam3/eval/hota_eval_toolkit/__init__.py
Normal file
1
sam3/eval/hota_eval_toolkit/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
# flake8: noqa
|
||||
114
sam3/eval/hota_eval_toolkit/run_ytvis_eval.py
Normal file
114
sam3/eval/hota_eval_toolkit/run_ytvis_eval.py
Normal file
@@ -0,0 +1,114 @@
|
||||
# flake8: noqa
|
||||
|
||||
"""run_youtube_vis.py
|
||||
Run example:
|
||||
run_youtube_vis.py --USE_PARALLEL False --METRICS HOTA --TRACKERS_TO_EVAL STEm_Seg
|
||||
Command Line Arguments: Defaults, # Comments
|
||||
Eval arguments:
|
||||
'USE_PARALLEL': False,
|
||||
'NUM_PARALLEL_CORES': 8,
|
||||
'BREAK_ON_ERROR': True, # Raises exception and exits with error
|
||||
'RETURN_ON_ERROR': False, # if not BREAK_ON_ERROR, then returns from function on error
|
||||
'LOG_ON_ERROR': os.path.join(code_path, 'error_log.txt'), # if not None, save any errors into a log file.
|
||||
'PRINT_RESULTS': True,
|
||||
'PRINT_ONLY_COMBINED': False,
|
||||
'PRINT_CONFIG': True,
|
||||
'TIME_PROGRESS': True,
|
||||
'DISPLAY_LESS_PROGRESS': True,
|
||||
'OUTPUT_SUMMARY': True,
|
||||
'OUTPUT_EMPTY_CLASSES': True, # If False, summary files are not output for classes with no detections
|
||||
'OUTPUT_DETAILED': True,
|
||||
'PLOT_CURVES': True,
|
||||
Dataset arguments:
|
||||
'GT_FOLDER': os.path.join(code_path, 'data/gt/youtube_vis/youtube_vis_training'), # Location of GT data
|
||||
'TRACKERS_FOLDER': os.path.join(code_path, 'data/trackers/youtube_vis/youtube_vis_training'),
|
||||
# Trackers location
|
||||
'OUTPUT_FOLDER': None, # Where to save eval results (if None, same as TRACKERS_FOLDER)
|
||||
'TRACKERS_TO_EVAL': None, # Filenames of trackers to eval (if None, all in folder)
|
||||
'CLASSES_TO_EVAL': None, # Classes to eval (if None, all classes)
|
||||
'SPLIT_TO_EVAL': 'training', # Valid: 'training', 'val'
|
||||
'PRINT_CONFIG': True, # Whether to print current config
|
||||
'OUTPUT_SUB_FOLDER': '', # Output files are saved in OUTPUT_FOLDER/tracker_name/OUTPUT_SUB_FOLDER
|
||||
'TRACKER_SUB_FOLDER': 'data', # Tracker files are in TRACKER_FOLDER/tracker_name/TRACKER_SUB_FOLDER
|
||||
'TRACKER_DISPLAY_NAMES': None, # Names of trackers to display, if None: TRACKERS_TO_EVAL
|
||||
Metric arguments:
|
||||
'METRICS': ['TrackMAP', 'HOTA', 'CLEAR', 'Identity']
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
from multiprocessing import freeze_support
|
||||
|
||||
from . import trackeval
|
||||
|
||||
|
||||
def run_ytvis_eval(args=None, gt_json=None, dt_json=None):
|
||||
# Command line interface:
|
||||
default_eval_config = trackeval.Evaluator.get_default_eval_config()
|
||||
# print only combined since TrackMAP is undefined for per sequence breakdowns
|
||||
default_eval_config["PRINT_ONLY_COMBINED"] = True
|
||||
default_dataset_config = trackeval.datasets.YouTubeVIS.get_default_dataset_config()
|
||||
default_metrics_config = {"METRICS": ["HOTA"]}
|
||||
config = {
|
||||
**default_eval_config,
|
||||
**default_dataset_config,
|
||||
**default_metrics_config,
|
||||
} # Merge default configs
|
||||
parser = argparse.ArgumentParser()
|
||||
for setting in config.keys():
|
||||
if type(config[setting]) == list or type(config[setting]) == type(None):
|
||||
parser.add_argument("--" + setting, nargs="+")
|
||||
else:
|
||||
parser.add_argument("--" + setting)
|
||||
args = parser.parse_args(args).__dict__
|
||||
for setting in args.keys():
|
||||
if args[setting] is not None:
|
||||
if type(config[setting]) == type(True):
|
||||
if args[setting] == "True":
|
||||
x = True
|
||||
elif args[setting] == "False":
|
||||
x = False
|
||||
else:
|
||||
raise Exception(
|
||||
"Command line parameter " + setting + "must be True or False"
|
||||
)
|
||||
elif type(config[setting]) == type(1):
|
||||
x = int(args[setting])
|
||||
elif type(args[setting]) == type(None):
|
||||
x = None
|
||||
else:
|
||||
x = args[setting]
|
||||
config[setting] = x
|
||||
eval_config = {k: v for k, v in config.items() if k in default_eval_config.keys()}
|
||||
dataset_config = {
|
||||
k: v for k, v in config.items() if k in default_dataset_config.keys()
|
||||
}
|
||||
metrics_config = {
|
||||
k: v for k, v in config.items() if k in default_metrics_config.keys()
|
||||
}
|
||||
|
||||
# Run code
|
||||
evaluator = trackeval.Evaluator(eval_config)
|
||||
# allow directly specifying the GT JSON data and Tracker (result)
|
||||
# JSON data as Python objects, without reading from files.
|
||||
dataset_config["GT_JSON_OBJECT"] = gt_json
|
||||
dataset_config["TRACKER_JSON_OBJECT"] = dt_json
|
||||
dataset_list = [trackeval.datasets.YouTubeVIS(dataset_config)]
|
||||
metrics_list = []
|
||||
# for metric in [trackeval.metrics.TrackMAP, trackeval.metrics.HOTA, trackeval.metrics.CLEAR,
|
||||
# trackeval.metrics.Identity]:
|
||||
for metric in [trackeval.metrics.HOTA]:
|
||||
if metric.get_name() in metrics_config["METRICS"]:
|
||||
metrics_list.append(metric())
|
||||
if len(metrics_list) == 0:
|
||||
raise Exception("No metrics selected for evaluation")
|
||||
output_res, output_msg = evaluator.evaluate(dataset_list, metrics_list)
|
||||
return output_res, output_msg
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
freeze_support()
|
||||
run_ytvis_eval(sys.argv[1:])
|
||||
4
sam3/eval/hota_eval_toolkit/trackeval/__init__.py
Normal file
4
sam3/eval/hota_eval_toolkit/trackeval/__init__.py
Normal file
@@ -0,0 +1,4 @@
|
||||
# flake8: noqa
|
||||
|
||||
from . import datasets, metrics, utils
|
||||
from .eval import Evaluator
|
||||
68
sam3/eval/hota_eval_toolkit/trackeval/_timing.py
Normal file
68
sam3/eval/hota_eval_toolkit/trackeval/_timing.py
Normal file
@@ -0,0 +1,68 @@
|
||||
# flake8: noqa
|
||||
|
||||
import inspect
|
||||
from functools import wraps
|
||||
from time import perf_counter
|
||||
|
||||
DO_TIMING = False
|
||||
DISPLAY_LESS_PROGRESS = False
|
||||
timer_dict = {}
|
||||
counter = 0
|
||||
|
||||
|
||||
def time(f):
|
||||
@wraps(f)
|
||||
def wrap(*args, **kw):
|
||||
if DO_TIMING:
|
||||
# Run function with timing
|
||||
ts = perf_counter()
|
||||
result = f(*args, **kw)
|
||||
te = perf_counter()
|
||||
tt = te - ts
|
||||
|
||||
# Get function name
|
||||
arg_names = inspect.getfullargspec(f)[0]
|
||||
if arg_names[0] == "self" and DISPLAY_LESS_PROGRESS:
|
||||
return result
|
||||
elif arg_names[0] == "self":
|
||||
method_name = type(args[0]).__name__ + "." + f.__name__
|
||||
else:
|
||||
method_name = f.__name__
|
||||
|
||||
# Record accumulative time in each function for analysis
|
||||
if method_name in timer_dict.keys():
|
||||
timer_dict[method_name] += tt
|
||||
else:
|
||||
timer_dict[method_name] = tt
|
||||
|
||||
# If code is finished, display timing summary
|
||||
if method_name == "Evaluator.evaluate":
|
||||
print("")
|
||||
print("Timing analysis:")
|
||||
for key, value in timer_dict.items():
|
||||
print("%-70s %2.4f sec" % (key, value))
|
||||
else:
|
||||
# Get function argument values for printing special arguments of interest
|
||||
arg_titles = ["tracker", "seq", "cls"]
|
||||
arg_vals = []
|
||||
for i, a in enumerate(arg_names):
|
||||
if a in arg_titles:
|
||||
arg_vals.append(args[i])
|
||||
arg_text = "(" + ", ".join(arg_vals) + ")"
|
||||
|
||||
# Display methods and functions with different indentation.
|
||||
if arg_names[0] == "self":
|
||||
print("%-74s %2.4f sec" % (" " * 4 + method_name + arg_text, tt))
|
||||
elif arg_names[0] == "test":
|
||||
pass
|
||||
else:
|
||||
global counter
|
||||
counter += 1
|
||||
print("%i %-70s %2.4f sec" % (counter, method_name + arg_text, tt))
|
||||
|
||||
return result
|
||||
else:
|
||||
# If config["TIME_PROGRESS"] is false, or config["USE_PARALLEL"] is true, run functions normally without timing.
|
||||
return f(*args, **kw)
|
||||
|
||||
return wrap
|
||||
@@ -0,0 +1,4 @@
|
||||
# flake8: noqa
|
||||
|
||||
from .tao_ow import TAO_OW
|
||||
from .youtube_vis import YouTubeVIS
|
||||
379
sam3/eval/hota_eval_toolkit/trackeval/datasets/_base_dataset.py
Normal file
379
sam3/eval/hota_eval_toolkit/trackeval/datasets/_base_dataset.py
Normal file
@@ -0,0 +1,379 @@
|
||||
# flake8: noqa
|
||||
|
||||
import csv
|
||||
import io
|
||||
import os
|
||||
import traceback
|
||||
import zipfile
|
||||
from abc import ABC, abstractmethod
|
||||
from copy import deepcopy
|
||||
|
||||
import numpy as np
|
||||
|
||||
from .. import _timing
|
||||
from ..utils import TrackEvalException
|
||||
|
||||
|
||||
class _BaseDataset(ABC):
|
||||
@abstractmethod
|
||||
def __init__(self):
|
||||
self.tracker_list = None
|
||||
self.seq_list = None
|
||||
self.class_list = None
|
||||
self.output_fol = None
|
||||
self.output_sub_fol = None
|
||||
self.should_classes_combine = True
|
||||
self.use_super_categories = False
|
||||
|
||||
# Functions to implement:
|
||||
|
||||
@staticmethod
|
||||
@abstractmethod
|
||||
def get_default_dataset_config(): ...
|
||||
|
||||
@abstractmethod
|
||||
def _load_raw_file(self, tracker, seq, is_gt): ...
|
||||
|
||||
@_timing.time
|
||||
@abstractmethod
|
||||
def get_preprocessed_seq_data(self, raw_data, cls): ...
|
||||
|
||||
@abstractmethod
|
||||
def _calculate_similarities(self, gt_dets_t, tracker_dets_t): ...
|
||||
|
||||
# Helper functions for all datasets:
|
||||
|
||||
@classmethod
|
||||
def get_class_name(cls):
|
||||
return cls.__name__
|
||||
|
||||
def get_name(self):
|
||||
return self.get_class_name()
|
||||
|
||||
def get_output_fol(self, tracker):
|
||||
return os.path.join(self.output_fol, tracker, self.output_sub_fol)
|
||||
|
||||
def get_display_name(self, tracker):
|
||||
"""Can be overwritten if the trackers name (in files) is different to how it should be displayed.
|
||||
By default this method just returns the trackers name as is.
|
||||
"""
|
||||
return tracker
|
||||
|
||||
def get_eval_info(self):
|
||||
"""Return info about the dataset needed for the Evaluator"""
|
||||
return self.tracker_list, self.seq_list, self.class_list
|
||||
|
||||
@_timing.time
|
||||
def get_raw_seq_data(self, tracker, seq):
|
||||
"""Loads raw data (tracker and ground-truth) for a single tracker on a single sequence.
|
||||
Raw data includes all of the information needed for both preprocessing and evaluation, for all classes.
|
||||
A later function (get_processed_seq_data) will perform such preprocessing and extract relevant information for
|
||||
the evaluation of each class.
|
||||
|
||||
This returns a dict which contains the fields:
|
||||
[num_timesteps]: integer
|
||||
[gt_ids, tracker_ids, gt_classes, tracker_classes, tracker_confidences]:
|
||||
list (for each timestep) of 1D NDArrays (for each det).
|
||||
[gt_dets, tracker_dets, gt_crowd_ignore_regions]: list (for each timestep) of lists of detections.
|
||||
[similarity_scores]: list (for each timestep) of 2D NDArrays.
|
||||
[gt_extras]: dict (for each extra) of lists (for each timestep) of 1D NDArrays (for each det).
|
||||
|
||||
gt_extras contains dataset specific information used for preprocessing such as occlusion and truncation levels.
|
||||
|
||||
Note that similarities are extracted as part of the dataset and not the metric, because almost all metrics are
|
||||
independent of the exact method of calculating the similarity. However datasets are not (e.g. segmentation
|
||||
masks vs 2D boxes vs 3D boxes).
|
||||
We calculate the similarity before preprocessing because often both preprocessing and evaluation require it and
|
||||
we don't wish to calculate this twice.
|
||||
We calculate similarity between all gt and tracker classes (not just each class individually) to allow for
|
||||
calculation of metrics such as class confusion matrices. Typically the impact of this on performance is low.
|
||||
"""
|
||||
# Load raw data.
|
||||
raw_gt_data = self._load_raw_file(tracker, seq, is_gt=True)
|
||||
raw_tracker_data = self._load_raw_file(tracker, seq, is_gt=False)
|
||||
raw_data = {**raw_tracker_data, **raw_gt_data} # Merges dictionaries
|
||||
|
||||
# Calculate similarities for each timestep.
|
||||
similarity_scores = []
|
||||
for t, (gt_dets_t, tracker_dets_t) in enumerate(
|
||||
zip(raw_data["gt_dets"], raw_data["tracker_dets"])
|
||||
):
|
||||
ious = self._calculate_similarities(gt_dets_t, tracker_dets_t)
|
||||
similarity_scores.append(ious)
|
||||
raw_data["similarity_scores"] = similarity_scores
|
||||
return raw_data
|
||||
|
||||
@staticmethod
|
||||
def _load_simple_text_file(
|
||||
file,
|
||||
time_col=0,
|
||||
id_col=None,
|
||||
remove_negative_ids=False,
|
||||
valid_filter=None,
|
||||
crowd_ignore_filter=None,
|
||||
convert_filter=None,
|
||||
is_zipped=False,
|
||||
zip_file=None,
|
||||
force_delimiters=None,
|
||||
):
|
||||
"""Function that loads data which is in a commonly used text file format.
|
||||
Assumes each det is given by one row of a text file.
|
||||
There is no limit to the number or meaning of each column,
|
||||
however one column needs to give the timestep of each det (time_col) which is default col 0.
|
||||
|
||||
The file dialect (deliminator, num cols, etc) is determined automatically.
|
||||
This function automatically separates dets by timestep,
|
||||
and is much faster than alternatives such as np.loadtext or pandas.
|
||||
|
||||
If remove_negative_ids is True and id_col is not None, dets with negative values in id_col are excluded.
|
||||
These are not excluded from ignore data.
|
||||
|
||||
valid_filter can be used to only include certain classes.
|
||||
It is a dict with ints as keys, and lists as values,
|
||||
such that a row is included if "row[key].lower() is in value" for all key/value pairs in the dict.
|
||||
If None, all classes are included.
|
||||
|
||||
crowd_ignore_filter can be used to read crowd_ignore regions separately. It has the same format as valid filter.
|
||||
|
||||
convert_filter can be used to convert value read to another format.
|
||||
This is used most commonly to convert classes given as string to a class id.
|
||||
This is a dict such that the key is the column to convert, and the value is another dict giving the mapping.
|
||||
|
||||
Optionally, input files could be a zip of multiple text files for storage efficiency.
|
||||
|
||||
Returns read_data and ignore_data.
|
||||
Each is a dict (with keys as timesteps as strings) of lists (over dets) of lists (over column values).
|
||||
Note that all data is returned as strings, and must be converted to float/int later if needed.
|
||||
Note that timesteps will not be present in the returned dict keys if there are no dets for them
|
||||
"""
|
||||
|
||||
if remove_negative_ids and id_col is None:
|
||||
raise TrackEvalException(
|
||||
"remove_negative_ids is True, but id_col is not given."
|
||||
)
|
||||
if crowd_ignore_filter is None:
|
||||
crowd_ignore_filter = {}
|
||||
if convert_filter is None:
|
||||
convert_filter = {}
|
||||
try:
|
||||
if is_zipped: # Either open file directly or within a zip.
|
||||
if zip_file is None:
|
||||
raise TrackEvalException(
|
||||
"is_zipped set to True, but no zip_file is given."
|
||||
)
|
||||
archive = zipfile.ZipFile(os.path.join(zip_file), "r")
|
||||
fp = io.TextIOWrapper(archive.open(file, "r"))
|
||||
else:
|
||||
fp = open(file)
|
||||
read_data = {}
|
||||
crowd_ignore_data = {}
|
||||
fp.seek(0, os.SEEK_END)
|
||||
# check if file is empty
|
||||
if fp.tell():
|
||||
fp.seek(0)
|
||||
dialect = csv.Sniffer().sniff(
|
||||
fp.readline(), delimiters=force_delimiters
|
||||
) # Auto determine structure.
|
||||
dialect.skipinitialspace = (
|
||||
True # Deal with extra spaces between columns
|
||||
)
|
||||
fp.seek(0)
|
||||
reader = csv.reader(fp, dialect)
|
||||
for row in reader:
|
||||
try:
|
||||
# Deal with extra trailing spaces at the end of rows
|
||||
if row[-1] in "":
|
||||
row = row[:-1]
|
||||
timestep = str(int(float(row[time_col])))
|
||||
# Read ignore regions separately.
|
||||
is_ignored = False
|
||||
for ignore_key, ignore_value in crowd_ignore_filter.items():
|
||||
if row[ignore_key].lower() in ignore_value:
|
||||
# Convert values in one column (e.g. string to id)
|
||||
for (
|
||||
convert_key,
|
||||
convert_value,
|
||||
) in convert_filter.items():
|
||||
row[convert_key] = convert_value[
|
||||
row[convert_key].lower()
|
||||
]
|
||||
# Save data separated by timestep.
|
||||
if timestep in crowd_ignore_data.keys():
|
||||
crowd_ignore_data[timestep].append(row)
|
||||
else:
|
||||
crowd_ignore_data[timestep] = [row]
|
||||
is_ignored = True
|
||||
if (
|
||||
is_ignored
|
||||
): # if det is an ignore region, it cannot be a normal det.
|
||||
continue
|
||||
# Exclude some dets if not valid.
|
||||
if valid_filter is not None:
|
||||
for key, value in valid_filter.items():
|
||||
if row[key].lower() not in value:
|
||||
continue
|
||||
if remove_negative_ids:
|
||||
if int(float(row[id_col])) < 0:
|
||||
continue
|
||||
# Convert values in one column (e.g. string to id)
|
||||
for convert_key, convert_value in convert_filter.items():
|
||||
row[convert_key] = convert_value[row[convert_key].lower()]
|
||||
# Save data separated by timestep.
|
||||
if timestep in read_data.keys():
|
||||
read_data[timestep].append(row)
|
||||
else:
|
||||
read_data[timestep] = [row]
|
||||
except Exception:
|
||||
exc_str_init = (
|
||||
"In file %s the following line cannot be read correctly: \n"
|
||||
% os.path.basename(file)
|
||||
)
|
||||
exc_str = " ".join([exc_str_init] + row)
|
||||
raise TrackEvalException(exc_str)
|
||||
fp.close()
|
||||
except Exception:
|
||||
print("Error loading file: %s, printing traceback." % file)
|
||||
traceback.print_exc()
|
||||
raise TrackEvalException(
|
||||
"File %s cannot be read because it is either not present or invalidly formatted"
|
||||
% os.path.basename(file)
|
||||
)
|
||||
return read_data, crowd_ignore_data
|
||||
|
||||
@staticmethod
|
||||
def _calculate_mask_ious(masks1, masks2, is_encoded=False, do_ioa=False):
|
||||
"""Calculates the IOU (intersection over union) between two arrays of segmentation masks.
|
||||
If is_encoded a run length encoding with pycocotools is assumed as input format, otherwise an input of numpy
|
||||
arrays of the shape (num_masks, height, width) is assumed and the encoding is performed.
|
||||
If do_ioa (intersection over area) , then calculates the intersection over the area of masks1 - this is commonly
|
||||
used to determine if detections are within crowd ignore region.
|
||||
:param masks1: first set of masks (numpy array of shape (num_masks, height, width) if not encoded,
|
||||
else pycocotools rle encoded format)
|
||||
:param masks2: second set of masks (numpy array of shape (num_masks, height, width) if not encoded,
|
||||
else pycocotools rle encoded format)
|
||||
:param is_encoded: whether the input is in pycocotools rle encoded format
|
||||
:param do_ioa: whether to perform IoA computation
|
||||
:return: the IoU/IoA scores
|
||||
"""
|
||||
|
||||
# Only loaded when run to reduce minimum requirements
|
||||
from pycocotools import mask as mask_utils
|
||||
|
||||
# use pycocotools for run length encoding of masks
|
||||
if not is_encoded:
|
||||
masks1 = mask_utils.encode(
|
||||
np.array(np.transpose(masks1, (1, 2, 0)), order="F")
|
||||
)
|
||||
masks2 = mask_utils.encode(
|
||||
np.array(np.transpose(masks2, (1, 2, 0)), order="F")
|
||||
)
|
||||
|
||||
# use pycocotools for iou computation of rle encoded masks
|
||||
ious = mask_utils.iou(masks1, masks2, [do_ioa] * len(masks2))
|
||||
if len(masks1) == 0 or len(masks2) == 0:
|
||||
ious = np.asarray(ious).reshape(len(masks1), len(masks2))
|
||||
assert (ious >= 0 - np.finfo("float").eps).all()
|
||||
assert (ious <= 1 + np.finfo("float").eps).all()
|
||||
|
||||
return ious
|
||||
|
||||
@staticmethod
|
||||
def _calculate_box_ious(bboxes1, bboxes2, box_format="xywh", do_ioa=False):
|
||||
"""Calculates the IOU (intersection over union) between two arrays of boxes.
|
||||
Allows variable box formats ('xywh' and 'x0y0x1y1').
|
||||
If do_ioa (intersection over area) , then calculates the intersection over the area of boxes1 - this is commonly
|
||||
used to determine if detections are within crowd ignore region.
|
||||
"""
|
||||
if box_format in "xywh":
|
||||
# layout: (x0, y0, w, h)
|
||||
bboxes1 = deepcopy(bboxes1)
|
||||
bboxes2 = deepcopy(bboxes2)
|
||||
|
||||
bboxes1[:, 2] = bboxes1[:, 0] + bboxes1[:, 2]
|
||||
bboxes1[:, 3] = bboxes1[:, 1] + bboxes1[:, 3]
|
||||
bboxes2[:, 2] = bboxes2[:, 0] + bboxes2[:, 2]
|
||||
bboxes2[:, 3] = bboxes2[:, 1] + bboxes2[:, 3]
|
||||
elif box_format not in "x0y0x1y1":
|
||||
raise (TrackEvalException("box_format %s is not implemented" % box_format))
|
||||
|
||||
# layout: (x0, y0, x1, y1)
|
||||
min_ = np.minimum(bboxes1[:, np.newaxis, :], bboxes2[np.newaxis, :, :])
|
||||
max_ = np.maximum(bboxes1[:, np.newaxis, :], bboxes2[np.newaxis, :, :])
|
||||
intersection = np.maximum(min_[..., 2] - max_[..., 0], 0) * np.maximum(
|
||||
min_[..., 3] - max_[..., 1], 0
|
||||
)
|
||||
area1 = (bboxes1[..., 2] - bboxes1[..., 0]) * (
|
||||
bboxes1[..., 3] - bboxes1[..., 1]
|
||||
)
|
||||
|
||||
if do_ioa:
|
||||
ioas = np.zeros_like(intersection)
|
||||
valid_mask = area1 > 0 + np.finfo("float").eps
|
||||
ioas[valid_mask, :] = (
|
||||
intersection[valid_mask, :] / area1[valid_mask][:, np.newaxis]
|
||||
)
|
||||
|
||||
return ioas
|
||||
else:
|
||||
area2 = (bboxes2[..., 2] - bboxes2[..., 0]) * (
|
||||
bboxes2[..., 3] - bboxes2[..., 1]
|
||||
)
|
||||
union = area1[:, np.newaxis] + area2[np.newaxis, :] - intersection
|
||||
intersection[area1 <= 0 + np.finfo("float").eps, :] = 0
|
||||
intersection[:, area2 <= 0 + np.finfo("float").eps] = 0
|
||||
intersection[union <= 0 + np.finfo("float").eps] = 0
|
||||
union[union <= 0 + np.finfo("float").eps] = 1
|
||||
ious = intersection / union
|
||||
return ious
|
||||
|
||||
@staticmethod
|
||||
def _calculate_euclidean_similarity(dets1, dets2, zero_distance=2.0):
|
||||
"""Calculates the euclidean distance between two sets of detections, and then converts this into a similarity
|
||||
measure with values between 0 and 1 using the following formula: sim = max(0, 1 - dist/zero_distance).
|
||||
The default zero_distance of 2.0, corresponds to the default used in MOT15_3D, such that a 0.5 similarity
|
||||
threshold corresponds to a 1m distance threshold for TPs.
|
||||
"""
|
||||
dist = np.linalg.norm(dets1[:, np.newaxis] - dets2[np.newaxis, :], axis=2)
|
||||
sim = np.maximum(0, 1 - dist / zero_distance)
|
||||
return sim
|
||||
|
||||
@staticmethod
|
||||
def _check_unique_ids(data, after_preproc=False):
|
||||
"""Check the requirement that the tracker_ids and gt_ids are unique per timestep"""
|
||||
gt_ids = data["gt_ids"]
|
||||
tracker_ids = data["tracker_ids"]
|
||||
for t, (gt_ids_t, tracker_ids_t) in enumerate(zip(gt_ids, tracker_ids)):
|
||||
if len(tracker_ids_t) > 0:
|
||||
unique_ids, counts = np.unique(tracker_ids_t, return_counts=True)
|
||||
if np.max(counts) != 1:
|
||||
duplicate_ids = unique_ids[counts > 1]
|
||||
exc_str_init = (
|
||||
"Tracker predicts the same ID more than once in a single timestep "
|
||||
"(seq: %s, frame: %i, ids:" % (data["seq"], t + 1)
|
||||
)
|
||||
exc_str = (
|
||||
" ".join([exc_str_init] + [str(d) for d in duplicate_ids]) + ")"
|
||||
)
|
||||
if after_preproc:
|
||||
exc_str_init += (
|
||||
"\n Note that this error occurred after preprocessing (but not before), "
|
||||
"so ids may not be as in file, and something seems wrong with preproc."
|
||||
)
|
||||
raise TrackEvalException(exc_str)
|
||||
if len(gt_ids_t) > 0:
|
||||
unique_ids, counts = np.unique(gt_ids_t, return_counts=True)
|
||||
if np.max(counts) != 1:
|
||||
duplicate_ids = unique_ids[counts > 1]
|
||||
exc_str_init = (
|
||||
"Ground-truth has the same ID more than once in a single timestep "
|
||||
"(seq: %s, frame: %i, ids:" % (data["seq"], t + 1)
|
||||
)
|
||||
exc_str = (
|
||||
" ".join([exc_str_init] + [str(d) for d in duplicate_ids]) + ")"
|
||||
)
|
||||
if after_preproc:
|
||||
exc_str_init += (
|
||||
"\n Note that this error occurred after preprocessing (but not before), "
|
||||
"so ids may not be as in file, and something seems wrong with preproc."
|
||||
)
|
||||
raise TrackEvalException(exc_str)
|
||||
891
sam3/eval/hota_eval_toolkit/trackeval/datasets/tao_ow.py
Normal file
891
sam3/eval/hota_eval_toolkit/trackeval/datasets/tao_ow.py
Normal file
@@ -0,0 +1,891 @@
|
||||
# flake8: noqa
|
||||
|
||||
import itertools
|
||||
import json
|
||||
import os
|
||||
from collections import defaultdict
|
||||
|
||||
import numpy as np
|
||||
from scipy.optimize import linear_sum_assignment
|
||||
|
||||
from .. import _timing, utils
|
||||
from ..utils import TrackEvalException
|
||||
from ._base_dataset import _BaseDataset
|
||||
|
||||
|
||||
class TAO_OW(_BaseDataset):
|
||||
"""Dataset class for TAO tracking"""
|
||||
|
||||
@staticmethod
|
||||
def get_default_dataset_config():
|
||||
"""Default class config values"""
|
||||
code_path = utils.get_code_path()
|
||||
default_config = {
|
||||
"GT_FOLDER": os.path.join(
|
||||
code_path, "data/gt/tao/tao_training"
|
||||
), # Location of GT data
|
||||
"TRACKERS_FOLDER": os.path.join(
|
||||
code_path, "data/trackers/tao/tao_training"
|
||||
), # Trackers location
|
||||
"OUTPUT_FOLDER": None, # Where to save eval results (if None, same as TRACKERS_FOLDER)
|
||||
"TRACKERS_TO_EVAL": None, # Filenames of trackers to eval (if None, all in folder)
|
||||
"CLASSES_TO_EVAL": None, # Classes to eval (if None, all classes)
|
||||
"SPLIT_TO_EVAL": "training", # Valid: 'training', 'val'
|
||||
"PRINT_CONFIG": True, # Whether to print current config
|
||||
"TRACKER_SUB_FOLDER": "data", # Tracker files are in TRACKER_FOLDER/tracker_name/TRACKER_SUB_FOLDER
|
||||
"OUTPUT_SUB_FOLDER": "", # Output files are saved in OUTPUT_FOLDER/tracker_name/OUTPUT_SUB_FOLDER
|
||||
"TRACKER_DISPLAY_NAMES": None, # Names of trackers to display, if None: TRACKERS_TO_EVAL
|
||||
"MAX_DETECTIONS": 300, # Number of maximal allowed detections per image (0 for unlimited)
|
||||
"SUBSET": "all",
|
||||
}
|
||||
return default_config
|
||||
|
||||
def __init__(self, config=None):
|
||||
"""Initialise dataset, checking that all required files are present"""
|
||||
super().__init__()
|
||||
# Fill non-given config values with defaults
|
||||
self.config = utils.init_config(
|
||||
config, self.get_default_dataset_config(), self.get_name()
|
||||
)
|
||||
self.gt_fol = self.config["GT_FOLDER"]
|
||||
self.tracker_fol = self.config["TRACKERS_FOLDER"]
|
||||
self.should_classes_combine = True
|
||||
self.use_super_categories = False
|
||||
|
||||
self.tracker_sub_fol = self.config["TRACKER_SUB_FOLDER"]
|
||||
self.output_fol = self.config["OUTPUT_FOLDER"]
|
||||
if self.output_fol is None:
|
||||
self.output_fol = self.tracker_fol
|
||||
self.output_sub_fol = self.config["OUTPUT_SUB_FOLDER"]
|
||||
|
||||
gt_dir_files = [
|
||||
file for file in os.listdir(self.gt_fol) if file.endswith(".json")
|
||||
]
|
||||
if len(gt_dir_files) != 1:
|
||||
raise TrackEvalException(
|
||||
self.gt_fol + " does not contain exactly one json file."
|
||||
)
|
||||
|
||||
with open(os.path.join(self.gt_fol, gt_dir_files[0])) as f:
|
||||
self.gt_data = json.load(f)
|
||||
|
||||
self.subset = self.config["SUBSET"]
|
||||
if self.subset != "all":
|
||||
# Split GT data into `known`, `unknown` or `distractor`
|
||||
self._split_known_unknown_distractor()
|
||||
self.gt_data = self._filter_gt_data(self.gt_data)
|
||||
|
||||
# merge categories marked with a merged tag in TAO dataset
|
||||
self._merge_categories(self.gt_data["annotations"] + self.gt_data["tracks"])
|
||||
|
||||
# Get sequences to eval and sequence information
|
||||
self.seq_list = [
|
||||
vid["name"].replace("/", "-") for vid in self.gt_data["videos"]
|
||||
]
|
||||
self.seq_name_to_seq_id = {
|
||||
vid["name"].replace("/", "-"): vid["id"] for vid in self.gt_data["videos"]
|
||||
}
|
||||
# compute mappings from videos to annotation data
|
||||
self.videos_to_gt_tracks, self.videos_to_gt_images = self._compute_vid_mappings(
|
||||
self.gt_data["annotations"]
|
||||
)
|
||||
# compute sequence lengths
|
||||
self.seq_lengths = {vid["id"]: 0 for vid in self.gt_data["videos"]}
|
||||
for img in self.gt_data["images"]:
|
||||
self.seq_lengths[img["video_id"]] += 1
|
||||
self.seq_to_images_to_timestep = self._compute_image_to_timestep_mappings()
|
||||
self.seq_to_classes = {
|
||||
vid["id"]: {
|
||||
"pos_cat_ids": list(
|
||||
{
|
||||
track["category_id"]
|
||||
for track in self.videos_to_gt_tracks[vid["id"]]
|
||||
}
|
||||
),
|
||||
"neg_cat_ids": vid["neg_category_ids"],
|
||||
"not_exhaustively_labeled_cat_ids": vid["not_exhaustive_category_ids"],
|
||||
}
|
||||
for vid in self.gt_data["videos"]
|
||||
}
|
||||
|
||||
# Get classes to eval
|
||||
considered_vid_ids = [self.seq_name_to_seq_id[vid] for vid in self.seq_list]
|
||||
seen_cats = set(
|
||||
[
|
||||
cat_id
|
||||
for vid_id in considered_vid_ids
|
||||
for cat_id in self.seq_to_classes[vid_id]["pos_cat_ids"]
|
||||
]
|
||||
)
|
||||
# only classes with ground truth are evaluated in TAO
|
||||
self.valid_classes = [
|
||||
cls["name"] for cls in self.gt_data["categories"] if cls["id"] in seen_cats
|
||||
]
|
||||
# cls_name_to_cls_id_map = {cls['name']: cls['id'] for cls in self.gt_data['categories']}
|
||||
|
||||
if self.config["CLASSES_TO_EVAL"]:
|
||||
# self.class_list = [cls.lower() if cls.lower() in self.valid_classes else None
|
||||
# for cls in self.config['CLASSES_TO_EVAL']]
|
||||
self.class_list = ["object"] # class-agnostic
|
||||
if not all(self.class_list):
|
||||
raise TrackEvalException(
|
||||
"Attempted to evaluate an invalid class. Only classes "
|
||||
+ ", ".join(self.valid_classes)
|
||||
+ " are valid (classes present in ground truth data)."
|
||||
)
|
||||
else:
|
||||
# self.class_list = [cls for cls in self.valid_classes]
|
||||
self.class_list = ["object"] # class-agnostic
|
||||
# self.class_name_to_class_id = {k: v for k, v in cls_name_to_cls_id_map.items() if k in self.class_list}
|
||||
self.class_name_to_class_id = {"object": 1} # class-agnostic
|
||||
|
||||
# Get trackers to eval
|
||||
if self.config["TRACKERS_TO_EVAL"] is None:
|
||||
self.tracker_list = os.listdir(self.tracker_fol)
|
||||
else:
|
||||
self.tracker_list = self.config["TRACKERS_TO_EVAL"]
|
||||
|
||||
if self.config["TRACKER_DISPLAY_NAMES"] is None:
|
||||
self.tracker_to_disp = dict(zip(self.tracker_list, self.tracker_list))
|
||||
elif (self.config["TRACKERS_TO_EVAL"] is not None) and (
|
||||
len(self.config["TRACKER_DISPLAY_NAMES"]) == len(self.tracker_list)
|
||||
):
|
||||
self.tracker_to_disp = dict(
|
||||
zip(self.tracker_list, self.config["TRACKER_DISPLAY_NAMES"])
|
||||
)
|
||||
else:
|
||||
raise TrackEvalException(
|
||||
"List of tracker files and tracker display names do not match."
|
||||
)
|
||||
|
||||
self.tracker_data = {tracker: dict() for tracker in self.tracker_list}
|
||||
|
||||
for tracker in self.tracker_list:
|
||||
tr_dir_files = [
|
||||
file
|
||||
for file in os.listdir(
|
||||
os.path.join(self.tracker_fol, tracker, self.tracker_sub_fol)
|
||||
)
|
||||
if file.endswith(".json")
|
||||
]
|
||||
if len(tr_dir_files) != 1:
|
||||
raise TrackEvalException(
|
||||
os.path.join(self.tracker_fol, tracker, self.tracker_sub_fol)
|
||||
+ " does not contain exactly one json file."
|
||||
)
|
||||
with open(
|
||||
os.path.join(
|
||||
self.tracker_fol, tracker, self.tracker_sub_fol, tr_dir_files[0]
|
||||
)
|
||||
) as f:
|
||||
curr_data = json.load(f)
|
||||
|
||||
# limit detections if MAX_DETECTIONS > 0
|
||||
if self.config["MAX_DETECTIONS"]:
|
||||
curr_data = self._limit_dets_per_image(curr_data)
|
||||
|
||||
# fill missing video ids
|
||||
self._fill_video_ids_inplace(curr_data)
|
||||
|
||||
# make track ids unique over whole evaluation set
|
||||
self._make_track_ids_unique(curr_data)
|
||||
|
||||
# merge categories marked with a merged tag in TAO dataset
|
||||
self._merge_categories(curr_data)
|
||||
|
||||
# get tracker sequence information
|
||||
curr_videos_to_tracker_tracks, curr_videos_to_tracker_images = (
|
||||
self._compute_vid_mappings(curr_data)
|
||||
)
|
||||
self.tracker_data[tracker]["vids_to_tracks"] = curr_videos_to_tracker_tracks
|
||||
self.tracker_data[tracker]["vids_to_images"] = curr_videos_to_tracker_images
|
||||
|
||||
def get_display_name(self, tracker):
|
||||
return self.tracker_to_disp[tracker]
|
||||
|
||||
def _load_raw_file(self, tracker, seq, is_gt):
|
||||
"""Load a file (gt or tracker) in the TAO format
|
||||
|
||||
If is_gt, this returns a dict which contains the fields:
|
||||
[gt_ids, gt_classes] : list (for each timestep) of 1D NDArrays (for each det).
|
||||
[gt_dets]: list (for each timestep) of lists of detections.
|
||||
[classes_to_gt_tracks]: dictionary with class values as keys and list of dictionaries (with frame indices as
|
||||
keys and corresponding segmentations as values) for each track
|
||||
[classes_to_gt_track_ids, classes_to_gt_track_areas, classes_to_gt_track_lengths]: dictionary with class values
|
||||
as keys and lists (for each track) as values
|
||||
|
||||
if not is_gt, this returns a dict which contains the fields:
|
||||
[tracker_ids, tracker_classes, tracker_confidences] : list (for each timestep) of 1D NDArrays (for each det).
|
||||
[tracker_dets]: list (for each timestep) of lists of detections.
|
||||
[classes_to_dt_tracks]: dictionary with class values as keys and list of dictionaries (with frame indices as
|
||||
keys and corresponding segmentations as values) for each track
|
||||
[classes_to_dt_track_ids, classes_to_dt_track_areas, classes_to_dt_track_lengths]: dictionary with class values
|
||||
as keys and lists as values
|
||||
[classes_to_dt_track_scores]: dictionary with class values as keys and 1D numpy arrays as values
|
||||
"""
|
||||
seq_id = self.seq_name_to_seq_id[seq]
|
||||
# File location
|
||||
if is_gt:
|
||||
imgs = self.videos_to_gt_images[seq_id]
|
||||
else:
|
||||
imgs = self.tracker_data[tracker]["vids_to_images"][seq_id]
|
||||
|
||||
# Convert data to required format
|
||||
num_timesteps = self.seq_lengths[seq_id]
|
||||
img_to_timestep = self.seq_to_images_to_timestep[seq_id]
|
||||
data_keys = ["ids", "classes", "dets"]
|
||||
if not is_gt:
|
||||
data_keys += ["tracker_confidences"]
|
||||
raw_data = {key: [None] * num_timesteps for key in data_keys}
|
||||
for img in imgs:
|
||||
# some tracker data contains images without any ground truth information, these are ignored
|
||||
try:
|
||||
t = img_to_timestep[img["id"]]
|
||||
except KeyError:
|
||||
continue
|
||||
annotations = img["annotations"]
|
||||
raw_data["dets"][t] = np.atleast_2d(
|
||||
[ann["bbox"] for ann in annotations]
|
||||
).astype(float)
|
||||
raw_data["ids"][t] = np.atleast_1d(
|
||||
[ann["track_id"] for ann in annotations]
|
||||
).astype(int)
|
||||
raw_data["classes"][t] = np.atleast_1d([1 for _ in annotations]).astype(
|
||||
int
|
||||
) # class-agnostic
|
||||
if not is_gt:
|
||||
raw_data["tracker_confidences"][t] = np.atleast_1d(
|
||||
[ann["score"] for ann in annotations]
|
||||
).astype(float)
|
||||
|
||||
for t, d in enumerate(raw_data["dets"]):
|
||||
if d is None:
|
||||
raw_data["dets"][t] = np.empty((0, 4)).astype(float)
|
||||
raw_data["ids"][t] = np.empty(0).astype(int)
|
||||
raw_data["classes"][t] = np.empty(0).astype(int)
|
||||
if not is_gt:
|
||||
raw_data["tracker_confidences"][t] = np.empty(0)
|
||||
|
||||
if is_gt:
|
||||
key_map = {"ids": "gt_ids", "classes": "gt_classes", "dets": "gt_dets"}
|
||||
else:
|
||||
key_map = {
|
||||
"ids": "tracker_ids",
|
||||
"classes": "tracker_classes",
|
||||
"dets": "tracker_dets",
|
||||
}
|
||||
for k, v in key_map.items():
|
||||
raw_data[v] = raw_data.pop(k)
|
||||
|
||||
# all_classes = [self.class_name_to_class_id[cls] for cls in self.class_list]
|
||||
all_classes = [1] # class-agnostic
|
||||
|
||||
if is_gt:
|
||||
classes_to_consider = all_classes
|
||||
all_tracks = self.videos_to_gt_tracks[seq_id]
|
||||
else:
|
||||
# classes_to_consider = self.seq_to_classes[seq_id]['pos_cat_ids'] \
|
||||
# + self.seq_to_classes[seq_id]['neg_cat_ids']
|
||||
classes_to_consider = all_classes # class-agnostic
|
||||
all_tracks = self.tracker_data[tracker]["vids_to_tracks"][seq_id]
|
||||
|
||||
# classes_to_tracks = {cls: [track for track in all_tracks if track['category_id'] == cls]
|
||||
# if cls in classes_to_consider else [] for cls in all_classes}
|
||||
classes_to_tracks = {
|
||||
cls: [track for track in all_tracks] if cls in classes_to_consider else []
|
||||
for cls in all_classes
|
||||
} # class-agnostic
|
||||
|
||||
# mapping from classes to track information
|
||||
raw_data["classes_to_tracks"] = {
|
||||
cls: [
|
||||
{
|
||||
det["image_id"]: np.atleast_1d(det["bbox"])
|
||||
for det in track["annotations"]
|
||||
}
|
||||
for track in tracks
|
||||
]
|
||||
for cls, tracks in classes_to_tracks.items()
|
||||
}
|
||||
raw_data["classes_to_track_ids"] = {
|
||||
cls: [track["id"] for track in tracks]
|
||||
for cls, tracks in classes_to_tracks.items()
|
||||
}
|
||||
raw_data["classes_to_track_areas"] = {
|
||||
cls: [track["area"] for track in tracks]
|
||||
for cls, tracks in classes_to_tracks.items()
|
||||
}
|
||||
raw_data["classes_to_track_lengths"] = {
|
||||
cls: [len(track["annotations"]) for track in tracks]
|
||||
for cls, tracks in classes_to_tracks.items()
|
||||
}
|
||||
|
||||
if not is_gt:
|
||||
raw_data["classes_to_dt_track_scores"] = {
|
||||
cls: np.array(
|
||||
[
|
||||
np.mean([float(x["score"]) for x in track["annotations"]])
|
||||
for track in tracks
|
||||
]
|
||||
)
|
||||
for cls, tracks in classes_to_tracks.items()
|
||||
}
|
||||
|
||||
if is_gt:
|
||||
key_map = {
|
||||
"classes_to_tracks": "classes_to_gt_tracks",
|
||||
"classes_to_track_ids": "classes_to_gt_track_ids",
|
||||
"classes_to_track_lengths": "classes_to_gt_track_lengths",
|
||||
"classes_to_track_areas": "classes_to_gt_track_areas",
|
||||
}
|
||||
else:
|
||||
key_map = {
|
||||
"classes_to_tracks": "classes_to_dt_tracks",
|
||||
"classes_to_track_ids": "classes_to_dt_track_ids",
|
||||
"classes_to_track_lengths": "classes_to_dt_track_lengths",
|
||||
"classes_to_track_areas": "classes_to_dt_track_areas",
|
||||
}
|
||||
for k, v in key_map.items():
|
||||
raw_data[v] = raw_data.pop(k)
|
||||
|
||||
raw_data["num_timesteps"] = num_timesteps
|
||||
raw_data["neg_cat_ids"] = self.seq_to_classes[seq_id]["neg_cat_ids"]
|
||||
raw_data["not_exhaustively_labeled_cls"] = self.seq_to_classes[seq_id][
|
||||
"not_exhaustively_labeled_cat_ids"
|
||||
]
|
||||
raw_data["seq"] = seq
|
||||
return raw_data
|
||||
|
||||
@_timing.time
|
||||
def get_preprocessed_seq_data(self, raw_data, cls):
|
||||
"""Preprocess data for a single sequence for a single class ready for evaluation.
|
||||
Inputs:
|
||||
- raw_data is a dict containing the data for the sequence already read in by get_raw_seq_data().
|
||||
- cls is the class to be evaluated.
|
||||
Outputs:
|
||||
- data is a dict containing all of the information that metrics need to perform evaluation.
|
||||
It contains the following fields:
|
||||
[num_timesteps, num_gt_ids, num_tracker_ids, num_gt_dets, num_tracker_dets] : integers.
|
||||
[gt_ids, tracker_ids, tracker_confidences]: list (for each timestep) of 1D NDArrays (for each det).
|
||||
[gt_dets, tracker_dets]: list (for each timestep) of lists of detections.
|
||||
[similarity_scores]: list (for each timestep) of 2D NDArrays.
|
||||
Notes:
|
||||
General preprocessing (preproc) occurs in 4 steps. Some datasets may not use all of these steps.
|
||||
1) Extract only detections relevant for the class to be evaluated (including distractor detections).
|
||||
2) Match gt dets and tracker dets. Remove tracker dets that are matched to a gt det that is of a
|
||||
distractor class, or otherwise marked as to be removed.
|
||||
3) Remove unmatched tracker dets if they fall within a crowd ignore region or don't meet a certain
|
||||
other criteria (e.g. are too small).
|
||||
4) Remove gt dets that were only useful for preprocessing and not for actual evaluation.
|
||||
After the above preprocessing steps, this function also calculates the number of gt and tracker detections
|
||||
and unique track ids. It also relabels gt and tracker ids to be contiguous and checks that ids are
|
||||
unique within each timestep.
|
||||
TAO:
|
||||
In TAO, the 4 preproc steps are as follow:
|
||||
1) All classes present in the ground truth data are evaluated separately.
|
||||
2) No matched tracker detections are removed.
|
||||
3) Unmatched tracker detections are removed if there is not ground truth data and the class does not
|
||||
belong to the categories marked as negative for this sequence. Additionally, unmatched tracker
|
||||
detections for classes which are marked as not exhaustively labeled are removed.
|
||||
4) No gt detections are removed.
|
||||
Further, for TrackMAP computation track representations for the given class are accessed from a dictionary
|
||||
and the tracks from the tracker data are sorted according to the tracker confidence.
|
||||
"""
|
||||
cls_id = self.class_name_to_class_id[cls]
|
||||
is_not_exhaustively_labeled = cls_id in raw_data["not_exhaustively_labeled_cls"]
|
||||
is_neg_category = cls_id in raw_data["neg_cat_ids"]
|
||||
|
||||
data_keys = [
|
||||
"gt_ids",
|
||||
"tracker_ids",
|
||||
"gt_dets",
|
||||
"tracker_dets",
|
||||
"tracker_confidences",
|
||||
"similarity_scores",
|
||||
]
|
||||
data = {key: [None] * raw_data["num_timesteps"] for key in data_keys}
|
||||
unique_gt_ids = []
|
||||
unique_tracker_ids = []
|
||||
num_gt_dets = 0
|
||||
num_tracker_dets = 0
|
||||
for t in range(raw_data["num_timesteps"]):
|
||||
# Only extract relevant dets for this class for preproc and eval (cls)
|
||||
gt_class_mask = np.atleast_1d(raw_data["gt_classes"][t] == cls_id)
|
||||
gt_class_mask = gt_class_mask.astype(bool)
|
||||
gt_ids = raw_data["gt_ids"][t][gt_class_mask]
|
||||
gt_dets = raw_data["gt_dets"][t][gt_class_mask]
|
||||
|
||||
tracker_class_mask = np.atleast_1d(raw_data["tracker_classes"][t] == cls_id)
|
||||
tracker_class_mask = tracker_class_mask.astype(bool)
|
||||
tracker_ids = raw_data["tracker_ids"][t][tracker_class_mask]
|
||||
tracker_dets = raw_data["tracker_dets"][t][tracker_class_mask]
|
||||
tracker_confidences = raw_data["tracker_confidences"][t][tracker_class_mask]
|
||||
similarity_scores = raw_data["similarity_scores"][t][gt_class_mask, :][
|
||||
:, tracker_class_mask
|
||||
]
|
||||
|
||||
# Match tracker and gt dets (with hungarian algorithm).
|
||||
unmatched_indices = np.arange(tracker_ids.shape[0])
|
||||
if gt_ids.shape[0] > 0 and tracker_ids.shape[0] > 0:
|
||||
matching_scores = similarity_scores.copy()
|
||||
matching_scores[matching_scores < 0.5 - np.finfo("float").eps] = 0
|
||||
match_rows, match_cols = linear_sum_assignment(-matching_scores)
|
||||
actually_matched_mask = (
|
||||
matching_scores[match_rows, match_cols] > 0 + np.finfo("float").eps
|
||||
)
|
||||
match_cols = match_cols[actually_matched_mask]
|
||||
unmatched_indices = np.delete(unmatched_indices, match_cols, axis=0)
|
||||
|
||||
if gt_ids.shape[0] == 0 and not is_neg_category:
|
||||
to_remove_tracker = unmatched_indices
|
||||
elif is_not_exhaustively_labeled:
|
||||
to_remove_tracker = unmatched_indices
|
||||
else:
|
||||
to_remove_tracker = np.array([], dtype=int)
|
||||
|
||||
# remove all unwanted unmatched tracker detections
|
||||
data["tracker_ids"][t] = np.delete(tracker_ids, to_remove_tracker, axis=0)
|
||||
data["tracker_dets"][t] = np.delete(tracker_dets, to_remove_tracker, axis=0)
|
||||
data["tracker_confidences"][t] = np.delete(
|
||||
tracker_confidences, to_remove_tracker, axis=0
|
||||
)
|
||||
similarity_scores = np.delete(similarity_scores, to_remove_tracker, axis=1)
|
||||
|
||||
data["gt_ids"][t] = gt_ids
|
||||
data["gt_dets"][t] = gt_dets
|
||||
data["similarity_scores"][t] = similarity_scores
|
||||
|
||||
unique_gt_ids += list(np.unique(data["gt_ids"][t]))
|
||||
unique_tracker_ids += list(np.unique(data["tracker_ids"][t]))
|
||||
num_tracker_dets += len(data["tracker_ids"][t])
|
||||
num_gt_dets += len(data["gt_ids"][t])
|
||||
|
||||
# Re-label IDs such that there are no empty IDs
|
||||
if len(unique_gt_ids) > 0:
|
||||
unique_gt_ids = np.unique(unique_gt_ids)
|
||||
gt_id_map = np.nan * np.ones((np.max(unique_gt_ids) + 1))
|
||||
gt_id_map[unique_gt_ids] = np.arange(len(unique_gt_ids))
|
||||
for t in range(raw_data["num_timesteps"]):
|
||||
if len(data["gt_ids"][t]) > 0:
|
||||
data["gt_ids"][t] = gt_id_map[data["gt_ids"][t]].astype(int)
|
||||
if len(unique_tracker_ids) > 0:
|
||||
unique_tracker_ids = np.unique(unique_tracker_ids)
|
||||
tracker_id_map = np.nan * np.ones((np.max(unique_tracker_ids) + 1))
|
||||
tracker_id_map[unique_tracker_ids] = np.arange(len(unique_tracker_ids))
|
||||
for t in range(raw_data["num_timesteps"]):
|
||||
if len(data["tracker_ids"][t]) > 0:
|
||||
data["tracker_ids"][t] = tracker_id_map[
|
||||
data["tracker_ids"][t]
|
||||
].astype(int)
|
||||
|
||||
# Record overview statistics.
|
||||
data["num_tracker_dets"] = num_tracker_dets
|
||||
data["num_gt_dets"] = num_gt_dets
|
||||
data["num_tracker_ids"] = len(unique_tracker_ids)
|
||||
data["num_gt_ids"] = len(unique_gt_ids)
|
||||
data["num_timesteps"] = raw_data["num_timesteps"]
|
||||
data["seq"] = raw_data["seq"]
|
||||
|
||||
# get track representations
|
||||
data["gt_tracks"] = raw_data["classes_to_gt_tracks"][cls_id]
|
||||
data["gt_track_ids"] = raw_data["classes_to_gt_track_ids"][cls_id]
|
||||
data["gt_track_lengths"] = raw_data["classes_to_gt_track_lengths"][cls_id]
|
||||
data["gt_track_areas"] = raw_data["classes_to_gt_track_areas"][cls_id]
|
||||
data["dt_tracks"] = raw_data["classes_to_dt_tracks"][cls_id]
|
||||
data["dt_track_ids"] = raw_data["classes_to_dt_track_ids"][cls_id]
|
||||
data["dt_track_lengths"] = raw_data["classes_to_dt_track_lengths"][cls_id]
|
||||
data["dt_track_areas"] = raw_data["classes_to_dt_track_areas"][cls_id]
|
||||
data["dt_track_scores"] = raw_data["classes_to_dt_track_scores"][cls_id]
|
||||
data["not_exhaustively_labeled"] = is_not_exhaustively_labeled
|
||||
data["iou_type"] = "bbox"
|
||||
|
||||
# sort tracker data tracks by tracker confidence scores
|
||||
if data["dt_tracks"]:
|
||||
idx = np.argsort(
|
||||
[-score for score in data["dt_track_scores"]], kind="mergesort"
|
||||
)
|
||||
data["dt_track_scores"] = [data["dt_track_scores"][i] for i in idx]
|
||||
data["dt_tracks"] = [data["dt_tracks"][i] for i in idx]
|
||||
data["dt_track_ids"] = [data["dt_track_ids"][i] for i in idx]
|
||||
data["dt_track_lengths"] = [data["dt_track_lengths"][i] for i in idx]
|
||||
data["dt_track_areas"] = [data["dt_track_areas"][i] for i in idx]
|
||||
# Ensure that ids are unique per timestep.
|
||||
self._check_unique_ids(data)
|
||||
|
||||
return data
|
||||
|
||||
def _calculate_similarities(self, gt_dets_t, tracker_dets_t):
|
||||
similarity_scores = self._calculate_box_ious(gt_dets_t, tracker_dets_t)
|
||||
return similarity_scores
|
||||
|
||||
def _merge_categories(self, annotations):
|
||||
"""
|
||||
Merges categories with a merged tag. Adapted from https://github.com/TAO-Dataset
|
||||
:param annotations: the annotations in which the classes should be merged
|
||||
:return: None
|
||||
"""
|
||||
merge_map = {}
|
||||
for category in self.gt_data["categories"]:
|
||||
if "merged" in category:
|
||||
for to_merge in category["merged"]:
|
||||
merge_map[to_merge["id"]] = category["id"]
|
||||
|
||||
for ann in annotations:
|
||||
ann["category_id"] = merge_map.get(ann["category_id"], ann["category_id"])
|
||||
|
||||
def _compute_vid_mappings(self, annotations):
|
||||
"""
|
||||
Computes mappings from Videos to corresponding tracks and images.
|
||||
:param annotations: the annotations for which the mapping should be generated
|
||||
:return: the video-to-track-mapping, the video-to-image-mapping
|
||||
"""
|
||||
vids_to_tracks = {}
|
||||
vids_to_imgs = {}
|
||||
vid_ids = [vid["id"] for vid in self.gt_data["videos"]]
|
||||
|
||||
# compute an mapping from image IDs to images
|
||||
images = {}
|
||||
for image in self.gt_data["images"]:
|
||||
images[image["id"]] = image
|
||||
|
||||
for ann in annotations:
|
||||
ann["area"] = ann["bbox"][2] * ann["bbox"][3]
|
||||
|
||||
vid = ann["video_id"]
|
||||
if ann["video_id"] not in vids_to_tracks.keys():
|
||||
vids_to_tracks[ann["video_id"]] = list()
|
||||
if ann["video_id"] not in vids_to_imgs.keys():
|
||||
vids_to_imgs[ann["video_id"]] = list()
|
||||
|
||||
# Fill in vids_to_tracks
|
||||
tid = ann["track_id"]
|
||||
exist_tids = [track["id"] for track in vids_to_tracks[vid]]
|
||||
try:
|
||||
index1 = exist_tids.index(tid)
|
||||
except ValueError:
|
||||
index1 = -1
|
||||
if tid not in exist_tids:
|
||||
curr_track = {
|
||||
"id": tid,
|
||||
"category_id": ann["category_id"],
|
||||
"video_id": vid,
|
||||
"annotations": [ann],
|
||||
}
|
||||
vids_to_tracks[vid].append(curr_track)
|
||||
else:
|
||||
vids_to_tracks[vid][index1]["annotations"].append(ann)
|
||||
|
||||
# Fill in vids_to_imgs
|
||||
img_id = ann["image_id"]
|
||||
exist_img_ids = [img["id"] for img in vids_to_imgs[vid]]
|
||||
try:
|
||||
index2 = exist_img_ids.index(img_id)
|
||||
except ValueError:
|
||||
index2 = -1
|
||||
if index2 == -1:
|
||||
curr_img = {"id": img_id, "annotations": [ann]}
|
||||
vids_to_imgs[vid].append(curr_img)
|
||||
else:
|
||||
vids_to_imgs[vid][index2]["annotations"].append(ann)
|
||||
|
||||
# sort annotations by frame index and compute track area
|
||||
for vid, tracks in vids_to_tracks.items():
|
||||
for track in tracks:
|
||||
track["annotations"] = sorted(
|
||||
track["annotations"],
|
||||
key=lambda x: images[x["image_id"]]["frame_index"],
|
||||
)
|
||||
# Computer average area
|
||||
track["area"] = sum(x["area"] for x in track["annotations"]) / len(
|
||||
track["annotations"]
|
||||
)
|
||||
|
||||
# Ensure all videos are present
|
||||
for vid_id in vid_ids:
|
||||
if vid_id not in vids_to_tracks.keys():
|
||||
vids_to_tracks[vid_id] = []
|
||||
if vid_id not in vids_to_imgs.keys():
|
||||
vids_to_imgs[vid_id] = []
|
||||
|
||||
return vids_to_tracks, vids_to_imgs
|
||||
|
||||
def _compute_image_to_timestep_mappings(self):
|
||||
"""
|
||||
Computes a mapping from images to the corresponding timestep in the sequence.
|
||||
:return: the image-to-timestep-mapping
|
||||
"""
|
||||
images = {}
|
||||
for image in self.gt_data["images"]:
|
||||
images[image["id"]] = image
|
||||
|
||||
seq_to_imgs_to_timestep = {vid["id"]: dict() for vid in self.gt_data["videos"]}
|
||||
for vid in seq_to_imgs_to_timestep:
|
||||
curr_imgs = [img["id"] for img in self.videos_to_gt_images[vid]]
|
||||
curr_imgs = sorted(curr_imgs, key=lambda x: images[x]["frame_index"])
|
||||
seq_to_imgs_to_timestep[vid] = {
|
||||
curr_imgs[i]: i for i in range(len(curr_imgs))
|
||||
}
|
||||
|
||||
return seq_to_imgs_to_timestep
|
||||
|
||||
def _limit_dets_per_image(self, annotations):
|
||||
"""
|
||||
Limits the number of detections for each image to config['MAX_DETECTIONS']. Adapted from
|
||||
https://github.com/TAO-Dataset/
|
||||
:param annotations: the annotations in which the detections should be limited
|
||||
:return: the annotations with limited detections
|
||||
"""
|
||||
max_dets = self.config["MAX_DETECTIONS"]
|
||||
img_ann = defaultdict(list)
|
||||
for ann in annotations:
|
||||
img_ann[ann["image_id"]].append(ann)
|
||||
|
||||
for img_id, _anns in img_ann.items():
|
||||
if len(_anns) <= max_dets:
|
||||
continue
|
||||
_anns = sorted(_anns, key=lambda x: x["score"], reverse=True)
|
||||
img_ann[img_id] = _anns[:max_dets]
|
||||
|
||||
return [ann for anns in img_ann.values() for ann in anns]
|
||||
|
||||
def _fill_video_ids_inplace(self, annotations):
|
||||
"""
|
||||
Fills in missing video IDs inplace. Adapted from https://github.com/TAO-Dataset/
|
||||
:param annotations: the annotations for which the videos IDs should be filled inplace
|
||||
:return: None
|
||||
"""
|
||||
missing_video_id = [x for x in annotations if "video_id" not in x]
|
||||
if missing_video_id:
|
||||
image_id_to_video_id = {
|
||||
x["id"]: x["video_id"] for x in self.gt_data["images"]
|
||||
}
|
||||
for x in missing_video_id:
|
||||
x["video_id"] = image_id_to_video_id[x["image_id"]]
|
||||
|
||||
@staticmethod
|
||||
def _make_track_ids_unique(annotations):
|
||||
"""
|
||||
Makes the track IDs unqiue over the whole annotation set. Adapted from https://github.com/TAO-Dataset/
|
||||
:param annotations: the annotation set
|
||||
:return: the number of updated IDs
|
||||
"""
|
||||
track_id_videos = {}
|
||||
track_ids_to_update = set()
|
||||
max_track_id = 0
|
||||
for ann in annotations:
|
||||
t = ann["track_id"]
|
||||
if t not in track_id_videos:
|
||||
track_id_videos[t] = ann["video_id"]
|
||||
|
||||
if ann["video_id"] != track_id_videos[t]:
|
||||
# Track id is assigned to multiple videos
|
||||
track_ids_to_update.add(t)
|
||||
max_track_id = max(max_track_id, t)
|
||||
|
||||
if track_ids_to_update:
|
||||
print("true")
|
||||
next_id = itertools.count(max_track_id + 1)
|
||||
new_track_ids = defaultdict(lambda: next(next_id))
|
||||
for ann in annotations:
|
||||
t = ann["track_id"]
|
||||
v = ann["video_id"]
|
||||
if t in track_ids_to_update:
|
||||
ann["track_id"] = new_track_ids[t, v]
|
||||
return len(track_ids_to_update)
|
||||
|
||||
def _split_known_unknown_distractor(self):
|
||||
all_ids = set(
|
||||
[i for i in range(1, 2000)]
|
||||
) # 2000 is larger than the max category id in TAO-OW.
|
||||
# `knowns` includes 78 TAO_category_ids that corresponds to 78 COCO classes.
|
||||
# (The other 2 COCO classes do not have corresponding classes in TAO).
|
||||
self.knowns = {
|
||||
4,
|
||||
13,
|
||||
1038,
|
||||
544,
|
||||
1057,
|
||||
34,
|
||||
35,
|
||||
36,
|
||||
41,
|
||||
45,
|
||||
58,
|
||||
60,
|
||||
579,
|
||||
1091,
|
||||
1097,
|
||||
1099,
|
||||
78,
|
||||
79,
|
||||
81,
|
||||
91,
|
||||
1115,
|
||||
1117,
|
||||
95,
|
||||
1122,
|
||||
99,
|
||||
1132,
|
||||
621,
|
||||
1135,
|
||||
625,
|
||||
118,
|
||||
1144,
|
||||
126,
|
||||
642,
|
||||
1155,
|
||||
133,
|
||||
1162,
|
||||
139,
|
||||
154,
|
||||
174,
|
||||
185,
|
||||
699,
|
||||
1215,
|
||||
714,
|
||||
717,
|
||||
1229,
|
||||
211,
|
||||
729,
|
||||
221,
|
||||
229,
|
||||
747,
|
||||
235,
|
||||
237,
|
||||
779,
|
||||
276,
|
||||
805,
|
||||
299,
|
||||
829,
|
||||
852,
|
||||
347,
|
||||
371,
|
||||
382,
|
||||
896,
|
||||
392,
|
||||
926,
|
||||
937,
|
||||
428,
|
||||
429,
|
||||
961,
|
||||
452,
|
||||
979,
|
||||
980,
|
||||
982,
|
||||
475,
|
||||
480,
|
||||
993,
|
||||
1001,
|
||||
502,
|
||||
1018,
|
||||
}
|
||||
# `distractors` is defined as in the paper "Opening up Open-World Tracking"
|
||||
self.distractors = {
|
||||
20,
|
||||
63,
|
||||
108,
|
||||
180,
|
||||
188,
|
||||
204,
|
||||
212,
|
||||
247,
|
||||
303,
|
||||
403,
|
||||
407,
|
||||
415,
|
||||
490,
|
||||
504,
|
||||
507,
|
||||
513,
|
||||
529,
|
||||
567,
|
||||
569,
|
||||
588,
|
||||
672,
|
||||
691,
|
||||
702,
|
||||
708,
|
||||
711,
|
||||
720,
|
||||
736,
|
||||
737,
|
||||
798,
|
||||
813,
|
||||
815,
|
||||
827,
|
||||
831,
|
||||
851,
|
||||
877,
|
||||
883,
|
||||
912,
|
||||
971,
|
||||
976,
|
||||
1130,
|
||||
1133,
|
||||
1134,
|
||||
1169,
|
||||
1184,
|
||||
1220,
|
||||
}
|
||||
self.unknowns = all_ids.difference(self.knowns.union(self.distractors))
|
||||
|
||||
def _filter_gt_data(self, raw_gt_data):
|
||||
"""
|
||||
Filter out irrelevant data in the raw_gt_data
|
||||
Args:
|
||||
raw_gt_data: directly loaded from json.
|
||||
|
||||
Returns:
|
||||
filtered gt_data
|
||||
"""
|
||||
valid_cat_ids = list()
|
||||
if self.subset == "known":
|
||||
valid_cat_ids = self.knowns
|
||||
elif self.subset == "distractor":
|
||||
valid_cat_ids = self.distractors
|
||||
elif self.subset == "unknown":
|
||||
valid_cat_ids = self.unknowns
|
||||
# elif self.subset == "test_only_unknowns":
|
||||
# valid_cat_ids = test_only_unknowns
|
||||
else:
|
||||
raise Exception("The parameter `SUBSET` is incorrect")
|
||||
|
||||
filtered = dict()
|
||||
filtered["videos"] = raw_gt_data["videos"]
|
||||
# filtered["videos"] = list()
|
||||
unwanted_vid = set()
|
||||
# for video in raw_gt_data["videos"]:
|
||||
# datasrc = video["name"].split('/')[1]
|
||||
# if datasrc in data_srcs:
|
||||
# filtered["videos"].append(video)
|
||||
# else:
|
||||
# unwanted_vid.add(video["id"])
|
||||
|
||||
filtered["annotations"] = list()
|
||||
for ann in raw_gt_data["annotations"]:
|
||||
if (ann["video_id"] not in unwanted_vid) and (
|
||||
ann["category_id"] in valid_cat_ids
|
||||
):
|
||||
filtered["annotations"].append(ann)
|
||||
|
||||
filtered["tracks"] = list()
|
||||
for track in raw_gt_data["tracks"]:
|
||||
if (track["video_id"] not in unwanted_vid) and (
|
||||
track["category_id"] in valid_cat_ids
|
||||
):
|
||||
filtered["tracks"].append(track)
|
||||
|
||||
filtered["images"] = list()
|
||||
for image in raw_gt_data["images"]:
|
||||
if image["video_id"] not in unwanted_vid:
|
||||
filtered["images"].append(image)
|
||||
|
||||
filtered["categories"] = list()
|
||||
for cat in raw_gt_data["categories"]:
|
||||
if cat["id"] in valid_cat_ids:
|
||||
filtered["categories"].append(cat)
|
||||
|
||||
filtered["info"] = raw_gt_data["info"]
|
||||
filtered["licenses"] = raw_gt_data["licenses"]
|
||||
|
||||
return filtered
|
||||
524
sam3/eval/hota_eval_toolkit/trackeval/datasets/youtube_vis.py
Normal file
524
sam3/eval/hota_eval_toolkit/trackeval/datasets/youtube_vis.py
Normal file
@@ -0,0 +1,524 @@
|
||||
# flake8: noqa
|
||||
|
||||
# note: this file has been modified from its original version in TrackEval in
|
||||
# https://github.com/JonathonLuiten/TrackEval/blob/master/trackeval/datasets/youtube_vis.py
|
||||
# to support the following:
|
||||
# 1) bbox evaluation (via `IOU_TYPE`)
|
||||
# 2) passing GT and prediction data as Python objects (via `GT_JSON_OBJECT` and `TRACKER_JSON_OBJECT`)
|
||||
# 3) specifying a custom dataset name (via `DATASET_NAME`)
|
||||
|
||||
import json
|
||||
import os
|
||||
|
||||
import numpy as np
|
||||
|
||||
from .. import _timing, utils
|
||||
from ..utils import TrackEvalException
|
||||
from ._base_dataset import _BaseDataset
|
||||
|
||||
|
||||
class YouTubeVIS(_BaseDataset):
|
||||
"""Dataset class for YouTubeVIS tracking"""
|
||||
|
||||
@staticmethod
|
||||
def get_default_dataset_config():
|
||||
"""Default class config values"""
|
||||
code_path = utils.get_code_path()
|
||||
default_config = {
|
||||
"GT_FOLDER": os.path.join(
|
||||
code_path, "data/gt/youtube_vis/"
|
||||
), # Location of GT data
|
||||
"TRACKERS_FOLDER": os.path.join(code_path, "data/trackers/youtube_vis/"),
|
||||
# Trackers location
|
||||
"OUTPUT_FOLDER": None, # Where to save eval results (if None, same as TRACKERS_FOLDER)
|
||||
"TRACKERS_TO_EVAL": None, # Filenames of trackers to eval (if None, all in folder)
|
||||
"CLASSES_TO_EVAL": None, # Classes to eval (if None, all classes)
|
||||
"SPLIT_TO_EVAL": "train_sub_split", # Valid: 'train', 'val', 'train_sub_split'
|
||||
"PRINT_CONFIG": True, # Whether to print current config
|
||||
"OUTPUT_SUB_FOLDER": "", # Output files are saved in OUTPUT_FOLDER/tracker_name/OUTPUT_SUB_FOLDER
|
||||
"TRACKER_SUB_FOLDER": "data", # Tracker files are in TRACKER_FOLDER/tracker_name/TRACKER_SUB_FOLDER
|
||||
"TRACKER_DISPLAY_NAMES": None, # Names of trackers to display, if None: TRACKERS_TO_EVAL
|
||||
# Added for video phrase AP evaluation -- allow directly specifying the GT JSON data and Tracker (result)
|
||||
# JSON data as Python objects, without reading from files.
|
||||
"GT_JSON_OBJECT": None,
|
||||
"TRACKER_JSON_OBJECT": None,
|
||||
"IOU_TYPE": "segm",
|
||||
"DATASET_NAME": "video",
|
||||
}
|
||||
return default_config
|
||||
|
||||
def __init__(self, config=None):
|
||||
"""Initialise dataset, checking that all required files are present"""
|
||||
super().__init__()
|
||||
# Fill non-given config values with defaults
|
||||
self.config = utils.init_config(config, self.get_default_dataset_config())
|
||||
self.gt_fol = (
|
||||
self.config["GT_FOLDER"] + "youtube_vis_" + self.config["SPLIT_TO_EVAL"]
|
||||
)
|
||||
self.tracker_fol = (
|
||||
self.config["TRACKERS_FOLDER"]
|
||||
+ "youtube_vis_"
|
||||
+ self.config["SPLIT_TO_EVAL"]
|
||||
)
|
||||
self.use_super_categories = False
|
||||
self.should_classes_combine = True
|
||||
assert self.config["IOU_TYPE"] in ["segm", "bbox"]
|
||||
self.iou_type = self.config["IOU_TYPE"]
|
||||
print("=" * 100)
|
||||
print(f"Evaluate annotation type *{self.iou_type}*")
|
||||
self.dataset_name = self.config["DATASET_NAME"]
|
||||
|
||||
self.output_fol = self.config["OUTPUT_FOLDER"]
|
||||
if self.output_fol is None:
|
||||
self.output_fol = self.tracker_fol
|
||||
self.output_sub_fol = self.config["OUTPUT_SUB_FOLDER"]
|
||||
self.tracker_sub_fol = self.config["TRACKER_SUB_FOLDER"]
|
||||
|
||||
if self.config["GT_JSON_OBJECT"] is not None:
|
||||
# allow directly specifying the GT JSON data without reading from files
|
||||
gt_json = self.config["GT_JSON_OBJECT"]
|
||||
assert isinstance(gt_json, dict)
|
||||
assert "videos" in gt_json
|
||||
assert "categories" in gt_json
|
||||
assert "annotations" in gt_json
|
||||
self.gt_data = gt_json
|
||||
else:
|
||||
if not os.path.exists(self.gt_fol):
|
||||
print("GT folder not found: " + self.gt_fol)
|
||||
raise TrackEvalException(
|
||||
"GT folder not found: " + os.path.basename(self.gt_fol)
|
||||
)
|
||||
gt_dir_files = [
|
||||
file for file in os.listdir(self.gt_fol) if file.endswith(".json")
|
||||
]
|
||||
if len(gt_dir_files) != 1:
|
||||
raise TrackEvalException(
|
||||
self.gt_fol + " does not contain exactly one json file."
|
||||
)
|
||||
|
||||
with open(os.path.join(self.gt_fol, gt_dir_files[0])) as f:
|
||||
self.gt_data = json.load(f)
|
||||
|
||||
# Get classes to eval
|
||||
self.valid_classes = [cls["name"] for cls in self.gt_data["categories"]]
|
||||
cls_name_to_cls_id_map = {
|
||||
cls["name"]: cls["id"] for cls in self.gt_data["categories"]
|
||||
}
|
||||
|
||||
if self.config["CLASSES_TO_EVAL"]:
|
||||
self.class_list = [
|
||||
cls.lower() if cls.lower() in self.valid_classes else None
|
||||
for cls in self.config["CLASSES_TO_EVAL"]
|
||||
]
|
||||
if not all(self.class_list):
|
||||
raise TrackEvalException(
|
||||
"Attempted to evaluate an invalid class. Only classes "
|
||||
+ ", ".join(self.valid_classes)
|
||||
+ " are valid."
|
||||
)
|
||||
else:
|
||||
self.class_list = [cls["name"] for cls in self.gt_data["categories"]]
|
||||
self.class_name_to_class_id = {
|
||||
k: v for k, v in cls_name_to_cls_id_map.items() if k in self.class_list
|
||||
}
|
||||
|
||||
# Get sequences to eval and check gt files exist
|
||||
self.seq_list = [
|
||||
vid["file_names"][0].split("/")[0] for vid in self.gt_data["videos"]
|
||||
]
|
||||
self.seq_name_to_seq_id = {
|
||||
vid["file_names"][0].split("/")[0]: vid["id"]
|
||||
for vid in self.gt_data["videos"]
|
||||
}
|
||||
self.seq_lengths = {
|
||||
vid["id"]: len(vid["file_names"]) for vid in self.gt_data["videos"]
|
||||
}
|
||||
|
||||
# encode masks and compute track areas
|
||||
self._prepare_gt_annotations()
|
||||
|
||||
# Get trackers to eval
|
||||
if self.config["TRACKER_JSON_OBJECT"] is not None:
|
||||
# allow directly specifying the tracker JSON data without reading from files
|
||||
tracker_json = self.config["TRACKER_JSON_OBJECT"]
|
||||
assert isinstance(tracker_json, list)
|
||||
self.tracker_list = ["tracker"]
|
||||
elif self.config["TRACKERS_TO_EVAL"] is None:
|
||||
self.tracker_list = os.listdir(self.tracker_fol)
|
||||
else:
|
||||
self.tracker_list = self.config["TRACKERS_TO_EVAL"]
|
||||
|
||||
if self.config["TRACKER_DISPLAY_NAMES"] is None:
|
||||
self.tracker_to_disp = dict(zip(self.tracker_list, self.tracker_list))
|
||||
elif (self.config["TRACKERS_TO_EVAL"] is not None) and (
|
||||
len(self.config["TRACKER_DISPLAY_NAMES"]) == len(self.tracker_list)
|
||||
):
|
||||
self.tracker_to_disp = dict(
|
||||
zip(self.tracker_list, self.config["TRACKER_DISPLAY_NAMES"])
|
||||
)
|
||||
else:
|
||||
raise TrackEvalException(
|
||||
"List of tracker files and tracker display names do not match."
|
||||
)
|
||||
|
||||
# counter for globally unique track IDs
|
||||
self.global_tid_counter = 0
|
||||
|
||||
self.tracker_data = dict()
|
||||
if self.config["TRACKER_JSON_OBJECT"] is not None:
|
||||
# allow directly specifying the tracker JSON data without reading from files
|
||||
tracker = self.tracker_list[0]
|
||||
self.tracker_data[tracker] = tracker_json
|
||||
else:
|
||||
for tracker in self.tracker_list:
|
||||
tracker_dir_path = os.path.join(
|
||||
self.tracker_fol, tracker, self.tracker_sub_fol
|
||||
)
|
||||
tr_dir_files = [
|
||||
file
|
||||
for file in os.listdir(tracker_dir_path)
|
||||
if file.endswith(".json")
|
||||
]
|
||||
if len(tr_dir_files) != 1:
|
||||
raise TrackEvalException(
|
||||
tracker_dir_path + " does not contain exactly one json file."
|
||||
)
|
||||
|
||||
with open(os.path.join(tracker_dir_path, tr_dir_files[0])) as f:
|
||||
curr_data = json.load(f)
|
||||
|
||||
self.tracker_data[tracker] = curr_data
|
||||
|
||||
def get_display_name(self, tracker):
|
||||
return self.tracker_to_disp[tracker]
|
||||
|
||||
def _load_raw_file(self, tracker, seq, is_gt):
|
||||
"""Load a file (gt or tracker) in the YouTubeVIS format
|
||||
If is_gt, this returns a dict which contains the fields:
|
||||
[gt_ids, gt_classes] : list (for each timestep) of 1D NDArrays (for each det).
|
||||
[gt_dets]: list (for each timestep) of lists of detections.
|
||||
[classes_to_gt_tracks]: dictionary with class values as keys and list of dictionaries (with frame indices as
|
||||
keys and corresponding segmentations as values) for each track
|
||||
[classes_to_gt_track_ids, classes_to_gt_track_areas, classes_to_gt_track_iscrowd]: dictionary with class values
|
||||
as keys and lists (for each track) as values
|
||||
|
||||
if not is_gt, this returns a dict which contains the fields:
|
||||
[tracker_ids, tracker_classes, tracker_confidences] : list (for each timestep) of 1D NDArrays (for each det).
|
||||
[tracker_dets]: list (for each timestep) of lists of detections.
|
||||
[classes_to_dt_tracks]: dictionary with class values as keys and list of dictionaries (with frame indices as
|
||||
keys and corresponding segmentations as values) for each track
|
||||
[classes_to_dt_track_ids, classes_to_dt_track_areas]: dictionary with class values as keys and lists as values
|
||||
[classes_to_dt_track_scores]: dictionary with class values as keys and 1D numpy arrays as values
|
||||
"""
|
||||
# select sequence tracks
|
||||
seq_id = self.seq_name_to_seq_id[seq]
|
||||
if is_gt:
|
||||
tracks = [
|
||||
ann for ann in self.gt_data["annotations"] if ann["video_id"] == seq_id
|
||||
]
|
||||
else:
|
||||
tracks = self._get_tracker_seq_tracks(tracker, seq_id)
|
||||
|
||||
# Convert data to required format
|
||||
num_timesteps = self.seq_lengths[seq_id]
|
||||
data_keys = ["ids", "classes", "dets"]
|
||||
if not is_gt:
|
||||
data_keys += ["tracker_confidences"]
|
||||
raw_data = {key: [None] * num_timesteps for key in data_keys}
|
||||
result_key = "segmentations" if self.iou_type == "segm" else "bboxes"
|
||||
for t in range(num_timesteps):
|
||||
raw_data["dets"][t] = [
|
||||
track[result_key][t] for track in tracks if track[result_key][t]
|
||||
]
|
||||
raw_data["ids"][t] = np.atleast_1d(
|
||||
[track["id"] for track in tracks if track[result_key][t]]
|
||||
).astype(int)
|
||||
raw_data["classes"][t] = np.atleast_1d(
|
||||
[track["category_id"] for track in tracks if track[result_key][t]]
|
||||
).astype(int)
|
||||
if not is_gt:
|
||||
raw_data["tracker_confidences"][t] = np.atleast_1d(
|
||||
[track["score"] for track in tracks if track[result_key][t]]
|
||||
).astype(float)
|
||||
|
||||
if is_gt:
|
||||
key_map = {"ids": "gt_ids", "classes": "gt_classes", "dets": "gt_dets"}
|
||||
else:
|
||||
key_map = {
|
||||
"ids": "tracker_ids",
|
||||
"classes": "tracker_classes",
|
||||
"dets": "tracker_dets",
|
||||
}
|
||||
for k, v in key_map.items():
|
||||
raw_data[v] = raw_data.pop(k)
|
||||
|
||||
all_cls_ids = {self.class_name_to_class_id[cls] for cls in self.class_list}
|
||||
classes_to_tracks = {
|
||||
cls: [track for track in tracks if track["category_id"] == cls]
|
||||
for cls in all_cls_ids
|
||||
}
|
||||
|
||||
# mapping from classes to track representations and track information
|
||||
raw_data["classes_to_tracks"] = {
|
||||
cls: [
|
||||
{i: track[result_key][i] for i in range(len(track[result_key]))}
|
||||
for track in tracks
|
||||
]
|
||||
for cls, tracks in classes_to_tracks.items()
|
||||
}
|
||||
raw_data["classes_to_track_ids"] = {
|
||||
cls: [track["id"] for track in tracks]
|
||||
for cls, tracks in classes_to_tracks.items()
|
||||
}
|
||||
raw_data["classes_to_track_areas"] = {
|
||||
cls: [track["area"] for track in tracks]
|
||||
for cls, tracks in classes_to_tracks.items()
|
||||
}
|
||||
|
||||
if is_gt:
|
||||
raw_data["classes_to_gt_track_iscrowd"] = {
|
||||
cls: [track["iscrowd"] for track in tracks]
|
||||
for cls, tracks in classes_to_tracks.items()
|
||||
}
|
||||
else:
|
||||
raw_data["classes_to_dt_track_scores"] = {
|
||||
cls: np.array([track["score"] for track in tracks])
|
||||
for cls, tracks in classes_to_tracks.items()
|
||||
}
|
||||
|
||||
if is_gt:
|
||||
key_map = {
|
||||
"classes_to_tracks": "classes_to_gt_tracks",
|
||||
"classes_to_track_ids": "classes_to_gt_track_ids",
|
||||
"classes_to_track_areas": "classes_to_gt_track_areas",
|
||||
}
|
||||
else:
|
||||
key_map = {
|
||||
"classes_to_tracks": "classes_to_dt_tracks",
|
||||
"classes_to_track_ids": "classes_to_dt_track_ids",
|
||||
"classes_to_track_areas": "classes_to_dt_track_areas",
|
||||
}
|
||||
for k, v in key_map.items():
|
||||
raw_data[v] = raw_data.pop(k)
|
||||
|
||||
raw_data["num_timesteps"] = num_timesteps
|
||||
raw_data["seq"] = seq
|
||||
return raw_data
|
||||
|
||||
@_timing.time
|
||||
def get_preprocessed_seq_data(self, raw_data, cls):
|
||||
"""Preprocess data for a single sequence for a single class ready for evaluation.
|
||||
Inputs:
|
||||
- raw_data is a dict containing the data for the sequence already read in by get_raw_seq_data().
|
||||
- cls is the class to be evaluated.
|
||||
Outputs:
|
||||
- data is a dict containing all of the information that metrics need to perform evaluation.
|
||||
It contains the following fields:
|
||||
[num_timesteps, num_gt_ids, num_tracker_ids, num_gt_dets, num_tracker_dets] : integers.
|
||||
[gt_ids, tracker_ids, tracker_confidences]: list (for each timestep) of 1D NDArrays (for each det).
|
||||
[gt_dets, tracker_dets]: list (for each timestep) of lists of detections.
|
||||
[similarity_scores]: list (for each timestep) of 2D NDArrays.
|
||||
Notes:
|
||||
General preprocessing (preproc) occurs in 4 steps. Some datasets may not use all of these steps.
|
||||
1) Extract only detections relevant for the class to be evaluated (including distractor detections).
|
||||
2) Match gt dets and tracker dets. Remove tracker dets that are matched to a gt det that is of a
|
||||
distractor class, or otherwise marked as to be removed.
|
||||
3) Remove unmatched tracker dets if they fall within a crowd ignore region or don't meet a certain
|
||||
other criteria (e.g. are too small).
|
||||
4) Remove gt dets that were only useful for preprocessing and not for actual evaluation.
|
||||
After the above preprocessing steps, this function also calculates the number of gt and tracker detections
|
||||
and unique track ids. It also relabels gt and tracker ids to be contiguous and checks that ids are
|
||||
unique within each timestep.
|
||||
YouTubeVIS:
|
||||
In YouTubeVIS, the 4 preproc steps are as follow:
|
||||
1) There are 40 classes which are evaluated separately.
|
||||
2) No matched tracker dets are removed.
|
||||
3) No unmatched tracker dets are removed.
|
||||
4) No gt dets are removed.
|
||||
Further, for TrackMAP computation track representations for the given class are accessed from a dictionary
|
||||
and the tracks from the tracker data are sorted according to the tracker confidence.
|
||||
"""
|
||||
cls_id = self.class_name_to_class_id[cls]
|
||||
|
||||
data_keys = [
|
||||
"gt_ids",
|
||||
"tracker_ids",
|
||||
"gt_dets",
|
||||
"tracker_dets",
|
||||
"similarity_scores",
|
||||
]
|
||||
data = {key: [None] * raw_data["num_timesteps"] for key in data_keys}
|
||||
unique_gt_ids = []
|
||||
unique_tracker_ids = []
|
||||
num_gt_dets = 0
|
||||
num_tracker_dets = 0
|
||||
|
||||
for t in range(raw_data["num_timesteps"]):
|
||||
# Only extract relevant dets for this class for eval (cls)
|
||||
gt_class_mask = np.atleast_1d(raw_data["gt_classes"][t] == cls_id)
|
||||
gt_class_mask = gt_class_mask.astype(bool)
|
||||
gt_ids = raw_data["gt_ids"][t][gt_class_mask]
|
||||
gt_dets = [
|
||||
raw_data["gt_dets"][t][ind]
|
||||
for ind in range(len(gt_class_mask))
|
||||
if gt_class_mask[ind]
|
||||
]
|
||||
|
||||
tracker_class_mask = np.atleast_1d(raw_data["tracker_classes"][t] == cls_id)
|
||||
tracker_class_mask = tracker_class_mask.astype(bool)
|
||||
tracker_ids = raw_data["tracker_ids"][t][tracker_class_mask]
|
||||
tracker_dets = [
|
||||
raw_data["tracker_dets"][t][ind]
|
||||
for ind in range(len(tracker_class_mask))
|
||||
if tracker_class_mask[ind]
|
||||
]
|
||||
similarity_scores = raw_data["similarity_scores"][t][gt_class_mask, :][
|
||||
:, tracker_class_mask
|
||||
]
|
||||
|
||||
data["tracker_ids"][t] = tracker_ids
|
||||
data["tracker_dets"][t] = tracker_dets
|
||||
data["gt_ids"][t] = gt_ids
|
||||
data["gt_dets"][t] = gt_dets
|
||||
data["similarity_scores"][t] = similarity_scores
|
||||
|
||||
unique_gt_ids += list(np.unique(data["gt_ids"][t]))
|
||||
unique_tracker_ids += list(np.unique(data["tracker_ids"][t]))
|
||||
num_tracker_dets += len(data["tracker_ids"][t])
|
||||
num_gt_dets += len(data["gt_ids"][t])
|
||||
|
||||
# Re-label IDs such that there are no empty IDs
|
||||
if len(unique_gt_ids) > 0:
|
||||
unique_gt_ids = np.unique(unique_gt_ids)
|
||||
gt_id_map = np.nan * np.ones((np.max(unique_gt_ids) + 1))
|
||||
gt_id_map[unique_gt_ids] = np.arange(len(unique_gt_ids))
|
||||
for t in range(raw_data["num_timesteps"]):
|
||||
if len(data["gt_ids"][t]) > 0:
|
||||
data["gt_ids"][t] = gt_id_map[data["gt_ids"][t]].astype(int)
|
||||
if len(unique_tracker_ids) > 0:
|
||||
unique_tracker_ids = np.unique(unique_tracker_ids)
|
||||
tracker_id_map = np.nan * np.ones((np.max(unique_tracker_ids) + 1))
|
||||
tracker_id_map[unique_tracker_ids] = np.arange(len(unique_tracker_ids))
|
||||
for t in range(raw_data["num_timesteps"]):
|
||||
if len(data["tracker_ids"][t]) > 0:
|
||||
data["tracker_ids"][t] = tracker_id_map[
|
||||
data["tracker_ids"][t]
|
||||
].astype(int)
|
||||
|
||||
# Ensure that ids are unique per timestep.
|
||||
self._check_unique_ids(data)
|
||||
|
||||
# Record overview statistics.
|
||||
data["num_tracker_dets"] = num_tracker_dets
|
||||
data["num_gt_dets"] = num_gt_dets
|
||||
data["num_tracker_ids"] = len(unique_tracker_ids)
|
||||
data["num_gt_ids"] = len(unique_gt_ids)
|
||||
data["num_timesteps"] = raw_data["num_timesteps"]
|
||||
data["seq"] = raw_data["seq"]
|
||||
|
||||
# get track representations
|
||||
data["gt_tracks"] = raw_data["classes_to_gt_tracks"][cls_id]
|
||||
data["gt_track_ids"] = raw_data["classes_to_gt_track_ids"][cls_id]
|
||||
data["gt_track_areas"] = raw_data["classes_to_gt_track_areas"][cls_id]
|
||||
data["gt_track_iscrowd"] = raw_data["classes_to_gt_track_iscrowd"][cls_id]
|
||||
data["dt_tracks"] = raw_data["classes_to_dt_tracks"][cls_id]
|
||||
data["dt_track_ids"] = raw_data["classes_to_dt_track_ids"][cls_id]
|
||||
data["dt_track_areas"] = raw_data["classes_to_dt_track_areas"][cls_id]
|
||||
data["dt_track_scores"] = raw_data["classes_to_dt_track_scores"][cls_id]
|
||||
data["iou_type"] = "mask"
|
||||
|
||||
# sort tracker data tracks by tracker confidence scores
|
||||
if data["dt_tracks"]:
|
||||
idx = np.argsort(
|
||||
[-score for score in data["dt_track_scores"]], kind="mergesort"
|
||||
)
|
||||
data["dt_track_scores"] = [data["dt_track_scores"][i] for i in idx]
|
||||
data["dt_tracks"] = [data["dt_tracks"][i] for i in idx]
|
||||
data["dt_track_ids"] = [data["dt_track_ids"][i] for i in idx]
|
||||
data["dt_track_areas"] = [data["dt_track_areas"][i] for i in idx]
|
||||
|
||||
return data
|
||||
|
||||
def _calculate_similarities(self, gt_dets_t, tracker_dets_t):
|
||||
if self.iou_type == "segm":
|
||||
similarity_scores = self._calculate_mask_ious(
|
||||
gt_dets_t, tracker_dets_t, is_encoded=True, do_ioa=False
|
||||
)
|
||||
else:
|
||||
gt_dets_t = np.array(gt_dets_t, dtype=np.float32).reshape(-1, 4)
|
||||
tracker_dets_t = np.array(tracker_dets_t, dtype=np.float32).reshape(-1, 4)
|
||||
similarity_scores = self._calculate_box_ious(
|
||||
gt_dets_t, tracker_dets_t, box_format="xywh", do_ioa=False
|
||||
)
|
||||
return similarity_scores
|
||||
|
||||
def _prepare_gt_annotations(self):
|
||||
"""
|
||||
Prepares GT data by rle encoding segmentations and computing the average track area.
|
||||
:return: None
|
||||
"""
|
||||
if self.iou_type == "segm":
|
||||
# only loaded when needed to reduce minimum requirements
|
||||
from pycocotools import mask as mask_utils
|
||||
|
||||
for track in self.gt_data["annotations"]:
|
||||
h = track["height"]
|
||||
w = track["width"]
|
||||
for i, seg in enumerate(track["segmentations"]):
|
||||
if seg is not None and isinstance(seg["counts"], list):
|
||||
track["segmentations"][i] = mask_utils.frPyObjects(seg, h, w)
|
||||
areas = [a for a in track["areas"] if a]
|
||||
if len(areas) == 0:
|
||||
track["area"] = 0
|
||||
else:
|
||||
track["area"] = np.array(areas).mean()
|
||||
else:
|
||||
for track in self.gt_data["annotations"]:
|
||||
# For bbox eval, compute areas from bboxes if not already available
|
||||
areas = [a for a in track.get("areas", []) if a]
|
||||
if not areas:
|
||||
areas = []
|
||||
for bbox in track.get("bboxes", []):
|
||||
if bbox is not None:
|
||||
areas.append(bbox[2] * bbox[3])
|
||||
track["area"] = np.array(areas).mean() if areas else 0
|
||||
|
||||
def _get_tracker_seq_tracks(self, tracker, seq_id):
|
||||
"""
|
||||
Prepares tracker data for a given sequence. Extracts all annotations for given sequence ID, computes
|
||||
average track area and assigns a track ID.
|
||||
:param tracker: the given tracker
|
||||
:param seq_id: the sequence ID
|
||||
:return: the extracted tracks
|
||||
"""
|
||||
# only loaded when needed to reduce minimum requirements
|
||||
from pycocotools import mask as mask_utils
|
||||
|
||||
tracks = [
|
||||
ann for ann in self.tracker_data[tracker] if ann["video_id"] == seq_id
|
||||
]
|
||||
for track in tracks:
|
||||
if "areas" not in track:
|
||||
if self.iou_type == "segm":
|
||||
for seg in track["segmentations"]:
|
||||
if seg:
|
||||
track["areas"].append(mask_utils.area(seg))
|
||||
else:
|
||||
track["areas"].append(None)
|
||||
else:
|
||||
for bbox in track["bboxes"]:
|
||||
if bbox:
|
||||
track["areas"].append(bbox[2] * bbox[3])
|
||||
else:
|
||||
track["areas"].append(None)
|
||||
areas = [a for a in track["areas"] if a]
|
||||
if len(areas) == 0:
|
||||
track["area"] = 0
|
||||
else:
|
||||
track["area"] = np.array(areas).mean()
|
||||
track["id"] = self.global_tid_counter
|
||||
self.global_tid_counter += 1
|
||||
return tracks
|
||||
|
||||
def get_name(self):
|
||||
return self.dataset_name
|
||||
395
sam3/eval/hota_eval_toolkit/trackeval/eval.py
Normal file
395
sam3/eval/hota_eval_toolkit/trackeval/eval.py
Normal file
@@ -0,0 +1,395 @@
|
||||
# flake8: noqa
|
||||
|
||||
import os
|
||||
import time
|
||||
import traceback
|
||||
from functools import partial
|
||||
from multiprocessing.pool import Pool
|
||||
|
||||
import numpy as np
|
||||
|
||||
from . import _timing, utils
|
||||
from .metrics import Count
|
||||
from .utils import TrackEvalException
|
||||
|
||||
try:
|
||||
import tqdm
|
||||
|
||||
TQDM_IMPORTED = True
|
||||
except ImportError as _:
|
||||
TQDM_IMPORTED = False
|
||||
|
||||
|
||||
class Evaluator:
|
||||
"""Evaluator class for evaluating different metrics for different datasets"""
|
||||
|
||||
@staticmethod
|
||||
def get_default_eval_config():
|
||||
"""Returns the default config values for evaluation"""
|
||||
code_path = utils.get_code_path()
|
||||
default_config = {
|
||||
"USE_PARALLEL": False,
|
||||
"NUM_PARALLEL_CORES": 8,
|
||||
"BREAK_ON_ERROR": True, # Raises exception and exits with error
|
||||
"RETURN_ON_ERROR": False, # if not BREAK_ON_ERROR, then returns from function on error
|
||||
"LOG_ON_ERROR": os.path.join(
|
||||
code_path, "error_log.txt"
|
||||
), # if not None, save any errors into a log file.
|
||||
"PRINT_RESULTS": True,
|
||||
"PRINT_ONLY_COMBINED": False,
|
||||
"PRINT_CONFIG": True,
|
||||
"TIME_PROGRESS": True,
|
||||
"DISPLAY_LESS_PROGRESS": True,
|
||||
"OUTPUT_SUMMARY": True,
|
||||
"OUTPUT_EMPTY_CLASSES": True, # If False, summary files are not output for classes with no detections
|
||||
"OUTPUT_DETAILED": True,
|
||||
"PLOT_CURVES": True,
|
||||
}
|
||||
return default_config
|
||||
|
||||
def __init__(self, config=None):
|
||||
"""Initialise the evaluator with a config file"""
|
||||
self.config = utils.init_config(config, self.get_default_eval_config(), "Eval")
|
||||
# Only run timing analysis if not run in parallel.
|
||||
if self.config["TIME_PROGRESS"] and not self.config["USE_PARALLEL"]:
|
||||
_timing.DO_TIMING = True
|
||||
if self.config["DISPLAY_LESS_PROGRESS"]:
|
||||
_timing.DISPLAY_LESS_PROGRESS = True
|
||||
|
||||
def _combine_results(
|
||||
self,
|
||||
res,
|
||||
metrics_list,
|
||||
metric_names,
|
||||
dataset,
|
||||
res_field="COMBINED_SEQ",
|
||||
target_tag=None,
|
||||
):
|
||||
assert res_field.startswith("COMBINED_SEQ")
|
||||
# collecting combined cls keys (cls averaged, det averaged, super classes)
|
||||
tracker_list, seq_list, class_list = dataset.get_eval_info()
|
||||
combined_cls_keys = []
|
||||
res[res_field] = {}
|
||||
|
||||
# narrow the target for evaluation
|
||||
if target_tag is not None:
|
||||
target_video_ids = [
|
||||
annot["video_id"]
|
||||
for annot in dataset.gt_data["annotations"]
|
||||
if target_tag in annot["tags"]
|
||||
]
|
||||
vid2name = {
|
||||
video["id"]: video["file_names"][0].split("/")[0]
|
||||
for video in dataset.gt_data["videos"]
|
||||
}
|
||||
target_video_ids = set(target_video_ids)
|
||||
target_video = [vid2name[video_id] for video_id in target_video_ids]
|
||||
|
||||
if len(target_video) == 0:
|
||||
raise TrackEvalException(
|
||||
"No sequences found with the tag %s" % target_tag
|
||||
)
|
||||
|
||||
target_annotations = [
|
||||
annot
|
||||
for annot in dataset.gt_data["annotations"]
|
||||
if annot["video_id"] in target_video_ids
|
||||
]
|
||||
assert all(target_tag in annot["tags"] for annot in target_annotations), (
|
||||
f"Not all annotations in the target sequences have the target tag {target_tag}. "
|
||||
"We currently only support a target tag at the sequence level, not at the annotation level."
|
||||
)
|
||||
else:
|
||||
target_video = seq_list
|
||||
|
||||
# combine sequences for each class
|
||||
for c_cls in class_list:
|
||||
res[res_field][c_cls] = {}
|
||||
for metric, metric_name in zip(metrics_list, metric_names):
|
||||
curr_res = {
|
||||
seq_key: seq_value[c_cls][metric_name]
|
||||
for seq_key, seq_value in res.items()
|
||||
if not seq_key.startswith("COMBINED_SEQ")
|
||||
and seq_key in target_video
|
||||
}
|
||||
res[res_field][c_cls][metric_name] = metric.combine_sequences(curr_res)
|
||||
# combine classes
|
||||
if dataset.should_classes_combine:
|
||||
combined_cls_keys += [
|
||||
"cls_comb_cls_av",
|
||||
"cls_comb_det_av",
|
||||
"all",
|
||||
]
|
||||
res[res_field]["cls_comb_cls_av"] = {}
|
||||
res[res_field]["cls_comb_det_av"] = {}
|
||||
for metric, metric_name in zip(metrics_list, metric_names):
|
||||
cls_res = {
|
||||
cls_key: cls_value[metric_name]
|
||||
for cls_key, cls_value in res[res_field].items()
|
||||
if cls_key not in combined_cls_keys
|
||||
}
|
||||
res[res_field]["cls_comb_cls_av"][metric_name] = (
|
||||
metric.combine_classes_class_averaged(cls_res)
|
||||
)
|
||||
res[res_field]["cls_comb_det_av"][metric_name] = (
|
||||
metric.combine_classes_det_averaged(cls_res)
|
||||
)
|
||||
# combine classes to super classes
|
||||
if dataset.use_super_categories:
|
||||
for cat, sub_cats in dataset.super_categories.items():
|
||||
combined_cls_keys.append(cat)
|
||||
res[res_field][cat] = {}
|
||||
for metric, metric_name in zip(metrics_list, metric_names):
|
||||
cat_res = {
|
||||
cls_key: cls_value[metric_name]
|
||||
for cls_key, cls_value in res[res_field].items()
|
||||
if cls_key in sub_cats
|
||||
}
|
||||
res[res_field][cat][metric_name] = (
|
||||
metric.combine_classes_det_averaged(cat_res)
|
||||
)
|
||||
return res, combined_cls_keys
|
||||
|
||||
def _summarize_results(
|
||||
self,
|
||||
res,
|
||||
tracker,
|
||||
metrics_list,
|
||||
metric_names,
|
||||
dataset,
|
||||
res_field,
|
||||
combined_cls_keys,
|
||||
):
|
||||
config = self.config
|
||||
output_fol = dataset.get_output_fol(tracker)
|
||||
tracker_display_name = dataset.get_display_name(tracker)
|
||||
for c_cls in res[
|
||||
res_field
|
||||
].keys(): # class_list + combined classes if calculated
|
||||
summaries = []
|
||||
details = []
|
||||
num_dets = res[res_field][c_cls]["Count"]["Dets"]
|
||||
if config["OUTPUT_EMPTY_CLASSES"] or num_dets > 0:
|
||||
for metric, metric_name in zip(metrics_list, metric_names):
|
||||
# for combined classes there is no per sequence evaluation
|
||||
if c_cls in combined_cls_keys:
|
||||
table_res = {res_field: res[res_field][c_cls][metric_name]}
|
||||
else:
|
||||
table_res = {
|
||||
seq_key: seq_value[c_cls][metric_name]
|
||||
for seq_key, seq_value in res.items()
|
||||
}
|
||||
|
||||
if config["PRINT_RESULTS"] and config["PRINT_ONLY_COMBINED"]:
|
||||
dont_print = (
|
||||
dataset.should_classes_combine
|
||||
and c_cls not in combined_cls_keys
|
||||
)
|
||||
if not dont_print:
|
||||
metric.print_table(
|
||||
{res_field: table_res[res_field]},
|
||||
tracker_display_name,
|
||||
c_cls,
|
||||
res_field,
|
||||
res_field,
|
||||
)
|
||||
elif config["PRINT_RESULTS"]:
|
||||
metric.print_table(
|
||||
table_res, tracker_display_name, c_cls, res_field, res_field
|
||||
)
|
||||
if config["OUTPUT_SUMMARY"]:
|
||||
summaries.append(metric.summary_results(table_res))
|
||||
if config["OUTPUT_DETAILED"]:
|
||||
details.append(metric.detailed_results(table_res))
|
||||
if config["PLOT_CURVES"]:
|
||||
metric.plot_single_tracker_results(
|
||||
table_res,
|
||||
tracker_display_name,
|
||||
c_cls,
|
||||
output_fol,
|
||||
)
|
||||
if config["OUTPUT_SUMMARY"]:
|
||||
utils.write_summary_results(summaries, c_cls, output_fol)
|
||||
if config["OUTPUT_DETAILED"]:
|
||||
utils.write_detailed_results(details, c_cls, output_fol)
|
||||
|
||||
@_timing.time
|
||||
def evaluate(self, dataset_list, metrics_list, show_progressbar=False):
|
||||
"""Evaluate a set of metrics on a set of datasets"""
|
||||
config = self.config
|
||||
metrics_list = metrics_list + [Count()] # Count metrics are always run
|
||||
metric_names = utils.validate_metrics_list(metrics_list)
|
||||
dataset_names = [dataset.get_name() for dataset in dataset_list]
|
||||
output_res = {}
|
||||
output_msg = {}
|
||||
|
||||
for dataset, dataset_name in zip(dataset_list, dataset_names):
|
||||
# Get dataset info about what to evaluate
|
||||
output_res[dataset_name] = {}
|
||||
output_msg[dataset_name] = {}
|
||||
tracker_list, seq_list, class_list = dataset.get_eval_info()
|
||||
print(
|
||||
"\nEvaluating %i tracker(s) on %i sequence(s) for %i class(es) on %s dataset using the following "
|
||||
"metrics: %s\n"
|
||||
% (
|
||||
len(tracker_list),
|
||||
len(seq_list),
|
||||
len(class_list),
|
||||
dataset_name,
|
||||
", ".join(metric_names),
|
||||
)
|
||||
)
|
||||
|
||||
# Evaluate each tracker
|
||||
for tracker in tracker_list:
|
||||
# if not config['BREAK_ON_ERROR'] then go to next tracker without breaking
|
||||
try:
|
||||
# Evaluate each sequence in parallel or in series.
|
||||
# returns a nested dict (res), indexed like: res[seq][class][metric_name][sub_metric field]
|
||||
# e.g. res[seq_0001][pedestrian][hota][DetA]
|
||||
print("\nEvaluating %s\n" % tracker)
|
||||
time_start = time.time()
|
||||
if config["USE_PARALLEL"]:
|
||||
if show_progressbar and TQDM_IMPORTED:
|
||||
seq_list_sorted = sorted(seq_list)
|
||||
|
||||
with Pool(config["NUM_PARALLEL_CORES"]) as pool, tqdm.tqdm(
|
||||
total=len(seq_list)
|
||||
) as pbar:
|
||||
_eval_sequence = partial(
|
||||
eval_sequence,
|
||||
dataset=dataset,
|
||||
tracker=tracker,
|
||||
class_list=class_list,
|
||||
metrics_list=metrics_list,
|
||||
metric_names=metric_names,
|
||||
)
|
||||
results = []
|
||||
for r in pool.imap(
|
||||
_eval_sequence, seq_list_sorted, chunksize=20
|
||||
):
|
||||
results.append(r)
|
||||
pbar.update()
|
||||
res = dict(zip(seq_list_sorted, results))
|
||||
|
||||
else:
|
||||
with Pool(config["NUM_PARALLEL_CORES"]) as pool:
|
||||
_eval_sequence = partial(
|
||||
eval_sequence,
|
||||
dataset=dataset,
|
||||
tracker=tracker,
|
||||
class_list=class_list,
|
||||
metrics_list=metrics_list,
|
||||
metric_names=metric_names,
|
||||
)
|
||||
results = pool.map(_eval_sequence, seq_list)
|
||||
res = dict(zip(seq_list, results))
|
||||
else:
|
||||
res = {}
|
||||
if show_progressbar and TQDM_IMPORTED:
|
||||
seq_list_sorted = sorted(seq_list)
|
||||
for curr_seq in tqdm.tqdm(seq_list_sorted):
|
||||
res[curr_seq] = eval_sequence(
|
||||
curr_seq,
|
||||
dataset,
|
||||
tracker,
|
||||
class_list,
|
||||
metrics_list,
|
||||
metric_names,
|
||||
)
|
||||
else:
|
||||
for curr_seq in sorted(seq_list):
|
||||
res[curr_seq] = eval_sequence(
|
||||
curr_seq,
|
||||
dataset,
|
||||
tracker,
|
||||
class_list,
|
||||
metrics_list,
|
||||
metric_names,
|
||||
)
|
||||
|
||||
# Combine results over all sequences and then over all classes
|
||||
res, combined_cls_keys = self._combine_results(
|
||||
res, metrics_list, metric_names, dataset, "COMBINED_SEQ"
|
||||
)
|
||||
|
||||
if np.all(
|
||||
["tags" in annot for annot in dataset.gt_data["annotations"]]
|
||||
):
|
||||
# Combine results over the challenging sequences and then over all classes
|
||||
# currently only support "tracking_challenging_pair"
|
||||
res, _ = self._combine_results(
|
||||
res,
|
||||
metrics_list,
|
||||
metric_names,
|
||||
dataset,
|
||||
"COMBINED_SEQ_CHALLENGING",
|
||||
"tracking_challenging_pair",
|
||||
)
|
||||
|
||||
# Print and output results in various formats
|
||||
if config["TIME_PROGRESS"]:
|
||||
print(
|
||||
"\nAll sequences for %s finished in %.2f seconds"
|
||||
% (tracker, time.time() - time_start)
|
||||
)
|
||||
|
||||
self._summarize_results(
|
||||
res,
|
||||
tracker,
|
||||
metrics_list,
|
||||
metric_names,
|
||||
dataset,
|
||||
"COMBINED_SEQ",
|
||||
combined_cls_keys,
|
||||
)
|
||||
if "COMBINED_SEQ_CHALLENGING" in res:
|
||||
self._summarize_results(
|
||||
res,
|
||||
tracker,
|
||||
metrics_list,
|
||||
metric_names,
|
||||
dataset,
|
||||
"COMBINED_SEQ_CHALLENGING",
|
||||
combined_cls_keys,
|
||||
)
|
||||
|
||||
# Output for returning from function
|
||||
output_res[dataset_name][tracker] = res
|
||||
output_msg[dataset_name][tracker] = "Success"
|
||||
|
||||
except Exception as err:
|
||||
output_res[dataset_name][tracker] = None
|
||||
if type(err) == TrackEvalException:
|
||||
output_msg[dataset_name][tracker] = str(err)
|
||||
else:
|
||||
output_msg[dataset_name][tracker] = "Unknown error occurred."
|
||||
print("Tracker %s was unable to be evaluated." % tracker)
|
||||
print(err)
|
||||
traceback.print_exc()
|
||||
if config["LOG_ON_ERROR"] is not None:
|
||||
with open(config["LOG_ON_ERROR"], "a") as f:
|
||||
print(dataset_name, file=f)
|
||||
print(tracker, file=f)
|
||||
print(traceback.format_exc(), file=f)
|
||||
print("\n\n\n", file=f)
|
||||
if config["BREAK_ON_ERROR"]:
|
||||
raise err
|
||||
elif config["RETURN_ON_ERROR"]:
|
||||
return output_res, output_msg
|
||||
|
||||
return output_res, output_msg
|
||||
|
||||
|
||||
@_timing.time
|
||||
def eval_sequence(seq, dataset, tracker, class_list, metrics_list, metric_names):
|
||||
"""Function for evaluating a single sequence"""
|
||||
|
||||
raw_data = dataset.get_raw_seq_data(tracker, seq)
|
||||
seq_res = {}
|
||||
for cls in class_list:
|
||||
seq_res[cls] = {}
|
||||
data = dataset.get_preprocessed_seq_data(raw_data, cls)
|
||||
for metric, met_name in zip(metrics_list, metric_names):
|
||||
seq_res[cls][met_name] = metric.eval_sequence(data)
|
||||
return seq_res
|
||||
@@ -0,0 +1,4 @@
|
||||
# flake8: noqa
|
||||
|
||||
from .count import Count
|
||||
from .hota import HOTA
|
||||
145
sam3/eval/hota_eval_toolkit/trackeval/metrics/_base_metric.py
Normal file
145
sam3/eval/hota_eval_toolkit/trackeval/metrics/_base_metric.py
Normal file
@@ -0,0 +1,145 @@
|
||||
# flake8: noqa
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
import numpy as np
|
||||
|
||||
from .. import _timing
|
||||
from ..utils import TrackEvalException
|
||||
|
||||
|
||||
class _BaseMetric(ABC):
|
||||
@abstractmethod
|
||||
def __init__(self):
|
||||
self.plottable = False
|
||||
self.integer_fields = []
|
||||
self.float_fields = []
|
||||
self.array_labels = []
|
||||
self.integer_array_fields = []
|
||||
self.float_array_fields = []
|
||||
self.fields = []
|
||||
self.summary_fields = []
|
||||
self.registered = False
|
||||
|
||||
#####################################################################
|
||||
# Abstract functions for subclasses to implement
|
||||
|
||||
@_timing.time
|
||||
@abstractmethod
|
||||
def eval_sequence(self, data): ...
|
||||
|
||||
@abstractmethod
|
||||
def combine_sequences(self, all_res): ...
|
||||
|
||||
@abstractmethod
|
||||
def combine_classes_class_averaged(self, all_res, ignore_empty_classes=False): ...
|
||||
|
||||
@abstractmethod
|
||||
def combine_classes_det_averaged(self, all_res): ...
|
||||
|
||||
def plot_single_tracker_results(self, all_res, tracker, output_folder, cls):
|
||||
"""Plot results of metrics, only valid for metrics with self.plottable"""
|
||||
if self.plottable:
|
||||
raise NotImplementedError(
|
||||
"plot_results is not implemented for metric %s" % self.get_name()
|
||||
)
|
||||
else:
|
||||
pass
|
||||
|
||||
#####################################################################
|
||||
# Helper functions which are useful for all metrics:
|
||||
|
||||
@classmethod
|
||||
def get_name(cls):
|
||||
return cls.__name__
|
||||
|
||||
@staticmethod
|
||||
def _combine_sum(all_res, field):
|
||||
"""Combine sequence results via sum"""
|
||||
return sum([all_res[k][field] for k in all_res.keys()])
|
||||
|
||||
@staticmethod
|
||||
def _combine_weighted_av(all_res, field, comb_res, weight_field):
|
||||
"""Combine sequence results via weighted average"""
|
||||
return sum(
|
||||
[all_res[k][field] * all_res[k][weight_field] for k in all_res.keys()]
|
||||
) / np.maximum(1.0, comb_res[weight_field])
|
||||
|
||||
def print_table(
|
||||
self, table_res, tracker, cls, res_field="COMBINED_SEQ", output_lable="COMBINED"
|
||||
):
|
||||
"""Prints table of results for all sequences"""
|
||||
print("")
|
||||
metric_name = self.get_name()
|
||||
self._row_print(
|
||||
[metric_name + ": " + tracker + "-" + cls] + self.summary_fields
|
||||
)
|
||||
for seq, results in sorted(table_res.items()):
|
||||
if seq.startswith("COMBINED_SEQ"):
|
||||
continue
|
||||
summary_res = self._summary_row(results)
|
||||
self._row_print([seq] + summary_res)
|
||||
summary_res = self._summary_row(table_res[res_field])
|
||||
self._row_print([output_lable] + summary_res)
|
||||
|
||||
def _summary_row(self, results_):
|
||||
vals = []
|
||||
for h in self.summary_fields:
|
||||
if h in self.float_array_fields:
|
||||
vals.append("{0:1.5g}".format(100 * np.mean(results_[h])))
|
||||
elif h in self.float_fields:
|
||||
vals.append("{0:1.5g}".format(100 * float(results_[h])))
|
||||
elif h in self.integer_fields:
|
||||
vals.append("{0:d}".format(int(results_[h])))
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
"Summary function not implemented for this field type."
|
||||
)
|
||||
return vals
|
||||
|
||||
@staticmethod
|
||||
def _row_print(*argv):
|
||||
"""Prints results in an evenly spaced rows, with more space in first row"""
|
||||
if len(argv) == 1:
|
||||
argv = argv[0]
|
||||
to_print = "%-35s" % argv[0]
|
||||
for v in argv[1:]:
|
||||
to_print += "%-10s" % str(v)
|
||||
print(to_print)
|
||||
|
||||
def summary_results(self, table_res):
|
||||
"""Returns a simple summary of final results for a tracker"""
|
||||
return dict(
|
||||
zip(self.summary_fields, self._summary_row(table_res["COMBINED_SEQ"]))
|
||||
)
|
||||
|
||||
def detailed_results(self, table_res):
|
||||
"""Returns detailed final results for a tracker"""
|
||||
# Get detailed field information
|
||||
detailed_fields = self.float_fields + self.integer_fields
|
||||
for h in self.float_array_fields + self.integer_array_fields:
|
||||
for alpha in [int(100 * x) for x in self.array_labels]:
|
||||
detailed_fields.append(h + "___" + str(alpha))
|
||||
detailed_fields.append(h + "___AUC")
|
||||
|
||||
# Get detailed results
|
||||
detailed_results = {}
|
||||
for seq, res in table_res.items():
|
||||
detailed_row = self._detailed_row(res)
|
||||
if len(detailed_row) != len(detailed_fields):
|
||||
raise TrackEvalException(
|
||||
"Field names and data have different sizes (%i and %i)"
|
||||
% (len(detailed_row), len(detailed_fields))
|
||||
)
|
||||
detailed_results[seq] = dict(zip(detailed_fields, detailed_row))
|
||||
return detailed_results
|
||||
|
||||
def _detailed_row(self, res):
|
||||
detailed_row = []
|
||||
for h in self.float_fields + self.integer_fields:
|
||||
detailed_row.append(res[h])
|
||||
for h in self.float_array_fields + self.integer_array_fields:
|
||||
for i, alpha in enumerate([int(100 * x) for x in self.array_labels]):
|
||||
detailed_row.append(res[h][i])
|
||||
detailed_row.append(np.mean(res[h]))
|
||||
return detailed_row
|
||||
48
sam3/eval/hota_eval_toolkit/trackeval/metrics/count.py
Normal file
48
sam3/eval/hota_eval_toolkit/trackeval/metrics/count.py
Normal file
@@ -0,0 +1,48 @@
|
||||
# flake8: noqa
|
||||
|
||||
from .. import _timing
|
||||
from ._base_metric import _BaseMetric
|
||||
|
||||
|
||||
class Count(_BaseMetric):
|
||||
"""Class which simply counts the number of tracker and gt detections and ids."""
|
||||
|
||||
def __init__(self, config=None):
|
||||
super().__init__()
|
||||
self.integer_fields = ["Dets", "GT_Dets", "IDs", "GT_IDs"]
|
||||
self.fields = self.integer_fields
|
||||
self.summary_fields = self.fields
|
||||
|
||||
@_timing.time
|
||||
def eval_sequence(self, data):
|
||||
"""Returns counts for one sequence"""
|
||||
# Get results
|
||||
res = {
|
||||
"Dets": data["num_tracker_dets"],
|
||||
"GT_Dets": data["num_gt_dets"],
|
||||
"IDs": data["num_tracker_ids"],
|
||||
"GT_IDs": data["num_gt_ids"],
|
||||
"Frames": data["num_timesteps"],
|
||||
}
|
||||
return res
|
||||
|
||||
def combine_sequences(self, all_res):
|
||||
"""Combines metrics across all sequences"""
|
||||
res = {}
|
||||
for field in self.integer_fields:
|
||||
res[field] = self._combine_sum(all_res, field)
|
||||
return res
|
||||
|
||||
def combine_classes_class_averaged(self, all_res, ignore_empty_classes=None):
|
||||
"""Combines metrics across all classes by averaging over the class values"""
|
||||
res = {}
|
||||
for field in self.integer_fields:
|
||||
res[field] = self._combine_sum(all_res, field)
|
||||
return res
|
||||
|
||||
def combine_classes_det_averaged(self, all_res):
|
||||
"""Combines metrics across all classes by averaging over the detection values"""
|
||||
res = {}
|
||||
for field in self.integer_fields:
|
||||
res[field] = self._combine_sum(all_res, field)
|
||||
return res
|
||||
291
sam3/eval/hota_eval_toolkit/trackeval/metrics/hota.py
Normal file
291
sam3/eval/hota_eval_toolkit/trackeval/metrics/hota.py
Normal file
@@ -0,0 +1,291 @@
|
||||
# flake8: noqa
|
||||
|
||||
import os
|
||||
|
||||
import numpy as np
|
||||
from scipy.optimize import linear_sum_assignment
|
||||
|
||||
from .. import _timing
|
||||
from ._base_metric import _BaseMetric
|
||||
|
||||
|
||||
class HOTA(_BaseMetric):
|
||||
"""Class which implements the HOTA metrics.
|
||||
See: https://link.springer.com/article/10.1007/s11263-020-01375-2
|
||||
"""
|
||||
|
||||
def __init__(self, config=None):
|
||||
super().__init__()
|
||||
self.plottable = True
|
||||
self.array_labels = np.arange(0.05, 0.99, 0.05)
|
||||
self.integer_array_fields = ["HOTA_TP", "HOTA_FN", "HOTA_FP"]
|
||||
self.float_array_fields = [
|
||||
"HOTA",
|
||||
"DetA",
|
||||
"AssA",
|
||||
"DetRe",
|
||||
"DetPr",
|
||||
"AssRe",
|
||||
"AssPr",
|
||||
"LocA",
|
||||
"OWTA",
|
||||
]
|
||||
self.float_fields = ["HOTA(0)", "LocA(0)", "HOTALocA(0)"]
|
||||
self.fields = (
|
||||
self.float_array_fields + self.integer_array_fields + self.float_fields
|
||||
)
|
||||
self.summary_fields = self.float_array_fields + self.float_fields
|
||||
|
||||
@_timing.time
|
||||
def eval_sequence(self, data):
|
||||
"""Calculates the HOTA metrics for one sequence"""
|
||||
|
||||
# Initialise results
|
||||
res = {}
|
||||
for field in self.float_array_fields + self.integer_array_fields:
|
||||
res[field] = np.zeros((len(self.array_labels)), dtype=float)
|
||||
for field in self.float_fields:
|
||||
res[field] = 0
|
||||
|
||||
# Return result quickly if tracker or gt sequence is empty
|
||||
if data["num_tracker_dets"] == 0:
|
||||
res["HOTA_FN"] = data["num_gt_dets"] * np.ones(
|
||||
(len(self.array_labels)), dtype=float
|
||||
)
|
||||
res["LocA"] = np.ones((len(self.array_labels)), dtype=float)
|
||||
res["LocA(0)"] = 1.0
|
||||
return res
|
||||
if data["num_gt_dets"] == 0:
|
||||
res["HOTA_FP"] = data["num_tracker_dets"] * np.ones(
|
||||
(len(self.array_labels)), dtype=float
|
||||
)
|
||||
res["LocA"] = np.ones((len(self.array_labels)), dtype=float)
|
||||
res["LocA(0)"] = 1.0
|
||||
return res
|
||||
|
||||
# Variables counting global association
|
||||
potential_matches_count = np.zeros(
|
||||
(data["num_gt_ids"], data["num_tracker_ids"])
|
||||
)
|
||||
gt_id_count = np.zeros((data["num_gt_ids"], 1))
|
||||
tracker_id_count = np.zeros((1, data["num_tracker_ids"]))
|
||||
|
||||
# First loop through each timestep and accumulate global track information.
|
||||
for t, (gt_ids_t, tracker_ids_t) in enumerate(
|
||||
zip(data["gt_ids"], data["tracker_ids"])
|
||||
):
|
||||
# Count the potential matches between ids in each timestep
|
||||
# These are normalised, weighted by the match similarity.
|
||||
similarity = data["similarity_scores"][t]
|
||||
sim_iou_denom = (
|
||||
similarity.sum(0)[np.newaxis, :]
|
||||
+ similarity.sum(1)[:, np.newaxis]
|
||||
- similarity
|
||||
)
|
||||
sim_iou = np.zeros_like(similarity)
|
||||
sim_iou_mask = sim_iou_denom > 0 + np.finfo("float").eps
|
||||
sim_iou[sim_iou_mask] = (
|
||||
similarity[sim_iou_mask] / sim_iou_denom[sim_iou_mask]
|
||||
)
|
||||
potential_matches_count[
|
||||
gt_ids_t[:, np.newaxis], tracker_ids_t[np.newaxis, :]
|
||||
] += sim_iou
|
||||
|
||||
# Calculate the total number of dets for each gt_id and tracker_id.
|
||||
gt_id_count[gt_ids_t] += 1
|
||||
tracker_id_count[0, tracker_ids_t] += 1
|
||||
|
||||
# Calculate overall jaccard alignment score (before unique matching) between IDs
|
||||
global_alignment_score = potential_matches_count / (
|
||||
gt_id_count + tracker_id_count - potential_matches_count
|
||||
)
|
||||
matches_counts = [
|
||||
np.zeros_like(potential_matches_count) for _ in self.array_labels
|
||||
]
|
||||
|
||||
# Calculate scores for each timestep
|
||||
for t, (gt_ids_t, tracker_ids_t) in enumerate(
|
||||
zip(data["gt_ids"], data["tracker_ids"])
|
||||
):
|
||||
# Deal with the case that there are no gt_det/tracker_det in a timestep.
|
||||
if len(gt_ids_t) == 0:
|
||||
for a, alpha in enumerate(self.array_labels):
|
||||
res["HOTA_FP"][a] += len(tracker_ids_t)
|
||||
continue
|
||||
if len(tracker_ids_t) == 0:
|
||||
for a, alpha in enumerate(self.array_labels):
|
||||
res["HOTA_FN"][a] += len(gt_ids_t)
|
||||
continue
|
||||
|
||||
# Get matching scores between pairs of dets for optimizing HOTA
|
||||
similarity = data["similarity_scores"][t]
|
||||
score_mat = (
|
||||
global_alignment_score[
|
||||
gt_ids_t[:, np.newaxis], tracker_ids_t[np.newaxis, :]
|
||||
]
|
||||
* similarity
|
||||
)
|
||||
|
||||
# Hungarian algorithm to find best matches
|
||||
match_rows, match_cols = linear_sum_assignment(-score_mat)
|
||||
|
||||
# Calculate and accumulate basic statistics
|
||||
for a, alpha in enumerate(self.array_labels):
|
||||
actually_matched_mask = (
|
||||
similarity[match_rows, match_cols] >= alpha - np.finfo("float").eps
|
||||
)
|
||||
alpha_match_rows = match_rows[actually_matched_mask]
|
||||
alpha_match_cols = match_cols[actually_matched_mask]
|
||||
num_matches = len(alpha_match_rows)
|
||||
res["HOTA_TP"][a] += num_matches
|
||||
res["HOTA_FN"][a] += len(gt_ids_t) - num_matches
|
||||
res["HOTA_FP"][a] += len(tracker_ids_t) - num_matches
|
||||
if num_matches > 0:
|
||||
res["LocA"][a] += sum(
|
||||
similarity[alpha_match_rows, alpha_match_cols]
|
||||
)
|
||||
matches_counts[a][
|
||||
gt_ids_t[alpha_match_rows], tracker_ids_t[alpha_match_cols]
|
||||
] += 1
|
||||
|
||||
# Calculate association scores (AssA, AssRe, AssPr) for the alpha value.
|
||||
# First calculate scores per gt_id/tracker_id combo and then average over the number of detections.
|
||||
for a, alpha in enumerate(self.array_labels):
|
||||
matches_count = matches_counts[a]
|
||||
ass_a = matches_count / np.maximum(
|
||||
1, gt_id_count + tracker_id_count - matches_count
|
||||
)
|
||||
res["AssA"][a] = np.sum(matches_count * ass_a) / np.maximum(
|
||||
1, res["HOTA_TP"][a]
|
||||
)
|
||||
ass_re = matches_count / np.maximum(1, gt_id_count)
|
||||
res["AssRe"][a] = np.sum(matches_count * ass_re) / np.maximum(
|
||||
1, res["HOTA_TP"][a]
|
||||
)
|
||||
ass_pr = matches_count / np.maximum(1, tracker_id_count)
|
||||
res["AssPr"][a] = np.sum(matches_count * ass_pr) / np.maximum(
|
||||
1, res["HOTA_TP"][a]
|
||||
)
|
||||
|
||||
# Calculate final scores
|
||||
res["LocA"] = np.maximum(1e-10, res["LocA"]) / np.maximum(1e-10, res["HOTA_TP"])
|
||||
res = self._compute_final_fields(res)
|
||||
return res
|
||||
|
||||
def combine_sequences(self, all_res):
|
||||
"""Combines metrics across all sequences"""
|
||||
res = {}
|
||||
for field in self.integer_array_fields:
|
||||
res[field] = self._combine_sum(all_res, field)
|
||||
for field in ["AssRe", "AssPr", "AssA"]:
|
||||
res[field] = self._combine_weighted_av(
|
||||
all_res, field, res, weight_field="HOTA_TP"
|
||||
)
|
||||
loca_weighted_sum = sum(
|
||||
[all_res[k]["LocA"] * all_res[k]["HOTA_TP"] for k in all_res.keys()]
|
||||
)
|
||||
res["LocA"] = np.maximum(1e-10, loca_weighted_sum) / np.maximum(
|
||||
1e-10, res["HOTA_TP"]
|
||||
)
|
||||
res = self._compute_final_fields(res)
|
||||
return res
|
||||
|
||||
def combine_classes_class_averaged(self, all_res, ignore_empty_classes=False):
|
||||
"""Combines metrics across all classes by averaging over the class values.
|
||||
If 'ignore_empty_classes' is True, then it only sums over classes with at least one gt or predicted detection.
|
||||
"""
|
||||
res = {}
|
||||
for field in self.integer_array_fields:
|
||||
if ignore_empty_classes:
|
||||
res[field] = self._combine_sum(
|
||||
{
|
||||
k: v
|
||||
for k, v in all_res.items()
|
||||
if (
|
||||
v["HOTA_TP"] + v["HOTA_FN"] + v["HOTA_FP"]
|
||||
> 0 + np.finfo("float").eps
|
||||
).any()
|
||||
},
|
||||
field,
|
||||
)
|
||||
else:
|
||||
res[field] = self._combine_sum(
|
||||
{k: v for k, v in all_res.items()}, field
|
||||
)
|
||||
|
||||
for field in self.float_fields + self.float_array_fields:
|
||||
if ignore_empty_classes:
|
||||
res[field] = np.mean(
|
||||
[
|
||||
v[field]
|
||||
for v in all_res.values()
|
||||
if (
|
||||
v["HOTA_TP"] + v["HOTA_FN"] + v["HOTA_FP"]
|
||||
> 0 + np.finfo("float").eps
|
||||
).any()
|
||||
],
|
||||
axis=0,
|
||||
)
|
||||
else:
|
||||
res[field] = np.mean([v[field] for v in all_res.values()], axis=0)
|
||||
return res
|
||||
|
||||
def combine_classes_det_averaged(self, all_res):
|
||||
"""Combines metrics across all classes by averaging over the detection values"""
|
||||
res = {}
|
||||
for field in self.integer_array_fields:
|
||||
res[field] = self._combine_sum(all_res, field)
|
||||
for field in ["AssRe", "AssPr", "AssA"]:
|
||||
res[field] = self._combine_weighted_av(
|
||||
all_res, field, res, weight_field="HOTA_TP"
|
||||
)
|
||||
loca_weighted_sum = sum(
|
||||
[all_res[k]["LocA"] * all_res[k]["HOTA_TP"] for k in all_res.keys()]
|
||||
)
|
||||
res["LocA"] = np.maximum(1e-10, loca_weighted_sum) / np.maximum(
|
||||
1e-10, res["HOTA_TP"]
|
||||
)
|
||||
res = self._compute_final_fields(res)
|
||||
return res
|
||||
|
||||
@staticmethod
|
||||
def _compute_final_fields(res):
|
||||
"""Calculate sub-metric ('field') values which only depend on other sub-metric values.
|
||||
This function is used both for both per-sequence calculation, and in combining values across sequences.
|
||||
"""
|
||||
res["DetRe"] = res["HOTA_TP"] / np.maximum(1, res["HOTA_TP"] + res["HOTA_FN"])
|
||||
res["DetPr"] = res["HOTA_TP"] / np.maximum(1, res["HOTA_TP"] + res["HOTA_FP"])
|
||||
res["DetA"] = res["HOTA_TP"] / np.maximum(
|
||||
1, res["HOTA_TP"] + res["HOTA_FN"] + res["HOTA_FP"]
|
||||
)
|
||||
res["HOTA"] = np.sqrt(res["DetA"] * res["AssA"])
|
||||
res["OWTA"] = np.sqrt(res["DetRe"] * res["AssA"])
|
||||
|
||||
res["HOTA(0)"] = res["HOTA"][0]
|
||||
res["LocA(0)"] = res["LocA"][0]
|
||||
res["HOTALocA(0)"] = res["HOTA(0)"] * res["LocA(0)"]
|
||||
return res
|
||||
|
||||
def plot_single_tracker_results(self, table_res, tracker, cls, output_folder):
|
||||
"""Create plot of results"""
|
||||
|
||||
# Only loaded when run to reduce minimum requirements
|
||||
from matplotlib import pyplot as plt
|
||||
|
||||
res = table_res["COMBINED_SEQ"]
|
||||
styles_to_plot = ["r", "b", "g", "b--", "b:", "g--", "g:", "m"]
|
||||
for name, style in zip(self.float_array_fields, styles_to_plot):
|
||||
plt.plot(self.array_labels, res[name], style)
|
||||
plt.xlabel("alpha")
|
||||
plt.ylabel("score")
|
||||
plt.title(tracker + " - " + cls)
|
||||
plt.axis([0, 1, 0, 1])
|
||||
legend = []
|
||||
for name in self.float_array_fields:
|
||||
legend += [name + " (" + str(np.round(np.mean(res[name]), 2)) + ")"]
|
||||
plt.legend(legend, loc="lower left")
|
||||
out_file = os.path.join(output_folder, cls + "_plot.pdf")
|
||||
os.makedirs(os.path.dirname(out_file), exist_ok=True)
|
||||
plt.savefig(out_file)
|
||||
plt.savefig(out_file.replace(".pdf", ".png"))
|
||||
plt.clf()
|
||||
195
sam3/eval/hota_eval_toolkit/trackeval/utils.py
Normal file
195
sam3/eval/hota_eval_toolkit/trackeval/utils.py
Normal file
@@ -0,0 +1,195 @@
|
||||
# flake8: noqa
|
||||
|
||||
import argparse
|
||||
import csv
|
||||
import os
|
||||
from collections import OrderedDict
|
||||
|
||||
|
||||
def init_config(config, default_config, name=None):
|
||||
"""Initialise non-given config values with defaults"""
|
||||
if config is None:
|
||||
config = default_config
|
||||
else:
|
||||
for k in default_config.keys():
|
||||
if k not in config.keys():
|
||||
config[k] = default_config[k]
|
||||
if name and config["PRINT_CONFIG"]:
|
||||
print("\n%s Config:" % name)
|
||||
for c in config.keys():
|
||||
print("%-20s : %-30s" % (c, config[c]))
|
||||
return config
|
||||
|
||||
|
||||
def update_config(config):
|
||||
"""
|
||||
Parse the arguments of a script and updates the config values for a given value if specified in the arguments.
|
||||
:param config: the config to update
|
||||
:return: the updated config
|
||||
"""
|
||||
parser = argparse.ArgumentParser()
|
||||
for setting in config.keys():
|
||||
if type(config[setting]) == list or type(config[setting]) == type(None):
|
||||
parser.add_argument("--" + setting, nargs="+")
|
||||
else:
|
||||
parser.add_argument("--" + setting)
|
||||
args = parser.parse_args().__dict__
|
||||
for setting in args.keys():
|
||||
if args[setting] is not None:
|
||||
if type(config[setting]) == type(True):
|
||||
if args[setting] == "True":
|
||||
x = True
|
||||
elif args[setting] == "False":
|
||||
x = False
|
||||
else:
|
||||
raise Exception(
|
||||
"Command line parameter " + setting + "must be True or False"
|
||||
)
|
||||
elif type(config[setting]) == type(1):
|
||||
x = int(args[setting])
|
||||
elif type(args[setting]) == type(None):
|
||||
x = None
|
||||
else:
|
||||
x = args[setting]
|
||||
config[setting] = x
|
||||
return config
|
||||
|
||||
|
||||
def get_code_path():
|
||||
"""Get base path where code is"""
|
||||
return os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
|
||||
|
||||
|
||||
def validate_metrics_list(metrics_list):
|
||||
"""Get names of metric class and ensures they are unique, further checks that the fields within each metric class
|
||||
do not have overlapping names.
|
||||
"""
|
||||
metric_names = [metric.get_name() for metric in metrics_list]
|
||||
# check metric names are unique
|
||||
if len(metric_names) != len(set(metric_names)):
|
||||
raise TrackEvalException(
|
||||
"Code being run with multiple metrics of the same name"
|
||||
)
|
||||
fields = []
|
||||
for m in metrics_list:
|
||||
fields += m.fields
|
||||
# check metric fields are unique
|
||||
if len(fields) != len(set(fields)):
|
||||
raise TrackEvalException(
|
||||
"Code being run with multiple metrics with fields of the same name"
|
||||
)
|
||||
return metric_names
|
||||
|
||||
|
||||
def write_summary_results(summaries, cls, output_folder):
|
||||
"""Write summary results to file"""
|
||||
|
||||
fields = sum([list(s.keys()) for s in summaries], [])
|
||||
values = sum([list(s.values()) for s in summaries], [])
|
||||
|
||||
# In order to remain consistent upon new fields being adding, for each of the following fields if they are present
|
||||
# they will be output in the summary first in the order below. Any further fields will be output in the order each
|
||||
# metric family is called, and within each family either in the order they were added to the dict (python >= 3.6) or
|
||||
# randomly (python < 3.6).
|
||||
default_order = [
|
||||
"HOTA",
|
||||
"DetA",
|
||||
"AssA",
|
||||
"DetRe",
|
||||
"DetPr",
|
||||
"AssRe",
|
||||
"AssPr",
|
||||
"LocA",
|
||||
"OWTA",
|
||||
"HOTA(0)",
|
||||
"LocA(0)",
|
||||
"HOTALocA(0)",
|
||||
"MOTA",
|
||||
"MOTP",
|
||||
"MODA",
|
||||
"CLR_Re",
|
||||
"CLR_Pr",
|
||||
"MTR",
|
||||
"PTR",
|
||||
"MLR",
|
||||
"CLR_TP",
|
||||
"CLR_FN",
|
||||
"CLR_FP",
|
||||
"IDSW",
|
||||
"MT",
|
||||
"PT",
|
||||
"ML",
|
||||
"Frag",
|
||||
"sMOTA",
|
||||
"IDF1",
|
||||
"IDR",
|
||||
"IDP",
|
||||
"IDTP",
|
||||
"IDFN",
|
||||
"IDFP",
|
||||
"Dets",
|
||||
"GT_Dets",
|
||||
"IDs",
|
||||
"GT_IDs",
|
||||
]
|
||||
default_ordered_dict = OrderedDict(
|
||||
zip(default_order, [None for _ in default_order])
|
||||
)
|
||||
for f, v in zip(fields, values):
|
||||
default_ordered_dict[f] = v
|
||||
for df in default_order:
|
||||
if default_ordered_dict[df] is None:
|
||||
del default_ordered_dict[df]
|
||||
fields = list(default_ordered_dict.keys())
|
||||
values = list(default_ordered_dict.values())
|
||||
|
||||
out_file = os.path.join(output_folder, cls + "_summary.txt")
|
||||
os.makedirs(os.path.dirname(out_file), exist_ok=True)
|
||||
with open(out_file, "w", newline="") as f:
|
||||
writer = csv.writer(f, delimiter=" ")
|
||||
writer.writerow(fields)
|
||||
writer.writerow(values)
|
||||
|
||||
|
||||
def write_detailed_results(details, cls, output_folder):
|
||||
"""Write detailed results to file"""
|
||||
sequences = details[0].keys()
|
||||
fields = ["seq"] + sum([list(s["COMBINED_SEQ"].keys()) for s in details], [])
|
||||
out_file = os.path.join(output_folder, cls + "_detailed.csv")
|
||||
os.makedirs(os.path.dirname(out_file), exist_ok=True)
|
||||
with open(out_file, "w", newline="") as f:
|
||||
writer = csv.writer(f)
|
||||
writer.writerow(fields)
|
||||
for seq in sorted(sequences):
|
||||
if seq == "COMBINED_SEQ":
|
||||
continue
|
||||
writer.writerow([seq] + sum([list(s[seq].values()) for s in details], []))
|
||||
writer.writerow(
|
||||
["COMBINED"] + sum([list(s["COMBINED_SEQ"].values()) for s in details], [])
|
||||
)
|
||||
|
||||
|
||||
def load_detail(file):
|
||||
"""Loads detailed data for a tracker."""
|
||||
data = {}
|
||||
with open(file) as f:
|
||||
for i, row_text in enumerate(f):
|
||||
row = row_text.replace("\r", "").replace("\n", "").split(",")
|
||||
if i == 0:
|
||||
keys = row[1:]
|
||||
continue
|
||||
current_values = row[1:]
|
||||
seq = row[0]
|
||||
if seq == "COMBINED":
|
||||
seq = "COMBINED_SEQ"
|
||||
if (len(current_values) == len(keys)) and seq != "":
|
||||
data[seq] = {}
|
||||
for key, value in zip(keys, current_values):
|
||||
data[seq][key] = float(value)
|
||||
return data
|
||||
|
||||
|
||||
class TrackEvalException(Exception):
|
||||
"""Custom exception for catching expected errors."""
|
||||
|
||||
...
|
||||
648
sam3/eval/postprocessors.py
Normal file
648
sam3/eval/postprocessors.py
Normal file
@@ -0,0 +1,648 @@
|
||||
# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
|
||||
|
||||
"""Postprocessors class to transform MDETR output according to the downstream task"""
|
||||
|
||||
import dataclasses
|
||||
import logging
|
||||
from collections import defaultdict
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from sam3.model import box_ops
|
||||
from sam3.model.data_misc import BatchedInferenceMetadata, interpolate
|
||||
from sam3.train.masks_ops import rle_encode, robust_rle_encode
|
||||
from torch import nn
|
||||
|
||||
|
||||
class PostProcessNullOp(nn.Module):
|
||||
def __init__(self, **kwargs):
|
||||
super(PostProcessNullOp).__init__()
|
||||
pass
|
||||
|
||||
def forward(self, input):
|
||||
pass
|
||||
|
||||
def process_results(self, **kwargs):
|
||||
return kwargs["find_stages"]
|
||||
|
||||
|
||||
class PostProcessImage(nn.Module):
|
||||
"""This module converts the model's output into the format expected by the coco api"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
max_dets_per_img: int,
|
||||
iou_type="bbox",
|
||||
to_cpu: bool = True,
|
||||
use_original_ids: bool = False,
|
||||
use_original_sizes_box: bool = False,
|
||||
use_original_sizes_mask: bool = False,
|
||||
convert_mask_to_rle: bool = False,
|
||||
always_interpolate_masks_on_gpu: bool = True,
|
||||
use_presence: bool = True,
|
||||
detection_threshold: float = -1.0,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
self.max_dets_per_img = max_dets_per_img
|
||||
self.iou_type = iou_type
|
||||
self.to_cpu = to_cpu
|
||||
self.convert_mask_to_rle = convert_mask_to_rle
|
||||
self.always_interpolate_masks_on_gpu = always_interpolate_masks_on_gpu
|
||||
|
||||
self.use_presence = use_presence
|
||||
self.detection_threshold = detection_threshold
|
||||
self.use_original_ids = use_original_ids
|
||||
self.use_original_sizes_box = use_original_sizes_box
|
||||
self.use_original_sizes_mask = use_original_sizes_mask
|
||||
|
||||
@torch.no_grad()
|
||||
def forward(
|
||||
self,
|
||||
outputs,
|
||||
target_sizes_boxes,
|
||||
target_sizes_masks,
|
||||
forced_labels=None,
|
||||
consistent=False,
|
||||
ret_tensordict: bool = False, # This is experimental
|
||||
):
|
||||
"""Perform the computation
|
||||
Parameters:
|
||||
outputs: raw outputs of the model
|
||||
target_sizes_boxes: tensor of dimension [batch_size x 2] containing the size of each images of the batch
|
||||
For evaluation, this must be the original image size (before any data augmentation)
|
||||
For visualization, this should be the image size after data augment, but before padding
|
||||
target_sizes_masks: same but used to resize masks
|
||||
forced_labels: tensor of dimension [batch_size] containing the label to force for each image of the batch
|
||||
This is useful when evaluating the model using standard metrics (eg on COCO, LVIS). In that case,
|
||||
we query the model with every possible class label, so we when we pass the predictions to the evaluator,
|
||||
we want to make sure that the predicted "class" matches the one that was queried.
|
||||
consistent: whether all target sizes are equal
|
||||
ret_tensordict: Experimental argument. If true, return a tensordict.TensorDict instead of a list of dictionaries for easier manipulation.
|
||||
"""
|
||||
if ret_tensordict:
|
||||
assert (
|
||||
consistent is True
|
||||
), "We don't support returning TensorDict if the outputs have different shapes" # NOTE: It's possible but we don't support it.
|
||||
assert self.detection_threshold <= 0.0, "TODO: implement?"
|
||||
try:
|
||||
from tensordict import TensorDict
|
||||
except ImportError:
|
||||
logging.info(
|
||||
"tensordict is not installed. Install by running `pip install tensordict --no-deps`. Falling back by setting `ret_tensordict=False`"
|
||||
)
|
||||
ret_tensordict = False
|
||||
|
||||
out_bbox = outputs["pred_boxes"] if "pred_boxes" in outputs else None
|
||||
out_logits = outputs["pred_logits"]
|
||||
pred_masks = outputs["pred_masks"] if self.iou_type == "segm" else None
|
||||
out_probs = out_logits.sigmoid()
|
||||
if self.use_presence:
|
||||
presence_score = outputs["presence_logit_dec"].sigmoid().unsqueeze(1)
|
||||
out_probs = out_probs * presence_score
|
||||
|
||||
assert target_sizes_boxes.shape[1] == 2
|
||||
assert target_sizes_masks.shape[1] == 2
|
||||
batch_size = target_sizes_boxes.shape[0]
|
||||
|
||||
boxes, scores, labels, keep = self._process_boxes_and_labels(
|
||||
target_sizes_boxes, forced_labels, out_bbox, out_probs
|
||||
)
|
||||
assert boxes is None or len(boxes) == batch_size
|
||||
out_masks = self._process_masks(
|
||||
target_sizes_masks, pred_masks, consistent=consistent, keep=keep
|
||||
)
|
||||
del pred_masks
|
||||
|
||||
if boxes is None:
|
||||
assert out_masks is not None
|
||||
assert not ret_tensordict, "We don't support returning TensorDict if the output does not contain boxes"
|
||||
B = len(out_masks)
|
||||
boxes = [None] * B
|
||||
scores = [None] * B
|
||||
labels = [None] * B
|
||||
|
||||
results = {
|
||||
"scores": scores,
|
||||
"labels": labels,
|
||||
"boxes": boxes,
|
||||
}
|
||||
if out_masks is not None:
|
||||
if self.convert_mask_to_rle:
|
||||
results.update(masks_rle=out_masks)
|
||||
else:
|
||||
results.update(masks=out_masks)
|
||||
|
||||
if ret_tensordict:
|
||||
results = TensorDict(results).auto_batch_size_()
|
||||
if self.to_cpu:
|
||||
results = results.cpu()
|
||||
else:
|
||||
# Convert a dictonary of lists/tensors to list of dictionaries
|
||||
results = [
|
||||
dict(zip(results.keys(), res_tuple))
|
||||
for res_tuple in zip(*results.values())
|
||||
]
|
||||
|
||||
return results
|
||||
|
||||
def _process_masks(self, target_sizes, pred_masks, consistent=True, keep=None):
|
||||
if pred_masks is None:
|
||||
return None
|
||||
if self.always_interpolate_masks_on_gpu:
|
||||
gpu_device = target_sizes.device
|
||||
assert gpu_device.type == "cuda"
|
||||
pred_masks = pred_masks.to(device=gpu_device)
|
||||
if consistent:
|
||||
assert keep is None, "TODO: implement?"
|
||||
# All masks should have the same shape, expected when processing a batch of size 1
|
||||
target_size = target_sizes.unique(dim=0)
|
||||
assert target_size.size(0) == 1, "Expecting all target sizes to be equal"
|
||||
out_masks = (
|
||||
interpolate(
|
||||
pred_masks,
|
||||
target_size.squeeze().tolist(),
|
||||
mode="bilinear",
|
||||
align_corners=False,
|
||||
).sigmoid()
|
||||
> 0.5
|
||||
)
|
||||
if self.convert_mask_to_rle:
|
||||
raise RuntimeError("TODO: implement?")
|
||||
if self.to_cpu:
|
||||
out_masks = out_masks.cpu()
|
||||
else:
|
||||
out_masks = [[]] * len(pred_masks)
|
||||
|
||||
assert keep is None or len(keep) == len(pred_masks)
|
||||
for i, mask in enumerate(pred_masks):
|
||||
h, w = target_sizes[i]
|
||||
if keep is not None:
|
||||
mask = mask[keep[i]]
|
||||
# Uses the gpu version fist, moves masks to cpu if it fails"""
|
||||
try:
|
||||
interpolated = (
|
||||
interpolate(
|
||||
mask.unsqueeze(1),
|
||||
(h, w),
|
||||
mode="bilinear",
|
||||
align_corners=False,
|
||||
).sigmoid()
|
||||
> 0.5
|
||||
)
|
||||
except Exception as e:
|
||||
logging.info("Issue found, reverting to CPU mode!")
|
||||
mask_device = mask.device
|
||||
mask = mask.cpu()
|
||||
interpolated = (
|
||||
interpolate(
|
||||
mask.unsqueeze(1),
|
||||
(h, w),
|
||||
mode="bilinear",
|
||||
align_corners=False,
|
||||
).sigmoid()
|
||||
> 0.5
|
||||
)
|
||||
interpolated = interpolated.to(mask_device)
|
||||
|
||||
if self.convert_mask_to_rle:
|
||||
out_masks[i] = robust_rle_encode(interpolated.squeeze(1))
|
||||
else:
|
||||
out_masks[i] = interpolated
|
||||
if self.to_cpu:
|
||||
out_masks[i] = out_masks[i].cpu()
|
||||
|
||||
return out_masks
|
||||
|
||||
def _process_boxes_and_labels(
|
||||
self, target_sizes, forced_labels, out_bbox, out_probs
|
||||
):
|
||||
if out_bbox is None:
|
||||
return None, None, None, None
|
||||
assert len(out_probs) == len(target_sizes)
|
||||
if self.to_cpu:
|
||||
out_probs = out_probs.cpu()
|
||||
scores, labels = out_probs.max(-1)
|
||||
if forced_labels is None:
|
||||
labels = torch.ones_like(labels)
|
||||
else:
|
||||
labels = forced_labels[:, None].expand_as(labels)
|
||||
|
||||
# convert to [x0, y0, x1, y1] format
|
||||
boxes = box_ops.box_cxcywh_to_xyxy(out_bbox)
|
||||
|
||||
img_h, img_w = target_sizes.unbind(1)
|
||||
scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
|
||||
boxes = boxes * scale_fct[:, None, :]
|
||||
|
||||
if self.to_cpu:
|
||||
boxes = boxes.cpu()
|
||||
|
||||
keep = None
|
||||
if self.detection_threshold > 0:
|
||||
# Filter out the boxes with scores below the detection threshold
|
||||
keep = scores > self.detection_threshold
|
||||
assert len(keep) == len(boxes) == len(scores) == len(labels)
|
||||
|
||||
boxes = [b[k.to(b.device)] for b, k in zip(boxes, keep)]
|
||||
scores = [s[k.to(s.device)] for s, k in zip(scores, keep)]
|
||||
labels = [l[k.to(l.device)] for l, k in zip(labels, keep)]
|
||||
|
||||
return boxes, scores, labels, keep
|
||||
|
||||
def process_results(
|
||||
self, find_stages, find_metadatas: List[BatchedInferenceMetadata], **kwargs
|
||||
):
|
||||
if find_stages.loss_stages is not None:
|
||||
find_metadatas = [find_metadatas[i] for i in find_stages.loss_stages]
|
||||
assert len(find_stages) == len(find_metadatas)
|
||||
results = {}
|
||||
for outputs, meta in zip(find_stages, find_metadatas):
|
||||
img_size_for_boxes = (
|
||||
meta.original_size
|
||||
if self.use_original_sizes_box
|
||||
else torch.ones_like(meta.original_size)
|
||||
)
|
||||
img_size_for_masks = (
|
||||
meta.original_size
|
||||
if self.use_original_sizes_mask
|
||||
else torch.ones_like(meta.original_size)
|
||||
)
|
||||
detection_results = self(
|
||||
outputs,
|
||||
img_size_for_boxes,
|
||||
img_size_for_masks,
|
||||
forced_labels=(
|
||||
meta.original_category_id if self.use_original_ids else None
|
||||
),
|
||||
)
|
||||
ids = (
|
||||
meta.original_image_id if self.use_original_ids else meta.coco_image_id
|
||||
)
|
||||
assert len(detection_results) == len(ids)
|
||||
for img_id, result in zip(ids, detection_results):
|
||||
if img_id.item() not in results:
|
||||
results[img_id.item()] = result
|
||||
else:
|
||||
assert set(results[img_id.item()].keys()) == set(result.keys())
|
||||
for k in result.keys():
|
||||
if isinstance(result[k], torch.Tensor):
|
||||
results[img_id.item()][k] = torch.cat(
|
||||
[results[img_id.item()][k], result[k]], dim=0
|
||||
)
|
||||
elif isinstance(result[k], list):
|
||||
results[img_id.item()][k] += result[k]
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
f"Unexpected type {type(result[k])} in result."
|
||||
)
|
||||
# Prune the results to the max number of detections per image.
|
||||
for img_id, result in results.items():
|
||||
if (
|
||||
self.max_dets_per_img > 0
|
||||
and len(result["scores"]) > self.max_dets_per_img
|
||||
):
|
||||
_, topk_indexes = torch.topk(
|
||||
result["scores"], self.max_dets_per_img, dim=0
|
||||
)
|
||||
if self.to_cpu:
|
||||
topk_indexes = topk_indexes.cpu()
|
||||
for k in result.keys():
|
||||
if isinstance(results[img_id][k], list):
|
||||
results[img_id][k] = [
|
||||
results[img_id][k][i] for i in topk_indexes.tolist()
|
||||
]
|
||||
else:
|
||||
results[img_id][k] = results[img_id][k].to(topk_indexes.device)[
|
||||
topk_indexes
|
||||
]
|
||||
|
||||
return results
|
||||
|
||||
|
||||
class PostProcessAPIVideo(PostProcessImage):
|
||||
"""This module converts the video model's output into the format expected by the YT-VIS api"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*args,
|
||||
to_cpu: bool = True,
|
||||
convert_mask_to_rle: bool = False,
|
||||
always_interpolate_masks_on_gpu: bool = True,
|
||||
prob_thresh: float = 0.5,
|
||||
use_presence: bool = False,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(
|
||||
*args,
|
||||
# Here we always set `convert_mask_to_rle=False` in the base `PostProcessAPI` class
|
||||
# (so that its `_process_masks` won't return a list of RLEs). If we want to return
|
||||
# RLEs for video masklets, we handle it in this `PostProcessAPIVideo` class instead.
|
||||
convert_mask_to_rle=False,
|
||||
# Here we always set `to_cpu=False` in the base `PostProcessAPI` class (so that
|
||||
# the interpolated masks won't be automatically moved back to CPU). We will handle
|
||||
# it in this `PostProcessAPIVideo` class instead.
|
||||
always_interpolate_masks_on_gpu=always_interpolate_masks_on_gpu,
|
||||
use_presence=use_presence,
|
||||
**kwargs,
|
||||
)
|
||||
# Expected keys in the output dict to postprocess
|
||||
self.EXPECTED_KEYS = [
|
||||
"pred_logits",
|
||||
"pred_boxes",
|
||||
"pred_masks",
|
||||
]
|
||||
# Whether to post-process video masklets (under packed representation) into RLE format
|
||||
self.convert_mask_to_rle_for_video = convert_mask_to_rle
|
||||
self.to_cpu_for_video = to_cpu
|
||||
self.prob_thresh = prob_thresh
|
||||
|
||||
def process_results(
|
||||
self, find_stages, find_metadatas: List[BatchedInferenceMetadata], **kwargs
|
||||
):
|
||||
"""
|
||||
Tracking Postprocessor for SAM 3 video model.
|
||||
This function takes in the output of the SAM 3 video model and processes it to extract all the tracklet predictions.
|
||||
Args:
|
||||
find_stages: A list of tensors representing the output of the SAM 3 video model.
|
||||
find_metadatas: A list of BatchedInferenceMetadata objects containing metadata about each frame.
|
||||
**kwargs: Additional keyword arguments.
|
||||
Returns:
|
||||
A dictionary of predcitions with video_id as key.
|
||||
"""
|
||||
|
||||
# Import tensordict here to avoid global dependency.
|
||||
try:
|
||||
from tensordict import TensorDict
|
||||
except ImportError as e:
|
||||
logging.error(
|
||||
"tensordict is not installed, please install by running `pip install tensordict --no-deps`"
|
||||
)
|
||||
raise e
|
||||
# Notes and assumptions:
|
||||
# 1- This postprocessor assumes results only for a single video.
|
||||
# 2- There are N stage outputs corresponding to N video frames
|
||||
# 3- Each stage outputs contains PxQ preds, where P is number of prompts and Q is number of object queries. The output should also contain the tracking object ids corresponding to each object query.
|
||||
# 4- The tracking object id has a default value of -1, indicating that the object query is not tracking any object in the frame, and hence its predictions can be ingored for a given frame.
|
||||
# 5- Some objects may be tracked in a subset of frames only. So, we first extract the predictions in a packed representation (for efficient postprocessing -- specially memory)
|
||||
# and then we convert the packed representation into a padded one, where we zero pad boxes/masks for objects that are not tracked in some frames.
|
||||
# 6- We refer to objects by an object id, which is a tuple (prompt_idx, obj_id)
|
||||
|
||||
assert len(find_stages) > 0, "There is nothing to postprocess?"
|
||||
PROMPT_AXIS, OBJ_QUERY_AXIS = (0, 1)
|
||||
NO_OBJ_ID = -1
|
||||
# Maps object ID -> [indices in packed tensor]
|
||||
tracked_objects_packed_idx = defaultdict(list)
|
||||
# Maps object ID -> [indices in padded tensor (abs frame index)]
|
||||
tracked_objects_frame_idx = defaultdict(list)
|
||||
total_num_preds = 0
|
||||
# This will hold the packed representation of predictions.
|
||||
vid_preds_packed: List[TensorDict] = []
|
||||
vid_masklets_rle_packed: List[Optional[Dict]] = []
|
||||
video_id = -1 # We assume single video postprocessing, this ID should be unique in the datapoint.
|
||||
|
||||
for frame_idx, (frame_outs, meta) in enumerate(
|
||||
zip(find_stages, find_metadatas)
|
||||
):
|
||||
# only store keys we need to extract the results
|
||||
frame_outs_td = TensorDict(
|
||||
{k: frame_outs[k] for k in self.EXPECTED_KEYS}
|
||||
).auto_batch_size_() # Shape is [P,Q,...]
|
||||
meta_td = TensorDict(
|
||||
dataclasses.asdict(meta)
|
||||
).auto_batch_size_() # Shape is [P,...]
|
||||
unique_vid_id = meta.original_image_id.unique()
|
||||
assert unique_vid_id.size(0) == 1
|
||||
if video_id == -1:
|
||||
video_id = unique_vid_id.item()
|
||||
else:
|
||||
assert (
|
||||
video_id == unique_vid_id.item()
|
||||
), "We can only postprocess one video per datapoint"
|
||||
# keeping track of which objects appear in the current frame
|
||||
obj_ids_per_frame = frame_outs["pred_object_ids"]
|
||||
assert obj_ids_per_frame.size(-1) == frame_outs["pred_logits"].size(-2)
|
||||
if self.prob_thresh is not None:
|
||||
# only keep the predictions on this frame with probability above the threshold
|
||||
# (remove those predictions during the keep-alive period of a tracking query,
|
||||
# where its "pred_object_ids" is still the tracked object ID rather than -1)
|
||||
pred_probs = frame_outs["pred_logits"].sigmoid().squeeze(-1)
|
||||
obj_ids_per_frame = torch.where(
|
||||
pred_probs >= self.prob_thresh, obj_ids_per_frame, NO_OBJ_ID
|
||||
)
|
||||
tracked_obj_ids_idx = torch.where(obj_ids_per_frame != NO_OBJ_ID)
|
||||
# Object id is a tuple of (prompt_idx, obj_id). This is because the model can assign same obj_id for two different prompts.
|
||||
tracked_obj_ids = [
|
||||
(p_id.item(), obj_ids_per_frame[p_id, q_id].item())
|
||||
for p_id, q_id in zip(
|
||||
tracked_obj_ids_idx[PROMPT_AXIS],
|
||||
tracked_obj_ids_idx[OBJ_QUERY_AXIS],
|
||||
)
|
||||
]
|
||||
if len(tracked_obj_ids) == 0:
|
||||
continue
|
||||
# For each object, we keep track of the packed and padded (frame index) indices
|
||||
for oid in tracked_obj_ids:
|
||||
tracked_objects_packed_idx[oid].append(total_num_preds)
|
||||
tracked_objects_frame_idx[oid].append(frame_idx)
|
||||
total_num_preds += 1
|
||||
|
||||
# Since we have P*Q masks per frame, mask interpolation is the GPU memory bottleneck or time bottleneck in case of cpu processing.
|
||||
# Instead, we first extract results only for tracked objects, reducing the number of masks to K = sum_i(tracked_objs_per_ith_prompt), hopefully <<< P*Q
|
||||
tracked_objs_outs_td = frame_outs_td[
|
||||
tracked_obj_ids_idx
|
||||
] # [P,Q,...] --> [K,...]
|
||||
meta_td = meta_td[tracked_obj_ids_idx[PROMPT_AXIS].cpu()]
|
||||
if self.always_interpolate_masks_on_gpu:
|
||||
gpu_device = meta_td["original_size"].device
|
||||
assert gpu_device.type == "cuda"
|
||||
tracked_objs_outs_td = tracked_objs_outs_td.to(device=gpu_device)
|
||||
frame_results_td = self(
|
||||
tracked_objs_outs_td.unsqueeze(1),
|
||||
(
|
||||
meta_td["original_size"]
|
||||
if self.use_original_sizes
|
||||
else torch.ones_like(meta_td["original_size"])
|
||||
),
|
||||
forced_labels=(
|
||||
meta_td["original_category_id"] if self.use_original_ids else None
|
||||
),
|
||||
consistent=True,
|
||||
ret_tensordict=True,
|
||||
).squeeze(1)
|
||||
del tracked_objs_outs_td
|
||||
|
||||
# Optionally, remove "masks" from output tensor dict and directly encode them
|
||||
# to RLE format under packed representations
|
||||
if self.convert_mask_to_rle_for_video:
|
||||
interpolated_binary_masks = frame_results_td.pop("masks")
|
||||
rle_list = rle_encode(interpolated_binary_masks, return_areas=True)
|
||||
vid_masklets_rle_packed.extend(rle_list)
|
||||
# Optionally, move output TensorDict to CPU (do this after RLE encoding step above)
|
||||
if self.to_cpu_for_video:
|
||||
frame_results_td = frame_results_td.cpu()
|
||||
vid_preds_packed.append(frame_results_td)
|
||||
|
||||
if len(vid_preds_packed) == 0:
|
||||
logging.debug(f"Video {video_id} has no predictions")
|
||||
return {video_id: []}
|
||||
|
||||
vid_preds_packed = torch.cat(vid_preds_packed, dim=0)
|
||||
############### Construct a padded representation of the predictions ###############
|
||||
num_preds = len(tracked_objects_packed_idx)
|
||||
num_frames = len(find_stages)
|
||||
# We zero pad any missing prediction
|
||||
# NOTE: here, we also have padded tensors for "scores" and "labels", but we overwrite them later.
|
||||
padded_frames_results = TensorDict(
|
||||
{
|
||||
k: torch.zeros(
|
||||
num_preds, num_frames, *v.shape[1:], device=v.device, dtype=v.dtype
|
||||
)
|
||||
for k, v in vid_preds_packed.items()
|
||||
},
|
||||
batch_size=[
|
||||
num_preds,
|
||||
num_frames,
|
||||
],
|
||||
)
|
||||
padded_frames_results["scores"][...] = -1e8 # a very low score for empty object
|
||||
# Track scores and labels of each pred tracklet, only for frames where the model was able to track that object
|
||||
tracklet_scores = []
|
||||
tracklet_labels = []
|
||||
# Optionally, fill the list of RLEs for masklets
|
||||
# note: only frames with actual predicted masks (in packed format) will be
|
||||
# filled with RLEs; the rest will remains None in results["masks_rle"]
|
||||
if self.convert_mask_to_rle_for_video:
|
||||
vid_masklets_rle_padded = [[None] * num_frames for _ in range(num_preds)]
|
||||
for o_idx, oid in enumerate(tracked_objects_packed_idx):
|
||||
oid2packed_idx = tracked_objects_packed_idx[oid]
|
||||
oid2padded_idx = tracked_objects_frame_idx[oid]
|
||||
obj_packed_results = vid_preds_packed[oid2packed_idx]
|
||||
padded_frames_results[o_idx][oid2padded_idx] = obj_packed_results
|
||||
if self.convert_mask_to_rle_for_video:
|
||||
for packed_idx, padded_idx in zip(oid2packed_idx, oid2padded_idx):
|
||||
vid_masklets_rle_padded[o_idx][padded_idx] = (
|
||||
vid_masklets_rle_packed[packed_idx]
|
||||
)
|
||||
# NOTE: We need a single confidence score per tracklet for the mAP metric.
|
||||
# We use the average confidence score across time. (How does this impact AP?)
|
||||
tracklet_scores.append(obj_packed_results["scores"].mean())
|
||||
# We also need to have a unique category Id per tracklet.
|
||||
# This is not a problem for phrase AP, however, for mAP we do majority voting across time.
|
||||
tracklet_labels.append(obj_packed_results["labels"].mode()[0])
|
||||
|
||||
results = padded_frames_results.to_dict()
|
||||
results["scores"] = torch.stack(tracklet_scores, dim=0)
|
||||
results["labels"] = torch.stack(tracklet_labels, dim=0)
|
||||
if self.convert_mask_to_rle_for_video:
|
||||
results["masks_rle"] = vid_masklets_rle_padded
|
||||
# we keep the frame-level scores since it's needed by some evaluation scripts
|
||||
results["per_frame_scores"] = padded_frames_results["scores"]
|
||||
|
||||
return {video_id: results}
|
||||
|
||||
|
||||
class PostProcessTracking(PostProcessImage):
|
||||
"""This module converts the model's output into the format expected by the coco api"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
max_dets_per_img: int,
|
||||
iou_type="bbox",
|
||||
force_single_mask: bool = False,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
super().__init__(max_dets_per_img=max_dets_per_img, iou_type=iou_type, **kwargs)
|
||||
self.force_single_mask = force_single_mask
|
||||
|
||||
def process_results(
|
||||
self, find_stages, find_metadatas: BatchedInferenceMetadata, **kwargs
|
||||
):
|
||||
assert len(find_stages) == len(find_metadatas)
|
||||
results = {}
|
||||
for outputs, meta in zip(find_stages, find_metadatas):
|
||||
if self.force_single_mask:
|
||||
scores, labels = outputs["pred_logits"].max(-1)
|
||||
m = []
|
||||
for i in range(len(outputs["pred_masks"])):
|
||||
score, idx = scores[i].max(0)
|
||||
m.append(outputs["pred_masks"][i][idx])
|
||||
outputs["pred_masks"] = torch.stack(m, 0).unsqueeze(1)
|
||||
detection_results = self(outputs, meta.original_size, consistent=False)
|
||||
assert len(detection_results) == len(meta.coco_image_id)
|
||||
results.update(
|
||||
{
|
||||
(media_id.item(), object_id.item(), frame_index.item()): result
|
||||
for media_id, object_id, frame_index, result in zip(
|
||||
meta.original_image_id,
|
||||
meta.object_id,
|
||||
meta.frame_index,
|
||||
detection_results,
|
||||
)
|
||||
}
|
||||
)
|
||||
return results
|
||||
|
||||
|
||||
class PostProcessCounting(nn.Module):
|
||||
"""This module converts the model's output to be evaluated for counting tasks"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
use_original_ids: bool = False,
|
||||
threshold: float = 0.5,
|
||||
use_presence: bool = False,
|
||||
) -> None:
|
||||
"""
|
||||
Args:
|
||||
use_original_ids: whether to use the original image ids or the coco ids
|
||||
threshold: threshold for counting (values above this are counted)
|
||||
"""
|
||||
super().__init__()
|
||||
self.use_original_ids = use_original_ids
|
||||
self.threshold = threshold
|
||||
self.use_presence = use_presence
|
||||
|
||||
def forward(self, outputs, target_sizes):
|
||||
"""Perform the computation
|
||||
Parameters:
|
||||
outputs: raw outputs of the model
|
||||
target_sizes: tensor of dimension [batch_size x 2] containing the size of each images of the batch
|
||||
"""
|
||||
# Extract scores from model outputs and apply sigmoid
|
||||
scores = torch.sigmoid(outputs["pred_logits"]).squeeze(-1) # [B, N]
|
||||
if self.use_presence:
|
||||
presence_score = outputs["presence_logit_dec"].sigmoid()
|
||||
if presence_score.ndim == 1:
|
||||
presence_score = presence_score.unsqueeze(1) # [B, 1]
|
||||
scores = scores * presence_score # [B, N]
|
||||
|
||||
# Calculate counts by summing values above threshold
|
||||
counts = (scores > self.threshold).float().sum(dim=1)
|
||||
|
||||
assert len(counts) == len(target_sizes)
|
||||
results = []
|
||||
for count in counts:
|
||||
results.append({"count": count.item()})
|
||||
|
||||
return results
|
||||
|
||||
@torch.no_grad()
|
||||
def process_results(
|
||||
self, find_stages, find_metadatas: List[BatchedInferenceMetadata], **kwargs
|
||||
):
|
||||
assert len(find_stages) == len(find_metadatas)
|
||||
results = {}
|
||||
for outputs, meta in zip(find_stages, find_metadatas):
|
||||
detection_results = self(
|
||||
outputs,
|
||||
meta.original_size,
|
||||
)
|
||||
ids = (
|
||||
meta.original_image_id if self.use_original_ids else meta.coco_image_id
|
||||
)
|
||||
assert len(detection_results) == len(ids)
|
||||
for img_id, result in zip(ids, detection_results):
|
||||
results[img_id.item()] = result
|
||||
|
||||
return results
|
||||
155
sam3/eval/saco_veval_eval.py
Normal file
155
sam3/eval/saco_veval_eval.py
Normal file
@@ -0,0 +1,155 @@
|
||||
# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
from collections import defaultdict
|
||||
|
||||
from iopath.common.file_io import g_pathmgr
|
||||
from sam3.eval.saco_veval_evaluators import (
|
||||
VideoCGF1Evaluator,
|
||||
VideoPhraseApEvaluator,
|
||||
VideoPhraseHotaEvaluator,
|
||||
VideoTetaEvaluator,
|
||||
YTVISPredFileEvaluator,
|
||||
)
|
||||
|
||||
|
||||
class VEvalEvaluator:
|
||||
def __init__(self, gt_annot_file: str, eval_res_file: str):
|
||||
self.gt_annot_file = gt_annot_file
|
||||
self.eval_res_file = eval_res_file
|
||||
self.evaluators = [
|
||||
# mAP
|
||||
YTVISPredFileEvaluator(gt_annot_file),
|
||||
# Phrase AP
|
||||
VideoPhraseApEvaluator(gt_annot_file),
|
||||
# TETA
|
||||
VideoTetaEvaluator(gt_annot_file, use_mask=True, is_exhaustive=True),
|
||||
# HOTA
|
||||
VideoPhraseHotaEvaluator(gt_annot_file),
|
||||
# cgF1
|
||||
VideoCGF1Evaluator(gt_annot_file),
|
||||
]
|
||||
|
||||
def run_eval(self, pred_file: str):
|
||||
dataset_results = {}
|
||||
video_np_results = defaultdict(dict)
|
||||
for evaluator in self.evaluators:
|
||||
d_res, v_np_res = evaluator.evaluate(pred_file)
|
||||
dataset_results.update(d_res)
|
||||
for (video_id, category_id), res in v_np_res.items():
|
||||
video_np_results[(video_id, category_id)].update(res)
|
||||
|
||||
if len(dataset_results) == 0:
|
||||
dataset_results = {"": 0.0}
|
||||
|
||||
formatted_video_np_results = [
|
||||
{"video_id": video_id, "category_id": category_id, **res}
|
||||
for (video_id, category_id), res in video_np_results.items()
|
||||
]
|
||||
eval_metrics = {
|
||||
"dataset_results": dataset_results,
|
||||
"video_np_results": formatted_video_np_results,
|
||||
}
|
||||
|
||||
with g_pathmgr.open(self.eval_res_file, "w") as f:
|
||||
json.dump(eval_metrics, f)
|
||||
|
||||
return eval_metrics
|
||||
|
||||
|
||||
def run_main_all(dataset_name, args):
|
||||
gt_annot_file = os.path.join(args.gt_annot_dir, dataset_name + ".json")
|
||||
pred_file = os.path.join(args.pred_dir, dataset_name + "_preds.json")
|
||||
eval_res_file = os.path.join(args.eval_res_dir, dataset_name + "_eval_res.json")
|
||||
print(f"=== Running evaluation for Pred {pred_file} vs GT {gt_annot_file} ===")
|
||||
veval_evaluator = VEvalEvaluator(
|
||||
gt_annot_file=gt_annot_file, eval_res_file=eval_res_file
|
||||
)
|
||||
_ = veval_evaluator.run_eval(pred_file=pred_file)
|
||||
|
||||
print(f"=== Results saved to {eval_res_file} ===")
|
||||
|
||||
|
||||
def main_all(args):
|
||||
saco_veval_dataset_names = [
|
||||
"saco_veval_sav_test",
|
||||
"saco_veval_sav_val",
|
||||
"saco_veval_yt1b_test",
|
||||
"saco_veval_yt1b_val",
|
||||
"saco_veval_smartglasses_test",
|
||||
"saco_veval_smartglasses_val",
|
||||
]
|
||||
|
||||
# multiprocessing may not really work as inner evaluator also using multiprocessing
|
||||
# so we just for loop
|
||||
for dataset_name in saco_veval_dataset_names:
|
||||
print(f"=== Running evaluation for dataset {dataset_name} ===")
|
||||
run_main_all(dataset_name=dataset_name, args=args)
|
||||
|
||||
|
||||
def main_one(args):
|
||||
gt_annot_file = args.gt_annot_file
|
||||
pred_file = args.pred_file
|
||||
eval_res_file = args.eval_res_file
|
||||
|
||||
print(f"=== Running evaluation for Pred {pred_file} vs GT {gt_annot_file} ===")
|
||||
veval_evaluator = VEvalEvaluator(
|
||||
gt_annot_file=gt_annot_file, eval_res_file=eval_res_file
|
||||
)
|
||||
_ = veval_evaluator.run_eval(pred_file=pred_file)
|
||||
|
||||
print(f"=== Results saved to {eval_res_file} ===")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Run video grounding evaluators")
|
||||
|
||||
# Create subparsers for different commands
|
||||
subparsers = parser.add_subparsers(dest="command", required=True)
|
||||
|
||||
# Run evaluation for all datasets
|
||||
all_parser = subparsers.add_parser("all", help="Run evaluation for all datasets")
|
||||
all_parser.add_argument(
|
||||
"--gt_annot_dir",
|
||||
type=str,
|
||||
help="Directory that contains the ground truth annotation files",
|
||||
)
|
||||
all_parser.add_argument(
|
||||
"--pred_dir",
|
||||
type=str,
|
||||
help="Directory that contains the prediction files",
|
||||
)
|
||||
all_parser.add_argument(
|
||||
"--eval_res_dir",
|
||||
type=str,
|
||||
help="Directory that contains the eval results files",
|
||||
)
|
||||
all_parser.set_defaults(func=main_all)
|
||||
|
||||
# Run evaluation for one dataset
|
||||
one_parser = subparsers.add_parser("one", help="Run evaluation for one dataset")
|
||||
one_parser.add_argument(
|
||||
"--gt_annot_file",
|
||||
type=str,
|
||||
help="Path to the ground truth annotation file",
|
||||
)
|
||||
one_parser.add_argument(
|
||||
"--pred_file",
|
||||
type=str,
|
||||
help="Path to the prediction file",
|
||||
)
|
||||
one_parser.add_argument(
|
||||
"--eval_res_file",
|
||||
type=str,
|
||||
help="Path to the eval results file",
|
||||
)
|
||||
one_parser.set_defaults(func=main_one)
|
||||
|
||||
# Parse and dispatch
|
||||
args = parser.parse_args()
|
||||
args.func(args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
838
sam3/eval/saco_veval_evaluators.py
Normal file
838
sam3/eval/saco_veval_evaluators.py
Normal file
@@ -0,0 +1,838 @@
|
||||
# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
|
||||
import json
|
||||
import os
|
||||
import tempfile
|
||||
from collections import defaultdict
|
||||
from typing import Dict, Optional, Sequence, Tuple
|
||||
|
||||
import numpy as np
|
||||
import pycocotools.mask
|
||||
from sam3.eval.cgf1_eval import CGF1_METRICS
|
||||
from sam3.eval.conversion_util import (
|
||||
convert_ytbvis_to_cocovid_gt,
|
||||
convert_ytbvis_to_cocovid_pred,
|
||||
)
|
||||
from sam3.eval.hota_eval_toolkit.run_ytvis_eval import run_ytvis_eval
|
||||
from sam3.eval.teta_eval_toolkit import config, Evaluator, metrics
|
||||
from sam3.eval.teta_eval_toolkit.datasets import COCO, TAO
|
||||
from sam3.eval.ytvis_coco_wrapper import YTVIS
|
||||
from sam3.eval.ytvis_eval import VideoDemoF1Eval, YTVISeval
|
||||
from sam3.train.nms_helper import process_frame_level_nms, process_track_level_nms
|
||||
|
||||
|
||||
def _get_metric_index(metric_name: str, iou_threshold: Optional[float] = None) -> int:
|
||||
"""
|
||||
Find the index of a metric in CGF1_METRICS by name and IoU threshold.
|
||||
|
||||
Args:
|
||||
metric_name: Name of the metric (e.g., "cgF1", "precision", "recall")
|
||||
iou_threshold: IoU threshold (None for average over 0.5:0.95, or specific value like 0.5, 0.75)
|
||||
|
||||
Returns:
|
||||
Index of the metric in CGF1_METRICS
|
||||
|
||||
Raises:
|
||||
ValueError: If metric not found
|
||||
"""
|
||||
for idx, metric in enumerate(CGF1_METRICS):
|
||||
if metric.name == metric_name and metric.iou_threshold == iou_threshold:
|
||||
return idx
|
||||
raise ValueError(
|
||||
f"Metric '{metric_name}' with IoU threshold {iou_threshold} not found in CGF1_METRICS"
|
||||
)
|
||||
|
||||
|
||||
class BasePredFileEvaluator:
|
||||
"""A base class for evaluating a prediction file."""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class YTVISPredFileEvaluator(BasePredFileEvaluator):
|
||||
"""Evaluate class mAP for YT-VIS prediction files."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
gt_ann_file: str,
|
||||
dataset_name: str = "video",
|
||||
iou_types: Optional[Sequence[str]] = None,
|
||||
):
|
||||
self.gt_ann_file = gt_ann_file
|
||||
self.dataset_name = dataset_name
|
||||
self.iou_types = list(iou_types) if iou_types is not None else ["bbox", "segm"]
|
||||
assert all(iou_type in ["bbox", "segm"] for iou_type in self.iou_types)
|
||||
|
||||
def evaluate(self, pred_file: str) -> Dict[str, float]:
|
||||
# use our internal video evaluation toolkit for YT-VIS pred file
|
||||
# (i.e. the same one we're using for video phrase AP)
|
||||
results = {}
|
||||
use_cats = True # YT-VIS mAP evaluation uses categories
|
||||
ytvisGT = YTVIS(self.gt_ann_file, ignore_gt_cats=not use_cats)
|
||||
# the original YT-VIS GT annotations have uncompressed RLEs ("counts" is an integer list)
|
||||
# rather than compressed RLEs ("counts" is a string), so we first convert them here.
|
||||
if "segm" in self.iou_types:
|
||||
for ann in ytvisGT.dataset["annotations"]:
|
||||
ann["segmentations"] = [
|
||||
_compress_rle(rle) for rle in ann["segmentations"]
|
||||
]
|
||||
|
||||
with open(pred_file) as f:
|
||||
dt = json.load(f)
|
||||
# Our prediction file saves "video_id" and absolute (unnormalized) boxes.
|
||||
# Note that we should use the official (original) YT-VIS annotations (i.e. the one
|
||||
# saved via "scripts/datasets/training/ytvis_split.py", instead of the one saved
|
||||
# via "scripts/api_db_to_ytvis_json.py") in this evaluator, which contain absolute
|
||||
# boxes coordinates in its GT annotations.
|
||||
for d in dt:
|
||||
d["image_id"] = d["video_id"]
|
||||
ytvisDT = ytvisGT.loadRes(dt)
|
||||
|
||||
for iou_type in self.iou_types:
|
||||
ytvisEval = YTVISeval(ytvisGT, ytvisDT, iou_type)
|
||||
|
||||
# set the area ranges for small, medium, and large objects (using
|
||||
# absolute pixel areas) as in the official YT-VIS evaluation toolkit:
|
||||
# https://github.com/achalddave/ytvosapi/blob/eca601117c9f86bad084cb91f1d918e9ab665a75/PythonAPI/ytvostools/ytvoseval.py#L538
|
||||
ytvisEval.params.areaRng = [
|
||||
[0**2, 1e5**2],
|
||||
[0**2, 128**2],
|
||||
[128**2, 256**2],
|
||||
[256**2, 1e5**2],
|
||||
]
|
||||
ytvisEval.params.areaRngLbl = ["all", "small", "medium", "large"]
|
||||
ytvisEval.params.useCats = use_cats
|
||||
|
||||
ytvisEval.evaluate()
|
||||
ytvisEval.accumulate()
|
||||
ytvisEval.summarize()
|
||||
result_key = f"{self.dataset_name}_{'mask' if iou_type == 'segm' else 'bbox'}_mAP_50_95"
|
||||
results[result_key] = ytvisEval.stats[0]
|
||||
|
||||
# video-NP level results not supported for `YTVISPredFileEvaluator` yet
|
||||
video_np_level_results = {}
|
||||
return results, video_np_level_results
|
||||
|
||||
|
||||
class VideoPhraseApEvaluator(BasePredFileEvaluator):
|
||||
"""Evaluate Video Phrase AP with YT-VIS format prediction and GT files."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
gt_ann_file: str,
|
||||
dataset_name: str = "video",
|
||||
iou_types: Optional[Sequence[str]] = None,
|
||||
):
|
||||
self.gt_ann_file = gt_ann_file
|
||||
self.dataset_name = dataset_name
|
||||
self.iou_types = list(iou_types) if iou_types is not None else ["bbox", "segm"]
|
||||
assert all(iou_type in ["bbox", "segm"] for iou_type in self.iou_types)
|
||||
|
||||
def evaluate(self, pred_file: str) -> Dict[str, float]:
|
||||
with open(self.gt_ann_file) as f:
|
||||
gt = json.load(f)
|
||||
with open(pred_file) as f:
|
||||
dt = json.load(f)
|
||||
# For phrase AP and demo F1 evaluation, we need to remap each pair of (video_id, category_id) to
|
||||
# a new unique video_id, so that we don't mix detections from different categories under `useCat=False`
|
||||
gt, dt = remap_video_category_pairs_to_unique_video_ids(gt, dt)
|
||||
if "segm" in self.iou_types:
|
||||
for ann in gt["annotations"]:
|
||||
ann["segmentations"] = [
|
||||
_compress_rle(rle) for rle in ann["segmentations"]
|
||||
]
|
||||
for d in dt:
|
||||
d["image_id"] = d["video_id"]
|
||||
|
||||
results = {}
|
||||
use_cats = False # Phrase AP evaluation does not use categories
|
||||
ytvisGT = YTVIS(annotation_file=None, ignore_gt_cats=not use_cats)
|
||||
ytvisGT.dataset = gt
|
||||
ytvisGT.createIndex()
|
||||
ytvisDT = ytvisGT.loadRes(dt)
|
||||
|
||||
for iou_type in self.iou_types:
|
||||
phraseApEval = YTVISeval(ytvisGT, ytvisDT, iou_type)
|
||||
|
||||
# set the area ranges for small, medium, and large objects (using
|
||||
# absolute pixel areas) as in the official YT-VIS evaluation toolkit:
|
||||
# https://github.com/achalddave/ytvosapi/blob/eca601117c9f86bad084cb91f1d918e9ab665a75/PythonAPI/ytvostools/ytvoseval.py#L538
|
||||
phraseApEval.params.areaRng = [
|
||||
[0**2, 1e5**2],
|
||||
[0**2, 128**2],
|
||||
[128**2, 256**2],
|
||||
[256**2, 1e5**2],
|
||||
]
|
||||
phraseApEval.params.areaRngLbl = ["all", "small", "medium", "large"]
|
||||
phraseApEval.params.useCats = use_cats
|
||||
|
||||
phraseApEval.evaluate()
|
||||
phraseApEval.accumulate()
|
||||
phraseApEval.summarize()
|
||||
result_prefix = f"{self.dataset_name}"
|
||||
result_prefix += f"_{'mask' if iou_type == 'segm' else 'bbox'}_phrase_ap"
|
||||
# fetch Phrase AP results from the corresponding indices in `phraseApEval.stats`
|
||||
# (see `_summarizeDets` in https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/pycocotools/cocoeval.py)
|
||||
results[result_prefix + "_50_95"] = phraseApEval.stats[0] # IoU=0.5:0.95
|
||||
results[result_prefix + "_50"] = phraseApEval.stats[1] # IoU=0.5
|
||||
results[result_prefix + "_75"] = phraseApEval.stats[2] # IoU=0.75
|
||||
|
||||
# video-NP level results not supported for `VideoPhraseApEvaluator` yet
|
||||
video_np_level_results = {}
|
||||
return results, video_np_level_results
|
||||
|
||||
|
||||
class VideoCGF1Evaluator(BasePredFileEvaluator):
|
||||
"""Evaluate Video Demo F1 with YT-VIS format prediction and GT files."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
gt_ann_file: str,
|
||||
dataset_name: str = "video",
|
||||
prob_thresh: float = 0.5,
|
||||
iou_types: Optional[Sequence[str]] = None,
|
||||
):
|
||||
self.gt_ann_file = gt_ann_file
|
||||
self.dataset_name = dataset_name
|
||||
self.prob_thresh = prob_thresh
|
||||
self.iou_types = list(iou_types) if iou_types is not None else ["bbox", "segm"]
|
||||
assert all(iou_type in ["bbox", "segm"] for iou_type in self.iou_types)
|
||||
|
||||
def evaluate(self, pred_file: str) -> Dict[str, float]:
|
||||
with open(self.gt_ann_file) as f:
|
||||
gt = json.load(f)
|
||||
with open(pred_file) as f:
|
||||
dt = json.load(f)
|
||||
# compute IL_MCC and CG-F1 can only be computed if we have "video_np_pairs" keys in the GT JSON
|
||||
compute_ilmcc_and_cgf1 = "video_np_pairs" in gt
|
||||
if not compute_ilmcc_and_cgf1:
|
||||
print(
|
||||
f"Warning: IL_MCC and CG-F1 are not computed for {pred_file=} as it does not have 'video_np_pairs' keys in the GT JSON"
|
||||
)
|
||||
# For phrase AP and demo F1 evaluation, we need to remap each pair of (video_id, category_id) to
|
||||
# a new unique video_id, so that we don't mix detections from different categories under `useCat=False`
|
||||
gt, dt = remap_video_category_pairs_to_unique_video_ids(
|
||||
gt, dt, add_negative_np_pairs=compute_ilmcc_and_cgf1
|
||||
)
|
||||
if "segm" in self.iou_types:
|
||||
for ann in gt["annotations"]:
|
||||
ann["segmentations"] = [
|
||||
_compress_rle(rle) for rle in ann["segmentations"]
|
||||
]
|
||||
for d in dt:
|
||||
d["image_id"] = d["video_id"]
|
||||
|
||||
results = {}
|
||||
use_cats = False # Demo F1 evaluation does not use categories
|
||||
ytvisGT = YTVIS(annotation_file=None, ignore_gt_cats=not use_cats)
|
||||
ytvisGT.dataset = gt
|
||||
ytvisGT.createIndex()
|
||||
ytvisDT = ytvisGT.loadRes(dt)
|
||||
|
||||
video_np_level_results = {}
|
||||
for iou_type in self.iou_types:
|
||||
demoF1Eval = VideoDemoF1Eval(ytvisGT, ytvisDT, iou_type, self.prob_thresh)
|
||||
|
||||
demoF1Eval.params.useCats = use_cats
|
||||
demoF1Eval.params.areaRng = [[0**2, 1e5**2]]
|
||||
demoF1Eval.params.areaRngLbl = ["all"]
|
||||
demoF1Eval.params.maxDets = [100000]
|
||||
|
||||
demoF1Eval.evaluate()
|
||||
demoF1Eval.accumulate()
|
||||
demoF1Eval.summarize()
|
||||
result_prefix = f"{self.dataset_name}"
|
||||
result_prefix += f"_{'mask' if iou_type == 'segm' else 'bbox'}_demo"
|
||||
|
||||
stats = demoF1Eval.stats
|
||||
|
||||
if compute_ilmcc_and_cgf1:
|
||||
# Average IoU threshold (0.5:0.95)
|
||||
cgf1_micro_avg_idx = _get_metric_index("cgF1", None)
|
||||
positive_micro_f1_avg_idx = _get_metric_index("positive_micro_F1", None)
|
||||
ilmcc_avg_idx = _get_metric_index("IL_MCC", None)
|
||||
results[result_prefix + "_cgf1_micro_50_95"] = stats[cgf1_micro_avg_idx]
|
||||
results[result_prefix + "_ilmcc_50_95"] = stats[ilmcc_avg_idx]
|
||||
results[result_prefix + "_positive_micro_f1_50_95"] = stats[
|
||||
positive_micro_f1_avg_idx
|
||||
]
|
||||
|
||||
# IoU = 0.5
|
||||
cgf1_micro_50_idx = _get_metric_index("cgF1", 0.5)
|
||||
positive_micro_f1_50_idx = _get_metric_index("positive_micro_F1", 0.5)
|
||||
results[result_prefix + "_cgf1_micro_50"] = stats[cgf1_micro_50_idx]
|
||||
results[result_prefix + "_ilmcc_50"] = float(
|
||||
np.array(stats[cgf1_micro_50_idx])
|
||||
/ np.array(stats[positive_micro_f1_50_idx])
|
||||
)
|
||||
results[result_prefix + "_positive_micro_f1_50"] = stats[
|
||||
positive_micro_f1_50_idx
|
||||
]
|
||||
|
||||
# IoU = 0.75
|
||||
cgf1_micro_75_idx = _get_metric_index("cgF1", 0.75)
|
||||
positive_micro_f1_75_idx = _get_metric_index("positive_micro_F1", 0.75)
|
||||
results[result_prefix + "_cgf1_micro_75"] = stats[cgf1_micro_75_idx]
|
||||
results[result_prefix + "_ilmcc_75"] = float(
|
||||
np.array(stats[cgf1_micro_75_idx])
|
||||
/ np.array(stats[positive_micro_f1_75_idx])
|
||||
)
|
||||
results[result_prefix + "_positive_micro_f1_75"] = stats[
|
||||
positive_micro_f1_75_idx
|
||||
]
|
||||
|
||||
self.extract_video_np_level_results(demoF1Eval, video_np_level_results)
|
||||
|
||||
return results, video_np_level_results
|
||||
|
||||
def extract_video_np_level_results(self, demoF1Eval, video_np_level_results):
|
||||
"""Aggregate statistics for video-level metrics."""
|
||||
num_iou_thrs = len(demoF1Eval.params.iouThrs)
|
||||
iou_50_index = int(np.where(demoF1Eval.params.iouThrs == 0.5)[0])
|
||||
iou_75_index = int(np.where(demoF1Eval.params.iouThrs == 0.75)[0])
|
||||
|
||||
result_prefix = "mask" if demoF1Eval.params.iouType == "segm" else "bbox"
|
||||
|
||||
assert len(demoF1Eval.evalImgs) == len(demoF1Eval.cocoGt.dataset["images"])
|
||||
for i, video in enumerate(demoF1Eval.cocoGt.dataset["images"]):
|
||||
# the original video id and category id before remapping
|
||||
video_id = video["orig_video_id"]
|
||||
category_id = video["orig_category_id"]
|
||||
eval_img_dict = demoF1Eval.evalImgs[i]
|
||||
|
||||
TPs = eval_img_dict.get("TPs", np.zeros(num_iou_thrs, dtype=np.int64))
|
||||
FPs = eval_img_dict.get("FPs", np.zeros(num_iou_thrs, dtype=np.int64))
|
||||
FNs = eval_img_dict.get("FNs", np.zeros(num_iou_thrs, dtype=np.int64))
|
||||
assert len(TPs) == len(FPs) == len(FNs) == num_iou_thrs
|
||||
# F1 = 2*TP / (2*TP + FP + FN), and we set F1 to 1.0 if denominator is 0
|
||||
denominator = 2 * TPs + FPs + FNs
|
||||
F1s = np.where(denominator > 0, 2 * TPs / np.maximum(denominator, 1), 1.0)
|
||||
local_results = {
|
||||
f"{result_prefix}_TP_50_95": float(TPs.mean()),
|
||||
f"{result_prefix}_FP_50_95": float(FPs.mean()),
|
||||
f"{result_prefix}_FN_50_95": float(FNs.mean()),
|
||||
f"{result_prefix}_F1_50_95": float(F1s.mean()),
|
||||
f"{result_prefix}_TP_50": float(TPs[iou_50_index]),
|
||||
f"{result_prefix}_FP_50": float(FPs[iou_50_index]),
|
||||
f"{result_prefix}_FN_50": float(FNs[iou_50_index]),
|
||||
f"{result_prefix}_F1_50": float(F1s[iou_50_index]),
|
||||
f"{result_prefix}_TP_75": float(TPs[iou_75_index]),
|
||||
f"{result_prefix}_FP_75": float(FPs[iou_75_index]),
|
||||
f"{result_prefix}_FN_75": float(FNs[iou_75_index]),
|
||||
f"{result_prefix}_F1_75": float(F1s[iou_75_index]),
|
||||
}
|
||||
if (video_id, category_id) not in video_np_level_results:
|
||||
video_np_level_results[(video_id, category_id)] = {}
|
||||
video_np_level_results[(video_id, category_id)].update(local_results)
|
||||
|
||||
|
||||
class VideoTetaEvaluator(BasePredFileEvaluator):
|
||||
"""Evaluate TETA metric using YouTubeVIS format prediction and GT files."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
gt_ann_file: str,
|
||||
dataset_name: str = "video",
|
||||
tracker_name: str = "Sam3",
|
||||
nms_threshold: float = 0.5,
|
||||
nms_strategy: str = "none", # "track", "frame", or "none"
|
||||
prob_thresh: float = 0.5,
|
||||
is_exhaustive: bool = False,
|
||||
use_mask: bool = False,
|
||||
num_parallel_cores: int = 8,
|
||||
):
|
||||
self.gt_ann_file = gt_ann_file
|
||||
self.dataset_name = dataset_name
|
||||
self.tracker_name = tracker_name
|
||||
self.nms_threshold = nms_threshold
|
||||
self.nms_strategy = nms_strategy.lower() # Convert to lowercase for consistency
|
||||
self.prob_thresh = prob_thresh
|
||||
self.metric_prefix = "TETA"
|
||||
self.is_exhaustive = is_exhaustive
|
||||
self.use_mask = use_mask
|
||||
self.num_parallel_cores = num_parallel_cores
|
||||
|
||||
# Verify NMS strategy is valid
|
||||
valid_strategies = ["track", "frame", "none"]
|
||||
print("current nms_strategy:", self.nms_strategy)
|
||||
if self.nms_strategy not in valid_strategies:
|
||||
raise ValueError(
|
||||
f"Invalid NMS strategy: {self.nms_strategy}. Must be one of {valid_strategies}"
|
||||
)
|
||||
|
||||
print(f"Initialized VideoTetaEvaluator with NMS strategy: {self.nms_strategy}")
|
||||
print(f"Probability threshold set to: {self.prob_thresh}")
|
||||
print(f"Dataset exhaustivity set to: {self.is_exhaustive}")
|
||||
print(f"Tracker name set to: {self.tracker_name}")
|
||||
print(f"Dataset name set to: {self.dataset_name}")
|
||||
print(f"Use mask set to: {self.use_mask}")
|
||||
|
||||
def process_predictions(self, pred_file: str, tmp_dir: str) -> str:
|
||||
"""Process predictions with selected NMS strategy"""
|
||||
with open(pred_file, "r") as f:
|
||||
raw_preds = json.load(f)
|
||||
print(f"Processing predictions with {self.nms_strategy} NMS strategy")
|
||||
|
||||
# Filter by score threshold
|
||||
if self.prob_thresh > 0:
|
||||
raw_preds = [d for d in raw_preds if d["score"] >= self.prob_thresh]
|
||||
print(
|
||||
f"Filtered to {len(raw_preds)} predictions with score >= {self.prob_thresh}"
|
||||
)
|
||||
# Group predictions by video_id
|
||||
video_groups = defaultdict(list)
|
||||
for pred in raw_preds:
|
||||
video_groups[pred["video_id"]].append(pred)
|
||||
# Process based on NMS strategy
|
||||
if self.nms_strategy == "track":
|
||||
process_track_level_nms(video_groups, nms_threshold=self.nms_threshold)
|
||||
elif self.nms_strategy == "frame":
|
||||
process_frame_level_nms(video_groups, nms_threshold=self.nms_threshold)
|
||||
elif self.nms_strategy == "none":
|
||||
print("Skipping NMS processing as strategy is set to 'none'")
|
||||
# No processing needed for "none" strategy
|
||||
# Save processed predictions
|
||||
processed_preds = [
|
||||
track for tracks in video_groups.values() for track in tracks
|
||||
]
|
||||
processed_path = os.path.join(tmp_dir, "processed_preds.json")
|
||||
with open(processed_path, "w") as f:
|
||||
json.dump(processed_preds, f)
|
||||
|
||||
print(f"Saved processed predictions to {processed_path}")
|
||||
return processed_path
|
||||
|
||||
def evaluate(self, pred_file: str) -> Tuple[Dict[str, float], Dict]:
|
||||
"""Main evaluation method"""
|
||||
|
||||
print(f"Evaluating TETA Metric with {self.nms_strategy.upper()} NMS strategy")
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
# Process predictions first
|
||||
processed_pred_file = self.process_predictions(pred_file, tmp_dir)
|
||||
|
||||
# Convert GT to COCO-vid format
|
||||
gt_dir = os.path.join(tmp_dir, "gt")
|
||||
os.makedirs(gt_dir, exist_ok=True)
|
||||
gt_coco_path = os.path.join(gt_dir, "annotations.json")
|
||||
convert_ytbvis_to_cocovid_gt(self.gt_ann_file, gt_coco_path)
|
||||
|
||||
# Convert processed predictions to COCO-vid format
|
||||
pred_dir = os.path.join(tmp_dir, "predictions")
|
||||
tracker_dir = os.path.join(pred_dir, self.tracker_name)
|
||||
os.makedirs(tracker_dir, exist_ok=True)
|
||||
pred_coco_path = os.path.join(tracker_dir, "track_results_cocofmt.json")
|
||||
convert_ytbvis_to_cocovid_pred(
|
||||
youtubevis_pred_path=processed_pred_file,
|
||||
converted_dataset_path=gt_coco_path,
|
||||
output_path=pred_coco_path,
|
||||
)
|
||||
# Configure TETA evaluator
|
||||
default_eval_config = config.get_default_eval_config()
|
||||
default_eval_config["PRINT_ONLY_COMBINED"] = True
|
||||
default_eval_config["DISPLAY_LESS_PROGRESS"] = True
|
||||
default_eval_config["OUTPUT_TEMP_RAW_DATA"] = True
|
||||
default_eval_config["NUM_PARALLEL_CORES"] = self.num_parallel_cores
|
||||
default_dataset_config = config.get_default_dataset_config()
|
||||
default_dataset_config["TRACKERS_TO_EVAL"] = [self.tracker_name]
|
||||
default_dataset_config["GT_FOLDER"] = gt_dir
|
||||
default_dataset_config["OUTPUT_FOLDER"] = pred_dir
|
||||
default_dataset_config["TRACKER_SUB_FOLDER"] = tracker_dir
|
||||
default_dataset_config["USE_MASK"] = self.use_mask
|
||||
|
||||
evaluator = Evaluator(default_eval_config)
|
||||
if self.is_exhaustive:
|
||||
dataset_list = [COCO(default_dataset_config)]
|
||||
dataset_parsing_key = "COCO"
|
||||
else:
|
||||
dataset_list = [TAO(default_dataset_config)]
|
||||
dataset_parsing_key = "TAO"
|
||||
|
||||
# Run evaluation
|
||||
eval_results, _ = evaluator.evaluate(
|
||||
dataset_list, [metrics.TETA(exhaustive=self.is_exhaustive)]
|
||||
)
|
||||
|
||||
# Extract and format results
|
||||
results = {
|
||||
f"{self.dataset_name}_{'mask' if self.use_mask else 'bbox'}_teta": float(
|
||||
eval_results[dataset_parsing_key]["TETA"][0]
|
||||
),
|
||||
f"{self.dataset_name}_{'mask' if self.use_mask else 'bbox'}_loc_a": float(
|
||||
eval_results[dataset_parsing_key]["TETA"][1]
|
||||
),
|
||||
f"{self.dataset_name}_{'mask' if self.use_mask else 'bbox'}_assoc_a": float(
|
||||
eval_results[dataset_parsing_key]["TETA"][2]
|
||||
),
|
||||
f"{self.dataset_name}_{'mask' if self.use_mask else 'bbox'}_cls_a": float(
|
||||
eval_results[dataset_parsing_key]["TETA"][3]
|
||||
),
|
||||
f"{self.dataset_name}_{'mask' if self.use_mask else 'bbox'}_loc_re": float(
|
||||
eval_results[dataset_parsing_key]["TETA"][4]
|
||||
),
|
||||
f"{self.dataset_name}_{'mask' if self.use_mask else 'bbox'}_loc_pr": float(
|
||||
eval_results[dataset_parsing_key]["TETA"][5]
|
||||
),
|
||||
f"{self.dataset_name}_{'mask' if self.use_mask else 'bbox'}_assoc_re": float(
|
||||
eval_results[dataset_parsing_key]["TETA"][6]
|
||||
),
|
||||
f"{self.dataset_name}_{'mask' if self.use_mask else 'bbox'}_assoc_pr": float(
|
||||
eval_results[dataset_parsing_key]["TETA"][7]
|
||||
),
|
||||
f"{self.dataset_name}_{'mask' if self.use_mask else 'bbox'}_cls_re": float(
|
||||
eval_results[dataset_parsing_key]["TETA"][8]
|
||||
),
|
||||
f"{self.dataset_name}_{'mask' if self.use_mask else 'bbox'}_cls_pr": float(
|
||||
eval_results[dataset_parsing_key]["TETA"][9]
|
||||
),
|
||||
}
|
||||
|
||||
# video-NP level results not supported for `VideoTetaEvaluator` yet
|
||||
video_np_level_results = {}
|
||||
return results, video_np_level_results
|
||||
|
||||
|
||||
class VideoPhraseHotaEvaluator(BasePredFileEvaluator):
|
||||
"""Evaluate Video Phrase HOTA with YT-VIS format prediction and GT files."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
gt_ann_file: str,
|
||||
dataset_name: str = "video",
|
||||
prob_thresh: float = 0.5,
|
||||
iou_types: Optional[Sequence[str]] = None,
|
||||
compute_video_mot_hota: bool = False,
|
||||
):
|
||||
self.gt_ann_file = gt_ann_file
|
||||
self.dataset_name = dataset_name
|
||||
self.prob_thresh = prob_thresh
|
||||
self.metric_prefix = "phrase"
|
||||
# the list of metrics to collect from the HOTA evaluation results
|
||||
self.metric_to_collect = [
|
||||
"HOTA",
|
||||
"DetA",
|
||||
"AssA",
|
||||
"DetRe",
|
||||
"DetPr",
|
||||
"AssRe",
|
||||
"AssPr",
|
||||
"LocA",
|
||||
"OWTA",
|
||||
]
|
||||
self.iou_types = list(iou_types) if iou_types is not None else ["bbox", "segm"]
|
||||
assert all(iou_type in ["bbox", "segm"] for iou_type in self.iou_types)
|
||||
|
||||
# If True, compute video MOT HOTA, aggregating predictions/GT from all categories.
|
||||
self.compute_video_mot_hota = compute_video_mot_hota
|
||||
|
||||
def evaluate(self, pred_file: str) -> Dict[str, float]:
|
||||
# use the YT-VIS evaluation toolkit in TrackEval
|
||||
|
||||
with open(self.gt_ann_file) as f:
|
||||
gt = json.load(f)
|
||||
with open(pred_file) as f:
|
||||
dt = json.load(f)
|
||||
# keep only predictions with score above the probability threshold
|
||||
dt = [d for d in dt if d["score"] > self.prob_thresh]
|
||||
for d in dt:
|
||||
assert len(d["areas"]) == len(d["bboxes"])
|
||||
assert len(d["areas"]) == len(d["segmentations"])
|
||||
# remove empty boxes (otherwise they will count as false positives for during
|
||||
# per-frame detection accuracy in HOTA evaluation)
|
||||
for t in range(len(d["bboxes"])):
|
||||
bbox = d["bboxes"][t]
|
||||
if d["areas"][t] == 0 or bbox is None or all(x == 0 for x in bbox):
|
||||
d["segmentations"][t] = None
|
||||
d["bboxes"][t] = None
|
||||
d["areas"][t] = None
|
||||
# check that box occurence and mask occurence are consistent
|
||||
for bbox, mask, area in zip(d["bboxes"], d["segmentations"], d["areas"]):
|
||||
assert (area is None) == (bbox is None)
|
||||
assert (area is None) == (mask is None)
|
||||
# set all scores to 1.0 for HOTA evaluation (just like Demo F1, the exact score
|
||||
# value is not used in HOTA metrics; it will be treated as a detection prediction
|
||||
# as long as its score is above the threshold)
|
||||
d["score"] = 1.0
|
||||
|
||||
# remap the GT and DT annotations for phrase HOTA evaluation
|
||||
gt = _fill_in_ann_height_width(gt)
|
||||
if not self.compute_video_mot_hota:
|
||||
# remap the GT and DT annotations for phrase HOTA evaluation
|
||||
gt, dt = self._remap_gt_dt(gt, dt)
|
||||
else:
|
||||
# Compute video-level MOT HOTA
|
||||
# Apply track-level NMS
|
||||
video_groups = defaultdict(list)
|
||||
for pred in dt:
|
||||
video_groups[pred["video_id"]].append(pred)
|
||||
process_track_level_nms(video_groups, nms_threshold=0.5)
|
||||
dt = [track for tracks in video_groups.values() for track in tracks]
|
||||
|
||||
# Remap GT track ids for class-agnostic HOTA
|
||||
gt, dt = remap_gt_dt_class_agnostic(gt, dt)
|
||||
|
||||
# run the HOTA evaluation using TrackEval on the remapped (video_id, category_id) pairs
|
||||
out_dict = {}
|
||||
video_np_level_results = {}
|
||||
for iou_type in self.iou_types:
|
||||
output_res, _ = run_ytvis_eval(
|
||||
args=[
|
||||
"--METRICS",
|
||||
"HOTA",
|
||||
"--IOU_TYPE",
|
||||
iou_type,
|
||||
"--DATASET_NAME",
|
||||
self.dataset_name,
|
||||
"--USE_PARALLEL",
|
||||
"True",
|
||||
"--NUM_PARALLEL_CORES",
|
||||
"8",
|
||||
"--PLOT_CURVES",
|
||||
"False",
|
||||
"--LOG_ON_ERROR",
|
||||
"None",
|
||||
"--PRINT_ONLY_COMBINED",
|
||||
"True",
|
||||
"--OUTPUT_SUMMARY",
|
||||
"False",
|
||||
"--OUTPUT_DETAILED",
|
||||
"False",
|
||||
"--TIME_PROGRESS",
|
||||
"False",
|
||||
"--PRINT_CONFIG",
|
||||
"False",
|
||||
],
|
||||
gt_json=gt,
|
||||
dt_json=dt,
|
||||
)
|
||||
self.extract_video_np_level_results(
|
||||
iou_type=iou_type,
|
||||
remapped_gt=gt,
|
||||
raw_results=output_res[self.dataset_name]["tracker"],
|
||||
video_np_level_results=video_np_level_results,
|
||||
)
|
||||
|
||||
def _summarize_results(output_res, iou_type, field, suffix):
|
||||
eval_res = output_res[self.dataset_name]["tracker"][field]
|
||||
result_prefix = f"{self.dataset_name}_{'mask' if iou_type == 'segm' else 'bbox'}_{suffix}"
|
||||
for metric_name in self.metric_to_collect:
|
||||
eval_res_hota = eval_res["cls_comb_cls_av"]["HOTA"]
|
||||
result_key = f"{result_prefix}_{self.metric_prefix}_{metric_name}"
|
||||
result_value = float(np.mean(eval_res_hota[metric_name]))
|
||||
out_dict[result_key] = result_value
|
||||
|
||||
_summarize_results(output_res, iou_type, "COMBINED_SEQ", "all")
|
||||
if "COMBINED_SEQ_CHALLENGING" in output_res[self.dataset_name]["tracker"]:
|
||||
_summarize_results(
|
||||
output_res, iou_type, "COMBINED_SEQ_CHALLENGING", "challenging"
|
||||
)
|
||||
|
||||
# video-NP level results not supported for `VideoPhraseHotaEvaluator` yet
|
||||
return out_dict, video_np_level_results
|
||||
|
||||
def _remap_gt_dt(self, gt, dt):
|
||||
# For phrase HOTA evaluation, we need to remap each pair of (video_id, category_id) to
|
||||
# a new unique video_id, so that we don't mix detections from different categories
|
||||
gt, dt = remap_video_category_pairs_to_unique_video_ids(gt, dt)
|
||||
# We further map all the categories to category_id=1 in HOTA evaluation toolkit
|
||||
# for phrase HOTA (similar to "useCat=False" for video phrase AP)
|
||||
remapped_category_id = 1
|
||||
gt["categories"] = [
|
||||
{
|
||||
"supercategory": "object",
|
||||
"id": remapped_category_id,
|
||||
"name": "_REMAPPED_FOR_PHRASE_METRICS_",
|
||||
}
|
||||
]
|
||||
for ann in gt["annotations"]:
|
||||
ann["category_id"] = remapped_category_id
|
||||
for d in dt:
|
||||
d["category_id"] = remapped_category_id
|
||||
# To be compatible with the TrackEval YT-VIS evaluation toolkit, we need to give
|
||||
# unique filenames to each remapped video, so we add remapped video_id as prefix.
|
||||
for video in gt["videos"]:
|
||||
new_video_id = video["id"]
|
||||
video["file_names"] = [
|
||||
f"remapped_vid_{new_video_id:012d}/{name}"
|
||||
for name in video["file_names"]
|
||||
]
|
||||
return gt, dt
|
||||
|
||||
def extract_video_np_level_results(
|
||||
self, iou_type, remapped_gt, raw_results, video_np_level_results
|
||||
):
|
||||
"""Aggregate statistics for video-level metrics."""
|
||||
result_prefix = "mask" if iou_type == "segm" else "bbox"
|
||||
for video in remapped_gt["videos"]:
|
||||
# the original video id and category id before remapping
|
||||
video_id = video["orig_video_id"]
|
||||
category_id = video["orig_category_id"]
|
||||
video_key = f"remapped_vid_{video['id']:012d}"
|
||||
results = raw_results[video_key]["_REMAPPED_FOR_PHRASE_METRICS_"]["HOTA"]
|
||||
|
||||
local_results = {}
|
||||
for metric_name in self.metric_to_collect:
|
||||
result_key = f"{result_prefix}_{metric_name}"
|
||||
local_results[result_key] = float(results[metric_name].mean())
|
||||
if (video_id, category_id) not in video_np_level_results:
|
||||
video_np_level_results[(video_id, category_id)] = {}
|
||||
video_np_level_results[(video_id, category_id)].update(local_results)
|
||||
|
||||
|
||||
class VideoClassBasedHotaEvaluator(VideoPhraseHotaEvaluator):
|
||||
def __init__(
|
||||
self,
|
||||
gt_ann_file: str,
|
||||
dataset_name: str = "video",
|
||||
prob_thresh: float = 0.5,
|
||||
):
|
||||
super().__init__(gt_ann_file, dataset_name, prob_thresh)
|
||||
self.metric_prefix = "class"
|
||||
|
||||
def _remap_gt_dt(self, gt, dt):
|
||||
return gt, dt # no remapping needed for class-based HOTA evaluation
|
||||
|
||||
def extract_video_np_level_results(self, *args, **kwargs):
|
||||
pass # no video-NP level results for class-based HOTA evaluation
|
||||
|
||||
|
||||
def _compress_rle(rle):
|
||||
"""Convert RLEs from uncompressed (integer list) to compressed (string) format."""
|
||||
if rle is None:
|
||||
return None
|
||||
if isinstance(rle["counts"], list):
|
||||
rle = pycocotools.mask.frPyObjects(rle, rle["size"][0], rle["size"][1])
|
||||
rle["counts"] = rle["counts"].decode()
|
||||
return rle
|
||||
|
||||
|
||||
def remap_video_category_pairs_to_unique_video_ids(
|
||||
gt_json, dt_json, add_negative_np_pairs=False
|
||||
):
|
||||
"""
|
||||
Remap each pair of (video_id, category_id) to a new unique video_id. This is useful
|
||||
for phrase AP and demo F1 evaluation on videos, where we have `useCat=False` and
|
||||
rely on separating different NPs (from the same video) into different new video ids,
|
||||
so that we don't mix detections from different categories in computeIoU under `useCat=False`.
|
||||
|
||||
This is consistent with how do we phrase AP and demo F1 evaluation on images, where we
|
||||
use a remapped unique coco_image_id for each image-NP pair (based in its query["id"] in
|
||||
CustomCocoDetectionAPI.load_queries in modulated_detection_api.py)
|
||||
"""
|
||||
# collect the unique video_id-category_id pairs
|
||||
video_id_to_video = {v["id"]: v for v in gt_json["videos"]}
|
||||
video_id_category_id_pairs = set()
|
||||
for pred in dt_json:
|
||||
video_id_category_id_pairs.add((pred["video_id"], pred["category_id"]))
|
||||
for ann in gt_json["annotations"]:
|
||||
video_id_category_id_pairs.add((ann["video_id"], ann["category_id"]))
|
||||
|
||||
# assign the video_id-category_id pairs to unique video ids
|
||||
video_id_category_id_pairs = sorted(video_id_category_id_pairs)
|
||||
video_id_category_id_to_new_video_id = {
|
||||
pair: (i + 1) for i, pair in enumerate(video_id_category_id_pairs)
|
||||
}
|
||||
# also map the negative NP pairs -- this is needed for IL_MCC and CG-F1 evaluation
|
||||
if add_negative_np_pairs:
|
||||
for vnp in gt_json["video_np_pairs"]:
|
||||
pair = (vnp["video_id"], vnp["category_id"])
|
||||
if pair not in video_id_category_id_to_new_video_id:
|
||||
video_id_category_id_to_new_video_id[pair] = (
|
||||
len(video_id_category_id_to_new_video_id) + 1
|
||||
)
|
||||
|
||||
# map the "video_id" in predictions
|
||||
for pred in dt_json:
|
||||
pred["video_id"] = video_id_category_id_to_new_video_id[
|
||||
(pred["video_id"], pred["category_id"])
|
||||
]
|
||||
# map the "video_id" in gt_json["annotations"]
|
||||
for ann in gt_json["annotations"]:
|
||||
ann["video_id"] = video_id_category_id_to_new_video_id[
|
||||
(ann["video_id"], ann["category_id"])
|
||||
]
|
||||
# map and duplicate gt_json["videos"]
|
||||
new_videos = []
|
||||
for (
|
||||
video_id,
|
||||
category_id,
|
||||
), new_video_id in video_id_category_id_to_new_video_id.items():
|
||||
video = video_id_to_video[video_id].copy()
|
||||
video["id"] = new_video_id
|
||||
# preserve the original video_id and category_id of each remapped video entry,
|
||||
# so that we can associate sample-level eval metrics with the original video-NP pairs
|
||||
video["orig_video_id"] = video_id
|
||||
video["orig_category_id"] = category_id
|
||||
new_videos.append(video)
|
||||
gt_json["videos"] = new_videos
|
||||
|
||||
return gt_json, dt_json
|
||||
|
||||
|
||||
def remap_gt_dt_class_agnostic(gt, dt):
|
||||
"""
|
||||
For class-agnostic HOTA, merge all GT tracks for each video (across NPs),
|
||||
ensure unique track_ids, and set all category_id to 1.
|
||||
Also, add orig_video_id and orig_category_id for compatibility.
|
||||
"""
|
||||
# 1. Remap all GT track_ids to be unique per video
|
||||
gt_anns_by_video = defaultdict(list)
|
||||
for ann in gt["annotations"]:
|
||||
gt_anns_by_video[ann["video_id"]].append(ann)
|
||||
|
||||
# Ensure unique track ids across tracks of all videos
|
||||
next_tid = 1
|
||||
for _, anns in gt_anns_by_video.items():
|
||||
# Map old track_ids to new unique ones
|
||||
old_to_new_tid = {}
|
||||
for ann in anns:
|
||||
old_tid = ann["id"]
|
||||
if old_tid not in old_to_new_tid:
|
||||
old_to_new_tid[old_tid] = next_tid
|
||||
next_tid += 1
|
||||
ann["id"] = old_to_new_tid[old_tid]
|
||||
# Set category_id to 1 for class-agnostic
|
||||
ann["category_id"] = 1
|
||||
|
||||
# Set all GT categories to a single category
|
||||
gt["categories"] = [
|
||||
{
|
||||
"supercategory": "object",
|
||||
"id": 1,
|
||||
"name": "_REMAPPED_FOR_PHRASE_METRICS_",
|
||||
}
|
||||
]
|
||||
|
||||
# Add orig_video_id and orig_category_id to each video for compatibility
|
||||
anns_by_video = defaultdict(list)
|
||||
for ann in gt["annotations"]:
|
||||
anns_by_video[ann["video_id"]].append(ann)
|
||||
for video in gt["videos"]:
|
||||
video["orig_video_id"] = video["id"]
|
||||
# Use the first annotation's original category_id if available, else None
|
||||
orig_cat = (
|
||||
anns_by_video[video["id"]][0]["category_id"]
|
||||
if anns_by_video[video["id"]]
|
||||
else None
|
||||
)
|
||||
video["orig_category_id"] = orig_cat
|
||||
video["file_names"] = [
|
||||
f"remapped_vid_{video['id']:012d}/{name}" for name in video["file_names"]
|
||||
]
|
||||
|
||||
# Set all DT category_id to 1
|
||||
for d in dt:
|
||||
d["category_id"] = 1
|
||||
return gt, dt
|
||||
|
||||
|
||||
def _fill_in_ann_height_width(gt_json):
|
||||
"""Fill in missing height/width in GT annotations from its video info."""
|
||||
video_id_to_video = {v["id"]: v for v in gt_json["videos"]}
|
||||
for ann in gt_json["annotations"]:
|
||||
if "height" not in ann or "width" not in ann:
|
||||
video = video_id_to_video[ann["video_id"]]
|
||||
if "height" not in ann:
|
||||
ann["height"] = video["height"]
|
||||
if "width" not in ann:
|
||||
ann["width"] = video["width"]
|
||||
|
||||
return gt_json
|
||||
5
sam3/eval/teta_eval_toolkit/__init__.py
Normal file
5
sam3/eval/teta_eval_toolkit/__init__.py
Normal file
@@ -0,0 +1,5 @@
|
||||
# fmt: off
|
||||
# flake8: noqa
|
||||
|
||||
from . import config, datasets, metrics, utils
|
||||
from .eval import Evaluator
|
||||
69
sam3/eval/teta_eval_toolkit/_timing.py
Normal file
69
sam3/eval/teta_eval_toolkit/_timing.py
Normal file
@@ -0,0 +1,69 @@
|
||||
# fmt: off
|
||||
# flake8: noqa
|
||||
|
||||
import inspect
|
||||
from functools import wraps
|
||||
from time import perf_counter
|
||||
|
||||
DO_TIMING = False
|
||||
DISPLAY_LESS_PROGRESS = False
|
||||
timer_dict = {}
|
||||
counter = 0
|
||||
|
||||
|
||||
def time(f):
|
||||
@wraps(f)
|
||||
def wrap(*args, **kw):
|
||||
if DO_TIMING:
|
||||
# Run function with timing
|
||||
ts = perf_counter()
|
||||
result = f(*args, **kw)
|
||||
te = perf_counter()
|
||||
tt = te - ts
|
||||
|
||||
# Get function name
|
||||
arg_names = inspect.getfullargspec(f)[0]
|
||||
if arg_names[0] == "self" and DISPLAY_LESS_PROGRESS:
|
||||
return result
|
||||
elif arg_names[0] == "self":
|
||||
method_name = type(args[0]).__name__ + "." + f.__name__
|
||||
else:
|
||||
method_name = f.__name__
|
||||
|
||||
# Record accumulative time in each function for analysis
|
||||
if method_name in timer_dict.keys():
|
||||
timer_dict[method_name] += tt
|
||||
else:
|
||||
timer_dict[method_name] = tt
|
||||
|
||||
# If code is finished, display timing summary
|
||||
if method_name == "Evaluator.evaluate":
|
||||
print("")
|
||||
print("Timing analysis:")
|
||||
for key, value in timer_dict.items():
|
||||
print("%-70s %2.4f sec" % (key, value))
|
||||
else:
|
||||
# Get function argument values for printing special arguments of interest
|
||||
arg_titles = ["tracker", "seq", "cls"]
|
||||
arg_vals = []
|
||||
for i, a in enumerate(arg_names):
|
||||
if a in arg_titles:
|
||||
arg_vals.append(args[i])
|
||||
arg_text = "(" + ", ".join(arg_vals) + ")"
|
||||
|
||||
# Display methods and functions with different indentation.
|
||||
if arg_names[0] == "self":
|
||||
print("%-74s %2.4f sec" % (" " * 4 + method_name + arg_text, tt))
|
||||
elif arg_names[0] == "test":
|
||||
pass
|
||||
else:
|
||||
global counter
|
||||
counter += 1
|
||||
print("%i %-70s %2.4f sec" % (counter, method_name + arg_text, tt))
|
||||
|
||||
return result
|
||||
else:
|
||||
# If config["TIME_PROGRESS"] is false, or config["USE_PARALLEL"] is true, run functions normally without timing.
|
||||
return f(*args, **kw)
|
||||
|
||||
return wrap
|
||||
153
sam3/eval/teta_eval_toolkit/config.py
Normal file
153
sam3/eval/teta_eval_toolkit/config.py
Normal file
@@ -0,0 +1,153 @@
|
||||
# fmt: off
|
||||
# flake8: noqa
|
||||
|
||||
"""Config."""
|
||||
import argparse
|
||||
import os
|
||||
|
||||
|
||||
def parse_configs():
|
||||
"""Parse command line."""
|
||||
default_eval_config = get_default_eval_config()
|
||||
default_eval_config["DISPLAY_LESS_PROGRESS"] = True
|
||||
default_dataset_config = get_default_dataset_config()
|
||||
default_metrics_config = {"METRICS": ["TETA"]}
|
||||
config = {
|
||||
**default_eval_config,
|
||||
**default_dataset_config,
|
||||
**default_metrics_config,
|
||||
}
|
||||
parser = argparse.ArgumentParser()
|
||||
for setting in config.keys():
|
||||
if type(config[setting]) == list or type(config[setting]) == type(None):
|
||||
parser.add_argument("--" + setting, nargs="+")
|
||||
else:
|
||||
parser.add_argument("--" + setting)
|
||||
args = parser.parse_args().__dict__
|
||||
for setting in args.keys():
|
||||
if args[setting] is not None:
|
||||
if type(config[setting]) == type(True):
|
||||
if args[setting] == "True":
|
||||
x = True
|
||||
elif args[setting] == "False":
|
||||
x = False
|
||||
else:
|
||||
raise Exception(
|
||||
f"Command line parameter {setting} must be True/False"
|
||||
)
|
||||
elif type(config[setting]) == type(1):
|
||||
x = int(args[setting])
|
||||
elif type(args[setting]) == type(None):
|
||||
x = None
|
||||
else:
|
||||
x = args[setting]
|
||||
config[setting] = x
|
||||
eval_config = {k: v for k, v in config.items() if k in default_eval_config.keys()}
|
||||
dataset_config = {
|
||||
k: v for k, v in config.items() if k in default_dataset_config.keys()
|
||||
}
|
||||
metrics_config = {
|
||||
k: v for k, v in config.items() if k in default_metrics_config.keys()
|
||||
}
|
||||
|
||||
return eval_config, dataset_config, metrics_config
|
||||
|
||||
|
||||
def get_default_eval_config():
|
||||
"""Returns the default config values for evaluation."""
|
||||
code_path = get_code_path()
|
||||
default_config = {
|
||||
"USE_PARALLEL": True,
|
||||
"NUM_PARALLEL_CORES": 8,
|
||||
"BREAK_ON_ERROR": True,
|
||||
"RETURN_ON_ERROR": False,
|
||||
"LOG_ON_ERROR": os.path.join(code_path, "error_log.txt"),
|
||||
"PRINT_RESULTS": True,
|
||||
"PRINT_ONLY_COMBINED": True,
|
||||
"PRINT_CONFIG": True,
|
||||
"TIME_PROGRESS": True,
|
||||
"DISPLAY_LESS_PROGRESS": True,
|
||||
"OUTPUT_SUMMARY": True,
|
||||
"OUTPUT_EMPTY_CLASSES": True,
|
||||
"OUTPUT_TEM_RAW_DATA": True,
|
||||
"OUTPUT_PER_SEQ_RES": True,
|
||||
}
|
||||
return default_config
|
||||
|
||||
|
||||
def get_default_dataset_config():
|
||||
"""Default class config values"""
|
||||
code_path = get_code_path()
|
||||
default_config = {
|
||||
"GT_FOLDER": os.path.join(
|
||||
code_path, "data/gt/tao/tao_training"
|
||||
), # Location of GT data
|
||||
"TRACKERS_FOLDER": os.path.join(
|
||||
code_path, "data/trackers/tao/tao_training"
|
||||
), # Trackers location
|
||||
"OUTPUT_FOLDER": None, # Where to save eval results (if None, same as TRACKERS_FOLDER)
|
||||
"TRACKERS_TO_EVAL": ['TETer'], # Filenames of trackers to eval (if None, all in folder)
|
||||
"CLASSES_TO_EVAL": None, # Classes to eval (if None, all classes)
|
||||
"SPLIT_TO_EVAL": "training", # Valid: 'training', 'val'
|
||||
"PRINT_CONFIG": True, # Whether to print current config
|
||||
"TRACKER_SUB_FOLDER": "data", # Tracker files are in TRACKER_FOLDER/tracker_name/TRACKER_SUB_FOLDER
|
||||
"OUTPUT_SUB_FOLDER": "", # Output files are saved in OUTPUT_FOLDER/tracker_name/OUTPUT_SUB_FOLDER
|
||||
"TRACKER_DISPLAY_NAMES": None, # Names of trackers to display, if None: TRACKERS_TO_EVAL
|
||||
"MAX_DETECTIONS": 0, # Number of maximal allowed detections per image (0 for unlimited)
|
||||
"USE_MASK": False, # Whether to use mask data for evaluation
|
||||
}
|
||||
return default_config
|
||||
|
||||
|
||||
def init_config(config, default_config, name=None):
|
||||
"""Initialize non-given config values with defaults."""
|
||||
if config is None:
|
||||
config = default_config
|
||||
else:
|
||||
for k in default_config.keys():
|
||||
if k not in config.keys():
|
||||
config[k] = default_config[k]
|
||||
if name and config["PRINT_CONFIG"]:
|
||||
print("\n%s Config:" % name)
|
||||
for c in config.keys():
|
||||
print("%-20s : %-30s" % (c, config[c]))
|
||||
return config
|
||||
|
||||
|
||||
def update_config(config):
|
||||
"""
|
||||
Parse the arguments of a script and updates the config values for a given value if specified in the arguments.
|
||||
:param config: the config to update
|
||||
:return: the updated config
|
||||
"""
|
||||
parser = argparse.ArgumentParser()
|
||||
for setting in config.keys():
|
||||
if type(config[setting]) == list or type(config[setting]) == type(None):
|
||||
parser.add_argument("--" + setting, nargs="+")
|
||||
else:
|
||||
parser.add_argument("--" + setting)
|
||||
args = parser.parse_args().__dict__
|
||||
for setting in args.keys():
|
||||
if args[setting] is not None:
|
||||
if type(config[setting]) == type(True):
|
||||
if args[setting] == "True":
|
||||
x = True
|
||||
elif args[setting] == "False":
|
||||
x = False
|
||||
else:
|
||||
raise Exception(
|
||||
"Command line parameter " + setting + "must be True or False"
|
||||
)
|
||||
elif type(config[setting]) == type(1):
|
||||
x = int(args[setting])
|
||||
elif type(args[setting]) == type(None):
|
||||
x = None
|
||||
else:
|
||||
x = args[setting]
|
||||
config[setting] = x
|
||||
return config
|
||||
|
||||
|
||||
def get_code_path():
|
||||
"""Get base path where code is"""
|
||||
return os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
|
||||
5
sam3/eval/teta_eval_toolkit/datasets/__init__.py
Normal file
5
sam3/eval/teta_eval_toolkit/datasets/__init__.py
Normal file
@@ -0,0 +1,5 @@
|
||||
# fmt: off
|
||||
# flake8: noqa
|
||||
"""Datasets."""
|
||||
from .coco import COCO
|
||||
from .tao import TAO
|
||||
379
sam3/eval/teta_eval_toolkit/datasets/_base_dataset.py
Normal file
379
sam3/eval/teta_eval_toolkit/datasets/_base_dataset.py
Normal file
@@ -0,0 +1,379 @@
|
||||
# fmt: off
|
||||
# flake8: noqa
|
||||
|
||||
import csv
|
||||
import io
|
||||
import os
|
||||
import traceback
|
||||
import zipfile
|
||||
from abc import ABC, abstractmethod
|
||||
from copy import deepcopy
|
||||
|
||||
import numpy as np
|
||||
|
||||
from .. import _timing
|
||||
from ..utils import TrackEvalException
|
||||
|
||||
|
||||
class _BaseDataset(ABC):
|
||||
@abstractmethod
|
||||
def __init__(self):
|
||||
self.tracker_list = None
|
||||
self.seq_list = None
|
||||
self.class_list = None
|
||||
self.output_fol = None
|
||||
self.output_sub_fol = None
|
||||
self.should_classes_combine = True
|
||||
self.use_super_categories = False
|
||||
|
||||
# Functions to implement:
|
||||
|
||||
@abstractmethod
|
||||
def _load_raw_file(self, tracker, seq, is_gt):
|
||||
...
|
||||
|
||||
@_timing.time
|
||||
@abstractmethod
|
||||
def get_preprocessed_seq_data(self, raw_data, cls):
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
def _calculate_similarities(self, gt_dets_t, tracker_dets_t):
|
||||
...
|
||||
|
||||
# Helper functions for all datasets:
|
||||
|
||||
@classmethod
|
||||
def get_class_name(cls):
|
||||
return cls.__name__
|
||||
|
||||
def get_name(self):
|
||||
return self.get_class_name()
|
||||
|
||||
def get_output_fol(self, tracker):
|
||||
return os.path.join(self.output_fol, tracker, self.output_sub_fol)
|
||||
|
||||
def get_display_name(self, tracker):
|
||||
"""Can be overwritten if the trackers name (in files) is different to how it should be displayed.
|
||||
By default this method just returns the trackers name as is.
|
||||
"""
|
||||
return tracker
|
||||
|
||||
def get_eval_info(self):
|
||||
"""Return info about the dataset needed for the Evaluator"""
|
||||
return self.tracker_list, self.seq_list, self.class_list
|
||||
|
||||
@_timing.time
|
||||
def get_raw_seq_data(self, tracker, seq):
|
||||
"""Loads raw data (tracker and ground-truth) for a single tracker on a single sequence.
|
||||
Raw data includes all of the information needed for both preprocessing and evaluation, for all classes.
|
||||
A later function (get_processed_seq_data) will perform such preprocessing and extract relevant information for
|
||||
the evaluation of each class.
|
||||
|
||||
This returns a dict which contains the fields:
|
||||
[num_timesteps]: integer
|
||||
[gt_ids, tracker_ids, gt_classes, tracker_classes, tracker_confidences]:
|
||||
list (for each timestep) of 1D NDArrays (for each det).
|
||||
[gt_dets, tracker_dets, gt_crowd_ignore_regions]: list (for each timestep) of lists of detections.
|
||||
[similarity_scores]: list (for each timestep) of 2D NDArrays.
|
||||
[gt_extras]: dict (for each extra) of lists (for each timestep) of 1D NDArrays (for each det).
|
||||
|
||||
gt_extras contains dataset specific information used for preprocessing such as occlusion and truncation levels.
|
||||
|
||||
Note that similarities are extracted as part of the dataset and not the metric, because almost all metrics are
|
||||
independent of the exact method of calculating the similarity. However datasets are not (e.g. segmentation
|
||||
masks vs 2D boxes vs 3D boxes).
|
||||
We calculate the similarity before preprocessing because often both preprocessing and evaluation require it and
|
||||
we don't wish to calculate this twice.
|
||||
We calculate similarity between all gt and tracker classes (not just each class individually) to allow for
|
||||
calculation of metrics such as class confusion matrices. Typically the impact of this on performance is low.
|
||||
"""
|
||||
# Load raw data.
|
||||
raw_gt_data = self._load_raw_file(tracker, seq, is_gt=True)
|
||||
raw_tracker_data = self._load_raw_file(tracker, seq, is_gt=False)
|
||||
raw_data = {**raw_tracker_data, **raw_gt_data} # Merges dictionaries
|
||||
|
||||
# Calculate similarities for each timestep.
|
||||
similarity_scores = []
|
||||
for _, (gt_dets_t, tracker_dets_t) in enumerate(
|
||||
zip(raw_data["gt_dets"], raw_data["tk_dets"])
|
||||
):
|
||||
ious = self._calculate_similarities(gt_dets_t, tracker_dets_t)
|
||||
similarity_scores.append(ious)
|
||||
raw_data["similarity_scores"] = similarity_scores
|
||||
return raw_data
|
||||
|
||||
@staticmethod
|
||||
def _load_simple_text_file(
|
||||
file,
|
||||
time_col=0,
|
||||
id_col=None,
|
||||
remove_negative_ids=False,
|
||||
valid_filter=None,
|
||||
crowd_ignore_filter=None,
|
||||
convert_filter=None,
|
||||
is_zipped=False,
|
||||
zip_file=None,
|
||||
force_delimiters=None,
|
||||
):
|
||||
"""Function that loads data which is in a commonly used text file format.
|
||||
Assumes each det is given by one row of a text file.
|
||||
There is no limit to the number or meaning of each column,
|
||||
however one column needs to give the timestep of each det (time_col) which is default col 0.
|
||||
|
||||
The file dialect (deliminator, num cols, etc) is determined automatically.
|
||||
This function automatically separates dets by timestep,
|
||||
and is much faster than alternatives such as np.loadtext or pandas.
|
||||
|
||||
If remove_negative_ids is True and id_col is not None, dets with negative values in id_col are excluded.
|
||||
These are not excluded from ignore data.
|
||||
|
||||
valid_filter can be used to only include certain classes.
|
||||
It is a dict with ints as keys, and lists as values,
|
||||
such that a row is included if "row[key].lower() is in value" for all key/value pairs in the dict.
|
||||
If None, all classes are included.
|
||||
|
||||
crowd_ignore_filter can be used to read crowd_ignore regions separately. It has the same format as valid filter.
|
||||
|
||||
convert_filter can be used to convert value read to another format.
|
||||
This is used most commonly to convert classes given as string to a class id.
|
||||
This is a dict such that the key is the column to convert, and the value is another dict giving the mapping.
|
||||
|
||||
Optionally, input files could be a zip of multiple text files for storage efficiency.
|
||||
|
||||
Returns read_data and ignore_data.
|
||||
Each is a dict (with keys as timesteps as strings) of lists (over dets) of lists (over column values).
|
||||
Note that all data is returned as strings, and must be converted to float/int later if needed.
|
||||
Note that timesteps will not be present in the returned dict keys if there are no dets for them
|
||||
"""
|
||||
|
||||
if remove_negative_ids and id_col is None:
|
||||
raise TrackEvalException(
|
||||
"remove_negative_ids is True, but id_col is not given."
|
||||
)
|
||||
if crowd_ignore_filter is None:
|
||||
crowd_ignore_filter = {}
|
||||
if convert_filter is None:
|
||||
convert_filter = {}
|
||||
try:
|
||||
if is_zipped: # Either open file directly or within a zip.
|
||||
if zip_file is None:
|
||||
raise TrackEvalException(
|
||||
"is_zipped set to True, but no zip_file is given."
|
||||
)
|
||||
archive = zipfile.ZipFile(os.path.join(zip_file), "r")
|
||||
fp = io.TextIOWrapper(archive.open(file, "r"))
|
||||
else:
|
||||
fp = open(file)
|
||||
read_data = {}
|
||||
crowd_ignore_data = {}
|
||||
fp.seek(0, os.SEEK_END)
|
||||
# check if file is empty
|
||||
if fp.tell():
|
||||
fp.seek(0)
|
||||
dialect = csv.Sniffer().sniff(
|
||||
fp.readline(), delimiters=force_delimiters
|
||||
) # Auto determine structure.
|
||||
dialect.skipinitialspace = (
|
||||
True # Deal with extra spaces between columns
|
||||
)
|
||||
fp.seek(0)
|
||||
reader = csv.reader(fp, dialect)
|
||||
for row in reader:
|
||||
try:
|
||||
# Deal with extra trailing spaces at the end of rows
|
||||
if row[-1] in "":
|
||||
row = row[:-1]
|
||||
timestep = str(int(float(row[time_col])))
|
||||
# Read ignore regions separately.
|
||||
is_ignored = False
|
||||
for ignore_key, ignore_value in crowd_ignore_filter.items():
|
||||
if row[ignore_key].lower() in ignore_value:
|
||||
# Convert values in one column (e.g. string to id)
|
||||
for (
|
||||
convert_key,
|
||||
convert_value,
|
||||
) in convert_filter.items():
|
||||
row[convert_key] = convert_value[
|
||||
row[convert_key].lower()
|
||||
]
|
||||
# Save data separated by timestep.
|
||||
if timestep in crowd_ignore_data.keys():
|
||||
crowd_ignore_data[timestep].append(row)
|
||||
else:
|
||||
crowd_ignore_data[timestep] = [row]
|
||||
is_ignored = True
|
||||
if (
|
||||
is_ignored
|
||||
): # if det is an ignore region, it cannot be a normal det.
|
||||
continue
|
||||
# Exclude some dets if not valid.
|
||||
if valid_filter is not None:
|
||||
for key, value in valid_filter.items():
|
||||
if row[key].lower() not in value:
|
||||
continue
|
||||
if remove_negative_ids:
|
||||
if int(float(row[id_col])) < 0:
|
||||
continue
|
||||
# Convert values in one column (e.g. string to id)
|
||||
for convert_key, convert_value in convert_filter.items():
|
||||
row[convert_key] = convert_value[row[convert_key].lower()]
|
||||
# Save data separated by timestep.
|
||||
if timestep in read_data.keys():
|
||||
read_data[timestep].append(row)
|
||||
else:
|
||||
read_data[timestep] = [row]
|
||||
except Exception:
|
||||
exc_str_init = (
|
||||
"In file %s the following line cannot be read correctly: \n"
|
||||
% os.path.basename(file)
|
||||
)
|
||||
exc_str = " ".join([exc_str_init] + row)
|
||||
raise TrackEvalException(exc_str)
|
||||
fp.close()
|
||||
except Exception:
|
||||
print("Error loading file: %s, printing traceback." % file)
|
||||
traceback.print_exc()
|
||||
raise TrackEvalException(
|
||||
"File %s cannot be read because it is either not present or invalidly formatted"
|
||||
% os.path.basename(file)
|
||||
)
|
||||
return read_data, crowd_ignore_data
|
||||
|
||||
@staticmethod
|
||||
def _calculate_mask_ious(masks1, masks2, is_encoded=False, do_ioa=False):
|
||||
"""Calculates the IOU (intersection over union) between two arrays of segmentation masks.
|
||||
If is_encoded a run length encoding with pycocotools is assumed as input format, otherwise an input of numpy
|
||||
arrays of the shape (num_masks, height, width) is assumed and the encoding is performed.
|
||||
If do_ioa (intersection over area) , then calculates the intersection over the area of masks1 - this is commonly
|
||||
used to determine if detections are within crowd ignore region.
|
||||
:param masks1: first set of masks (numpy array of shape (num_masks, height, width) if not encoded,
|
||||
else pycocotools rle encoded format)
|
||||
:param masks2: second set of masks (numpy array of shape (num_masks, height, width) if not encoded,
|
||||
else pycocotools rle encoded format)
|
||||
:param is_encoded: whether the input is in pycocotools rle encoded format
|
||||
:param do_ioa: whether to perform IoA computation
|
||||
:return: the IoU/IoA scores
|
||||
"""
|
||||
|
||||
# Only loaded when run to reduce minimum requirements
|
||||
from pycocotools import mask as mask_utils
|
||||
|
||||
# use pycocotools for run length encoding of masks
|
||||
if not is_encoded:
|
||||
masks1 = mask_utils.encode(
|
||||
np.array(np.transpose(masks1, (1, 2, 0)), order="F")
|
||||
)
|
||||
masks2 = mask_utils.encode(
|
||||
np.array(np.transpose(masks2, (1, 2, 0)), order="F")
|
||||
)
|
||||
|
||||
# use pycocotools for iou computation of rle encoded masks
|
||||
ious = mask_utils.iou(masks1, masks2, [do_ioa] * len(masks2))
|
||||
if len(masks1) == 0 or len(masks2) == 0:
|
||||
ious = np.asarray(ious).reshape(len(masks1), len(masks2))
|
||||
assert (ious >= 0 - np.finfo("float").eps).all()
|
||||
assert (ious <= 1 + np.finfo("float").eps).all()
|
||||
|
||||
return ious
|
||||
|
||||
@staticmethod
|
||||
def _calculate_box_ious(bboxes1, bboxes2, box_format="xywh", do_ioa=False):
|
||||
"""Calculates the IOU (intersection over union) between two arrays of boxes.
|
||||
Allows variable box formats ('xywh' and 'x0y0x1y1').
|
||||
If do_ioa (intersection over area) , then calculates the intersection over the area of boxes1 - this is commonly
|
||||
used to determine if detections are within crowd ignore region.
|
||||
"""
|
||||
if box_format in "xywh":
|
||||
# layout: (x0, y0, w, h)
|
||||
bboxes1 = deepcopy(bboxes1)
|
||||
bboxes2 = deepcopy(bboxes2)
|
||||
|
||||
bboxes1[:, 2] = bboxes1[:, 0] + bboxes1[:, 2]
|
||||
bboxes1[:, 3] = bboxes1[:, 1] + bboxes1[:, 3]
|
||||
bboxes2[:, 2] = bboxes2[:, 0] + bboxes2[:, 2]
|
||||
bboxes2[:, 3] = bboxes2[:, 1] + bboxes2[:, 3]
|
||||
elif box_format not in "x0y0x1y1":
|
||||
raise (TrackEvalException("box_format %s is not implemented" % box_format))
|
||||
|
||||
# layout: (x0, y0, x1, y1)
|
||||
min_ = np.minimum(bboxes1[:, np.newaxis, :], bboxes2[np.newaxis, :, :])
|
||||
max_ = np.maximum(bboxes1[:, np.newaxis, :], bboxes2[np.newaxis, :, :])
|
||||
intersection = np.maximum(min_[..., 2] - max_[..., 0], 0) * np.maximum(
|
||||
min_[..., 3] - max_[..., 1], 0
|
||||
)
|
||||
area1 = (bboxes1[..., 2] - bboxes1[..., 0]) * (
|
||||
bboxes1[..., 3] - bboxes1[..., 1]
|
||||
)
|
||||
|
||||
if do_ioa:
|
||||
ioas = np.zeros_like(intersection)
|
||||
valid_mask = area1 > 0 + np.finfo("float").eps
|
||||
ioas[valid_mask, :] = (
|
||||
intersection[valid_mask, :] / area1[valid_mask][:, np.newaxis]
|
||||
)
|
||||
|
||||
return ioas
|
||||
else:
|
||||
area2 = (bboxes2[..., 2] - bboxes2[..., 0]) * (
|
||||
bboxes2[..., 3] - bboxes2[..., 1]
|
||||
)
|
||||
union = area1[:, np.newaxis] + area2[np.newaxis, :] - intersection
|
||||
intersection[area1 <= 0 + np.finfo("float").eps, :] = 0
|
||||
intersection[:, area2 <= 0 + np.finfo("float").eps] = 0
|
||||
intersection[union <= 0 + np.finfo("float").eps] = 0
|
||||
union[union <= 0 + np.finfo("float").eps] = 1
|
||||
ious = intersection / union
|
||||
return ious
|
||||
|
||||
@staticmethod
|
||||
def _calculate_euclidean_similarity(dets1, dets2, zero_distance=2.0):
|
||||
"""Calculates the euclidean distance between two sets of detections, and then converts this into a similarity
|
||||
measure with values between 0 and 1 using the following formula: sim = max(0, 1 - dist/zero_distance).
|
||||
The default zero_distance of 2.0, corresponds to the default used in MOT15_3D, such that a 0.5 similarity
|
||||
threshold corresponds to a 1m distance threshold for TPs.
|
||||
"""
|
||||
dist = np.linalg.norm(dets1[:, np.newaxis] - dets2[np.newaxis, :], axis=2)
|
||||
sim = np.maximum(0, 1 - dist / zero_distance)
|
||||
return sim
|
||||
|
||||
@staticmethod
|
||||
def _check_unique_ids(data, after_preproc=False):
|
||||
"""Check the requirement that the tracker_ids and gt_ids are unique per timestep"""
|
||||
gt_ids = data["gt_ids"]
|
||||
tracker_ids = data["tk_ids"]
|
||||
for t, (gt_ids_t, tracker_ids_t) in enumerate(zip(gt_ids, tracker_ids)):
|
||||
if len(tracker_ids_t) > 0:
|
||||
unique_ids, counts = np.unique(tracker_ids_t, return_counts=True)
|
||||
if np.max(counts) != 1:
|
||||
duplicate_ids = unique_ids[counts > 1]
|
||||
exc_str_init = (
|
||||
"Tracker predicts the same ID more than once in a single timestep "
|
||||
"(seq: %s, frame: %i, ids:" % (data["seq"], t + 1)
|
||||
)
|
||||
exc_str = (
|
||||
" ".join([exc_str_init] + [str(d) for d in duplicate_ids]) + ")"
|
||||
)
|
||||
if after_preproc:
|
||||
exc_str_init += (
|
||||
"\n Note that this error occurred after preprocessing (but not before), "
|
||||
"so ids may not be as in file, and something seems wrong with preproc."
|
||||
)
|
||||
raise TrackEvalException(exc_str)
|
||||
if len(gt_ids_t) > 0:
|
||||
unique_ids, counts = np.unique(gt_ids_t, return_counts=True)
|
||||
if np.max(counts) != 1:
|
||||
duplicate_ids = unique_ids[counts > 1]
|
||||
exc_str_init = (
|
||||
"Ground-truth has the same ID more than once in a single timestep "
|
||||
"(seq: %s, frame: %i, ids:" % (data["seq"], t + 1)
|
||||
)
|
||||
exc_str = (
|
||||
" ".join([exc_str_init] + [str(d) for d in duplicate_ids]) + ")"
|
||||
)
|
||||
if after_preproc:
|
||||
exc_str_init += (
|
||||
"\n Note that this error occurred after preprocessing (but not before), "
|
||||
"so ids may not be as in file, and something seems wrong with preproc."
|
||||
)
|
||||
raise TrackEvalException(exc_str)
|
||||
637
sam3/eval/teta_eval_toolkit/datasets/coco.py
Normal file
637
sam3/eval/teta_eval_toolkit/datasets/coco.py
Normal file
@@ -0,0 +1,637 @@
|
||||
# fmt: off
|
||||
# flake8: noqa
|
||||
|
||||
"""COCO Dataset."""
|
||||
import copy
|
||||
import itertools
|
||||
import json
|
||||
import os
|
||||
from collections import defaultdict
|
||||
|
||||
import numpy as np
|
||||
from scipy.optimize import linear_sum_assignment
|
||||
|
||||
from .. import _timing, utils
|
||||
from ..config import get_default_dataset_config, init_config
|
||||
from ..utils import TrackEvalException
|
||||
from ._base_dataset import _BaseDataset
|
||||
|
||||
|
||||
class COCO(_BaseDataset):
|
||||
"""Tracking datasets in COCO format."""
|
||||
|
||||
def __init__(self, config=None):
|
||||
"""Initialize dataset, checking that all required files are present."""
|
||||
super().__init__()
|
||||
# Fill non-given config values with defaults
|
||||
self.config = init_config(config, get_default_dataset_config(), self.get_name())
|
||||
self.gt_fol = self.config["GT_FOLDER"]
|
||||
self.tracker_fol = self.config["TRACKERS_FOLDER"]
|
||||
self.should_classes_combine = True
|
||||
self.use_super_categories = False
|
||||
self.use_mask = self.config["USE_MASK"]
|
||||
|
||||
self.tracker_sub_fol = self.config["TRACKER_SUB_FOLDER"]
|
||||
self.output_fol = self.config["OUTPUT_FOLDER"]
|
||||
if self.output_fol is None:
|
||||
self.output_fol = self.tracker_fol
|
||||
self.output_sub_fol = self.config["OUTPUT_SUB_FOLDER"]
|
||||
|
||||
if self.gt_fol.endswith(".json"):
|
||||
self.gt_data = json.load(open(self.gt_fol, "r"))
|
||||
else:
|
||||
gt_dir_files = [
|
||||
file for file in os.listdir(self.gt_fol) if file.endswith(".json")
|
||||
]
|
||||
if len(gt_dir_files) != 1:
|
||||
raise TrackEvalException(
|
||||
f"{self.gt_fol} does not contain exactly one json file."
|
||||
)
|
||||
|
||||
with open(os.path.join(self.gt_fol, gt_dir_files[0])) as f:
|
||||
self.gt_data = json.load(f)
|
||||
|
||||
# fill missing video ids
|
||||
self._fill_video_ids_inplace(self.gt_data["annotations"])
|
||||
|
||||
# get sequences to eval and sequence information
|
||||
self.seq_list = [
|
||||
vid["name"].replace("/", "-") for vid in self.gt_data["videos"]
|
||||
]
|
||||
self.seq_name2seqid = {
|
||||
vid["name"].replace("/", "-"): vid["id"] for vid in self.gt_data["videos"]
|
||||
}
|
||||
# compute mappings from videos to annotation data
|
||||
self.video2gt_track, self.video2gt_image = self._compute_vid_mappings(
|
||||
self.gt_data["annotations"]
|
||||
)
|
||||
# compute sequence lengths
|
||||
self.seq_lengths = {vid["id"]: 0 for vid in self.gt_data["videos"]}
|
||||
for img in self.gt_data["images"]:
|
||||
self.seq_lengths[img["video_id"]] += 1
|
||||
self.seq2images2timestep = self._compute_image_to_timestep_mappings()
|
||||
self.seq2cls = {
|
||||
vid["id"]: {
|
||||
"pos_cat_ids": list(
|
||||
{track["category_id"] for track in self.video2gt_track[vid["id"]]}
|
||||
),
|
||||
}
|
||||
for vid in self.gt_data["videos"]
|
||||
}
|
||||
|
||||
# Get classes to eval
|
||||
considered_vid_ids = [self.seq_name2seqid[vid] for vid in self.seq_list]
|
||||
seen_cats = set(
|
||||
[
|
||||
cat_id
|
||||
for vid_id in considered_vid_ids
|
||||
for cat_id in self.seq2cls[vid_id]["pos_cat_ids"]
|
||||
]
|
||||
)
|
||||
# only classes with ground truth are evaluated in TAO
|
||||
self.valid_classes = [
|
||||
cls["name"] for cls in self.gt_data["categories"] if cls["id"] in seen_cats
|
||||
]
|
||||
cls_name2clsid_map = {
|
||||
cls["name"]: cls["id"] for cls in self.gt_data["categories"]
|
||||
}
|
||||
|
||||
if self.config["CLASSES_TO_EVAL"]:
|
||||
self.class_list = [
|
||||
cls.lower() if cls.lower() in self.valid_classes else None
|
||||
for cls in self.config["CLASSES_TO_EVAL"]
|
||||
]
|
||||
if not all(self.class_list):
|
||||
valid_cls = ", ".join(self.valid_classes)
|
||||
raise TrackEvalException(
|
||||
"Attempted to evaluate an invalid class. Only classes "
|
||||
f"{valid_cls} are valid (classes present in ground truth"
|
||||
" data)."
|
||||
)
|
||||
else:
|
||||
self.class_list = [cls for cls in self.valid_classes]
|
||||
self.cls_name2clsid = {
|
||||
k: v for k, v in cls_name2clsid_map.items() if k in self.class_list
|
||||
}
|
||||
self.clsid2cls_name = {
|
||||
v: k for k, v in cls_name2clsid_map.items() if k in self.class_list
|
||||
}
|
||||
# get trackers to eval
|
||||
if self.config["TRACKERS_TO_EVAL"] is None:
|
||||
self.tracker_list = os.listdir(self.tracker_fol)
|
||||
else:
|
||||
self.tracker_list = self.config["TRACKERS_TO_EVAL"]
|
||||
|
||||
if self.config["TRACKER_DISPLAY_NAMES"] is None:
|
||||
self.tracker_to_disp = dict(zip(self.tracker_list, self.tracker_list))
|
||||
elif (self.config["TRACKERS_TO_EVAL"] is not None) and (
|
||||
len(self.config["TK_DISPLAY_NAMES"]) == len(self.tracker_list)
|
||||
):
|
||||
self.tracker_to_disp = dict(
|
||||
zip(self.tracker_list, self.config["TK_DISPLAY_NAMES"])
|
||||
)
|
||||
else:
|
||||
raise TrackEvalException(
|
||||
"List of tracker files and tracker display names do not match."
|
||||
)
|
||||
|
||||
self.tracker_data = {tracker: dict() for tracker in self.tracker_list}
|
||||
|
||||
for tracker in self.tracker_list:
|
||||
if self.tracker_sub_fol.endswith(".json"):
|
||||
with open(os.path.join(self.tracker_sub_fol)) as f:
|
||||
curr_data = json.load(f)
|
||||
else:
|
||||
tr_dir = os.path.join(self.tracker_fol, tracker, self.tracker_sub_fol)
|
||||
tr_dir_files = [
|
||||
file for file in os.listdir(tr_dir) if file.endswith(".json")
|
||||
]
|
||||
if len(tr_dir_files) != 1:
|
||||
raise TrackEvalException(
|
||||
f"{tr_dir} does not contain exactly one json file."
|
||||
)
|
||||
with open(os.path.join(tr_dir, tr_dir_files[0])) as f:
|
||||
curr_data = json.load(f)
|
||||
|
||||
# limit detections if MAX_DETECTIONS > 0
|
||||
if self.config["MAX_DETECTIONS"]:
|
||||
curr_data = self._limit_dets_per_image(curr_data)
|
||||
|
||||
# fill missing video ids
|
||||
self._fill_video_ids_inplace(curr_data)
|
||||
|
||||
# make track ids unique over whole evaluation set
|
||||
self._make_tk_ids_unique(curr_data)
|
||||
|
||||
# get tracker sequence information
|
||||
curr_vids2tracks, curr_vids2images = self._compute_vid_mappings(curr_data)
|
||||
self.tracker_data[tracker]["vids_to_tracks"] = curr_vids2tracks
|
||||
self.tracker_data[tracker]["vids_to_images"] = curr_vids2images
|
||||
|
||||
def get_display_name(self, tracker):
|
||||
return self.tracker_to_disp[tracker]
|
||||
|
||||
def _load_raw_file(self, tracker, seq, is_gt):
|
||||
"""Load a file (gt or tracker) in the TAO format
|
||||
|
||||
If is_gt, this returns a dict which contains the fields:
|
||||
[gt_ids, gt_classes]:
|
||||
list (for each timestep) of 1D NDArrays (for each det).
|
||||
[gt_dets]: list (for each timestep) of lists of detections.
|
||||
|
||||
if not is_gt, this returns a dict which contains the fields:
|
||||
[tk_ids, tk_classes]:
|
||||
list (for each timestep) of 1D NDArrays (for each det).
|
||||
[tk_dets]: list (for each timestep) of lists of detections.
|
||||
"""
|
||||
seq_id = self.seq_name2seqid[seq]
|
||||
# file location
|
||||
if is_gt:
|
||||
imgs = self.video2gt_image[seq_id]
|
||||
else:
|
||||
imgs = self.tracker_data[tracker]["vids_to_images"][seq_id]
|
||||
|
||||
# convert data to required format
|
||||
num_timesteps = self.seq_lengths[seq_id]
|
||||
img_to_timestep = self.seq2images2timestep[seq_id]
|
||||
data_keys = ["ids", "classes", "dets"]
|
||||
# if not is_gt:
|
||||
# data_keys += ["tk_confidences"]
|
||||
raw_data = {key: [None] * num_timesteps for key in data_keys}
|
||||
for img in imgs:
|
||||
# some tracker data contains images without any ground truth info,
|
||||
# these are ignored
|
||||
if img["id"] not in img_to_timestep:
|
||||
continue
|
||||
t = img_to_timestep[img["id"]]
|
||||
anns = img["annotations"]
|
||||
tk_str = utils.get_track_id_str(anns[0])
|
||||
|
||||
if self.use_mask:
|
||||
# When using mask, extract segmentation data
|
||||
raw_data["dets"][t] = [ann.get("segmentation") for ann in anns]
|
||||
else:
|
||||
# When using bbox, extract bbox data
|
||||
raw_data["dets"][t] = np.atleast_2d([ann["bbox"] for ann in anns]).astype(
|
||||
float
|
||||
)
|
||||
raw_data["ids"][t] = np.atleast_1d([ann[tk_str] for ann in anns]).astype(
|
||||
int
|
||||
)
|
||||
raw_data["classes"][t] = np.atleast_1d(
|
||||
[ann["category_id"] for ann in anns]
|
||||
).astype(int)
|
||||
# if not is_gt:
|
||||
# raw_data["tk_confidences"][t] = np.atleast_1d(
|
||||
# [ann["score"] for ann in anns]
|
||||
# ).astype(float)
|
||||
|
||||
for t, d in enumerate(raw_data["dets"]):
|
||||
if d is None:
|
||||
raw_data["dets"][t] = np.empty((0, 4)).astype(float)
|
||||
raw_data["ids"][t] = np.empty(0).astype(int)
|
||||
raw_data["classes"][t] = np.empty(0).astype(int)
|
||||
# if not is_gt:
|
||||
# raw_data["tk_confidences"][t] = np.empty(0)
|
||||
|
||||
if is_gt:
|
||||
key_map = {"ids": "gt_ids", "classes": "gt_classes", "dets": "gt_dets"}
|
||||
else:
|
||||
key_map = {"ids": "tk_ids", "classes": "tk_classes", "dets": "tk_dets"}
|
||||
for k, v in key_map.items():
|
||||
raw_data[v] = raw_data.pop(k)
|
||||
|
||||
raw_data["num_timesteps"] = num_timesteps
|
||||
raw_data["seq"] = seq
|
||||
return raw_data
|
||||
|
||||
def get_preprocessed_seq_data_thr(self, raw_data, cls, assignment=None):
|
||||
"""Preprocess data for a single sequence for a single class.
|
||||
|
||||
Inputs:
|
||||
raw_data: dict containing the data for the sequence already
|
||||
read in by get_raw_seq_data().
|
||||
cls: class to be evaluated.
|
||||
Outputs:
|
||||
gt_ids:
|
||||
list (for each timestep) of ids of GT tracks
|
||||
tk_ids:
|
||||
list (for each timestep) of ids of predicted tracks (all for TP
|
||||
matching (Det + AssocA))
|
||||
tk_overlap_ids:
|
||||
list (for each timestep) of ids of predicted tracks that overlap
|
||||
with GTs
|
||||
tk_dets:
|
||||
list (for each timestep) of lists of detections that
|
||||
corresponding to the tk_ids
|
||||
tk_classes:
|
||||
list (for each timestep) of lists of classes that corresponding
|
||||
to the tk_ids
|
||||
tk_confidences:
|
||||
list (for each timestep) of lists of classes that corresponding
|
||||
to the tk_ids
|
||||
sim_scores:
|
||||
similarity score between gt_ids and tk_ids.
|
||||
"""
|
||||
if cls != "all":
|
||||
cls_id = self.cls_name2clsid[cls]
|
||||
|
||||
data_keys = [
|
||||
"gt_ids",
|
||||
"tk_ids",
|
||||
"gt_id_map",
|
||||
"tk_id_map",
|
||||
"gt_dets",
|
||||
"gt_classes",
|
||||
"gt_class_name",
|
||||
"tk_overlap_classes",
|
||||
"tk_overlap_ids",
|
||||
"tk_class_eval_tk_ids",
|
||||
"tk_dets",
|
||||
"tk_classes",
|
||||
# "tk_confidences",
|
||||
"tk_exh_ids",
|
||||
"sim_scores",
|
||||
]
|
||||
data = {key: [None] * raw_data["num_timesteps"] for key in data_keys}
|
||||
unique_gt_ids = []
|
||||
unique_tk_ids = []
|
||||
num_gt_dets = 0
|
||||
num_tk_cls_dets = 0
|
||||
num_tk_overlap_dets = 0
|
||||
overlap_ious_thr = 0.5
|
||||
loc_and_asso_tk_ids = []
|
||||
exh_class_tk_ids = []
|
||||
|
||||
for t in range(raw_data["num_timesteps"]):
|
||||
# only extract relevant dets for this class for preproc and eval
|
||||
if cls == "all":
|
||||
gt_class_mask = np.ones_like(raw_data["gt_classes"][t]).astype(bool)
|
||||
else:
|
||||
gt_class_mask = np.atleast_1d(
|
||||
raw_data["gt_classes"][t] == cls_id
|
||||
).astype(bool)
|
||||
|
||||
# select GT that is not in the evaluating classes
|
||||
if assignment is not None and assignment:
|
||||
all_gt_ids = list(assignment[t].keys())
|
||||
gt_ids_in = raw_data["gt_ids"][t][gt_class_mask]
|
||||
gt_ids_out = set(all_gt_ids) - set(gt_ids_in)
|
||||
tk_ids_out = set([assignment[t][key] for key in list(gt_ids_out)])
|
||||
|
||||
# compute overlapped tracks and add their ids to overlap_tk_ids
|
||||
sim_scores = raw_data["similarity_scores"]
|
||||
overlap_ids_masks = (sim_scores[t][gt_class_mask] >= overlap_ious_thr).any(
|
||||
axis=0
|
||||
)
|
||||
overlap_tk_ids_t = raw_data["tk_ids"][t][overlap_ids_masks]
|
||||
if assignment is not None and assignment:
|
||||
data["tk_overlap_ids"][t] = list(set(overlap_tk_ids_t) - tk_ids_out)
|
||||
else:
|
||||
data["tk_overlap_ids"][t] = list(set(overlap_tk_ids_t))
|
||||
|
||||
loc_and_asso_tk_ids += data["tk_overlap_ids"][t]
|
||||
|
||||
data["tk_exh_ids"][t] = []
|
||||
if cls == "all":
|
||||
continue
|
||||
|
||||
# add the track ids of exclusive annotated class to exh_class_tk_ids
|
||||
tk_exh_mask = np.atleast_1d(raw_data["tk_classes"][t] == cls_id)
|
||||
tk_exh_mask = tk_exh_mask.astype(bool)
|
||||
exh_class_tk_ids_t = raw_data["tk_ids"][t][tk_exh_mask]
|
||||
exh_class_tk_ids.append(exh_class_tk_ids_t)
|
||||
data["tk_exh_ids"][t] = exh_class_tk_ids_t
|
||||
|
||||
# remove tk_ids that has been assigned to GT belongs to other classes.
|
||||
loc_and_asso_tk_ids = list(set(loc_and_asso_tk_ids))
|
||||
|
||||
# remove all unwanted unmatched tracker detections
|
||||
for t in range(raw_data["num_timesteps"]):
|
||||
# add gt to the data
|
||||
if cls == "all":
|
||||
gt_class_mask = np.ones_like(raw_data["gt_classes"][t]).astype(bool)
|
||||
else:
|
||||
gt_class_mask = np.atleast_1d(
|
||||
raw_data["gt_classes"][t] == cls_id
|
||||
).astype(bool)
|
||||
data["gt_classes"][t] = cls_id
|
||||
data["gt_class_name"][t] = cls
|
||||
|
||||
gt_ids = raw_data["gt_ids"][t][gt_class_mask]
|
||||
if self.use_mask:
|
||||
gt_dets = [raw_data['gt_dets'][t][ind] for ind in range(len(gt_class_mask)) if gt_class_mask[ind]]
|
||||
else:
|
||||
gt_dets = raw_data["gt_dets"][t][gt_class_mask]
|
||||
data["gt_ids"][t] = gt_ids
|
||||
data["gt_dets"][t] = gt_dets
|
||||
|
||||
# filter pred and only keep those that highly overlap with GTs
|
||||
tk_mask = np.isin(
|
||||
raw_data["tk_ids"][t], np.array(loc_and_asso_tk_ids), assume_unique=True
|
||||
)
|
||||
tk_overlap_mask = np.isin(
|
||||
raw_data["tk_ids"][t],
|
||||
np.array(data["tk_overlap_ids"][t]),
|
||||
assume_unique=True,
|
||||
)
|
||||
|
||||
tk_ids = raw_data["tk_ids"][t][tk_mask]
|
||||
if self.use_mask:
|
||||
tk_dets = [raw_data['tk_dets'][t][ind] for ind in range(len(tk_mask)) if
|
||||
tk_mask[ind]]
|
||||
else:
|
||||
tk_dets = raw_data["tk_dets"][t][tk_mask]
|
||||
|
||||
tracker_classes = raw_data["tk_classes"][t][tk_mask]
|
||||
|
||||
# add overlap classes for computing the FP for Cls term
|
||||
tracker_overlap_classes = raw_data["tk_classes"][t][tk_overlap_mask]
|
||||
# tracker_confidences = raw_data["tk_confidences"][t][tk_mask]
|
||||
sim_scores_masked = sim_scores[t][gt_class_mask, :][:, tk_mask]
|
||||
|
||||
# add filtered prediction to the data
|
||||
data["tk_classes"][t] = tracker_classes
|
||||
data["tk_overlap_classes"][t] = tracker_overlap_classes
|
||||
data["tk_ids"][t] = tk_ids
|
||||
data["tk_dets"][t] = tk_dets
|
||||
# data["tk_confidences"][t] = tracker_confidences
|
||||
data["sim_scores"][t] = sim_scores_masked
|
||||
data["tk_class_eval_tk_ids"][t] = set(
|
||||
list(data["tk_overlap_ids"][t]) + list(data["tk_exh_ids"][t])
|
||||
)
|
||||
|
||||
# count total number of detections
|
||||
unique_gt_ids += list(np.unique(data["gt_ids"][t]))
|
||||
# the unique track ids are for association.
|
||||
unique_tk_ids += list(np.unique(data["tk_ids"][t]))
|
||||
|
||||
num_tk_overlap_dets += len(data["tk_overlap_ids"][t])
|
||||
num_tk_cls_dets += len(data["tk_class_eval_tk_ids"][t])
|
||||
num_gt_dets += len(data["gt_ids"][t])
|
||||
|
||||
# re-label IDs such that there are no empty IDs
|
||||
if len(unique_gt_ids) > 0:
|
||||
unique_gt_ids = np.unique(unique_gt_ids)
|
||||
gt_id_map = np.nan * np.ones((np.max(unique_gt_ids) + 1))
|
||||
gt_id_map[unique_gt_ids] = np.arange(len(unique_gt_ids))
|
||||
data["gt_id_map"] = {}
|
||||
for gt_id in unique_gt_ids:
|
||||
new_gt_id = gt_id_map[gt_id].astype(int)
|
||||
data["gt_id_map"][new_gt_id] = gt_id
|
||||
|
||||
for t in range(raw_data["num_timesteps"]):
|
||||
if len(data["gt_ids"][t]) > 0:
|
||||
data["gt_ids"][t] = gt_id_map[data["gt_ids"][t]].astype(int)
|
||||
|
||||
if len(unique_tk_ids) > 0:
|
||||
unique_tk_ids = np.unique(unique_tk_ids)
|
||||
tk_id_map = np.nan * np.ones((np.max(unique_tk_ids) + 1))
|
||||
tk_id_map[unique_tk_ids] = np.arange(len(unique_tk_ids))
|
||||
|
||||
data["tk_id_map"] = {}
|
||||
for track_id in unique_tk_ids:
|
||||
new_track_id = tk_id_map[track_id].astype(int)
|
||||
data["tk_id_map"][new_track_id] = track_id
|
||||
|
||||
for t in range(raw_data["num_timesteps"]):
|
||||
if len(data["tk_ids"][t]) > 0:
|
||||
data["tk_ids"][t] = tk_id_map[data["tk_ids"][t]].astype(int)
|
||||
if len(data["tk_overlap_ids"][t]) > 0:
|
||||
data["tk_overlap_ids"][t] = tk_id_map[
|
||||
data["tk_overlap_ids"][t]
|
||||
].astype(int)
|
||||
|
||||
# record overview statistics.
|
||||
data["num_tk_cls_dets"] = num_tk_cls_dets
|
||||
data["num_tk_overlap_dets"] = num_tk_overlap_dets
|
||||
data["num_gt_dets"] = num_gt_dets
|
||||
data["num_tk_ids"] = len(unique_tk_ids)
|
||||
data["num_gt_ids"] = len(unique_gt_ids)
|
||||
data["num_timesteps"] = raw_data["num_timesteps"]
|
||||
data["seq"] = raw_data["seq"]
|
||||
|
||||
self._check_unique_ids(data)
|
||||
|
||||
return data
|
||||
|
||||
@_timing.time
|
||||
def get_preprocessed_seq_data(
|
||||
self, raw_data, cls, assignment=None, thresholds=[50, 75]
|
||||
):
|
||||
"""Preprocess data for a single sequence for a single class."""
|
||||
data = {}
|
||||
if thresholds is None:
|
||||
thresholds = [50, 75]
|
||||
elif isinstance(thresholds, int):
|
||||
thresholds = [thresholds]
|
||||
|
||||
for thr in thresholds:
|
||||
assignment_thr = None
|
||||
if assignment is not None:
|
||||
assignment_thr = assignment[thr]
|
||||
data[thr] = self.get_preprocessed_seq_data_thr(
|
||||
raw_data, cls, assignment_thr
|
||||
)
|
||||
|
||||
return data
|
||||
|
||||
def _calculate_similarities(self, gt_dets_t, tk_dets_t):
|
||||
"""Compute similarity scores."""
|
||||
if self.use_mask:
|
||||
similarity_scores = self._calculate_mask_ious(gt_dets_t, tk_dets_t, is_encoded=True, do_ioa=False)
|
||||
else:
|
||||
similarity_scores = self._calculate_box_ious(gt_dets_t, tk_dets_t)
|
||||
return similarity_scores
|
||||
|
||||
def _compute_vid_mappings(self, annotations):
|
||||
"""Computes mappings from videos to corresponding tracks and images."""
|
||||
vids_to_tracks = {}
|
||||
vids_to_imgs = {}
|
||||
vid_ids = [vid["id"] for vid in self.gt_data["videos"]]
|
||||
|
||||
# compute an mapping from image IDs to images
|
||||
images = {}
|
||||
for image in self.gt_data["images"]:
|
||||
images[image["id"]] = image
|
||||
|
||||
tk_str = utils.get_track_id_str(annotations[0])
|
||||
for ann in annotations:
|
||||
ann["area"] = ann["bbox"][2] * ann["bbox"][3]
|
||||
|
||||
vid = ann["video_id"]
|
||||
if ann["video_id"] not in vids_to_tracks.keys():
|
||||
vids_to_tracks[ann["video_id"]] = list()
|
||||
if ann["video_id"] not in vids_to_imgs.keys():
|
||||
vids_to_imgs[ann["video_id"]] = list()
|
||||
|
||||
# fill in vids_to_tracks
|
||||
tid = ann[tk_str]
|
||||
exist_tids = [track["id"] for track in vids_to_tracks[vid]]
|
||||
try:
|
||||
index1 = exist_tids.index(tid)
|
||||
except ValueError:
|
||||
index1 = -1
|
||||
if tid not in exist_tids:
|
||||
curr_track = {
|
||||
"id": tid,
|
||||
"category_id": ann["category_id"],
|
||||
"video_id": vid,
|
||||
"annotations": [ann],
|
||||
}
|
||||
vids_to_tracks[vid].append(curr_track)
|
||||
else:
|
||||
vids_to_tracks[vid][index1]["annotations"].append(ann)
|
||||
|
||||
# fill in vids_to_imgs
|
||||
img_id = ann["image_id"]
|
||||
exist_img_ids = [img["id"] for img in vids_to_imgs[vid]]
|
||||
try:
|
||||
index2 = exist_img_ids.index(img_id)
|
||||
except ValueError:
|
||||
index2 = -1
|
||||
if index2 == -1:
|
||||
curr_img = {"id": img_id, "annotations": [ann]}
|
||||
vids_to_imgs[vid].append(curr_img)
|
||||
else:
|
||||
vids_to_imgs[vid][index2]["annotations"].append(ann)
|
||||
|
||||
# sort annotations by frame index and compute track area
|
||||
for vid, tracks in vids_to_tracks.items():
|
||||
for track in tracks:
|
||||
track["annotations"] = sorted(
|
||||
track["annotations"],
|
||||
key=lambda x: images[x["image_id"]]["frame_id"],
|
||||
)
|
||||
# compute average area
|
||||
track["area"] = sum(x["area"] for x in track["annotations"]) / len(
|
||||
track["annotations"]
|
||||
)
|
||||
|
||||
# ensure all videos are present
|
||||
for vid_id in vid_ids:
|
||||
if vid_id not in vids_to_tracks.keys():
|
||||
vids_to_tracks[vid_id] = []
|
||||
if vid_id not in vids_to_imgs.keys():
|
||||
vids_to_imgs[vid_id] = []
|
||||
|
||||
return vids_to_tracks, vids_to_imgs
|
||||
|
||||
def _compute_image_to_timestep_mappings(self):
|
||||
"""Computes a mapping from images to timestep in sequence."""
|
||||
images = {}
|
||||
for image in self.gt_data["images"]:
|
||||
images[image["id"]] = image
|
||||
|
||||
seq_to_imgs_to_timestep = {vid["id"]: dict() for vid in self.gt_data["videos"]}
|
||||
for vid in seq_to_imgs_to_timestep:
|
||||
curr_imgs = [img["id"] for img in self.video2gt_image[vid]]
|
||||
curr_imgs = sorted(curr_imgs, key=lambda x: images[x]["frame_id"])
|
||||
seq_to_imgs_to_timestep[vid] = {
|
||||
curr_imgs[i]: i for i in range(len(curr_imgs))
|
||||
}
|
||||
|
||||
return seq_to_imgs_to_timestep
|
||||
|
||||
def _limit_dets_per_image(self, annotations):
|
||||
"""Limits the number of detections for each image.
|
||||
|
||||
Adapted from https://github.com/TAO-Dataset/.
|
||||
"""
|
||||
max_dets = self.config["MAX_DETECTIONS"]
|
||||
img_ann = defaultdict(list)
|
||||
for ann in annotations:
|
||||
img_ann[ann["image_id"]].append(ann)
|
||||
|
||||
for img_id, _anns in img_ann.items():
|
||||
if len(_anns) <= max_dets:
|
||||
continue
|
||||
_anns = sorted(_anns, key=lambda x: x["score"], reverse=True)
|
||||
img_ann[img_id] = _anns[:max_dets]
|
||||
|
||||
return [ann for anns in img_ann.values() for ann in anns]
|
||||
|
||||
def _fill_video_ids_inplace(self, annotations):
|
||||
"""Fills in missing video IDs inplace.
|
||||
|
||||
Adapted from https://github.com/TAO-Dataset/.
|
||||
"""
|
||||
missing_video_id = [x for x in annotations if "video_id" not in x]
|
||||
if missing_video_id:
|
||||
image_id_to_video_id = {
|
||||
x["id"]: x["video_id"] for x in self.gt_data["images"]
|
||||
}
|
||||
for x in missing_video_id:
|
||||
x["video_id"] = image_id_to_video_id[x["image_id"]]
|
||||
|
||||
@staticmethod
|
||||
def _make_tk_ids_unique(annotations):
|
||||
"""Makes track IDs unqiue over the whole annotation set.
|
||||
|
||||
Adapted from https://github.com/TAO-Dataset/.
|
||||
"""
|
||||
track_id_videos = {}
|
||||
track_ids_to_update = set()
|
||||
max_track_id = 0
|
||||
|
||||
tk_str = utils.get_track_id_str(annotations[0])
|
||||
for ann in annotations:
|
||||
t = int(ann[tk_str])
|
||||
if t not in track_id_videos:
|
||||
track_id_videos[t] = ann["video_id"]
|
||||
|
||||
if ann["video_id"] != track_id_videos[t]:
|
||||
# track id is assigned to multiple videos
|
||||
track_ids_to_update.add(t)
|
||||
max_track_id = max(max_track_id, t)
|
||||
|
||||
if track_ids_to_update:
|
||||
print("true")
|
||||
next_id = itertools.count(max_track_id + 1)
|
||||
new_tk_ids = defaultdict(lambda: next(next_id))
|
||||
for ann in annotations:
|
||||
t = ann[tk_str]
|
||||
v = ann["video_id"]
|
||||
if t in track_ids_to_update:
|
||||
ann[tk_str] = new_tk_ids[t, v]
|
||||
return len(track_ids_to_update)
|
||||
659
sam3/eval/teta_eval_toolkit/datasets/tao.py
Normal file
659
sam3/eval/teta_eval_toolkit/datasets/tao.py
Normal file
@@ -0,0 +1,659 @@
|
||||
# fmt: off
|
||||
# flake8: noqa
|
||||
|
||||
"""TAO Dataset."""
|
||||
import copy
|
||||
import itertools
|
||||
import json
|
||||
import os
|
||||
from collections import defaultdict
|
||||
|
||||
import numpy as np
|
||||
|
||||
from .. import _timing
|
||||
from ..config import get_default_dataset_config, init_config
|
||||
from ..utils import TrackEvalException
|
||||
from ._base_dataset import _BaseDataset
|
||||
|
||||
|
||||
class TAO(_BaseDataset):
|
||||
"""Dataset class for TAO tracking"""
|
||||
|
||||
def __init__(self, config=None):
|
||||
"""Initialize dataset, checking that all required files are present."""
|
||||
super().__init__()
|
||||
# Fill non-given config values with defaults
|
||||
self.config = init_config(config, get_default_dataset_config(), self.get_name())
|
||||
self.gt_fol = self.config["GT_FOLDER"]
|
||||
self.tracker_fol = self.config["TRACKERS_FOLDER"]
|
||||
self.should_classes_combine = True
|
||||
self.use_super_categories = False
|
||||
self.use_mask = self.config["USE_MASK"]
|
||||
|
||||
|
||||
self.tracker_sub_fol = self.config["TRACKER_SUB_FOLDER"]
|
||||
self.output_fol = self.config["OUTPUT_FOLDER"]
|
||||
if self.output_fol is None:
|
||||
self.output_fol = self.tracker_fol
|
||||
self.output_sub_fol = self.config["OUTPUT_SUB_FOLDER"]
|
||||
|
||||
if self.gt_fol.endswith(".json"):
|
||||
self.gt_data = json.load(open(self.gt_fol, "r"))
|
||||
else:
|
||||
gt_dir_files = [
|
||||
file for file in os.listdir(self.gt_fol) if file.endswith(".json")
|
||||
]
|
||||
if len(gt_dir_files) != 1:
|
||||
raise TrackEvalException(
|
||||
f"{self.gt_fol} does not contain exactly one json file."
|
||||
)
|
||||
|
||||
with open(os.path.join(self.gt_fol, gt_dir_files[0])) as f:
|
||||
self.gt_data = json.load(f)
|
||||
|
||||
# merge categories marked with a merged tag in TAO dataset
|
||||
self._merge_categories(self.gt_data["annotations"] + self.gt_data["tracks"])
|
||||
|
||||
# get sequences to eval and sequence information
|
||||
self.seq_list = [
|
||||
vid["name"].replace("/", "-") for vid in self.gt_data["videos"]
|
||||
]
|
||||
self.seq_name2seqid = {
|
||||
vid["name"].replace("/", "-"): vid["id"] for vid in self.gt_data["videos"]
|
||||
}
|
||||
# compute mappings from videos to annotation data
|
||||
self.video2gt_track, self.video2gt_image = self._compute_vid_mappings(
|
||||
self.gt_data["annotations"]
|
||||
)
|
||||
# compute sequence lengths
|
||||
self.seq_lengths = {vid["id"]: 0 for vid in self.gt_data["videos"]}
|
||||
for img in self.gt_data["images"]:
|
||||
self.seq_lengths[img["video_id"]] += 1
|
||||
self.seq2images2timestep = self._compute_image_to_timestep_mappings()
|
||||
self.seq2cls = {
|
||||
vid["id"]: {
|
||||
"pos_cat_ids": list(
|
||||
{track["category_id"] for track in self.video2gt_track[vid["id"]]}
|
||||
),
|
||||
"neg_cat_ids": vid["neg_category_ids"],
|
||||
"not_exh_labeled_cat_ids": vid["not_exhaustive_category_ids"],
|
||||
}
|
||||
for vid in self.gt_data["videos"]
|
||||
}
|
||||
|
||||
# Get classes to eval
|
||||
considered_vid_ids = [self.seq_name2seqid[vid] for vid in self.seq_list]
|
||||
seen_cats = set(
|
||||
[
|
||||
cat_id
|
||||
for vid_id in considered_vid_ids
|
||||
for cat_id in self.seq2cls[vid_id]["pos_cat_ids"]
|
||||
]
|
||||
)
|
||||
# only classes with ground truth are evaluated in TAO
|
||||
self.valid_classes = [
|
||||
cls["name"] for cls in self.gt_data["categories"] if cls["id"] in seen_cats
|
||||
]
|
||||
cls_name2clsid_map = {
|
||||
cls["name"]: cls["id"] for cls in self.gt_data["categories"]
|
||||
}
|
||||
|
||||
if self.config["CLASSES_TO_EVAL"]:
|
||||
self.class_list = [
|
||||
cls.lower() if cls.lower() in self.valid_classes else None
|
||||
for cls in self.config["CLASSES_TO_EVAL"]
|
||||
]
|
||||
if not all(self.class_list):
|
||||
valid_cls = ", ".join(self.valid_classes)
|
||||
raise TrackEvalException(
|
||||
"Attempted to evaluate an invalid class. Only classes "
|
||||
f"{valid_cls} are valid (classes present in ground truth"
|
||||
" data)."
|
||||
)
|
||||
else:
|
||||
self.class_list = [cls for cls in self.valid_classes]
|
||||
self.cls_name2clsid = {
|
||||
k: v for k, v in cls_name2clsid_map.items() if k in self.class_list
|
||||
}
|
||||
self.clsid2cls_name = {
|
||||
v: k for k, v in cls_name2clsid_map.items() if k in self.class_list
|
||||
}
|
||||
# get trackers to eval
|
||||
print(self.config["TRACKERS_TO_EVAL"] )
|
||||
if self.config["TRACKERS_TO_EVAL"] is None:
|
||||
self.tracker_list = os.listdir(self.tracker_fol)
|
||||
else:
|
||||
self.tracker_list = self.config["TRACKERS_TO_EVAL"]
|
||||
|
||||
if self.config["TRACKER_DISPLAY_NAMES"] is None:
|
||||
self.tracker_to_disp = dict(zip(self.tracker_list, self.tracker_list))
|
||||
elif (self.config["TRACKERS_TO_EVAL"] is not None) and (
|
||||
len(self.config["TK_DISPLAY_NAMES"]) == len(self.tracker_list)
|
||||
):
|
||||
self.tracker_to_disp = dict(
|
||||
zip(self.tracker_list, self.config["TK_DISPLAY_NAMES"])
|
||||
)
|
||||
else:
|
||||
raise TrackEvalException(
|
||||
"List of tracker files and tracker display names do not match."
|
||||
)
|
||||
|
||||
self.tracker_data = {tracker: dict() for tracker in self.tracker_list}
|
||||
|
||||
for tracker in self.tracker_list:
|
||||
if self.tracker_sub_fol.endswith(".json"):
|
||||
with open(os.path.join(self.tracker_sub_fol)) as f:
|
||||
curr_data = json.load(f)
|
||||
else:
|
||||
tr_dir = os.path.join(self.tracker_fol, tracker, self.tracker_sub_fol)
|
||||
tr_dir_files = [
|
||||
file for file in os.listdir(tr_dir) if file.endswith(".json")
|
||||
]
|
||||
if len(tr_dir_files) != 1:
|
||||
raise TrackEvalException(
|
||||
f"{tr_dir} does not contain exactly one json file."
|
||||
)
|
||||
with open(os.path.join(tr_dir, tr_dir_files[0])) as f:
|
||||
curr_data = json.load(f)
|
||||
|
||||
# limit detections if MAX_DETECTIONS > 0
|
||||
if self.config["MAX_DETECTIONS"]:
|
||||
curr_data = self._limit_dets_per_image(curr_data)
|
||||
|
||||
# fill missing video ids
|
||||
self._fill_video_ids_inplace(curr_data)
|
||||
|
||||
# make track ids unique over whole evaluation set
|
||||
self._make_tk_ids_unique(curr_data)
|
||||
|
||||
# merge categories marked with a merged tag in TAO dataset
|
||||
self._merge_categories(curr_data)
|
||||
|
||||
# get tracker sequence information
|
||||
curr_vids2tracks, curr_vids2images = self._compute_vid_mappings(curr_data)
|
||||
self.tracker_data[tracker]["vids_to_tracks"] = curr_vids2tracks
|
||||
self.tracker_data[tracker]["vids_to_images"] = curr_vids2images
|
||||
|
||||
def get_display_name(self, tracker):
|
||||
return self.tracker_to_disp[tracker]
|
||||
|
||||
def _load_raw_file(self, tracker, seq, is_gt):
|
||||
"""Load a file (gt or tracker) in the TAO format
|
||||
|
||||
If is_gt, this returns a dict which contains the fields:
|
||||
[gt_ids, gt_classes]:
|
||||
list (for each timestep) of 1D NDArrays (for each det).
|
||||
[gt_dets]: list (for each timestep) of lists of detections.
|
||||
|
||||
if not is_gt, this returns a dict which contains the fields:
|
||||
[tk_ids, tk_classes, tk_confidences]:
|
||||
list (for each timestep) of 1D NDArrays (for each det).
|
||||
[tk_dets]: list (for each timestep) of lists of detections.
|
||||
"""
|
||||
seq_id = self.seq_name2seqid[seq]
|
||||
# file location
|
||||
if is_gt:
|
||||
imgs = self.video2gt_image[seq_id]
|
||||
else:
|
||||
imgs = self.tracker_data[tracker]["vids_to_images"][seq_id]
|
||||
|
||||
# convert data to required format
|
||||
num_timesteps = self.seq_lengths[seq_id]
|
||||
img_to_timestep = self.seq2images2timestep[seq_id]
|
||||
data_keys = ["ids", "classes", "dets"]
|
||||
if not is_gt:
|
||||
data_keys += ["tk_confidences"]
|
||||
raw_data = {key: [None] * num_timesteps for key in data_keys}
|
||||
for img in imgs:
|
||||
# some tracker data contains images without any ground truth info,
|
||||
# these are ignored
|
||||
if img["id"] not in img_to_timestep:
|
||||
continue
|
||||
t = img_to_timestep[img["id"]]
|
||||
anns = img["annotations"]
|
||||
if self.use_mask:
|
||||
# When using mask, extract segmentation data
|
||||
raw_data["dets"][t] = [ann.get("segmentation") for ann in anns]
|
||||
else:
|
||||
# When using bbox, extract bbox data
|
||||
raw_data["dets"][t] = np.atleast_2d([ann["bbox"] for ann in anns]).astype(
|
||||
float
|
||||
)
|
||||
raw_data["ids"][t] = np.atleast_1d(
|
||||
[ann["track_id"] for ann in anns]
|
||||
).astype(int)
|
||||
raw_data["classes"][t] = np.atleast_1d(
|
||||
[ann["category_id"] for ann in anns]
|
||||
).astype(int)
|
||||
if not is_gt:
|
||||
raw_data["tk_confidences"][t] = np.atleast_1d(
|
||||
[ann["score"] for ann in anns]
|
||||
).astype(float)
|
||||
|
||||
for t, d in enumerate(raw_data["dets"]):
|
||||
if d is None:
|
||||
raw_data["dets"][t] = np.empty((0, 4)).astype(float)
|
||||
raw_data["ids"][t] = np.empty(0).astype(int)
|
||||
raw_data["classes"][t] = np.empty(0).astype(int)
|
||||
if not is_gt:
|
||||
raw_data["tk_confidences"][t] = np.empty(0)
|
||||
|
||||
if is_gt:
|
||||
key_map = {"ids": "gt_ids", "classes": "gt_classes", "dets": "gt_dets"}
|
||||
else:
|
||||
key_map = {"ids": "tk_ids", "classes": "tk_classes", "dets": "tk_dets"}
|
||||
for k, v in key_map.items():
|
||||
raw_data[v] = raw_data.pop(k)
|
||||
|
||||
raw_data["num_timesteps"] = num_timesteps
|
||||
raw_data["neg_cat_ids"] = self.seq2cls[seq_id]["neg_cat_ids"]
|
||||
raw_data["not_exh_labeled_cls"] = self.seq2cls[seq_id][
|
||||
"not_exh_labeled_cat_ids"
|
||||
]
|
||||
raw_data["seq"] = seq
|
||||
return raw_data
|
||||
|
||||
def get_preprocessed_seq_data_thr(self, raw_data, cls, assignment=None):
|
||||
"""Preprocess data for a single sequence for a single class.
|
||||
|
||||
Inputs:
|
||||
raw_data: dict containing the data for the sequence already
|
||||
read in by get_raw_seq_data().
|
||||
cls: class to be evaluated.
|
||||
Outputs:
|
||||
gt_ids:
|
||||
list (for each timestep) of ids of GT tracks
|
||||
tk_ids:
|
||||
list (for each timestep) of ids of predicted tracks (all for TP
|
||||
matching (Det + AssocA))
|
||||
tk_overlap_ids:
|
||||
list (for each timestep) of ids of predicted tracks that overlap
|
||||
with GTs
|
||||
tk_neg_ids:
|
||||
list (for each timestep) of ids of predicted tracks that with
|
||||
the class id on the negative list for the current sequence.
|
||||
tk_exh_ids:
|
||||
list (for each timestep) of ids of predicted tracks that do not
|
||||
overlap with existing GTs but have the class id on the
|
||||
exhaustive annotated class list for the current sequence.
|
||||
tk_dets:
|
||||
list (for each timestep) of lists of detections that
|
||||
corresponding to the tk_ids
|
||||
tk_classes:
|
||||
list (for each timestep) of lists of classes that corresponding
|
||||
to the tk_ids
|
||||
tk_confidences:
|
||||
list (for each timestep) of lists of classes that corresponding
|
||||
to the tk_ids
|
||||
sim_scores:
|
||||
similarity score between gt_ids and tk_ids.
|
||||
"""
|
||||
if cls != "all":
|
||||
cls_id = self.cls_name2clsid[cls]
|
||||
|
||||
data_keys = [
|
||||
"gt_ids",
|
||||
"tk_ids",
|
||||
"gt_id_map",
|
||||
"tk_id_map",
|
||||
"gt_dets",
|
||||
"gt_classes",
|
||||
"gt_class_name",
|
||||
"tk_overlap_classes",
|
||||
"tk_overlap_ids",
|
||||
"tk_neg_ids",
|
||||
"tk_exh_ids",
|
||||
"tk_class_eval_tk_ids",
|
||||
"tk_dets",
|
||||
"tk_classes",
|
||||
"tk_confidences",
|
||||
"sim_scores",
|
||||
]
|
||||
data = {key: [None] * raw_data["num_timesteps"] for key in data_keys}
|
||||
unique_gt_ids = []
|
||||
unique_tk_ids = []
|
||||
num_gt_dets = 0
|
||||
num_tk_cls_dets = 0
|
||||
num_tk_overlap_dets = 0
|
||||
overlap_ious_thr = 0.5
|
||||
loc_and_asso_tk_ids = []
|
||||
|
||||
for t in range(raw_data["num_timesteps"]):
|
||||
# only extract relevant dets for this class for preproc and eval
|
||||
if cls == "all":
|
||||
gt_class_mask = np.ones_like(raw_data["gt_classes"][t]).astype(bool)
|
||||
else:
|
||||
gt_class_mask = np.atleast_1d(
|
||||
raw_data["gt_classes"][t] == cls_id
|
||||
).astype(bool)
|
||||
|
||||
# select GT that is not in the evaluating classes
|
||||
if assignment is not None and assignment:
|
||||
all_gt_ids = list(assignment[t].keys())
|
||||
gt_ids_in = raw_data["gt_ids"][t][gt_class_mask]
|
||||
gt_ids_out = set(all_gt_ids) - set(gt_ids_in)
|
||||
tk_ids_out = set([assignment[t][key] for key in list(gt_ids_out)])
|
||||
|
||||
# compute overlapped tracks and add their ids to overlap_tk_ids
|
||||
sim_scores = raw_data["similarity_scores"]
|
||||
overlap_ids_masks = (sim_scores[t][gt_class_mask] >= overlap_ious_thr).any(
|
||||
axis=0
|
||||
)
|
||||
overlap_tk_ids_t = raw_data["tk_ids"][t][overlap_ids_masks]
|
||||
if assignment is not None and assignment:
|
||||
data["tk_overlap_ids"][t] = list(set(overlap_tk_ids_t) - tk_ids_out)
|
||||
else:
|
||||
data["tk_overlap_ids"][t] = list(set(overlap_tk_ids_t))
|
||||
|
||||
loc_and_asso_tk_ids += data["tk_overlap_ids"][t]
|
||||
|
||||
data["tk_exh_ids"][t] = []
|
||||
data["tk_neg_ids"][t] = []
|
||||
|
||||
if cls == "all":
|
||||
continue
|
||||
|
||||
# remove tk_ids that has been assigned to GT belongs to other classes.
|
||||
loc_and_asso_tk_ids = list(set(loc_and_asso_tk_ids))
|
||||
|
||||
# remove all unwanted unmatched tracker detections
|
||||
for t in range(raw_data["num_timesteps"]):
|
||||
# add gt to the data
|
||||
if cls == "all":
|
||||
gt_class_mask = np.ones_like(raw_data["gt_classes"][t]).astype(bool)
|
||||
else:
|
||||
gt_class_mask = np.atleast_1d(
|
||||
raw_data["gt_classes"][t] == cls_id
|
||||
).astype(bool)
|
||||
data["gt_classes"][t] = cls_id
|
||||
data["gt_class_name"][t] = cls
|
||||
|
||||
gt_ids = raw_data["gt_ids"][t][gt_class_mask]
|
||||
if self.use_mask:
|
||||
gt_dets = [raw_data['gt_dets'][t][ind] for ind in range(len(gt_class_mask)) if gt_class_mask[ind]]
|
||||
else:
|
||||
gt_dets = raw_data["gt_dets"][t][gt_class_mask]
|
||||
data["gt_ids"][t] = gt_ids
|
||||
data["gt_dets"][t] = gt_dets
|
||||
|
||||
# filter pred and only keep those that highly overlap with GTs
|
||||
tk_mask = np.isin(
|
||||
raw_data["tk_ids"][t], np.array(loc_and_asso_tk_ids), assume_unique=True
|
||||
)
|
||||
tk_overlap_mask = np.isin(
|
||||
raw_data["tk_ids"][t],
|
||||
np.array(data["tk_overlap_ids"][t]),
|
||||
assume_unique=True,
|
||||
)
|
||||
|
||||
tk_ids = raw_data["tk_ids"][t][tk_mask]
|
||||
if self.use_mask:
|
||||
tk_dets = [raw_data['tk_dets'][t][ind] for ind in range(len(tk_mask)) if
|
||||
tk_mask[ind]]
|
||||
else:
|
||||
tk_dets = raw_data["tk_dets"][t][tk_mask]
|
||||
tracker_classes = raw_data["tk_classes"][t][tk_mask]
|
||||
|
||||
# add overlap classes for computing the FP for Cls term
|
||||
tracker_overlap_classes = raw_data["tk_classes"][t][tk_overlap_mask]
|
||||
tracker_confidences = raw_data["tk_confidences"][t][tk_mask]
|
||||
sim_scores_masked = sim_scores[t][gt_class_mask, :][:, tk_mask]
|
||||
|
||||
# add filtered prediction to the data
|
||||
data["tk_classes"][t] = tracker_classes
|
||||
data["tk_overlap_classes"][t] = tracker_overlap_classes
|
||||
data["tk_ids"][t] = tk_ids
|
||||
data["tk_dets"][t] = tk_dets
|
||||
data["tk_confidences"][t] = tracker_confidences
|
||||
data["sim_scores"][t] = sim_scores_masked
|
||||
data["tk_class_eval_tk_ids"][t] = set(
|
||||
list(data["tk_overlap_ids"][t])
|
||||
+ list(data["tk_neg_ids"][t])
|
||||
+ list(data["tk_exh_ids"][t])
|
||||
)
|
||||
|
||||
# count total number of detections
|
||||
unique_gt_ids += list(np.unique(data["gt_ids"][t]))
|
||||
# the unique track ids are for association.
|
||||
unique_tk_ids += list(np.unique(data["tk_ids"][t]))
|
||||
|
||||
num_tk_overlap_dets += len(data["tk_overlap_ids"][t])
|
||||
num_tk_cls_dets += len(data["tk_class_eval_tk_ids"][t])
|
||||
num_gt_dets += len(data["gt_ids"][t])
|
||||
|
||||
# re-label IDs such that there are no empty IDs
|
||||
if len(unique_gt_ids) > 0:
|
||||
unique_gt_ids = np.unique(unique_gt_ids)
|
||||
gt_id_map = np.nan * np.ones((np.max(unique_gt_ids) + 1))
|
||||
gt_id_map[unique_gt_ids] = np.arange(len(unique_gt_ids))
|
||||
data["gt_id_map"] = {}
|
||||
for gt_id in unique_gt_ids:
|
||||
new_gt_id = gt_id_map[gt_id].astype(int)
|
||||
data["gt_id_map"][new_gt_id] = gt_id
|
||||
|
||||
for t in range(raw_data["num_timesteps"]):
|
||||
if len(data["gt_ids"][t]) > 0:
|
||||
data["gt_ids"][t] = gt_id_map[data["gt_ids"][t]].astype(int)
|
||||
|
||||
if len(unique_tk_ids) > 0:
|
||||
unique_tk_ids = np.unique(unique_tk_ids)
|
||||
tk_id_map = np.nan * np.ones((np.max(unique_tk_ids) + 1))
|
||||
tk_id_map[unique_tk_ids] = np.arange(len(unique_tk_ids))
|
||||
|
||||
data["tk_id_map"] = {}
|
||||
for track_id in unique_tk_ids:
|
||||
new_track_id = tk_id_map[track_id].astype(int)
|
||||
data["tk_id_map"][new_track_id] = track_id
|
||||
|
||||
for t in range(raw_data["num_timesteps"]):
|
||||
if len(data["tk_ids"][t]) > 0:
|
||||
data["tk_ids"][t] = tk_id_map[data["tk_ids"][t]].astype(int)
|
||||
if len(data["tk_overlap_ids"][t]) > 0:
|
||||
data["tk_overlap_ids"][t] = tk_id_map[
|
||||
data["tk_overlap_ids"][t]
|
||||
].astype(int)
|
||||
|
||||
# record overview statistics.
|
||||
data["num_tk_cls_dets"] = num_tk_cls_dets
|
||||
data["num_tk_overlap_dets"] = num_tk_overlap_dets
|
||||
data["num_gt_dets"] = num_gt_dets
|
||||
data["num_tk_ids"] = len(unique_tk_ids)
|
||||
data["num_gt_ids"] = len(unique_gt_ids)
|
||||
data["num_timesteps"] = raw_data["num_timesteps"]
|
||||
data["seq"] = raw_data["seq"]
|
||||
|
||||
self._check_unique_ids(data)
|
||||
|
||||
return data
|
||||
|
||||
@_timing.time
|
||||
def get_preprocessed_seq_data(
|
||||
self, raw_data, cls, assignment=None, thresholds=[50, 75]
|
||||
):
|
||||
"""Preprocess data for a single sequence for a single class."""
|
||||
data = {}
|
||||
if thresholds is None:
|
||||
thresholds = [50]
|
||||
elif isinstance(thresholds, int):
|
||||
thresholds = [thresholds]
|
||||
|
||||
for thr in thresholds:
|
||||
assignment_thr = None
|
||||
if assignment is not None:
|
||||
assignment_thr = assignment[thr]
|
||||
data[thr] = self.get_preprocessed_seq_data_thr(
|
||||
raw_data, cls, assignment_thr
|
||||
)
|
||||
|
||||
return data
|
||||
|
||||
def _calculate_similarities(self, gt_dets_t, tk_dets_t):
|
||||
"""Compute similarity scores."""
|
||||
if self.use_mask:
|
||||
similarity_scores = self._calculate_mask_ious(gt_dets_t, tk_dets_t, is_encoded=True, do_ioa=False)
|
||||
else:
|
||||
similarity_scores = self._calculate_box_ious(gt_dets_t, tk_dets_t)
|
||||
return similarity_scores
|
||||
|
||||
def _merge_categories(self, annotations):
|
||||
"""Merges categories with a merged tag.
|
||||
|
||||
Adapted from https://github.com/TAO-Dataset.
|
||||
"""
|
||||
merge_map = {}
|
||||
for category in self.gt_data["categories"]:
|
||||
if "merged" in category:
|
||||
for to_merge in category["merged"]:
|
||||
merge_map[to_merge["id"]] = category["id"]
|
||||
|
||||
for ann in annotations:
|
||||
ann["category_id"] = merge_map.get(ann["category_id"], ann["category_id"])
|
||||
|
||||
def _compute_vid_mappings(self, annotations):
|
||||
"""Computes mappings from videos to corresponding tracks and images."""
|
||||
vids_to_tracks = {}
|
||||
vids_to_imgs = {}
|
||||
vid_ids = [vid["id"] for vid in self.gt_data["videos"]]
|
||||
|
||||
# compute an mapping from image IDs to images
|
||||
images = {}
|
||||
for image in self.gt_data["images"]:
|
||||
images[image["id"]] = image
|
||||
|
||||
for ann in annotations:
|
||||
ann["area"] = ann["bbox"][2] * ann["bbox"][3]
|
||||
|
||||
vid = ann["video_id"]
|
||||
if ann["video_id"] not in vids_to_tracks.keys():
|
||||
vids_to_tracks[ann["video_id"]] = list()
|
||||
if ann["video_id"] not in vids_to_imgs.keys():
|
||||
vids_to_imgs[ann["video_id"]] = list()
|
||||
|
||||
# fill in vids_to_tracks
|
||||
tid = ann["track_id"]
|
||||
exist_tids = [track["id"] for track in vids_to_tracks[vid]]
|
||||
try:
|
||||
index1 = exist_tids.index(tid)
|
||||
except ValueError:
|
||||
index1 = -1
|
||||
if tid not in exist_tids:
|
||||
curr_track = {
|
||||
"id": tid,
|
||||
"category_id": ann["category_id"],
|
||||
"video_id": vid,
|
||||
"annotations": [ann],
|
||||
}
|
||||
vids_to_tracks[vid].append(curr_track)
|
||||
else:
|
||||
vids_to_tracks[vid][index1]["annotations"].append(ann)
|
||||
|
||||
# fill in vids_to_imgs
|
||||
img_id = ann["image_id"]
|
||||
exist_img_ids = [img["id"] for img in vids_to_imgs[vid]]
|
||||
try:
|
||||
index2 = exist_img_ids.index(img_id)
|
||||
except ValueError:
|
||||
index2 = -1
|
||||
if index2 == -1:
|
||||
curr_img = {"id": img_id, "annotations": [ann]}
|
||||
vids_to_imgs[vid].append(curr_img)
|
||||
else:
|
||||
vids_to_imgs[vid][index2]["annotations"].append(ann)
|
||||
|
||||
# sort annotations by frame index and compute track area
|
||||
for vid, tracks in vids_to_tracks.items():
|
||||
for track in tracks:
|
||||
track["annotations"] = sorted(
|
||||
track["annotations"],
|
||||
key=lambda x: images[x["image_id"]]["frame_index"],
|
||||
)
|
||||
# compute average area
|
||||
track["area"] = sum(x["area"] for x in track["annotations"]) / len(
|
||||
track["annotations"]
|
||||
)
|
||||
|
||||
# ensure all videos are present
|
||||
for vid_id in vid_ids:
|
||||
if vid_id not in vids_to_tracks.keys():
|
||||
vids_to_tracks[vid_id] = []
|
||||
if vid_id not in vids_to_imgs.keys():
|
||||
vids_to_imgs[vid_id] = []
|
||||
|
||||
return vids_to_tracks, vids_to_imgs
|
||||
|
||||
def _compute_image_to_timestep_mappings(self):
|
||||
"""Computes a mapping from images to timestep in sequence."""
|
||||
images = {}
|
||||
for image in self.gt_data["images"]:
|
||||
images[image["id"]] = image
|
||||
|
||||
seq_to_imgs_to_timestep = {vid["id"]: dict() for vid in self.gt_data["videos"]}
|
||||
for vid in seq_to_imgs_to_timestep:
|
||||
curr_imgs = [img["id"] for img in self.video2gt_image[vid]]
|
||||
curr_imgs = sorted(curr_imgs, key=lambda x: images[x]["frame_index"])
|
||||
seq_to_imgs_to_timestep[vid] = {
|
||||
curr_imgs[i]: i for i in range(len(curr_imgs))
|
||||
}
|
||||
|
||||
return seq_to_imgs_to_timestep
|
||||
|
||||
def _limit_dets_per_image(self, annotations):
|
||||
"""Limits the number of detections for each image.
|
||||
|
||||
Adapted from https://github.com/TAO-Dataset/.
|
||||
"""
|
||||
max_dets = self.config["MAX_DETECTIONS"]
|
||||
img_ann = defaultdict(list)
|
||||
for ann in annotations:
|
||||
img_ann[ann["image_id"]].append(ann)
|
||||
|
||||
for img_id, _anns in img_ann.items():
|
||||
if len(_anns) <= max_dets:
|
||||
continue
|
||||
_anns = sorted(_anns, key=lambda x: x["score"], reverse=True)
|
||||
img_ann[img_id] = _anns[:max_dets]
|
||||
|
||||
return [ann for anns in img_ann.values() for ann in anns]
|
||||
|
||||
def _fill_video_ids_inplace(self, annotations):
|
||||
"""Fills in missing video IDs inplace.
|
||||
|
||||
Adapted from https://github.com/TAO-Dataset/.
|
||||
"""
|
||||
missing_video_id = [x for x in annotations if "video_id" not in x]
|
||||
if missing_video_id:
|
||||
image_id_to_video_id = {
|
||||
x["id"]: x["video_id"] for x in self.gt_data["images"]
|
||||
}
|
||||
for x in missing_video_id:
|
||||
x["video_id"] = image_id_to_video_id[x["image_id"]]
|
||||
|
||||
@staticmethod
|
||||
def _make_tk_ids_unique(annotations):
|
||||
"""Makes track IDs unqiue over the whole annotation set.
|
||||
|
||||
Adapted from https://github.com/TAO-Dataset/.
|
||||
"""
|
||||
track_id_videos = {}
|
||||
track_ids_to_update = set()
|
||||
max_track_id = 0
|
||||
for ann in annotations:
|
||||
t = ann["track_id"]
|
||||
if t not in track_id_videos:
|
||||
track_id_videos[t] = ann["video_id"]
|
||||
|
||||
if ann["video_id"] != track_id_videos[t]:
|
||||
# track id is assigned to multiple videos
|
||||
track_ids_to_update.add(t)
|
||||
max_track_id = max(max_track_id, t)
|
||||
|
||||
if track_ids_to_update:
|
||||
print("true")
|
||||
next_id = itertools.count(max_track_id + 1)
|
||||
new_tk_ids = defaultdict(lambda: next(next_id))
|
||||
for ann in annotations:
|
||||
t = ann["track_id"]
|
||||
v = ann["video_id"]
|
||||
if t in track_ids_to_update:
|
||||
ann["track_id"] = new_tk_ids[t, v]
|
||||
return len(track_ids_to_update)
|
||||
275
sam3/eval/teta_eval_toolkit/eval.py
Normal file
275
sam3/eval/teta_eval_toolkit/eval.py
Normal file
@@ -0,0 +1,275 @@
|
||||
# fmt: off
|
||||
# flake8: noqa
|
||||
|
||||
import copy
|
||||
import os
|
||||
import pickle
|
||||
import time
|
||||
import traceback
|
||||
from functools import partial
|
||||
from multiprocessing.pool import Pool
|
||||
|
||||
import numpy as np
|
||||
|
||||
from . import _timing, utils
|
||||
from .config import get_default_eval_config, init_config
|
||||
from .utils import TrackEvalException
|
||||
|
||||
|
||||
class Evaluator:
|
||||
"""Evaluator class for evaluating different metrics for each datasets."""
|
||||
|
||||
def __init__(self, config=None):
|
||||
"""Initialize the evaluator with a config file."""
|
||||
self.config = init_config(config, get_default_eval_config(), "Eval")
|
||||
# Only run timing analysis if not run in parallel.
|
||||
if self.config["TIME_PROGRESS"] and not self.config["USE_PARALLEL"]:
|
||||
_timing.DO_TIMING = True
|
||||
if self.config["DISPLAY_LESS_PROGRESS"]:
|
||||
_timing.DISPLAY_LESS_PROGRESS = True
|
||||
|
||||
@_timing.time
|
||||
def evaluate(self, dataset_list, metrics_list):
|
||||
"""Evaluate a set of metrics on a set of datasets."""
|
||||
config = self.config
|
||||
metrics_list = metrics_list
|
||||
metric_names = utils.validate_metrics_list(metrics_list)
|
||||
dataset_names = [dataset.get_name() for dataset in dataset_list]
|
||||
output_res = {}
|
||||
output_msg = {}
|
||||
|
||||
for dataset, dname in zip(dataset_list, dataset_names):
|
||||
# Get dataset info about what to evaluate
|
||||
output_res[dname] = {}
|
||||
output_msg[dname] = {}
|
||||
tracker_list, seq_list, class_list = dataset.get_eval_info()
|
||||
print(
|
||||
f"\nEvaluating {len(tracker_list)} tracker(s) on "
|
||||
f"{len(seq_list)} sequence(s) for {len(class_list)} class(es)"
|
||||
f" on {dname} dataset using the following "
|
||||
f'metrics: {", ".join(metric_names)}\n'
|
||||
)
|
||||
|
||||
# Evaluate each tracker
|
||||
for tracker in tracker_list:
|
||||
try:
|
||||
output_res, output_msg = self.evaluate_tracker(
|
||||
tracker,
|
||||
dataset,
|
||||
dname,
|
||||
class_list,
|
||||
metrics_list,
|
||||
metric_names,
|
||||
seq_list,
|
||||
output_res,
|
||||
output_msg,
|
||||
)
|
||||
except Exception as err:
|
||||
output_res[dname][tracker] = None
|
||||
if type(err) == TrackEvalException:
|
||||
output_msg[dname][tracker] = str(err)
|
||||
else:
|
||||
output_msg[dname][tracker] = "Unknown error occurred."
|
||||
print("Tracker %s was unable to be evaluated." % tracker)
|
||||
print(err)
|
||||
traceback.print_exc()
|
||||
if config["LOG_ON_ERROR"] is not None:
|
||||
with open(config["LOG_ON_ERROR"], "a") as f:
|
||||
print(dname, file=f)
|
||||
print(tracker, file=f)
|
||||
print(traceback.format_exc(), file=f)
|
||||
print("\n\n\n", file=f)
|
||||
if config["BREAK_ON_ERROR"]:
|
||||
raise err
|
||||
elif config["RETURN_ON_ERROR"]:
|
||||
return output_res, output_msg
|
||||
|
||||
return output_res, output_msg
|
||||
|
||||
def evaluate_tracker(
|
||||
self,
|
||||
tracker,
|
||||
dataset,
|
||||
dname,
|
||||
class_list,
|
||||
metrics_list,
|
||||
metric_names,
|
||||
seq_list,
|
||||
output_res,
|
||||
output_msg,
|
||||
):
|
||||
"""Evaluate each sequence in parallel or in series."""
|
||||
print("\nEvaluating %s\n" % tracker)
|
||||
time_start = time.time()
|
||||
config = self.config
|
||||
if config["USE_PARALLEL"]:
|
||||
with Pool(config["NUM_PARALLEL_CORES"]) as pool:
|
||||
_eval_sequence = partial(
|
||||
eval_sequence,
|
||||
dataset=dataset,
|
||||
tracker=tracker,
|
||||
class_list=class_list,
|
||||
metrics_list=metrics_list,
|
||||
metric_names=metric_names,
|
||||
)
|
||||
results = pool.map(_eval_sequence, seq_list)
|
||||
res = dict(zip(seq_list, results))
|
||||
else:
|
||||
res = {}
|
||||
for curr_seq in sorted(seq_list):
|
||||
res[curr_seq] = eval_sequence(
|
||||
curr_seq, dataset, tracker, class_list, metrics_list, metric_names
|
||||
)
|
||||
|
||||
|
||||
# collecting combined cls keys (cls averaged, det averaged, super classes)
|
||||
cls_keys = []
|
||||
res["COMBINED_SEQ"] = {}
|
||||
# combine sequences for each class
|
||||
for c_cls in class_list:
|
||||
res["COMBINED_SEQ"][c_cls] = {}
|
||||
for metric, mname in zip(metrics_list, metric_names):
|
||||
curr_res = {
|
||||
seq_key: seq_value[c_cls][mname]
|
||||
for seq_key, seq_value in res.items()
|
||||
if seq_key != "COMBINED_SEQ"
|
||||
}
|
||||
# combine results over all sequences and then over all classes
|
||||
res["COMBINED_SEQ"][c_cls][mname] = metric.combine_sequences(curr_res)
|
||||
|
||||
# combine classes
|
||||
if dataset.should_classes_combine:
|
||||
if config["OUTPUT_PER_SEQ_RES"]:
|
||||
video_keys = res.keys()
|
||||
else:
|
||||
video_keys = ["COMBINED_SEQ"]
|
||||
for v_key in video_keys:
|
||||
cls_keys += ["average"]
|
||||
res[v_key]["average"] = {}
|
||||
for metric, mname in zip(metrics_list, metric_names):
|
||||
cls_res = {
|
||||
cls_key: cls_value[mname]
|
||||
for cls_key, cls_value in res[v_key].items()
|
||||
if cls_key not in cls_keys
|
||||
}
|
||||
res[v_key]["average"][
|
||||
mname
|
||||
] = metric.combine_classes_class_averaged(
|
||||
cls_res, ignore_empty=True
|
||||
)
|
||||
|
||||
# combine classes to super classes
|
||||
if dataset.use_super_categories:
|
||||
for cat, sub_cats in dataset.super_categories.items():
|
||||
cls_keys.append(cat)
|
||||
res["COMBINED_SEQ"][cat] = {}
|
||||
for metric, mname in zip(metrics_list, metric_names):
|
||||
cat_res = {
|
||||
cls_key: cls_value[mname]
|
||||
for cls_key, cls_value in res["COMBINED_SEQ"].items()
|
||||
if cls_key in sub_cats
|
||||
}
|
||||
res["COMBINED_SEQ"][cat][
|
||||
mname
|
||||
] = metric.combine_classes_det_averaged(cat_res)
|
||||
# Print and output results in various formats
|
||||
if config["TIME_PROGRESS"]:
|
||||
print(
|
||||
f"\nAll sequences for {tracker} finished in"
|
||||
f" {time.time() - time_start} seconds"
|
||||
)
|
||||
output_fol = dataset.get_output_fol(tracker)
|
||||
os.makedirs(output_fol, exist_ok=True)
|
||||
|
||||
# take a mean of each field of each thr
|
||||
if config["OUTPUT_PER_SEQ_RES"]:
|
||||
all_res = copy.deepcopy(res)
|
||||
summary_keys = res.keys()
|
||||
else:
|
||||
all_res = copy.deepcopy(res["COMBINED_SEQ"])
|
||||
summary_keys = ["COMBINED_SEQ"]
|
||||
thr_key_list = [50]
|
||||
for s_key in summary_keys:
|
||||
for metric, mname in zip(metrics_list, metric_names):
|
||||
if mname != "TETA":
|
||||
if s_key == "COMBINED_SEQ":
|
||||
metric.print_table(
|
||||
{"COMBINED_SEQ": res["COMBINED_SEQ"][cls_keys[0]][mname]},
|
||||
tracker,
|
||||
cls_keys[0],
|
||||
)
|
||||
continue
|
||||
|
||||
for c_cls in res[s_key].keys():
|
||||
for thr in thr_key_list:
|
||||
all_res[s_key][c_cls][mname][thr] = metric._summary_row(
|
||||
res[s_key][c_cls][mname][thr]
|
||||
)
|
||||
x = (
|
||||
np.array(list(all_res[s_key][c_cls]["TETA"].values()))
|
||||
.astype("float")
|
||||
.mean(axis=0)
|
||||
)
|
||||
all_res_summary = list(x.round(decimals=2).astype("str"))
|
||||
all_res[s_key][c_cls][mname]["ALL"] = all_res_summary
|
||||
if config["OUTPUT_SUMMARY"] and s_key == "COMBINED_SEQ":
|
||||
for t in thr_key_list:
|
||||
metric.print_summary_table(
|
||||
all_res[s_key][cls_keys[0]][mname][t],
|
||||
t,
|
||||
tracker,
|
||||
cls_keys[0],
|
||||
)
|
||||
|
||||
if config["OUTPUT_TEM_RAW_DATA"]:
|
||||
out_file = os.path.join(output_fol, "teta_summary_results.pth")
|
||||
pickle.dump(all_res, open(out_file, "wb"))
|
||||
print("Saved the TETA summary results.")
|
||||
|
||||
# output
|
||||
output_res[dname][mname] = all_res[s_key][cls_keys[0]][mname][t]
|
||||
output_msg[dname][tracker] = "Success"
|
||||
|
||||
return output_res, output_msg
|
||||
|
||||
|
||||
@_timing.time
|
||||
def eval_sequence(seq, dataset, tracker, class_list, metrics_list, metric_names):
|
||||
"""Function for evaluating a single sequence."""
|
||||
raw_data = dataset.get_raw_seq_data(tracker, seq)
|
||||
seq_res = {}
|
||||
|
||||
if "TETA" in metric_names:
|
||||
thresholds = [50]
|
||||
data_all_class = dataset.get_preprocessed_seq_data(
|
||||
raw_data, "all", thresholds=thresholds
|
||||
)
|
||||
teta = metrics_list[metric_names.index("TETA")]
|
||||
assignment = teta.compute_global_assignment(data_all_class)
|
||||
|
||||
# create a dict to save Cls_FP for each class in different thr.
|
||||
cls_fp = {
|
||||
key: {
|
||||
cls: np.zeros((len(np.arange(0.5, 0.99, 0.05)))) for cls in class_list
|
||||
}
|
||||
for key in thresholds
|
||||
}
|
||||
|
||||
for cls in class_list:
|
||||
seq_res[cls] = {}
|
||||
data = dataset.get_preprocessed_seq_data(raw_data, cls, assignment, thresholds)
|
||||
|
||||
for metric, mname in zip(metrics_list, metric_names):
|
||||
if mname == "TETA":
|
||||
seq_res[cls][mname], cls_fp, _ = metric.eval_sequence(
|
||||
data, cls, dataset.clsid2cls_name, cls_fp
|
||||
)
|
||||
else:
|
||||
seq_res[cls][mname] = metric.eval_sequence(data)
|
||||
|
||||
if "TETA" in metric_names:
|
||||
for thr in thresholds:
|
||||
for cls in class_list:
|
||||
seq_res[cls]["TETA"][thr]["Cls_FP"] += cls_fp[thr][cls]
|
||||
|
||||
return seq_res
|
||||
4
sam3/eval/teta_eval_toolkit/metrics/__init__.py
Normal file
4
sam3/eval/teta_eval_toolkit/metrics/__init__.py
Normal file
@@ -0,0 +1,4 @@
|
||||
# fmt: off
|
||||
# flake8: noqa
|
||||
|
||||
from .teta import TETA
|
||||
148
sam3/eval/teta_eval_toolkit/metrics/_base_metric.py
Normal file
148
sam3/eval/teta_eval_toolkit/metrics/_base_metric.py
Normal file
@@ -0,0 +1,148 @@
|
||||
# fmt: off
|
||||
# flake8: noqa
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
import numpy as np
|
||||
|
||||
from .. import _timing
|
||||
from ..utils import TrackEvalException
|
||||
|
||||
|
||||
class _BaseMetric(ABC):
|
||||
@abstractmethod
|
||||
def __init__(self):
|
||||
self.plottable = False
|
||||
self.integer_fields = []
|
||||
self.float_fields = []
|
||||
self.array_labels = []
|
||||
self.integer_array_fields = []
|
||||
self.float_array_fields = []
|
||||
self.fields = []
|
||||
self.summary_fields = []
|
||||
self.registered = False
|
||||
|
||||
#####################################################################
|
||||
# Abstract functions for subclasses to implement
|
||||
|
||||
@_timing.time
|
||||
@abstractmethod
|
||||
def eval_sequence(self, data):
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
def combine_sequences(self, all_res):
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
def combine_classes_class_averaged(self, all_res, ignore_empty=False):
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
def combine_classes_det_averaged(self, all_res):
|
||||
...
|
||||
|
||||
def plot_single_tracker_results(self, all_res, tracker, output_folder, cls):
|
||||
"""Plot results, only valid for metrics with self.plottable."""
|
||||
if self.plottable:
|
||||
raise NotImplementedError(
|
||||
f"plot_results is not implemented for metric {self.get_name()}"
|
||||
)
|
||||
else:
|
||||
pass
|
||||
|
||||
#####################################################################
|
||||
# Helper functions which are useful for all metrics:
|
||||
|
||||
@classmethod
|
||||
def get_name(cls):
|
||||
return cls.__name__
|
||||
|
||||
@staticmethod
|
||||
def _combine_sum(all_res, field):
|
||||
"""Combine sequence results via sum"""
|
||||
return sum([all_res[k][field] for k in all_res.keys()])
|
||||
|
||||
@staticmethod
|
||||
def _combine_weighted_av(all_res, field, comb_res, weight_field):
|
||||
"""Combine sequence results via weighted average."""
|
||||
return sum(
|
||||
[all_res[k][field] * all_res[k][weight_field] for k in all_res.keys()]
|
||||
) / np.maximum(1.0, comb_res[weight_field])
|
||||
|
||||
def print_table(self, table_res, tracker, cls):
|
||||
"""Print table of results for all sequences."""
|
||||
print("")
|
||||
metric_name = self.get_name()
|
||||
self._row_print(
|
||||
[metric_name + ": " + tracker + "-" + cls] + self.summary_fields
|
||||
)
|
||||
for seq, results in sorted(table_res.items()):
|
||||
if seq == "COMBINED_SEQ":
|
||||
continue
|
||||
summary_res = self._summary_row(results)
|
||||
self._row_print([seq] + summary_res)
|
||||
summary_res = self._summary_row(table_res["COMBINED_SEQ"])
|
||||
self._row_print(["COMBINED"] + summary_res)
|
||||
|
||||
def _summary_row(self, results_):
|
||||
vals = []
|
||||
for h in self.summary_fields:
|
||||
if h in self.float_array_fields:
|
||||
vals.append("{0:1.5g}".format(100 * np.mean(results_[h])))
|
||||
elif h in self.float_fields:
|
||||
vals.append("{0:1.5g}".format(100 * float(results_[h])))
|
||||
elif h in self.integer_fields:
|
||||
vals.append("{0:d}".format(int(results_[h])))
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
"Summary function not implemented for this field type."
|
||||
)
|
||||
return vals
|
||||
|
||||
@staticmethod
|
||||
def _row_print(*argv):
|
||||
"""Print results in evenly spaced rows, with more space in first row."""
|
||||
if len(argv) == 1:
|
||||
argv = argv[0]
|
||||
to_print = "%-35s" % argv[0]
|
||||
for v in argv[1:]:
|
||||
to_print += "%-10s" % str(v)
|
||||
print(to_print)
|
||||
|
||||
def summary_results(self, table_res):
|
||||
"""Return a simple summary of final results for a tracker."""
|
||||
return dict(
|
||||
zip(self.summary_fields, self._summary_row(table_res["COMBINED_SEQ"]),)
|
||||
)
|
||||
|
||||
def detailed_results(self, table_res):
|
||||
"""Return detailed final results for a tracker."""
|
||||
# Get detailed field information
|
||||
detailed_fields = self.float_fields + self.integer_fields
|
||||
for h in self.float_array_fields + self.integer_array_fields:
|
||||
for alpha in [int(100 * x) for x in self.array_labels]:
|
||||
detailed_fields.append(h + "___" + str(alpha))
|
||||
detailed_fields.append(h + "___AUC")
|
||||
|
||||
# Get detailed results
|
||||
detailed_results = {}
|
||||
for seq, res in table_res.items():
|
||||
detailed_row = self._detailed_row(res)
|
||||
if len(detailed_row) != len(detailed_fields):
|
||||
raise TrackEvalException(
|
||||
f"Field names and data have different sizes "
|
||||
f"({len(detailed_row)} and {len(detailed_fields)})"
|
||||
)
|
||||
detailed_results[seq] = dict(zip(detailed_fields, detailed_row))
|
||||
return detailed_results
|
||||
|
||||
def _detailed_row(self, res):
|
||||
detailed_row = []
|
||||
for h in self.float_fields + self.integer_fields:
|
||||
detailed_row.append(res[h])
|
||||
for h in self.float_array_fields + self.integer_array_fields:
|
||||
for i, _ in enumerate([int(100 * x) for x in self.array_labels]):
|
||||
detailed_row.append(res[h][i])
|
||||
detailed_row.append(np.mean(res[h]))
|
||||
return detailed_row
|
||||
399
sam3/eval/teta_eval_toolkit/metrics/teta.py
Normal file
399
sam3/eval/teta_eval_toolkit/metrics/teta.py
Normal file
@@ -0,0 +1,399 @@
|
||||
# fmt: off
|
||||
# flake8: noqa
|
||||
|
||||
"""Track Every Thing Accuracy metric."""
|
||||
|
||||
import numpy as np
|
||||
from scipy.optimize import linear_sum_assignment
|
||||
|
||||
from .. import _timing
|
||||
from ._base_metric import _BaseMetric
|
||||
|
||||
EPS = np.finfo("float").eps # epsilon
|
||||
|
||||
|
||||
class TETA(_BaseMetric):
|
||||
"""TETA metric."""
|
||||
|
||||
def __init__(self, exhaustive=False, config=None):
|
||||
"""Initialize metric."""
|
||||
super().__init__()
|
||||
self.plottable = True
|
||||
self.array_labels = np.arange(0.0, 0.99, 0.05)
|
||||
self.cls_array_labels = np.arange(0.5, 0.99, 0.05)
|
||||
|
||||
self.integer_array_fields = [
|
||||
"Loc_TP",
|
||||
"Loc_FN",
|
||||
"Loc_FP",
|
||||
"Cls_TP",
|
||||
"Cls_FN",
|
||||
"Cls_FP",
|
||||
]
|
||||
self.float_array_fields = (
|
||||
["TETA", "LocA", "AssocA", "ClsA"]
|
||||
+ ["LocRe", "LocPr"]
|
||||
+ ["AssocRe", "AssocPr"]
|
||||
+ ["ClsRe", "ClsPr"]
|
||||
)
|
||||
self.fields = self.float_array_fields + self.integer_array_fields
|
||||
self.summary_fields = self.float_array_fields
|
||||
self.exhaustive = exhaustive
|
||||
|
||||
def compute_global_assignment(self, data_thr, alpha=0.5):
|
||||
"""Compute global assignment of TP."""
|
||||
res = {
|
||||
thr: {t: {} for t in range(data_thr[thr]["num_timesteps"])}
|
||||
for thr in data_thr
|
||||
}
|
||||
|
||||
for thr in data_thr:
|
||||
data = data_thr[thr]
|
||||
# return empty result if tracker or gt sequence is empty
|
||||
if data["num_tk_overlap_dets"] == 0 or data["num_gt_dets"] == 0:
|
||||
return res
|
||||
|
||||
# global alignment score
|
||||
ga_score, _, _ = self.compute_global_alignment_score(data)
|
||||
|
||||
# calculate scores for each timestep
|
||||
for t, (gt_ids_t, tk_ids_t) in enumerate(
|
||||
zip(data["gt_ids"], data["tk_ids"])
|
||||
):
|
||||
# get matches optimizing for TETA
|
||||
amatch_rows, amatch_cols = self.compute_matches(
|
||||
data, t, ga_score, gt_ids_t, tk_ids_t, alpha=alpha
|
||||
)
|
||||
gt_ids = [data["gt_id_map"][tid] for tid in gt_ids_t[amatch_rows[0]]]
|
||||
matched_ids = [
|
||||
data["tk_id_map"][tid] for tid in tk_ids_t[amatch_cols[0]]
|
||||
]
|
||||
res[thr][t] = dict(zip(gt_ids, matched_ids))
|
||||
|
||||
return res
|
||||
|
||||
def eval_sequence_single_thr(self, data, cls, cid2clsname, cls_fp_thr, thr):
|
||||
"""Computes TETA metric for one threshold for one sequence."""
|
||||
res = {}
|
||||
class_info_list = []
|
||||
for field in self.float_array_fields + self.integer_array_fields:
|
||||
if field.startswith("Cls"):
|
||||
res[field] = np.zeros(len(self.cls_array_labels), dtype=float)
|
||||
else:
|
||||
res[field] = np.zeros((len(self.array_labels)), dtype=float)
|
||||
|
||||
# return empty result if tracker or gt sequence is empty
|
||||
if data["num_tk_overlap_dets"] == 0:
|
||||
res["Loc_FN"] = data["num_gt_dets"] * np.ones(
|
||||
(len(self.array_labels)), dtype=float
|
||||
)
|
||||
if self.exhaustive:
|
||||
cls_fp_thr[cls] = data["num_tk_cls_dets"] * np.ones(
|
||||
(len(self.cls_array_labels)), dtype=float
|
||||
)
|
||||
res = self._compute_final_fields(res)
|
||||
return res, cls_fp_thr, class_info_list
|
||||
|
||||
if data["num_gt_dets"] == 0:
|
||||
if self.exhaustive:
|
||||
cls_fp_thr[cls] = data["num_tk_cls_dets"] * np.ones(
|
||||
(len(self.cls_array_labels)), dtype=float
|
||||
)
|
||||
res = self._compute_final_fields(res)
|
||||
return res, cls_fp_thr, class_info_list
|
||||
|
||||
# global alignment score
|
||||
ga_score, gt_id_count, tk_id_count = self.compute_global_alignment_score(data)
|
||||
matches_counts = [np.zeros_like(ga_score) for _ in self.array_labels]
|
||||
|
||||
# calculate scores for each timestep
|
||||
for t, (gt_ids_t, tk_ids_t, tk_overlap_ids_t, tk_cls_ids_t) in enumerate(
|
||||
zip(
|
||||
data["gt_ids"],
|
||||
data["tk_ids"],
|
||||
data["tk_overlap_ids"],
|
||||
data["tk_class_eval_tk_ids"],
|
||||
)
|
||||
):
|
||||
# deal with the case that there are no gt_det/tk_det in a timestep
|
||||
if len(gt_ids_t) == 0:
|
||||
if self.exhaustive:
|
||||
cls_fp_thr[cls] += len(tk_cls_ids_t)
|
||||
continue
|
||||
|
||||
# get matches optimizing for TETA
|
||||
amatch_rows, amatch_cols = self.compute_matches(
|
||||
data, t, ga_score, gt_ids_t, tk_ids_t, list(self.array_labels)
|
||||
)
|
||||
|
||||
# map overlap_ids to original ids.
|
||||
if len(tk_overlap_ids_t) != 0:
|
||||
sorter = np.argsort(tk_ids_t)
|
||||
indexes = sorter[
|
||||
np.searchsorted(tk_ids_t, tk_overlap_ids_t, sorter=sorter)
|
||||
]
|
||||
sim_t = data["sim_scores"][t][:, indexes]
|
||||
fpl_candidates = tk_overlap_ids_t[(sim_t >= (thr / 100)).any(axis=0)]
|
||||
fpl_candidates_ori_ids_t = np.array(
|
||||
[data["tk_id_map"][tid] for tid in fpl_candidates]
|
||||
)
|
||||
else:
|
||||
fpl_candidates_ori_ids_t = []
|
||||
|
||||
if self.exhaustive:
|
||||
cls_fp_thr[cls] += len(tk_cls_ids_t) - len(tk_overlap_ids_t)
|
||||
|
||||
# calculate and accumulate basic statistics
|
||||
for a, alpha in enumerate(self.array_labels):
|
||||
match_row, match_col = amatch_rows[a], amatch_cols[a]
|
||||
num_matches = len(match_row)
|
||||
matched_ori_ids = set(
|
||||
[data["tk_id_map"][tid] for tid in tk_ids_t[match_col]]
|
||||
)
|
||||
match_tk_cls = data["tk_classes"][t][match_col]
|
||||
wrong_tk_cls = match_tk_cls[match_tk_cls != data["gt_classes"][t]]
|
||||
|
||||
num_class_and_det_matches = np.sum(
|
||||
match_tk_cls == data["gt_classes"][t]
|
||||
)
|
||||
|
||||
if alpha >= 0.5:
|
||||
for cid in wrong_tk_cls:
|
||||
if cid in cid2clsname:
|
||||
cname = cid2clsname[cid]
|
||||
cls_fp_thr[cname][a - 10] += 1
|
||||
res["Cls_TP"][a - 10] += num_class_and_det_matches
|
||||
res["Cls_FN"][a - 10] += num_matches - num_class_and_det_matches
|
||||
|
||||
res["Loc_TP"][a] += num_matches
|
||||
res["Loc_FN"][a] += len(gt_ids_t) - num_matches
|
||||
res["Loc_FP"][a] += len(set(fpl_candidates_ori_ids_t) - matched_ori_ids)
|
||||
|
||||
if num_matches > 0:
|
||||
matches_counts[a][gt_ids_t[match_row], tk_ids_t[match_col]] += 1
|
||||
|
||||
# calculate AssocA, AssocRe, AssocPr
|
||||
self.compute_association_scores(res, matches_counts, gt_id_count, tk_id_count)
|
||||
|
||||
# calculate final scores
|
||||
res = self._compute_final_fields(res)
|
||||
return res, cls_fp_thr, class_info_list
|
||||
|
||||
def compute_global_alignment_score(self, data):
|
||||
"""Computes global alignment score."""
|
||||
num_matches = np.zeros((data["num_gt_ids"], data["num_tk_ids"]))
|
||||
gt_id_count = np.zeros((data["num_gt_ids"], 1))
|
||||
tk_id_count = np.zeros((1, data["num_tk_ids"]))
|
||||
|
||||
# loop through each timestep and accumulate global track info.
|
||||
for t, (gt_ids_t, tk_ids_t) in enumerate(zip(data["gt_ids"], data["tk_ids"])):
|
||||
# count potential matches between ids in each time step
|
||||
# these are normalized, weighted by match similarity
|
||||
sim = data["sim_scores"][t]
|
||||
sim_iou_denom = sim.sum(0, keepdims=True) + sim.sum(1, keepdims=True) - sim
|
||||
sim_iou = np.zeros_like(sim)
|
||||
mask = sim_iou_denom > (0 + EPS)
|
||||
sim_iou[mask] = sim[mask] / sim_iou_denom[mask]
|
||||
num_matches[gt_ids_t[:, None], tk_ids_t[None, :]] += sim_iou
|
||||
|
||||
# calculate total number of dets for each gt_id and tk_id.
|
||||
gt_id_count[gt_ids_t] += 1
|
||||
tk_id_count[0, tk_ids_t] += 1
|
||||
|
||||
# Calculate overall Jaccard alignment score between IDs
|
||||
ga_score = num_matches / (gt_id_count + tk_id_count - num_matches)
|
||||
return ga_score, gt_id_count, tk_id_count
|
||||
|
||||
def compute_matches(self, data, t, ga_score, gt_ids, tk_ids, alpha):
|
||||
"""Compute matches based on alignment score."""
|
||||
sim = data["sim_scores"][t]
|
||||
score_mat = ga_score[gt_ids[:, None], tk_ids[None, :]] * sim
|
||||
# Hungarian algorithm to find best matches
|
||||
match_rows, match_cols = linear_sum_assignment(-score_mat)
|
||||
|
||||
if not isinstance(alpha, list):
|
||||
alpha = [alpha]
|
||||
alpha_match_rows, alpha_match_cols = [], []
|
||||
for a in alpha:
|
||||
matched_mask = sim[match_rows, match_cols] >= a - EPS
|
||||
alpha_match_rows.append(match_rows[matched_mask])
|
||||
alpha_match_cols.append(match_cols[matched_mask])
|
||||
return alpha_match_rows, alpha_match_cols
|
||||
|
||||
def compute_association_scores(self, res, matches_counts, gt_id_count, tk_id_count):
|
||||
"""Calculate association scores for each alpha.
|
||||
|
||||
First calculate scores per gt_id/tk_id combo,
|
||||
and then average over the number of detections.
|
||||
"""
|
||||
for a, _ in enumerate(self.array_labels):
|
||||
matches_count = matches_counts[a]
|
||||
ass_a = matches_count / np.maximum(
|
||||
1, gt_id_count + tk_id_count - matches_count
|
||||
)
|
||||
res["AssocA"][a] = np.sum(matches_count * ass_a) / np.maximum(
|
||||
1, res["Loc_TP"][a]
|
||||
)
|
||||
ass_re = matches_count / np.maximum(1, gt_id_count)
|
||||
res["AssocRe"][a] = np.sum(matches_count * ass_re) / np.maximum(
|
||||
1, res["Loc_TP"][a]
|
||||
)
|
||||
ass_pr = matches_count / np.maximum(1, tk_id_count)
|
||||
res["AssocPr"][a] = np.sum(matches_count * ass_pr) / np.maximum(
|
||||
1, res["Loc_TP"][a]
|
||||
)
|
||||
|
||||
@_timing.time
|
||||
def eval_sequence(self, data, cls, cls_id_name_mapping, cls_fp):
|
||||
"""Evaluate a single sequence across all thresholds."""
|
||||
res = {}
|
||||
class_info_dict = {}
|
||||
|
||||
for thr in data:
|
||||
res[thr], cls_fp[thr], cls_info = self.eval_sequence_single_thr(
|
||||
data[thr], cls, cls_id_name_mapping, cls_fp[thr], thr
|
||||
)
|
||||
class_info_dict[thr] = cls_info
|
||||
|
||||
return res, cls_fp, class_info_dict
|
||||
|
||||
def combine_sequences(self, all_res):
|
||||
"""Combines metrics across all sequences."""
|
||||
data = {}
|
||||
res = {}
|
||||
|
||||
if all_res:
|
||||
thresholds = list(list(all_res.values())[0].keys())
|
||||
else:
|
||||
thresholds = [50]
|
||||
for thr in thresholds:
|
||||
data[thr] = {}
|
||||
for seq_key in all_res:
|
||||
data[thr][seq_key] = all_res[seq_key][thr]
|
||||
for thr in thresholds:
|
||||
res[thr] = self._combine_sequences_thr(data[thr])
|
||||
|
||||
return res
|
||||
|
||||
def _combine_sequences_thr(self, all_res):
|
||||
"""Combines sequences over each threshold."""
|
||||
res = {}
|
||||
for field in self.integer_array_fields:
|
||||
res[field] = self._combine_sum(all_res, field)
|
||||
for field in ["AssocRe", "AssocPr", "AssocA"]:
|
||||
res[field] = self._combine_weighted_av(
|
||||
all_res, field, res, weight_field="Loc_TP"
|
||||
)
|
||||
res = self._compute_final_fields(res)
|
||||
return res
|
||||
|
||||
def combine_classes_class_averaged(self, all_res, ignore_empty=False):
|
||||
"""Combines metrics across all classes by averaging over classes.
|
||||
|
||||
If 'ignore_empty' is True, then it only sums over classes
|
||||
with at least one gt or predicted detection.
|
||||
"""
|
||||
data = {}
|
||||
res = {}
|
||||
if all_res:
|
||||
thresholds = list(list(all_res.values())[0].keys())
|
||||
else:
|
||||
thresholds = [50]
|
||||
for thr in thresholds:
|
||||
data[thr] = {}
|
||||
for cls_key in all_res:
|
||||
data[thr][cls_key] = all_res[cls_key][thr]
|
||||
for thr in data:
|
||||
res[thr] = self._combine_classes_class_averaged_thr(
|
||||
data[thr], ignore_empty=ignore_empty
|
||||
)
|
||||
return res
|
||||
|
||||
def _combine_classes_class_averaged_thr(self, all_res, ignore_empty=False):
|
||||
"""Combines classes over each threshold."""
|
||||
res = {}
|
||||
|
||||
def check_empty(val):
|
||||
"""Returns True if empty."""
|
||||
return not (val["Loc_TP"] + val["Loc_FN"] + val["Loc_FP"] > 0 + EPS).any()
|
||||
|
||||
for field in self.integer_array_fields:
|
||||
if ignore_empty:
|
||||
res_field = {k: v for k, v in all_res.items() if not check_empty(v)}
|
||||
else:
|
||||
res_field = {k: v for k, v in all_res.items()}
|
||||
res[field] = self._combine_sum(res_field, field)
|
||||
|
||||
for field in self.float_array_fields:
|
||||
if ignore_empty:
|
||||
res_field = [v[field] for v in all_res.values() if not check_empty(v)]
|
||||
else:
|
||||
res_field = [v[field] for v in all_res.values()]
|
||||
res[field] = np.mean(res_field, axis=0)
|
||||
return res
|
||||
|
||||
def combine_classes_det_averaged(self, all_res):
|
||||
"""Combines metrics across all classes by averaging over detections."""
|
||||
data = {}
|
||||
res = {}
|
||||
if all_res:
|
||||
thresholds = list(list(all_res.values())[0].keys())
|
||||
else:
|
||||
thresholds = [50]
|
||||
for thr in thresholds:
|
||||
data[thr] = {}
|
||||
for cls_key in all_res:
|
||||
data[thr][cls_key] = all_res[cls_key][thr]
|
||||
for thr in data:
|
||||
res[thr] = self._combine_classes_det_averaged_thr(data[thr])
|
||||
return res
|
||||
|
||||
def _combine_classes_det_averaged_thr(self, all_res):
|
||||
"""Combines detections over each threshold."""
|
||||
res = {}
|
||||
for field in self.integer_array_fields:
|
||||
res[field] = self._combine_sum(all_res, field)
|
||||
for field in ["AssocRe", "AssocPr", "AssocA"]:
|
||||
res[field] = self._combine_weighted_av(
|
||||
all_res, field, res, weight_field="Loc_TP"
|
||||
)
|
||||
res = self._compute_final_fields(res)
|
||||
return res
|
||||
|
||||
@staticmethod
|
||||
def _compute_final_fields(res):
|
||||
"""Calculate final metric values.
|
||||
|
||||
This function is used both for both per-sequence calculation,
|
||||
and in combining values across sequences.
|
||||
"""
|
||||
# LocA
|
||||
res["LocRe"] = res["Loc_TP"] / np.maximum(1, res["Loc_TP"] + res["Loc_FN"])
|
||||
res["LocPr"] = res["Loc_TP"] / np.maximum(1, res["Loc_TP"] + res["Loc_FP"])
|
||||
res["LocA"] = res["Loc_TP"] / np.maximum(
|
||||
1, res["Loc_TP"] + res["Loc_FN"] + res["Loc_FP"]
|
||||
)
|
||||
|
||||
# ClsA
|
||||
res["ClsRe"] = res["Cls_TP"] / np.maximum(1, res["Cls_TP"] + res["Cls_FN"])
|
||||
res["ClsPr"] = res["Cls_TP"] / np.maximum(1, res["Cls_TP"] + res["Cls_FP"])
|
||||
res["ClsA"] = res["Cls_TP"] / np.maximum(
|
||||
1, res["Cls_TP"] + res["Cls_FN"] + res["Cls_FP"]
|
||||
)
|
||||
|
||||
res["ClsRe"] = np.mean(res["ClsRe"])
|
||||
res["ClsPr"] = np.mean(res["ClsPr"])
|
||||
res["ClsA"] = np.mean(res["ClsA"])
|
||||
|
||||
res["TETA"] = (res["LocA"] + res["AssocA"] + res["ClsA"]) / 3
|
||||
|
||||
return res
|
||||
|
||||
def print_summary_table(self, thr_res, thr, tracker, cls):
|
||||
"""Prints summary table of results."""
|
||||
print("")
|
||||
metric_name = self.get_name()
|
||||
self._row_print(
|
||||
[f"{metric_name}{str(thr)}: {tracker}-{cls}"] + self.summary_fields
|
||||
)
|
||||
self._row_print(["COMBINED"] + thr_res)
|
||||
46
sam3/eval/teta_eval_toolkit/utils.py
Normal file
46
sam3/eval/teta_eval_toolkit/utils.py
Normal file
@@ -0,0 +1,46 @@
|
||||
# fmt: off
|
||||
# flake8: noqa
|
||||
|
||||
import csv
|
||||
import os
|
||||
from collections import OrderedDict
|
||||
|
||||
|
||||
def validate_metrics_list(metrics_list):
|
||||
"""Get names of metric class and ensures they are unique, further checks that the fields within each metric class
|
||||
do not have overlapping names.
|
||||
"""
|
||||
metric_names = [metric.get_name() for metric in metrics_list]
|
||||
# check metric names are unique
|
||||
if len(metric_names) != len(set(metric_names)):
|
||||
raise TrackEvalException(
|
||||
"Code being run with multiple metrics of the same name"
|
||||
)
|
||||
fields = []
|
||||
for m in metrics_list:
|
||||
fields += m.fields
|
||||
# check metric fields are unique
|
||||
if len(fields) != len(set(fields)):
|
||||
raise TrackEvalException(
|
||||
"Code being run with multiple metrics with fields of the same name"
|
||||
)
|
||||
return metric_names
|
||||
|
||||
|
||||
def get_track_id_str(ann):
|
||||
"""Get name of track ID in annotation."""
|
||||
if "track_id" in ann:
|
||||
tk_str = "track_id"
|
||||
elif "instance_id" in ann:
|
||||
tk_str = "instance_id"
|
||||
elif "scalabel_id" in ann:
|
||||
tk_str = "scalabel_id"
|
||||
else:
|
||||
assert False, "No track/instance ID."
|
||||
return tk_str
|
||||
|
||||
|
||||
class TrackEvalException(Exception):
|
||||
"""Custom exception for catching expected errors."""
|
||||
|
||||
...
|
||||
146
sam3/eval/ytvis_coco_wrapper.py
Normal file
146
sam3/eval/ytvis_coco_wrapper.py
Normal file
@@ -0,0 +1,146 @@
|
||||
# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
|
||||
# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
|
||||
|
||||
import copy
|
||||
import json
|
||||
import logging
|
||||
|
||||
import numpy as np
|
||||
import pycocotools.mask as mask_util
|
||||
from pycocotools.coco import COCO
|
||||
from typing_extensions import override
|
||||
|
||||
|
||||
class YTVIS(COCO):
|
||||
"""
|
||||
Helper class for reading YT-VIS annotations
|
||||
"""
|
||||
|
||||
@override
|
||||
def __init__(self, annotation_file: str = None, ignore_gt_cats: bool = True):
|
||||
"""
|
||||
Args:
|
||||
annotation_file: Path to the annotation file
|
||||
ignore_gt_cats: If True, we ignore the ground truth categories and replace them with a dummy "object" category. This is useful for Phrase AP evaluation.
|
||||
"""
|
||||
self.ignore_gt_cats = ignore_gt_cats
|
||||
super().__init__(annotation_file=annotation_file)
|
||||
|
||||
@override
|
||||
def createIndex(self):
|
||||
# We rename some keys to match the COCO format before creating the index.
|
||||
if "annotations" in self.dataset:
|
||||
for ann in self.dataset["annotations"]:
|
||||
if "video_id" in ann:
|
||||
ann["image_id"] = int(ann.pop("video_id"))
|
||||
if self.ignore_gt_cats:
|
||||
ann["category_id"] = -1
|
||||
else:
|
||||
ann["category_id"] = int(ann["category_id"])
|
||||
if "bboxes" in ann:
|
||||
# note that in some datasets we load under this YTVIS class,
|
||||
# some "bboxes" could be None for when the GT object is invisible,
|
||||
# so we replace them with [0, 0, 0, 0]
|
||||
ann["bboxes"] = [
|
||||
bbox if bbox is not None else [0, 0, 0, 0]
|
||||
for bbox in ann["bboxes"]
|
||||
]
|
||||
if "areas" in ann:
|
||||
# similar to "bboxes", some areas could be None for when the GT
|
||||
# object is invisible, so we replace them with 0
|
||||
areas = [a if a is not None else 0 for a in ann["areas"]]
|
||||
# Compute average area of tracklet
|
||||
ann["area"] = np.mean(areas)
|
||||
if "videos" in self.dataset:
|
||||
for vid in self.dataset["videos"]:
|
||||
vid["id"] = int(vid["id"])
|
||||
self.dataset["images"] = self.dataset.pop("videos")
|
||||
|
||||
if self.ignore_gt_cats:
|
||||
self.dataset["categories"] = [
|
||||
{"supercategory": "object", "id": -1, "name": "object"}
|
||||
]
|
||||
else:
|
||||
for cat in self.dataset["categories"]:
|
||||
cat["id"] = int(cat["id"])
|
||||
super().createIndex()
|
||||
|
||||
@override
|
||||
def getAnnIds(self, imgIds=[], catIds=[], areaRng=[], iscrowd=None):
|
||||
if len(areaRng) > 0:
|
||||
logging.warning(
|
||||
"Note that we filter out objects based on their *average* area across the video, not per frame area"
|
||||
)
|
||||
|
||||
return super().getAnnIds(imgIds=imgIds, catIds=catIds, iscrowd=iscrowd)
|
||||
|
||||
@override
|
||||
def showAnns(self, anns, draw_bbox=False):
|
||||
raise NotImplementedError("Showing annotations is not supported")
|
||||
|
||||
@override
|
||||
def loadRes(self, resFile):
|
||||
# Adapted from COCO.loadRes to support tracklets/masklets
|
||||
res = YTVIS(ignore_gt_cats=self.ignore_gt_cats)
|
||||
res.dataset["images"] = [img for img in self.dataset["images"]]
|
||||
|
||||
if type(resFile) == str:
|
||||
with open(resFile) as f:
|
||||
anns = json.load(f)
|
||||
elif type(resFile) == np.ndarray:
|
||||
anns = self.loadNumpyAnnotations(resFile)
|
||||
else:
|
||||
anns = resFile
|
||||
assert type(anns) == list, "results is not an array of objects"
|
||||
annsImgIds = [ann["image_id"] for ann in anns]
|
||||
assert set(annsImgIds) == (
|
||||
set(annsImgIds) & set(self.getImgIds())
|
||||
), "Results do not correspond to current coco set"
|
||||
if "bboxes" in anns[0] and not anns[0]["bboxes"] == []:
|
||||
res.dataset["categories"] = copy.deepcopy(self.dataset["categories"])
|
||||
for id, ann in enumerate(anns):
|
||||
bbs = [(bb if bb is not None else [0, 0, 0, 0]) for bb in ann["bboxes"]]
|
||||
xxyy = [[bb[0], bb[0] + bb[2], bb[1], bb[1] + bb[3]] for bb in bbs]
|
||||
if not "segmentations" in ann:
|
||||
ann["segmentations"] = [
|
||||
[[x1, y1, x1, y2, x2, y2, x2, y1]] for (x1, x2, y1, y2) in xxyy
|
||||
]
|
||||
ann["areas"] = [bb[2] * bb[3] for bb in bbs]
|
||||
# NOTE: We also compute average area of a tracklet across video, allowing us to compute area based mAP.
|
||||
ann["area"] = np.mean(ann["areas"])
|
||||
ann["id"] = id + 1
|
||||
ann["iscrowd"] = 0
|
||||
elif "segmentations" in anns[0]:
|
||||
res.dataset["categories"] = copy.deepcopy(self.dataset["categories"])
|
||||
for id, ann in enumerate(anns):
|
||||
ann["bboxes"] = [
|
||||
mask_util.toBbox(segm) for segm in ann["segmentations"]
|
||||
]
|
||||
if "areas" not in ann:
|
||||
ann["areas"] = [
|
||||
mask_util.area(segm) for segm in ann["segmentations"]
|
||||
]
|
||||
# NOTE: We also compute average area of a tracklet across video, allowing us to compute area based mAP.
|
||||
ann["area"] = np.mean(ann["areas"])
|
||||
ann["id"] = id + 1
|
||||
ann["iscrowd"] = 0
|
||||
|
||||
res.dataset["annotations"] = anns
|
||||
res.createIndex()
|
||||
return res
|
||||
|
||||
@override
|
||||
def download(self, tarDir=None, imgIds=[]):
|
||||
raise NotImplementedError
|
||||
|
||||
@override
|
||||
def loadNumpyAnnotations(self, data):
|
||||
raise NotImplementedError("We don't support numpy annotations for now")
|
||||
|
||||
@override
|
||||
def annToRLE(self, ann):
|
||||
raise NotImplementedError("We expect masks to be already in RLE format")
|
||||
|
||||
@override
|
||||
def annToMask(self, ann):
|
||||
raise NotImplementedError("We expect masks to be already in RLE format")
|
||||
411
sam3/eval/ytvis_eval.py
Normal file
411
sam3/eval/ytvis_eval.py
Normal file
@@ -0,0 +1,411 @@
|
||||
# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
|
||||
import copy
|
||||
import gc
|
||||
import logging
|
||||
import os
|
||||
from collections import defaultdict
|
||||
from operator import xor
|
||||
from pathlib import Path
|
||||
from typing import List, Optional
|
||||
|
||||
import numpy as np
|
||||
import pycocotools.mask as mask_util
|
||||
import torch
|
||||
from pycocotools.cocoeval import COCOeval
|
||||
from sam3.eval.cgf1_eval import CGF1Eval
|
||||
from sam3.eval.coco_eval_offline import convert_to_xywh
|
||||
from sam3.model.box_ops import box_xywh_inter_union
|
||||
from sam3.train.masks_ops import rle_encode
|
||||
from sam3.train.utils import distributed as dist
|
||||
from typing_extensions import override
|
||||
|
||||
try:
|
||||
import rapidjson as json
|
||||
except ModuleNotFoundError:
|
||||
import json
|
||||
|
||||
from iopath.common.file_io import g_pathmgr
|
||||
|
||||
|
||||
class YTVISevalMixin:
|
||||
"""
|
||||
Identical to COCOeval but adapts computeIoU to compute IoU between tracklets/masklets.
|
||||
"""
|
||||
|
||||
@override
|
||||
def _prepare(self):
|
||||
"""
|
||||
Copied from cocoeval.py but doesn't convert masks to RLEs (we assume they already are RLEs)
|
||||
"""
|
||||
p = self.params
|
||||
if p.useCats:
|
||||
gts = self.cocoGt.loadAnns(
|
||||
self.cocoGt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds)
|
||||
)
|
||||
dts = self.cocoDt.loadAnns(
|
||||
self.cocoDt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds)
|
||||
)
|
||||
else:
|
||||
gts = self.cocoGt.loadAnns(self.cocoGt.getAnnIds(imgIds=p.imgIds))
|
||||
dts = self.cocoDt.loadAnns(self.cocoDt.getAnnIds(imgIds=p.imgIds))
|
||||
|
||||
# set ignore flag
|
||||
for gt in gts:
|
||||
gt["ignore"] = gt["ignore"] if "ignore" in gt else 0
|
||||
gt["ignore"] = "iscrowd" in gt and gt["iscrowd"]
|
||||
if p.iouType == "keypoints":
|
||||
gt["ignore"] = (gt["num_keypoints"] == 0) or gt["ignore"]
|
||||
self._gts = defaultdict(list) # gt for evaluation
|
||||
self._dts = defaultdict(list) # dt for evaluation
|
||||
for gt in gts:
|
||||
self._gts[gt["image_id"], gt["category_id"]].append(gt)
|
||||
for dt in dts:
|
||||
self._dts[dt["image_id"], dt["category_id"]].append(dt)
|
||||
self.evalImgs = defaultdict(list) # per-image per-category evaluation results
|
||||
self.eval = {} # accumulated evaluation results
|
||||
|
||||
def computeIoU(self, imgId, catId):
|
||||
"""
|
||||
Compute IoU between tracklets. Copied from cocoeval.py but adapted for videos (in YT-VIS format)
|
||||
"""
|
||||
p = self.params
|
||||
if p.useCats:
|
||||
gt = self._gts[imgId, catId]
|
||||
dt = self._dts[imgId, catId]
|
||||
else:
|
||||
gt = [_ for cId in p.catIds for _ in self._gts[imgId, cId]]
|
||||
dt = [_ for cId in p.catIds for _ in self._dts[imgId, cId]]
|
||||
if len(gt) == 0 or len(dt) == 0:
|
||||
return []
|
||||
|
||||
# For class mAP and phrase AP evaluation, we sort the detections in descending order of scores (as in COCOeval).
|
||||
# For demo F1 evaluation, we DO NOT sort the detections (but match them with GTs via Hungarian matching).
|
||||
assert hasattr(self, "sort_inds_by_scores_in_iou"), (
|
||||
"subclasses that inherits YTVISevalMixin should set `self.sort_inds_by_scores_in_iou` "
|
||||
"(True for class mAP and phrase AP, False for demo F1)"
|
||||
)
|
||||
if self.sort_inds_by_scores_in_iou:
|
||||
inds = np.argsort([-d["score"] for d in dt], kind="mergesort")
|
||||
dt = [dt[i] for i in inds]
|
||||
if len(dt) > p.maxDets[-1]:
|
||||
dt = dt[0 : p.maxDets[-1]]
|
||||
|
||||
if p.iouType == "segm":
|
||||
g = [g["segmentations"] for g in gt]
|
||||
d = [d["segmentations"] for d in dt]
|
||||
elif p.iouType == "bbox":
|
||||
g = [g["bboxes"] for g in gt]
|
||||
d = [d["bboxes"] for d in dt]
|
||||
else:
|
||||
raise Exception("unknown iouType for iou computation")
|
||||
|
||||
def iou_tracklets(preds, gts):
|
||||
preds = torch.tensor(preds)
|
||||
gts = torch.tensor(gts)
|
||||
inter, union = box_xywh_inter_union(
|
||||
preds.unsqueeze(1), gts.unsqueeze(0)
|
||||
) # Num preds x Num GTS x Num frames
|
||||
inter = inter.sum(-1)
|
||||
union = union.sum(-1)
|
||||
assert (
|
||||
union > 0
|
||||
).all(), (
|
||||
"There exists a tracklet with zero GTs across time. This is suspicious"
|
||||
)
|
||||
return inter / union
|
||||
|
||||
def iou_masklets(preds, gts):
|
||||
inter = 0
|
||||
union = 0
|
||||
for p_i, gt_i in zip(preds, gts):
|
||||
if p_i and gt_i:
|
||||
# Compute areas of intersection and union
|
||||
inter += mask_util.area(
|
||||
mask_util.merge([p_i, gt_i], intersect=True)
|
||||
)
|
||||
union += mask_util.area(
|
||||
mask_util.merge([p_i, gt_i], intersect=False)
|
||||
)
|
||||
elif gt_i:
|
||||
union += mask_util.area(gt_i)
|
||||
elif p_i:
|
||||
union += mask_util.area(p_i)
|
||||
if union > 0:
|
||||
iou = inter / union
|
||||
assert iou >= 0 and iou <= 1, "Encountered an error in IoU computation"
|
||||
else:
|
||||
assert np.isclose(inter, 0) and np.isclose(
|
||||
union, 0
|
||||
), "Encountered an error in IoU computation"
|
||||
iou = 1
|
||||
return iou
|
||||
|
||||
if p.iouType == "segm":
|
||||
ious = [[iou_masklets(d_i, g_i) for g_i in g] for d_i in d]
|
||||
else:
|
||||
ious = iou_tracklets(d, g)
|
||||
return np.array(ious)
|
||||
|
||||
|
||||
class YTVISeval(YTVISevalMixin, COCOeval):
|
||||
# For class mAP and phrase AP evaluation, we sort the detections in descending order of scores (as in COCOeval).
|
||||
sort_inds_by_scores_in_iou = True
|
||||
|
||||
|
||||
class VideoDemoF1Eval(YTVISevalMixin, CGF1Eval):
|
||||
# For demo F1 evaluation, we DO NOT sort the detections (but match them with GTs via Hungarian matching).
|
||||
sort_inds_by_scores_in_iou = False
|
||||
|
||||
|
||||
class YTVISResultsWriter:
|
||||
"""
|
||||
Gather and dumps predictions in YT-VIS format.
|
||||
Expected flow of API calls: reset() -> N * update() -> compute_synced()
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
dump_file: str,
|
||||
postprocessor,
|
||||
gather_pred_via_filesys=False,
|
||||
pred_file_evaluators: Optional[List] = None,
|
||||
save_per_frame_scores: bool = False,
|
||||
write_eval_metrics_file: bool = True,
|
||||
eval_metrics_file_suffix: str = ".sam3_eval_metrics",
|
||||
):
|
||||
self.dump_file = dump_file
|
||||
self.dump = []
|
||||
self.postprocessor = postprocessor
|
||||
self.gather_pred_via_filesys = gather_pred_via_filesys
|
||||
if dist.is_main_process():
|
||||
dirname = os.path.dirname(self.dump_file)
|
||||
if not os.path.exists(dirname):
|
||||
os.makedirs(dirname, exist_ok=True)
|
||||
logging.info(f"Creating folder: {dirname}")
|
||||
|
||||
# the evaluation hooks to be applied to the prediction files
|
||||
self.pred_file_evaluators = pred_file_evaluators or []
|
||||
self.save_per_frame_scores = save_per_frame_scores
|
||||
# in addition to the prediction file, we also write the evaluation metrics
|
||||
# for easier debugging and analysis (stored in another eval_metrics_file
|
||||
# so that we can keep the dumped prediction file under YT-VIS format)
|
||||
self.write_eval_metrics_file = write_eval_metrics_file
|
||||
if self.write_eval_metrics_file:
|
||||
self.eval_metrics_file = self.dump_file + eval_metrics_file_suffix
|
||||
os.makedirs(os.path.dirname(self.eval_metrics_file), exist_ok=True)
|
||||
|
||||
def _dump_vid_preds(self, results):
|
||||
dumped_results = copy.deepcopy(results)
|
||||
self.dump.extend(dumped_results)
|
||||
|
||||
def prepare(self, predictions):
|
||||
ytvis_results = []
|
||||
for video_id, prediction in predictions.items():
|
||||
if len(prediction) == 0:
|
||||
continue
|
||||
for k in ["boxes", "scores", "labels"]:
|
||||
assert (
|
||||
k in prediction
|
||||
), f"Expected predictions to have `{k}` key, available keys are {prediction.keys()}"
|
||||
if self.save_per_frame_scores:
|
||||
assert (
|
||||
"per_frame_scores" in prediction
|
||||
), f"Expected predictions to have `per_frame_scores` key, available keys are {prediction.keys()}"
|
||||
assert xor(
|
||||
"masks" in prediction, "masks_rle" in prediction
|
||||
), f"Expected predictions to have either `masks` key or `masks_rle` key, available keys are {prediction.keys()}"
|
||||
|
||||
boxes = prediction["boxes"]
|
||||
boxes = convert_to_xywh(boxes).tolist()
|
||||
scores = prediction["scores"].tolist()
|
||||
labels = prediction["labels"].tolist()
|
||||
if "masks" in prediction:
|
||||
masks = prediction["masks"].squeeze(2)
|
||||
assert (
|
||||
masks.ndim == 4
|
||||
), "Expected masks to be of shape(N_preds,T_frames,H,W)"
|
||||
|
||||
areas = [mask.flatten(1).sum(1).tolist() for mask in masks]
|
||||
rles = [rle_encode(masklet) for masklet in masks]
|
||||
|
||||
# memory clean
|
||||
del masks
|
||||
del prediction["masks"]
|
||||
elif "masks_rle" in prediction:
|
||||
rles = prediction.pop("masks_rle")
|
||||
areas = [
|
||||
[0 if rle is None else rle.pop("area") for rle in rles_per_obj]
|
||||
for rles_per_obj in rles
|
||||
]
|
||||
else:
|
||||
raise ValueError(
|
||||
"Expected either `masks` or `masks_rle` key in the predictions."
|
||||
)
|
||||
|
||||
new_results = [
|
||||
{
|
||||
"video_id": video_id,
|
||||
"category_id": track_label,
|
||||
"bboxes": track_boxes,
|
||||
"score": track_score,
|
||||
"segmentations": track_masks,
|
||||
"areas": track_areas,
|
||||
}
|
||||
for (
|
||||
track_boxes,
|
||||
track_masks,
|
||||
track_areas,
|
||||
track_score,
|
||||
track_label,
|
||||
) in zip(boxes, rles, areas, scores, labels)
|
||||
]
|
||||
# Optionally, save per-frame scores
|
||||
if self.save_per_frame_scores:
|
||||
per_frame_scores = prediction["per_frame_scores"].tolist()
|
||||
for res, track_per_frame_scores in zip(new_results, per_frame_scores):
|
||||
res["per_frame_scores"] = track_per_frame_scores
|
||||
|
||||
ytvis_results.extend(new_results)
|
||||
|
||||
return ytvis_results
|
||||
|
||||
def set_sync_device(self, device: torch.device):
|
||||
self._sync_device = device
|
||||
|
||||
def update(self, *args, **kwargs):
|
||||
predictions = self.postprocessor.process_results(*args, **kwargs)
|
||||
results = self.prepare(predictions)
|
||||
self._dump_vid_preds(results)
|
||||
|
||||
def _dump_preds(self):
|
||||
if not dist.is_main_process():
|
||||
self.dump = []
|
||||
gc.collect()
|
||||
return
|
||||
dumped_file = Path(self.dump_file)
|
||||
logging.info(f"YTVIS evaluator: Dumping predictions to {dumped_file}")
|
||||
with g_pathmgr.open(str(dumped_file), "w") as f:
|
||||
json.dump(self.dump, f)
|
||||
self.dump = []
|
||||
gc.collect()
|
||||
return str(dumped_file)
|
||||
|
||||
def synchronize_between_processes(self):
|
||||
logging.info("YT-VIS evaluator: Synchronizing between processes")
|
||||
dump_dict = self._dedup_pre_gather(self.dump)
|
||||
if self.gather_pred_via_filesys:
|
||||
dump_dict_all_gpus = dist.gather_to_rank_0_via_filesys(dump_dict)
|
||||
else:
|
||||
dump_dict_all_gpus = dist.all_gather(dump_dict, force_cpu=True)
|
||||
self.dump = self._dedup_post_gather(dump_dict_all_gpus)
|
||||
logging.info(f"Gathered all {len(self.dump)} predictions")
|
||||
|
||||
def _dedup_pre_gather(self, predictions):
|
||||
"""
|
||||
Organize the predictions as a dict-of-list using (video_id, category_id) as keys
|
||||
for deduplication after gathering them across GPUs.
|
||||
|
||||
During evaluation, PyTorch data loader under `drop_last: False` would wrap
|
||||
around the dataset length to be a multiple of world size (GPU num) and duplicate
|
||||
the remaining batches. This causes the same test sample to appear simultaneously
|
||||
in multiple GPUs, resulting in duplicated predictions being saved into prediction
|
||||
files. These duplicates are then counted as false positives under detection mAP
|
||||
metrics (since a ground truth can be matched with only one prediction).
|
||||
|
||||
For example, if there are 4 GPUs and 6 samples [A1, A2, B1, B2, C1, C2], the data
|
||||
loader (under `drop_last: False`) would load it by wrapping it around like
|
||||
`[A1, A2, B1, B2, C1, C2, *A1*, *A2*]` to make a multiple of 4 and then split it as
|
||||
|
||||
- GPU 0: A1, C1
|
||||
- GPU 1: A2, C2
|
||||
- GPU 3: B1, **A1**
|
||||
- GPU 4: B2, **A2**
|
||||
(as in DistributedSampler in https://github.com/pytorch/pytorch/blob/521588519da9f4876d90ddd7a17c10d0eca89dc6/torch/utils/data/distributed.py#L116-L124)
|
||||
|
||||
so the predictions on A1 and A2 will occur twice in the final gathered outputs
|
||||
in the prediction file (and counted as false positives). This also affects our
|
||||
YT-VIS official val evaluation, but to a lesser extent than YT-VIS dev since
|
||||
the latter is much smaller and more susceptible to false positives.
|
||||
|
||||
So we to deduplicate this. The tricky part is that we cannot deduplicate them
|
||||
simply using video id, given that we are sharding the classes in each video
|
||||
across multiple batches (with 20 prompts per batch) in our "orig_cats" eval dbs.
|
||||
|
||||
The solution is to deduplicate based on (video_id, category_id) tuple as keys.
|
||||
We organize the predictions as a dict-of-list using (video_id, category_id) as
|
||||
keys on each GPU, with the list of masklets under this (video_id, category_id)
|
||||
on this GPU as values. Then, we all-gather this dict-of-list across GPUs and
|
||||
if a key (video_id, category_id) appears in multiple GPUs, we only take the
|
||||
prediction masklet list from one GPU.
|
||||
"""
|
||||
prediction_dict = defaultdict(list)
|
||||
for p in predictions:
|
||||
prediction_dict[(p["video_id"], p["category_id"])].append(p)
|
||||
return prediction_dict
|
||||
|
||||
def _dedup_post_gather(self, list_of_prediction_dict):
|
||||
"""
|
||||
Deduplicate the predictions from all GPUs. See `_dedup_pre_gather` for details.
|
||||
"""
|
||||
dedup_prediction_dict = {}
|
||||
duplication_keys = []
|
||||
for prediction_dict in list_of_prediction_dict:
|
||||
for k, v in prediction_dict.items():
|
||||
if k not in dedup_prediction_dict:
|
||||
dedup_prediction_dict[k] = v
|
||||
else:
|
||||
duplication_keys.append(k)
|
||||
|
||||
logging.info(
|
||||
f"skipped {len(duplication_keys)} duplicated predictions in YTVISResultsWriter "
|
||||
f"with the following (video_id, category_id) tuples: {duplication_keys}"
|
||||
)
|
||||
dedup_predictions = sum(dedup_prediction_dict.values(), [])
|
||||
return dedup_predictions
|
||||
|
||||
def compute_synced(
|
||||
self,
|
||||
):
|
||||
self.synchronize_between_processes()
|
||||
dumped_file = self._dump_preds()
|
||||
if not dist.is_main_process():
|
||||
return {"": 0.0}
|
||||
|
||||
# run evaluation hooks on the prediction file
|
||||
meters = {}
|
||||
all_video_np_level_results = defaultdict(dict)
|
||||
for evaluator in self.pred_file_evaluators:
|
||||
gc.collect()
|
||||
results, video_np_level_results = evaluator.evaluate(dumped_file)
|
||||
meters.update(results)
|
||||
for (video_id, category_id), res in video_np_level_results.items():
|
||||
all_video_np_level_results[(video_id, category_id)].update(res)
|
||||
|
||||
gc.collect()
|
||||
if self.write_eval_metrics_file:
|
||||
# convert the nested dict of {(video_id, category_id): per_sample_metric_dict}
|
||||
# to a list of per-sample metric dicts (with video_id and category_id) for JSON,
|
||||
# as JSON doesn't allow using tuples like (video_id, category_id) as dict keys
|
||||
video_np_level_metrics = [
|
||||
{"video_id": video_id, "category_id": category_id, **res}
|
||||
for (video_id, category_id), res in all_video_np_level_results.items()
|
||||
]
|
||||
eval_metrics = {
|
||||
"dataset_level_metrics": meters,
|
||||
"video_np_level_metrics": video_np_level_metrics,
|
||||
}
|
||||
with g_pathmgr.open(self.eval_metrics_file, "w") as f:
|
||||
json.dump(eval_metrics, f)
|
||||
logging.info(
|
||||
f"YTVIS evaluator: Dumped evaluation metrics to {self.eval_metrics_file}"
|
||||
)
|
||||
|
||||
if len(meters) == 0:
|
||||
meters = {"": 0.0}
|
||||
return meters
|
||||
|
||||
def compute(self):
|
||||
return {"": 0.0}
|
||||
|
||||
def reset(self, *args, **kwargs):
|
||||
self.dump = []
|
||||
Reference in New Issue
Block a user