Initial commit
fbshipit-source-id: da6be2f26e3a1202f4bffde8cb980e2dcb851294
This commit is contained in:
279
sam3/train/configs/eval_base.yaml
Normal file
279
sam3/train/configs/eval_base.yaml
Normal file
@@ -0,0 +1,279 @@
|
||||
# @package _global_
|
||||
defaults:
|
||||
- _self_
|
||||
|
||||
# This config is the base configuration for all evaluations. Amongst other things, it defines:
|
||||
# - the model
|
||||
# - the image transforms
|
||||
# - the post processors
|
||||
# - cluster configuration (only relevant for slurm-based evals, ignored otherwise)
|
||||
#
|
||||
# Most of the parameters should be kept as-is. The main modifications you may want to make are:
|
||||
# - the cluster configuration, to adjust partitions/qos to your system
|
||||
# - the flag gather_pred_via_filesys if you ram is tight
|
||||
# - num_val_workers if your number of cores is small (should be roughly number of cores / number of gpus)
|
||||
# - the paths below
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Paths Configuration (Chage this to your own paths)
|
||||
# ============================================================================
|
||||
paths:
|
||||
# If you leave the checkpoint path to null, the model will be downloaded from hugging-face. Otherwise provide a path
|
||||
checkpoint_path: null
|
||||
# the experiments will be subfolders of this
|
||||
base_experiment_log_dir: <YOUR EXPERIMENET LOG_DIR>
|
||||
|
||||
# base path to the annotation folder for gold (refer to the readmes on how to download)
|
||||
base_annotation_path: <YOUR_GOLD_GT_DIR>
|
||||
|
||||
# base path to the annotation folder for silver (refer to the readmes on how to download)
|
||||
base_annotation_path_silver: <YOUR_SILVER_GT_DIR>
|
||||
|
||||
# path to the metaclip images, used for SA-Co gold (refer to the readme for instructions). Can be null if you don't intend on evaluating on this dataset.
|
||||
metaclip_img_path: <YOUR_METACLIP_IMG_DIR>
|
||||
|
||||
# path to the sa1b images, used for SA-Co gold (refer to the readme for instructions). Can be null if you don't intend on evaluating on this dataset.
|
||||
sa1b_img_path: <YOUR_SA1B_IMG_DIR>
|
||||
|
||||
# path to the SA-Co/silver images
|
||||
silver_img_path: <YOUR_SILVER_IMG_DIR>
|
||||
|
||||
bpe_path: <BPE_PATH> # This should be under assets/bpe_simple_vocab_16e6.txt.gz
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Different helper parameters and functions
|
||||
# ============================================================================
|
||||
scratch:
|
||||
|
||||
use_presence_eval: True
|
||||
|
||||
base_val_transform:
|
||||
- _target_: sam3.train.transforms.basic_for_api.ComposeAPI
|
||||
transforms:
|
||||
######## transforms for validation (begin) ########
|
||||
- _target_: sam3.train.transforms.basic_for_api.RandomResizeAPI
|
||||
sizes: ${scratch.resolution} # originally `resolution: 1024`
|
||||
max_size:
|
||||
_target_: sam3.train.transforms.basic.get_random_resize_max_size
|
||||
size: ${scratch.resolution} # originally `resolution: 1024`
|
||||
square: true
|
||||
consistent_transform: False
|
||||
######## transforms for validation (end) ########
|
||||
- _target_: sam3.train.transforms.basic_for_api.ToTensorAPI
|
||||
- _target_: sam3.train.transforms.basic_for_api.NormalizeAPI
|
||||
mean: ${scratch.val_norm_mean}
|
||||
std: ${scratch.val_norm_std}
|
||||
|
||||
loss: null
|
||||
|
||||
# Model parameters
|
||||
d_model: 256
|
||||
input_box_embedding_dim: ${add:${scratch.d_model},2}
|
||||
|
||||
# Box processing
|
||||
original_box_postprocessor:
|
||||
_target_: sam3.eval.postprocessors.PostProcessImage
|
||||
max_dets_per_img: -1 # infinite detections
|
||||
use_original_ids: true
|
||||
use_original_sizes_box: true
|
||||
use_presence: ${scratch.use_presence_eval}
|
||||
|
||||
box_postprocessor:
|
||||
_target_: sam3.eval.postprocessors.PostProcessImage
|
||||
max_dets_per_img: -1 #infinite detections
|
||||
use_original_ids: false
|
||||
use_original_sizes_box: false
|
||||
use_presence: ${scratch.use_presence_eval}
|
||||
|
||||
box_postprocessor_thresholded:
|
||||
_target_: sam3.eval.postprocessors.PostProcessImage
|
||||
max_dets_per_img: -1 #infinite detections
|
||||
use_original_ids: false
|
||||
use_original_sizes_box: false
|
||||
detection_threshold: 0.3
|
||||
use_presence: ${scratch.use_presence_eval}
|
||||
|
||||
mask_postprocessor_thresholded:
|
||||
_target_: sam3.eval.postprocessors.PostProcessImage
|
||||
max_dets_per_img: -1 #infinite detections
|
||||
iou_type: "segm"
|
||||
use_original_ids: false
|
||||
use_original_sizes_box: false
|
||||
use_original_sizes_mask: true
|
||||
convert_mask_to_rle: True
|
||||
detection_threshold: 0.3
|
||||
use_presence: ${scratch.use_presence_eval}
|
||||
|
||||
# Image processing parameters
|
||||
resolution: 1008
|
||||
max_ann_per_img: 200
|
||||
|
||||
# Normalization parameters
|
||||
train_norm_mean: [0.5, 0.5, 0.5]
|
||||
train_norm_std: [0.5, 0.5, 0.5]
|
||||
val_norm_mean: [0.5, 0.5, 0.5]
|
||||
val_norm_std: [0.5, 0.5, 0.5]
|
||||
|
||||
# Training parameters
|
||||
train_batch_size: 1
|
||||
val_batch_size: 1
|
||||
num_train_workers: 0
|
||||
num_val_workers: 10 # change this depending on the number of cpu cores available
|
||||
max_data_epochs: 20
|
||||
target_epoch_size: 1500
|
||||
hybrid_repeats: 1
|
||||
context_length: 2
|
||||
|
||||
# All reduce - this controls how the predictions are sent back to node 0.
|
||||
# If you have a lot of ram, CPU gather is faster. Otherwise, we provide a fallback through filesystem (eg NFS)
|
||||
# Switch to true if you get cpu ooms during gather.
|
||||
gather_pred_via_filesys: false
|
||||
|
||||
# Learning rate and scheduler parameters (unused for eval)
|
||||
lr_scale: 0.1
|
||||
lr_transformer: ${times:8e-4,${scratch.lr_scale}}
|
||||
lr_vision_backbone: ${times:2.5e-4,${scratch.lr_scale}}
|
||||
lr_language_backbone: ${times:5e-5,${scratch.lr_scale}}
|
||||
lrd_vision_backbone: 0.9 # (lower for in-domain adn higher for ood)
|
||||
wd: 0.1
|
||||
scheduler_timescale: 20
|
||||
scheduler_warmup: 20
|
||||
scheduler_cooldown: 20
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Trainer Configuration
|
||||
# ============================================================================
|
||||
|
||||
trainer:
|
||||
_target_: sam3.train.trainer.Trainer
|
||||
skip_saving_ckpts: true
|
||||
empty_gpu_mem_cache_after_eval: True
|
||||
skip_first_val: True
|
||||
max_epochs: ${scratch.max_data_epochs}
|
||||
accelerator: cuda
|
||||
seed_value: 123
|
||||
val_epoch_freq: 10
|
||||
mode: val
|
||||
|
||||
distributed:
|
||||
backend: nccl
|
||||
find_unused_parameters: True
|
||||
gradient_as_bucket_view: True
|
||||
|
||||
loss:
|
||||
all:
|
||||
_target_: sam3.train.loss.sam3_loss.DummyLoss
|
||||
default:
|
||||
_target_: sam3.train.loss.sam3_loss.DummyLoss
|
||||
|
||||
data:
|
||||
train: null
|
||||
val: null
|
||||
|
||||
model:
|
||||
_target_: sam3.model_builder.build_sam3_image_model
|
||||
bpe_path: ${paths.bpe_path}
|
||||
device: cpus
|
||||
eval_mode: true
|
||||
enable_segmentation: true # Warning: Enable this if using segmentation.
|
||||
checkpoint_path: ${paths.checkpoint_path}
|
||||
|
||||
meters:
|
||||
val: null
|
||||
|
||||
optim:
|
||||
amp:
|
||||
enabled: True
|
||||
amp_dtype: bfloat16
|
||||
|
||||
optimizer:
|
||||
_target_: torch.optim.AdamW
|
||||
|
||||
gradient_clip:
|
||||
_target_: sam3.train.optim.optimizer.GradientClipper
|
||||
max_norm: 0.1
|
||||
norm_type: 2
|
||||
|
||||
param_group_modifiers:
|
||||
- _target_: sam3.train.optim.optimizer.layer_decay_param_modifier
|
||||
_partial_: True
|
||||
layer_decay_value: ${scratch.lrd_vision_backbone}
|
||||
apply_to: 'backbone.vision_backbone.trunk'
|
||||
overrides:
|
||||
- pattern: '*pos_embed*'
|
||||
value: 1.0
|
||||
|
||||
options:
|
||||
lr:
|
||||
- scheduler: # transformer and class_embed
|
||||
_target_: sam3.train.optim.schedulers.InverseSquareRootParamScheduler
|
||||
base_lr: ${scratch.lr_transformer}
|
||||
timescale: ${scratch.scheduler_timescale}
|
||||
warmup_steps: ${scratch.scheduler_warmup}
|
||||
cooldown_steps: ${scratch.scheduler_cooldown}
|
||||
- scheduler:
|
||||
_target_: sam3.train.optim.schedulers.InverseSquareRootParamScheduler
|
||||
base_lr: ${scratch.lr_vision_backbone}
|
||||
timescale: ${scratch.scheduler_timescale}
|
||||
warmup_steps: ${scratch.scheduler_warmup}
|
||||
cooldown_steps: ${scratch.scheduler_cooldown}
|
||||
param_names:
|
||||
- 'backbone.vision_backbone.*'
|
||||
- scheduler:
|
||||
_target_: sam3.train.optim.schedulers.InverseSquareRootParamScheduler
|
||||
base_lr: ${scratch.lr_language_backbone}
|
||||
timescale: ${scratch.scheduler_timescale}
|
||||
warmup_steps: ${scratch.scheduler_warmup}
|
||||
cooldown_steps: ${scratch.scheduler_cooldown}
|
||||
param_names:
|
||||
- 'backbone.language_backbone.*'
|
||||
|
||||
weight_decay:
|
||||
- scheduler:
|
||||
_target_: fvcore.common.param_scheduler.ConstantParamScheduler
|
||||
value: ${scratch.wd}
|
||||
- scheduler:
|
||||
_target_: fvcore.common.param_scheduler.ConstantParamScheduler
|
||||
value: 0.0
|
||||
param_names:
|
||||
- '*bias*'
|
||||
module_cls_names: ['torch.nn.LayerNorm']
|
||||
|
||||
checkpoint:
|
||||
save_dir: ${launcher.experiment_log_dir}/checkpoints
|
||||
save_freq: 0 # 0 only last checkpoint is saved.
|
||||
|
||||
|
||||
logging:
|
||||
tensorboard_writer:
|
||||
_target_: sam3.train.utils.logger.make_tensorboard_logger
|
||||
log_dir: ${launcher.experiment_log_dir}/tensorboard
|
||||
flush_secs: 120
|
||||
should_log: True
|
||||
wandb_writer: null
|
||||
log_dir: ${launcher.experiment_log_dir}/logs/
|
||||
log_freq: 10
|
||||
|
||||
# ============================================================================
|
||||
# Launcher and Submitit Configuration
|
||||
# ============================================================================
|
||||
|
||||
launcher:
|
||||
num_nodes: 4
|
||||
gpus_per_node: 8
|
||||
experiment_log_dir: ${paths.experiment_log_dir}
|
||||
multiprocessing_context: forkserver
|
||||
|
||||
|
||||
submitit:
|
||||
account: null # Add your SLURM account if use_cluster == 1
|
||||
partition: null
|
||||
qos: null # Add your QoS if use_cluster == 1
|
||||
timeout_hour: 72
|
||||
use_cluster: True
|
||||
cpus_per_task: 10
|
||||
port_range: [10000, 65000]
|
||||
constraint: null
|
||||
@@ -0,0 +1,66 @@
|
||||
# @package _global_
|
||||
defaults:
|
||||
- /configs/eval_base.yaml
|
||||
- _self_
|
||||
|
||||
# ============================================================================
|
||||
# Paths Configuration (you can override here, but it shouldn't require further changes if eval_base.yaml is correct
|
||||
# ============================================================================
|
||||
paths:
|
||||
experiment_log_dir: ${paths.base_experiment_log_dir}/gold_attributes/
|
||||
coco_gt: ${paths.base_annotation_path}/gold_attributes_merged_a_release_test.json
|
||||
coco_gts:
|
||||
- ${paths.base_annotation_path}/gold_attributes_merged_a_release_test.json
|
||||
- ${paths.base_annotation_path}/gold_attributes_merged_b_release_test.json
|
||||
- ${paths.base_annotation_path}/gold_attributes_merged_c_release_test.json
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Trainer Configuration
|
||||
# ============================================================================
|
||||
|
||||
trainer:
|
||||
data:
|
||||
val:
|
||||
_target_: sam3.train.data.torch_dataset.TorchDataset
|
||||
dataset:
|
||||
_target_: sam3.train.data.sam3_image_dataset.Sam3ImageDataset
|
||||
coco_json_loader:
|
||||
_target_: sam3.train.data.coco_json_loaders.SAM3_EVAL_API_FROM_JSON_NP
|
||||
_partial_: true
|
||||
img_folder: ${paths.metaclip_img_path}
|
||||
ann_file: ${paths.coco_gt}
|
||||
transforms: ${scratch.base_val_transform}
|
||||
max_ann_per_img: 100000
|
||||
multiplier: 1
|
||||
training: false
|
||||
|
||||
shuffle: False
|
||||
batch_size: ${scratch.val_batch_size}
|
||||
num_workers: ${scratch.num_val_workers}
|
||||
pin_memory: False
|
||||
drop_last: False
|
||||
collate_fn:
|
||||
_target_: sam3.train.data.collator.collate_fn_api
|
||||
_partial_: true
|
||||
repeats: ${scratch.hybrid_repeats}
|
||||
dict_key: gold_attributes
|
||||
|
||||
meters:
|
||||
val:
|
||||
gold_attributes: # this key matches the "dict_key" in the dataloader's collate function
|
||||
cgf1:
|
||||
_target_: sam3.eval.coco_writer.PredictionDumper
|
||||
iou_type: "segm"
|
||||
dump_dir: ${launcher.experiment_log_dir}/dumps/gold_attributes
|
||||
merge_predictions: True
|
||||
postprocessor: ${scratch.mask_postprocessor_thresholded}
|
||||
gather_pred_via_filesys: ${scratch.gather_pred_via_filesys}
|
||||
maxdets: 1000000 # no limit
|
||||
pred_file_evaluators:
|
||||
- _target_: sam3.eval.cgf1_eval.CGF1Evaluator
|
||||
gt_path: ${paths.coco_gts}
|
||||
iou_type: "bbox"
|
||||
- _target_: sam3.eval.cgf1_eval.CGF1Evaluator
|
||||
gt_path: ${paths.coco_gts}
|
||||
iou_type: "segm"
|
||||
@@ -0,0 +1,66 @@
|
||||
# @package _global_
|
||||
defaults:
|
||||
- /configs/eval_base.yaml
|
||||
- _self_
|
||||
|
||||
# ============================================================================
|
||||
# Paths Configuration (you can override here, but it shouldn't require further changes if eval_base.yaml is correct
|
||||
# ============================================================================
|
||||
paths:
|
||||
experiment_log_dir: ${paths.base_experiment_log_dir}/gold_crowded/
|
||||
coco_gt: ${paths.base_annotation_path}/gold_crowded_merged_a_release_test.json
|
||||
coco_gts:
|
||||
- ${paths.base_annotation_path}/gold_crowded_merged_a_release_test.json
|
||||
- ${paths.base_annotation_path}/gold_crowded_merged_b_release_test.json
|
||||
- ${paths.base_annotation_path}/gold_crowded_merged_c_release_test.json
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Trainer Configuration
|
||||
# ============================================================================
|
||||
|
||||
trainer:
|
||||
data:
|
||||
val:
|
||||
_target_: sam3.train.data.torch_dataset.TorchDataset
|
||||
dataset:
|
||||
_target_: sam3.train.data.sam3_image_dataset.Sam3ImageDataset
|
||||
coco_json_loader:
|
||||
_target_: sam3.train.data.coco_json_loaders.SAM3_EVAL_API_FROM_JSON_NP
|
||||
_partial_: true
|
||||
img_folder: ${paths.metaclip_img_path}
|
||||
ann_file: ${paths.coco_gt}
|
||||
transforms: ${scratch.base_val_transform}
|
||||
max_ann_per_img: 100000
|
||||
multiplier: 1
|
||||
training: false
|
||||
|
||||
shuffle: False
|
||||
batch_size: ${scratch.val_batch_size}
|
||||
num_workers: ${scratch.num_val_workers}
|
||||
pin_memory: False
|
||||
drop_last: False
|
||||
collate_fn:
|
||||
_target_: sam3.train.data.collator.collate_fn_api
|
||||
_partial_: true
|
||||
repeats: ${scratch.hybrid_repeats}
|
||||
dict_key: gold_crowded
|
||||
|
||||
meters:
|
||||
val:
|
||||
gold_crowded: # this key matches the "dict_key" in the dataloader's collate function
|
||||
cgf1:
|
||||
_target_: sam3.eval.coco_writer.PredictionDumper
|
||||
iou_type: "segm"
|
||||
dump_dir: ${launcher.experiment_log_dir}/dumps/gold_crowded
|
||||
merge_predictions: True
|
||||
postprocessor: ${scratch.mask_postprocessor_thresholded}
|
||||
gather_pred_via_filesys: ${scratch.gather_pred_via_filesys}
|
||||
maxdets: 1000000 # no limit
|
||||
pred_file_evaluators:
|
||||
- _target_: sam3.eval.cgf1_eval.CGF1Evaluator
|
||||
gt_path: ${paths.coco_gts}
|
||||
iou_type: "bbox"
|
||||
- _target_: sam3.eval.cgf1_eval.CGF1Evaluator
|
||||
gt_path: ${paths.coco_gts}
|
||||
iou_type: "segm"
|
||||
@@ -0,0 +1,66 @@
|
||||
# @package _global_
|
||||
defaults:
|
||||
- /configs/eval_base.yaml
|
||||
- _self_
|
||||
|
||||
# ============================================================================
|
||||
# Paths Configuration (you can override here, but it shouldn't require further changes if eval_base.yaml is correct
|
||||
# ============================================================================
|
||||
paths:
|
||||
experiment_log_dir: ${paths.base_experiment_log_dir}/gold_fg_food/
|
||||
coco_gt: ${paths.base_annotation_path}/gold_fg_food_merged_a_release_test.json
|
||||
coco_gts:
|
||||
- ${paths.base_annotation_path}/gold_fg_food_merged_a_release_test.json
|
||||
- ${paths.base_annotation_path}/gold_fg_food_merged_b_release_test.json
|
||||
- ${paths.base_annotation_path}/gold_fg_food_merged_c_release_test.json
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Trainer Configuration
|
||||
# ============================================================================
|
||||
|
||||
trainer:
|
||||
data:
|
||||
val:
|
||||
_target_: sam3.train.data.torch_dataset.TorchDataset
|
||||
dataset:
|
||||
_target_: sam3.train.data.sam3_image_dataset.Sam3ImageDataset
|
||||
coco_json_loader:
|
||||
_target_: sam3.train.data.coco_json_loaders.SAM3_EVAL_API_FROM_JSON_NP
|
||||
_partial_: true
|
||||
img_folder: ${paths.metaclip_img_path}
|
||||
ann_file: ${paths.coco_gt}
|
||||
transforms: ${scratch.base_val_transform}
|
||||
max_ann_per_img: 100000
|
||||
multiplier: 1
|
||||
training: false
|
||||
|
||||
shuffle: False
|
||||
batch_size: ${scratch.val_batch_size}
|
||||
num_workers: ${scratch.num_val_workers}
|
||||
pin_memory: False
|
||||
drop_last: False
|
||||
collate_fn:
|
||||
_target_: sam3.train.data.collator.collate_fn_api
|
||||
_partial_: true
|
||||
repeats: ${scratch.hybrid_repeats}
|
||||
dict_key: gold_fg_food
|
||||
|
||||
meters:
|
||||
val:
|
||||
gold_fg_food: # this key matches the "dict_key" in the dataloader's collate function
|
||||
cgf1:
|
||||
_target_: sam3.eval.coco_writer.PredictionDumper
|
||||
iou_type: "segm"
|
||||
dump_dir: ${launcher.experiment_log_dir}/dumps/gold_fg_food
|
||||
merge_predictions: True
|
||||
postprocessor: ${scratch.mask_postprocessor_thresholded}
|
||||
gather_pred_via_filesys: ${scratch.gather_pred_via_filesys}
|
||||
maxdets: 1000000 # no limit
|
||||
pred_file_evaluators:
|
||||
- _target_: sam3.eval.cgf1_eval.CGF1Evaluator
|
||||
gt_path: ${paths.coco_gts}
|
||||
iou_type: "bbox"
|
||||
- _target_: sam3.eval.cgf1_eval.CGF1Evaluator
|
||||
gt_path: ${paths.coco_gts}
|
||||
iou_type: "segm"
|
||||
@@ -0,0 +1,66 @@
|
||||
# @package _global_
|
||||
defaults:
|
||||
- /configs/eval_base.yaml
|
||||
- _self_
|
||||
|
||||
# ============================================================================
|
||||
# Paths Configuration (you can override here, but it shouldn't require further changes if eval_base.yaml is correct
|
||||
# ============================================================================
|
||||
paths:
|
||||
experiment_log_dir: ${paths.base_experiment_log_dir}/gold_fg_sports_equipment/
|
||||
coco_gt: ${paths.base_annotation_path}/gold_fg_sports_equipment_merged_a_release_test.json
|
||||
coco_gts:
|
||||
- ${paths.base_annotation_path}/gold_fg_sports_equipment_merged_a_release_test.json
|
||||
- ${paths.base_annotation_path}/gold_fg_sports_equipment_merged_b_release_test.json
|
||||
- ${paths.base_annotation_path}/gold_fg_sports_equipment_merged_c_release_test.json
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Trainer Configuration
|
||||
# ============================================================================
|
||||
|
||||
trainer:
|
||||
data:
|
||||
val:
|
||||
_target_: sam3.train.data.torch_dataset.TorchDataset
|
||||
dataset:
|
||||
_target_: sam3.train.data.sam3_image_dataset.Sam3ImageDataset
|
||||
coco_json_loader:
|
||||
_target_: sam3.train.data.coco_json_loaders.SAM3_EVAL_API_FROM_JSON_NP
|
||||
_partial_: true
|
||||
img_folder: ${paths.metaclip_img_path}
|
||||
ann_file: ${paths.coco_gt}
|
||||
transforms: ${scratch.base_val_transform}
|
||||
max_ann_per_img: 100000
|
||||
multiplier: 1
|
||||
training: false
|
||||
|
||||
shuffle: False
|
||||
batch_size: ${scratch.val_batch_size}
|
||||
num_workers: ${scratch.num_val_workers}
|
||||
pin_memory: False
|
||||
drop_last: False
|
||||
collate_fn:
|
||||
_target_: sam3.train.data.collator.collate_fn_api
|
||||
_partial_: true
|
||||
repeats: ${scratch.hybrid_repeats}
|
||||
dict_key: gold_fg_sports_equipment
|
||||
|
||||
meters:
|
||||
val:
|
||||
gold_fg_sports_equipment: # this key matches the "dict_key" in the dataloader's collate function
|
||||
cgf1:
|
||||
_target_: sam3.eval.coco_writer.PredictionDumper
|
||||
iou_type: "segm"
|
||||
dump_dir: ${launcher.experiment_log_dir}/dumps/gold_fg_sports_equipment
|
||||
merge_predictions: True
|
||||
postprocessor: ${scratch.mask_postprocessor_thresholded}
|
||||
gather_pred_via_filesys: ${scratch.gather_pred_via_filesys}
|
||||
maxdets: 1000000 # no limit
|
||||
pred_file_evaluators:
|
||||
- _target_: sam3.eval.cgf1_eval.CGF1Evaluator
|
||||
gt_path: ${paths.coco_gts}
|
||||
iou_type: "bbox"
|
||||
- _target_: sam3.eval.cgf1_eval.CGF1Evaluator
|
||||
gt_path: ${paths.coco_gts}
|
||||
iou_type: "segm"
|
||||
@@ -0,0 +1,66 @@
|
||||
# @package _global_
|
||||
defaults:
|
||||
- /configs/eval_base.yaml
|
||||
- _self_
|
||||
|
||||
# ============================================================================
|
||||
# Paths Configuration (you can override here, but it shouldn't require further changes if eval_base.yaml is correct
|
||||
# ============================================================================
|
||||
paths:
|
||||
experiment_log_dir: ${paths.base_experiment_log_dir}/gold_metaclip_nps/
|
||||
coco_gt: ${paths.base_annotation_path}/gold_metaclip_merged_a_release_test.json
|
||||
coco_gts:
|
||||
- ${paths.base_annotation_path}/gold_metaclip_merged_a_release_test.json
|
||||
- ${paths.base_annotation_path}/gold_metaclip_merged_b_release_test.json
|
||||
- ${paths.base_annotation_path}/gold_metaclip_merged_c_release_test.json
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Trainer Configuration
|
||||
# ============================================================================
|
||||
|
||||
trainer:
|
||||
data:
|
||||
val:
|
||||
_target_: sam3.train.data.torch_dataset.TorchDataset
|
||||
dataset:
|
||||
_target_: sam3.train.data.sam3_image_dataset.Sam3ImageDataset
|
||||
coco_json_loader:
|
||||
_target_: sam3.train.data.coco_json_loaders.SAM3_EVAL_API_FROM_JSON_NP
|
||||
_partial_: true
|
||||
img_folder: ${paths.metaclip_img_path}
|
||||
ann_file: ${paths.coco_gt}
|
||||
transforms: ${scratch.base_val_transform}
|
||||
max_ann_per_img: 100000
|
||||
multiplier: 1
|
||||
training: false
|
||||
|
||||
shuffle: False
|
||||
batch_size: ${scratch.val_batch_size}
|
||||
num_workers: ${scratch.num_val_workers}
|
||||
pin_memory: False
|
||||
drop_last: False
|
||||
collate_fn:
|
||||
_target_: sam3.train.data.collator.collate_fn_api
|
||||
_partial_: true
|
||||
repeats: ${scratch.hybrid_repeats}
|
||||
dict_key: gold_metaclip_nps
|
||||
|
||||
meters:
|
||||
val:
|
||||
gold_metaclip_nps: # this key matches the "dict_key" in the dataloader's collate function
|
||||
cgf1:
|
||||
_target_: sam3.eval.coco_writer.PredictionDumper
|
||||
iou_type: "segm"
|
||||
dump_dir: ${launcher.experiment_log_dir}/dumps/gold_metaclip_nps
|
||||
merge_predictions: True
|
||||
postprocessor: ${scratch.mask_postprocessor_thresholded}
|
||||
gather_pred_via_filesys: ${scratch.gather_pred_via_filesys}
|
||||
maxdets: 1000000 # no limit
|
||||
pred_file_evaluators:
|
||||
- _target_: sam3.eval.cgf1_eval.CGF1Evaluator
|
||||
gt_path: ${paths.coco_gts}
|
||||
iou_type: "bbox"
|
||||
- _target_: sam3.eval.cgf1_eval.CGF1Evaluator
|
||||
gt_path: ${paths.coco_gts}
|
||||
iou_type: "segm"
|
||||
@@ -0,0 +1,66 @@
|
||||
# @package _global_
|
||||
defaults:
|
||||
- /configs/eval_base.yaml
|
||||
- _self_
|
||||
|
||||
# ============================================================================
|
||||
# Paths Configuration (you can override here, but it shouldn't require further changes if eval_base.yaml is correct
|
||||
# ============================================================================
|
||||
paths:
|
||||
experiment_log_dir: ${paths.base_experiment_log_dir}/gold_sa1b_nps/
|
||||
coco_gt: ${paths.base_annotation_path}/gold_sa1b_merged_a_release_test.json
|
||||
coco_gts:
|
||||
- ${paths.base_annotation_path}/gold_sa1b_merged_a_release_test.json
|
||||
- ${paths.base_annotation_path}/gold_sa1b_merged_b_release_test.json
|
||||
- ${paths.base_annotation_path}/gold_sa1b_merged_c_release_test.json
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Trainer Configuration
|
||||
# ============================================================================
|
||||
|
||||
trainer:
|
||||
data:
|
||||
val:
|
||||
_target_: sam3.train.data.torch_dataset.TorchDataset
|
||||
dataset:
|
||||
_target_: sam3.train.data.sam3_image_dataset.Sam3ImageDataset
|
||||
coco_json_loader:
|
||||
_target_: sam3.train.data.coco_json_loaders.SAM3_EVAL_API_FROM_JSON_NP
|
||||
_partial_: true
|
||||
img_folder: ${paths.sa1b_img_path}
|
||||
ann_file: ${paths.coco_gt}
|
||||
transforms: ${scratch.base_val_transform}
|
||||
max_ann_per_img: 100000
|
||||
multiplier: 1
|
||||
training: false
|
||||
|
||||
shuffle: False
|
||||
batch_size: ${scratch.val_batch_size}
|
||||
num_workers: ${scratch.num_val_workers}
|
||||
pin_memory: False
|
||||
drop_last: False
|
||||
collate_fn:
|
||||
_target_: sam3.train.data.collator.collate_fn_api
|
||||
_partial_: true
|
||||
repeats: ${scratch.hybrid_repeats}
|
||||
dict_key: gold_sa1b_nps
|
||||
|
||||
meters:
|
||||
val:
|
||||
gold_sa1b_nps: # this key matches the "dict_key" in the dataloader's collate function
|
||||
cgf1:
|
||||
_target_: sam3.eval.coco_writer.PredictionDumper
|
||||
iou_type: "segm"
|
||||
dump_dir: ${launcher.experiment_log_dir}/dumps/gold_sa1b_nps
|
||||
merge_predictions: True
|
||||
postprocessor: ${scratch.mask_postprocessor_thresholded}
|
||||
gather_pred_via_filesys: ${scratch.gather_pred_via_filesys}
|
||||
maxdets: 1000000 # no limit
|
||||
pred_file_evaluators:
|
||||
- _target_: sam3.eval.cgf1_eval.CGF1Evaluator
|
||||
gt_path: ${paths.coco_gts}
|
||||
iou_type: "bbox"
|
||||
- _target_: sam3.eval.cgf1_eval.CGF1Evaluator
|
||||
gt_path: ${paths.coco_gts}
|
||||
iou_type: "segm"
|
||||
@@ -0,0 +1,66 @@
|
||||
# @package _global_
|
||||
defaults:
|
||||
- /configs/eval_base.yaml
|
||||
- _self_
|
||||
|
||||
# ============================================================================
|
||||
# Paths Configuration (you can override here, but it shouldn't require further changes if eval_base.yaml is correct
|
||||
# ============================================================================
|
||||
paths:
|
||||
experiment_log_dir: ${paths.base_experiment_log_dir}/gold_wiki_common/
|
||||
coco_gt: ${paths.base_annotation_path}/gold_wiki_common_merged_a_release_test.json
|
||||
coco_gts:
|
||||
- ${paths.base_annotation_path}/gold_wiki_common_merged_a_release_test.json
|
||||
- ${paths.base_annotation_path}/gold_wiki_common_merged_b_release_test.json
|
||||
- ${paths.base_annotation_path}/gold_wiki_common_merged_c_release_test.json
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Trainer Configuration
|
||||
# ============================================================================
|
||||
|
||||
trainer:
|
||||
data:
|
||||
val:
|
||||
_target_: sam3.train.data.torch_dataset.TorchDataset
|
||||
dataset:
|
||||
_target_: sam3.train.data.sam3_image_dataset.Sam3ImageDataset
|
||||
coco_json_loader:
|
||||
_target_: sam3.train.data.coco_json_loaders.SAM3_EVAL_API_FROM_JSON_NP
|
||||
_partial_: true
|
||||
img_folder: ${paths.metaclip_img_path}
|
||||
ann_file: ${paths.coco_gt}
|
||||
transforms: ${scratch.base_val_transform}
|
||||
max_ann_per_img: 100000
|
||||
multiplier: 1
|
||||
training: false
|
||||
|
||||
shuffle: False
|
||||
batch_size: ${scratch.val_batch_size}
|
||||
num_workers: ${scratch.num_val_workers}
|
||||
pin_memory: False
|
||||
drop_last: False
|
||||
collate_fn:
|
||||
_target_: sam3.train.data.collator.collate_fn_api
|
||||
_partial_: true
|
||||
repeats: ${scratch.hybrid_repeats}
|
||||
dict_key: gold_wiki_common
|
||||
|
||||
meters:
|
||||
val:
|
||||
gold_wiki_common: # this key matches the "dict_key" in the dataloader's collate function
|
||||
cgf1:
|
||||
_target_: sam3.eval.coco_writer.PredictionDumper
|
||||
iou_type: "segm"
|
||||
dump_dir: ${launcher.experiment_log_dir}/dumps/gold_wiki_common
|
||||
merge_predictions: True
|
||||
postprocessor: ${scratch.mask_postprocessor_thresholded}
|
||||
gather_pred_via_filesys: ${scratch.gather_pred_via_filesys}
|
||||
maxdets: 1000000 # no limit
|
||||
pred_file_evaluators:
|
||||
- _target_: sam3.eval.cgf1_eval.CGF1Evaluator
|
||||
gt_path: ${paths.coco_gts}
|
||||
iou_type: "bbox"
|
||||
- _target_: sam3.eval.cgf1_eval.CGF1Evaluator
|
||||
gt_path: ${paths.coco_gts}
|
||||
iou_type: "segm"
|
||||
255
sam3/train/configs/odinw13/odinw_text_and_visual.yaml
Normal file
255
sam3/train/configs/odinw13/odinw_text_and_visual.yaml
Normal file
@@ -0,0 +1,255 @@
|
||||
# @package _global_
|
||||
defaults:
|
||||
- _self_
|
||||
|
||||
# ============================================================================
|
||||
# Paths Configuration (Chage this to your own paths)
|
||||
# ============================================================================
|
||||
# python sam3/train/train.py -c configs/odinw_text_only.yaml --use-cluster 1 --partition ${PARTITION} --account ${ACCOUNT} --qos ${QoS}
|
||||
|
||||
paths:
|
||||
odinw_data_root: <YOUR_DATA_DIR>
|
||||
experiment_log_dir: <YOUR EXPERIMENET LOG_DIR>
|
||||
bpe_path: <BPE_PATH> # This should be under assets/bpe_simple_vocab_16e6.txt.gz
|
||||
|
||||
supercategory_tuple: ${all_odinw_supercategories.${string:${submitit.job_array.task_index}}}
|
||||
# Validation transforms pipeline
|
||||
val_transforms:
|
||||
- _target_: sam3.train.transforms.basic_for_api.ComposeAPI
|
||||
transforms:
|
||||
- _target_: sam3.train.transforms.basic_for_api.RandomResizeAPI
|
||||
sizes: ${scratch.resolution}
|
||||
max_size:
|
||||
_target_: sam3.train.transforms.basic.get_random_resize_max_size
|
||||
size: ${scratch.resolution}
|
||||
square: true
|
||||
consistent_transform: False
|
||||
- _target_: sam3.train.transforms.basic_for_api.ToTensorAPI
|
||||
- _target_: sam3.train.transforms.basic_for_api.NormalizeAPI
|
||||
mean: ${scratch.val_norm_mean}
|
||||
std: ${scratch.val_norm_std}
|
||||
- _target_: sam3.train.transforms.filter_query_transforms.TextQueryToVisual
|
||||
keep_text_queries: true # Note: set this to false if you only want visual
|
||||
probability: 1.0 # always
|
||||
|
||||
# ============================================================================
|
||||
# Different helper parameters and functions
|
||||
# ============================================================================
|
||||
scratch:
|
||||
enable_segmentation: True
|
||||
# Box processing
|
||||
use_presence_eval: True
|
||||
original_box_postprocessor:
|
||||
_target_: sam3.eval.postprocessors.PostProcessImage
|
||||
max_dets_per_img: -1 # infinite detections
|
||||
use_original_ids: true
|
||||
use_original_sizes_box: true
|
||||
use_presence: ${scratch.use_presence_eval}
|
||||
|
||||
# Image processing parameters
|
||||
resolution: 1008
|
||||
# Normalization parameters
|
||||
val_norm_mean: [0.5, 0.5, 0.5]
|
||||
val_norm_std: [0.5, 0.5, 0.5]
|
||||
|
||||
# Training parameters
|
||||
val_batch_size: 2
|
||||
num_val_workers: 0
|
||||
gather_pred_via_filesys: false
|
||||
|
||||
# ============================================================================
|
||||
# Trainer Configuration
|
||||
# ============================================================================
|
||||
|
||||
trainer:
|
||||
_target_: sam3.train.trainer.Trainer
|
||||
skip_saving_ckpts: true
|
||||
empty_gpu_mem_cache_after_eval: True
|
||||
max_epochs: 1
|
||||
accelerator: cuda
|
||||
seed_value: 123
|
||||
mode: val
|
||||
|
||||
distributed:
|
||||
backend: nccl
|
||||
find_unused_parameters: True
|
||||
gradient_as_bucket_view: True
|
||||
|
||||
loss:
|
||||
default:
|
||||
_target_: sam3.train.loss.sam3_loss.DummyLoss
|
||||
|
||||
data:
|
||||
val:
|
||||
_target_: sam3.train.data.torch_dataset.TorchDataset
|
||||
dataset:
|
||||
_target_: sam3.train.data.sam3_image_dataset.Sam3ImageDataset
|
||||
coco_json_loader:
|
||||
_target_: sam3.train.data.coco_json_loaders.COCO_FROM_JSON
|
||||
prompts: ${odinw35_prompts.${supercategory_tuple.name}}
|
||||
include_negatives: true
|
||||
category_chunk_size: 20 # Note: Since we are doing AP +ve we need to include all categories!
|
||||
_partial_: true
|
||||
img_folder: ${paths.odinw_data_root}/${supercategory_tuple.val.img_folder}
|
||||
ann_file:
|
||||
_target_: sam3.eval.coco_reindex.reindex_coco_to_temp
|
||||
input_json_path: ${paths.odinw_data_root}/${supercategory_tuple.val.json}
|
||||
transforms: ${val_transforms}
|
||||
max_ann_per_img: 100000
|
||||
multiplier: 1
|
||||
training: false
|
||||
|
||||
shuffle: False
|
||||
batch_size: ${scratch.val_batch_size}
|
||||
num_workers: ${scratch.num_val_workers}
|
||||
pin_memory: False
|
||||
drop_last: False
|
||||
collate_fn:
|
||||
_target_: sam3.train.data.collator.collate_fn_api
|
||||
_partial_: true
|
||||
repeats: 1
|
||||
dict_key: odinw35
|
||||
|
||||
model:
|
||||
_target_: sam3.model_builder.build_sam3_image_model
|
||||
bpe_path: ${paths.bpe_path}
|
||||
device: cpus
|
||||
eval_mode: true # Set to false if training
|
||||
enable_segmentation: ${scratch.enable_segmentation} # Warning: Enable this if using segmentation.
|
||||
|
||||
meters:
|
||||
val:
|
||||
odinw35:
|
||||
detection:
|
||||
_target_: sam3.eval.coco_writer.PredictionDumper
|
||||
iou_type: "bbox"
|
||||
dump_dir: ${launcher.experiment_log_dir}/dumps/roboflow/${supercategory_tuple.name}
|
||||
merge_predictions: True
|
||||
postprocessor: ${scratch.original_box_postprocessor}
|
||||
gather_pred_via_filesys: ${scratch.gather_pred_via_filesys}
|
||||
maxdets: 100
|
||||
pred_file_evaluators:
|
||||
- _target_: sam3.eval.coco_eval_offline.CocoEvaluatorOfflineWithPredFileEvaluators
|
||||
gt_path:
|
||||
_target_: sam3.eval.coco_reindex.reindex_coco_to_temp
|
||||
input_json_path: ${paths.odinw_data_root}/${supercategory_tuple.val.json}
|
||||
tide: False
|
||||
iou_type: "bbox"
|
||||
positive_split: true
|
||||
|
||||
checkpoint:
|
||||
save_dir: ${launcher.experiment_log_dir}/checkpoints
|
||||
save_freq: 0 # 0 only last checkpoint is saved.
|
||||
|
||||
|
||||
logging:
|
||||
tensorboard_writer:
|
||||
_target_: sam3.train.utils.logger.make_tensorboard_logger
|
||||
log_dir: ${launcher.experiment_log_dir}/tensorboard
|
||||
flush_secs: 120
|
||||
should_log: True
|
||||
wandb_writer: null
|
||||
log_dir: ${launcher.experiment_log_dir}/logs/${supercategory_tuple.name}
|
||||
log_freq: 10
|
||||
|
||||
# ============================================================================
|
||||
# Launcher and Submitit Configuration
|
||||
# ============================================================================
|
||||
|
||||
launcher:
|
||||
num_nodes: 1
|
||||
gpus_per_node: 2
|
||||
experiment_log_dir: ${paths.experiment_log_dir}
|
||||
multiprocessing_context: forkserver
|
||||
|
||||
submitit:
|
||||
account: null
|
||||
partition: null
|
||||
qos: null
|
||||
timeout_hour: 72
|
||||
use_cluster: True
|
||||
cpus_per_task: 10
|
||||
port_range: [10000, 65000]
|
||||
constraint: null
|
||||
|
||||
job_array:
|
||||
num_tasks: 13
|
||||
task_index: 0
|
||||
|
||||
# ============================================================================
|
||||
# ODinW13 Supercategories
|
||||
# ============================================================================
|
||||
|
||||
all_odinw_supercategories:
|
||||
- name: AerialMaritimeDrone_large
|
||||
val:
|
||||
img_folder: AerialMaritimeDrone/large/test/
|
||||
json: AerialMaritimeDrone/large/test/annotations_without_background.json
|
||||
- name: Aquarium
|
||||
val:
|
||||
img_folder: Aquarium/Aquarium Combined.v2-raw-1024.coco/test/
|
||||
json: Aquarium/Aquarium Combined.v2-raw-1024.coco/test/annotations_without_background.json
|
||||
- name: CottontailRabbits
|
||||
val:
|
||||
img_folder: CottontailRabbits/test/
|
||||
json: CottontailRabbits/test/annotations_without_background.json
|
||||
- name: EgoHands_generic
|
||||
val:
|
||||
img_folder: EgoHands/generic/test/
|
||||
json: EgoHands/generic/test/annotations_without_background.json
|
||||
- name: NorthAmericaMushrooms
|
||||
val:
|
||||
img_folder: NorthAmericaMushrooms/North American Mushrooms.v1-416x416.coco/test/
|
||||
json: NorthAmericaMushrooms/North American Mushrooms.v1-416x416.coco/test/annotations_without_background.json
|
||||
- name: Packages
|
||||
val:
|
||||
img_folder: Packages/Raw/test/
|
||||
json: Packages/Raw/test/annotations_without_background.json
|
||||
- name: PascalVOC
|
||||
val:
|
||||
img_folder: PascalVOC/valid/
|
||||
json: PascalVOC/valid/annotations_without_background.json
|
||||
- name: Raccoon
|
||||
val:
|
||||
img_folder: Raccoon/Raccoon.v2-raw.coco/test/
|
||||
json: Raccoon/Raccoon.v2-raw.coco/test/annotations_without_background.json
|
||||
- name: ShellfishOpenImages
|
||||
val:
|
||||
img_folder: ShellfishOpenImages/raw/test/
|
||||
json: ShellfishOpenImages/raw/test/annotations_without_background.json
|
||||
- name: VehiclesOpenImages
|
||||
val:
|
||||
img_folder: VehiclesOpenImages/416x416/test/
|
||||
json: VehiclesOpenImages/416x416/test/annotations_without_background.json
|
||||
- name: pistols
|
||||
val:
|
||||
img_folder: pistols/export/
|
||||
json: pistols/export/test_annotations_without_background.json
|
||||
- name: pothole
|
||||
val:
|
||||
img_folder: pothole/test/
|
||||
json: pothole/test/annotations_without_background.json
|
||||
- name: thermalDogsAndPeople
|
||||
val:
|
||||
img_folder: thermalDogsAndPeople/test/
|
||||
json: thermalDogsAndPeople/test/annotations_without_background.json
|
||||
|
||||
|
||||
odinw35_prompts:
|
||||
AerialMaritimeDrone_large: '[{"id": 1, "name": "boat", "supercategory": "movable-objects"},
|
||||
{"id": 2, "name": "car", "supercategory": "movable-objects"}, {"id": 3, "name": "dock",
|
||||
"supercategory": "movable-objects"}, {"id": 4, "name": "jet ski", "supercategory": "movable-objects"},
|
||||
{"id": 5, "name": "boat lift", "supercategory": "movable-objects"}]'
|
||||
Aquarium: null
|
||||
CottontailRabbits: null
|
||||
EgoHands_generic: null
|
||||
NorthAmericaMushrooms: '[{''id'': 1, ''name'':
|
||||
''chicken of the woods'', ''supercategory'': ''mushroom''}, {''id'': 2, ''name'': ''chanterelle'', ''supercategory'': ''mushroom''}]'
|
||||
Packages: null
|
||||
PascalVOC: null
|
||||
Raccoon: null
|
||||
ShellfishOpenImages: null
|
||||
VehiclesOpenImages: null
|
||||
pistols: null
|
||||
pothole: null
|
||||
thermalDogsAndPeople: null
|
||||
253
sam3/train/configs/odinw13/odinw_text_only.yaml
Normal file
253
sam3/train/configs/odinw13/odinw_text_only.yaml
Normal file
@@ -0,0 +1,253 @@
|
||||
# @package _global_
|
||||
defaults:
|
||||
- _self_
|
||||
|
||||
# ============================================================================
|
||||
# Paths Configuration (Chage this to your own paths)
|
||||
# ============================================================================
|
||||
# python sam3/train/train.py -c configs/odinw_text_only.yaml --use-cluster 1 --partition ${PARTITION} --account ${ACCOUNT} --qos ${QoS}
|
||||
|
||||
paths:
|
||||
odinw_data_root: <YOUR_DATA_DIR>
|
||||
experiment_log_dir: <YOUR EXPERIMENET LOG_DIR>
|
||||
bpe_path: <BPE_PATH> # This should be under assets/bpe_simple_vocab_16e6.txt.gz
|
||||
|
||||
|
||||
supercategory_tuple: ${all_odinw_supercategories.${string:${submitit.job_array.task_index}}}
|
||||
# Validation transforms pipeline
|
||||
val_transforms:
|
||||
- _target_: sam3.train.transforms.basic_for_api.ComposeAPI
|
||||
transforms:
|
||||
- _target_: sam3.train.transforms.basic_for_api.RandomResizeAPI
|
||||
sizes: ${scratch.resolution}
|
||||
max_size:
|
||||
_target_: sam3.train.transforms.basic.get_random_resize_max_size
|
||||
size: ${scratch.resolution}
|
||||
square: true
|
||||
consistent_transform: False
|
||||
- _target_: sam3.train.transforms.basic_for_api.ToTensorAPI
|
||||
- _target_: sam3.train.transforms.basic_for_api.NormalizeAPI
|
||||
mean: ${scratch.val_norm_mean}
|
||||
std: ${scratch.val_norm_std}
|
||||
|
||||
# ============================================================================
|
||||
# Different helper parameters and functions
|
||||
# ============================================================================
|
||||
scratch:
|
||||
enable_segmentation: True
|
||||
# Box processing
|
||||
use_presence_eval: True
|
||||
original_box_postprocessor:
|
||||
_target_: sam3.eval.postprocessors.PostProcessImage
|
||||
max_dets_per_img: -1 # infinite detections
|
||||
use_original_ids: true
|
||||
use_original_sizes_box: true
|
||||
use_presence: ${scratch.use_presence_eval}
|
||||
|
||||
# Image processing parameters
|
||||
resolution: 1008
|
||||
# Normalization parameters
|
||||
val_norm_mean: [0.5, 0.5, 0.5]
|
||||
val_norm_std: [0.5, 0.5, 0.5]
|
||||
|
||||
# Training parameters
|
||||
val_batch_size: 2
|
||||
num_val_workers: 0
|
||||
gather_pred_via_filesys: false
|
||||
|
||||
# ============================================================================
|
||||
# Trainer Configuration
|
||||
# ============================================================================
|
||||
|
||||
trainer:
|
||||
_target_: sam3.train.trainer.Trainer
|
||||
skip_saving_ckpts: true
|
||||
empty_gpu_mem_cache_after_eval: True
|
||||
max_epochs: 1
|
||||
accelerator: cuda
|
||||
seed_value: 123
|
||||
mode: val
|
||||
|
||||
distributed:
|
||||
backend: nccl
|
||||
find_unused_parameters: True
|
||||
gradient_as_bucket_view: True
|
||||
|
||||
loss:
|
||||
default:
|
||||
_target_: sam3.train.loss.sam3_loss.DummyLoss
|
||||
|
||||
data:
|
||||
val:
|
||||
_target_: sam3.train.data.torch_dataset.TorchDataset
|
||||
dataset:
|
||||
_target_: sam3.train.data.sam3_image_dataset.Sam3ImageDataset
|
||||
coco_json_loader:
|
||||
_target_: sam3.train.data.coco_json_loaders.COCO_FROM_JSON
|
||||
prompts: ${odinw35_prompts.${supercategory_tuple.name}}
|
||||
include_negatives: true
|
||||
category_chunk_size: 20 # Note: Since we are doing AP +ve we need to include all categories!
|
||||
_partial_: true
|
||||
img_folder: ${paths.odinw_data_root}/${supercategory_tuple.val.img_folder}
|
||||
ann_file:
|
||||
_target_: sam3.eval.coco_reindex.reindex_coco_to_temp
|
||||
input_json_path: ${paths.odinw_data_root}/${supercategory_tuple.val.json}
|
||||
transforms: ${val_transforms}
|
||||
max_ann_per_img: 100000
|
||||
multiplier: 1
|
||||
training: false
|
||||
|
||||
shuffle: False
|
||||
batch_size: ${scratch.val_batch_size}
|
||||
num_workers: ${scratch.num_val_workers}
|
||||
pin_memory: False
|
||||
drop_last: False
|
||||
collate_fn:
|
||||
_target_: sam3.train.data.collator.collate_fn_api
|
||||
_partial_: true
|
||||
repeats: 1
|
||||
dict_key: odinw35
|
||||
|
||||
model:
|
||||
_target_: sam3.model_builder.build_sam3_image_model
|
||||
bpe_path: ${paths.bpe_path}
|
||||
device: cpus
|
||||
eval_mode: true # Set to false if training
|
||||
enable_segmentation: ${scratch.enable_segmentation} # Warning: Enable this if using segmentation.
|
||||
|
||||
meters:
|
||||
val:
|
||||
odinw35:
|
||||
detection:
|
||||
_target_: sam3.eval.coco_writer.PredictionDumper
|
||||
iou_type: "bbox"
|
||||
dump_dir: ${launcher.experiment_log_dir}/dumps/odinw/${supercategory_tuple.name}
|
||||
merge_predictions: True
|
||||
postprocessor: ${scratch.original_box_postprocessor}
|
||||
gather_pred_via_filesys: ${scratch.gather_pred_via_filesys}
|
||||
maxdets: 100
|
||||
pred_file_evaluators:
|
||||
- _target_: sam3.eval.coco_eval_offline.CocoEvaluatorOfflineWithPredFileEvaluators
|
||||
gt_path:
|
||||
_target_: sam3.eval.coco_reindex.reindex_coco_to_temp
|
||||
input_json_path: ${paths.odinw_data_root}/${supercategory_tuple.val.json}
|
||||
tide: False
|
||||
iou_type: "bbox"
|
||||
positive_split: False
|
||||
|
||||
checkpoint:
|
||||
save_dir: ${launcher.experiment_log_dir}/checkpoints
|
||||
save_freq: 0 # 0 only last checkpoint is saved.
|
||||
|
||||
|
||||
logging:
|
||||
tensorboard_writer:
|
||||
_target_: sam3.train.utils.logger.make_tensorboard_logger
|
||||
log_dir: ${launcher.experiment_log_dir}/tensorboard
|
||||
flush_secs: 120
|
||||
should_log: True
|
||||
wandb_writer: null
|
||||
log_dir: ${launcher.experiment_log_dir}/logs/${supercategory_tuple.name}
|
||||
log_freq: 10
|
||||
|
||||
# ============================================================================
|
||||
# Launcher and Submitit Configuration
|
||||
# ============================================================================
|
||||
|
||||
launcher:
|
||||
num_nodes: 1
|
||||
gpus_per_node: 2
|
||||
experiment_log_dir: ${paths.experiment_log_dir}
|
||||
multiprocessing_context: forkserver
|
||||
|
||||
submitit:
|
||||
account: null
|
||||
partition: null
|
||||
qos: null
|
||||
timeout_hour: 72
|
||||
use_cluster: True
|
||||
cpus_per_task: 10
|
||||
port_range: [10000, 65000]
|
||||
constraint: null
|
||||
|
||||
job_array:
|
||||
num_tasks: 13
|
||||
task_index: 0
|
||||
|
||||
# ============================================================================
|
||||
# ODinW13 Supercategories
|
||||
# ============================================================================
|
||||
|
||||
all_odinw_supercategories:
|
||||
- name: AerialMaritimeDrone_large
|
||||
val:
|
||||
img_folder: AerialMaritimeDrone/large/test/
|
||||
json: AerialMaritimeDrone/large/test/annotations_without_background.json
|
||||
- name: Aquarium
|
||||
val:
|
||||
img_folder: Aquarium/Aquarium Combined.v2-raw-1024.coco/test/
|
||||
json: Aquarium/Aquarium Combined.v2-raw-1024.coco/test/annotations_without_background.json
|
||||
- name: CottontailRabbits
|
||||
val:
|
||||
img_folder: CottontailRabbits/test/
|
||||
json: CottontailRabbits/test/annotations_without_background.json
|
||||
- name: EgoHands_generic
|
||||
val:
|
||||
img_folder: EgoHands/generic/test/
|
||||
json: EgoHands/generic/test/annotations_without_background.json
|
||||
- name: NorthAmericaMushrooms
|
||||
val:
|
||||
img_folder: NorthAmericaMushrooms/North American Mushrooms.v1-416x416.coco/test/
|
||||
json: NorthAmericaMushrooms/North American Mushrooms.v1-416x416.coco/test/annotations_without_background.json
|
||||
- name: Packages
|
||||
val:
|
||||
img_folder: Packages/Raw/test/
|
||||
json: Packages/Raw/test/annotations_without_background.json
|
||||
- name: PascalVOC
|
||||
val:
|
||||
img_folder: PascalVOC/valid/
|
||||
json: PascalVOC/valid/annotations_without_background.json
|
||||
- name: Raccoon
|
||||
val:
|
||||
img_folder: Raccoon/Raccoon.v2-raw.coco/test/
|
||||
json: Raccoon/Raccoon.v2-raw.coco/test/annotations_without_background.json
|
||||
- name: ShellfishOpenImages
|
||||
val:
|
||||
img_folder: ShellfishOpenImages/raw/test/
|
||||
json: ShellfishOpenImages/raw/test/annotations_without_background.json
|
||||
- name: VehiclesOpenImages
|
||||
val:
|
||||
img_folder: VehiclesOpenImages/416x416/test/
|
||||
json: VehiclesOpenImages/416x416/test/annotations_without_background.json
|
||||
- name: pistols
|
||||
val:
|
||||
img_folder: pistols/export/
|
||||
json: pistols/export/test_annotations_without_background.json
|
||||
- name: pothole
|
||||
val:
|
||||
img_folder: pothole/test/
|
||||
json: pothole/test/annotations_without_background.json
|
||||
- name: thermalDogsAndPeople
|
||||
val:
|
||||
img_folder: thermalDogsAndPeople/test/
|
||||
json: thermalDogsAndPeople/test/annotations_without_background.json
|
||||
|
||||
|
||||
odinw35_prompts:
|
||||
AerialMaritimeDrone_large: '[{"id": 1, "name": "boat", "supercategory": "movable-objects"},
|
||||
{"id": 2, "name": "car", "supercategory": "movable-objects"}, {"id": 3, "name": "dock",
|
||||
"supercategory": "movable-objects"}, {"id": 4, "name": "jet ski", "supercategory": "movable-objects"},
|
||||
{"id": 5, "name": "boat lift", "supercategory": "movable-objects"}]'
|
||||
Aquarium: null
|
||||
CottontailRabbits: null
|
||||
EgoHands_generic: null
|
||||
NorthAmericaMushrooms: '[{''id'': 1, ''name'':
|
||||
''chicken of the woods'', ''supercategory'': ''mushroom''}, {''id'': 2, ''name'': ''chanterelle'', ''supercategory'': ''mushroom''}]'
|
||||
Packages: null
|
||||
PascalVOC: null
|
||||
Raccoon: null
|
||||
ShellfishOpenImages: null
|
||||
VehiclesOpenImages: null
|
||||
pistols: null
|
||||
pothole: null
|
||||
thermalDogsAndPeople: null
|
||||
253
sam3/train/configs/odinw13/odinw_text_only_positive.yaml
Normal file
253
sam3/train/configs/odinw13/odinw_text_only_positive.yaml
Normal file
@@ -0,0 +1,253 @@
|
||||
# @package _global_
|
||||
defaults:
|
||||
- _self_
|
||||
|
||||
# ============================================================================
|
||||
# Paths Configuration (Chage this to your own paths)
|
||||
# ============================================================================
|
||||
# python sam3/train/train.py -c configs/odinw_text_only.yaml --use-cluster 1 --partition ${PARTITION} --account ${ACCOUNT} --qos ${QoS}
|
||||
|
||||
paths:
|
||||
odinw_data_root: <YOUR_DATA_DIR>
|
||||
experiment_log_dir: <YOUR EXPERIMENET LOG_DIR>
|
||||
bpe_path: <BPE_PATH> # This should be under assets/bpe_simple_vocab_16e6.txt.gz
|
||||
|
||||
|
||||
supercategory_tuple: ${all_odinw_supercategories.${string:${submitit.job_array.task_index}}}
|
||||
# Validation transforms pipeline
|
||||
val_transforms:
|
||||
- _target_: sam3.train.transforms.basic_for_api.ComposeAPI
|
||||
transforms:
|
||||
- _target_: sam3.train.transforms.basic_for_api.RandomResizeAPI
|
||||
sizes: ${scratch.resolution}
|
||||
max_size:
|
||||
_target_: sam3.train.transforms.basic.get_random_resize_max_size
|
||||
size: ${scratch.resolution}
|
||||
square: true
|
||||
consistent_transform: False
|
||||
- _target_: sam3.train.transforms.basic_for_api.ToTensorAPI
|
||||
- _target_: sam3.train.transforms.basic_for_api.NormalizeAPI
|
||||
mean: ${scratch.val_norm_mean}
|
||||
std: ${scratch.val_norm_std}
|
||||
|
||||
# ============================================================================
|
||||
# Different helper parameters and functions
|
||||
# ============================================================================
|
||||
scratch:
|
||||
enable_segmentation: True
|
||||
# Box processing
|
||||
use_presence_eval: True
|
||||
original_box_postprocessor:
|
||||
_target_: sam3.eval.postprocessors.PostProcessImage
|
||||
max_dets_per_img: -1 # infinite detections
|
||||
use_original_ids: true
|
||||
use_original_sizes_box: true
|
||||
use_presence: ${scratch.use_presence_eval}
|
||||
|
||||
# Image processing parameters
|
||||
resolution: 1008
|
||||
# Normalization parameters
|
||||
val_norm_mean: [0.5, 0.5, 0.5]
|
||||
val_norm_std: [0.5, 0.5, 0.5]
|
||||
|
||||
# Training parameters
|
||||
val_batch_size: 2
|
||||
num_val_workers: 0
|
||||
gather_pred_via_filesys: false
|
||||
|
||||
# ============================================================================
|
||||
# Trainer Configuration
|
||||
# ============================================================================
|
||||
|
||||
trainer:
|
||||
_target_: sam3.train.trainer.Trainer
|
||||
skip_saving_ckpts: true
|
||||
empty_gpu_mem_cache_after_eval: True
|
||||
max_epochs: 1
|
||||
accelerator: cuda
|
||||
seed_value: 123
|
||||
mode: val
|
||||
|
||||
distributed:
|
||||
backend: nccl
|
||||
find_unused_parameters: True
|
||||
gradient_as_bucket_view: True
|
||||
|
||||
loss:
|
||||
default:
|
||||
_target_: sam3.train.loss.sam3_loss.DummyLoss
|
||||
|
||||
data:
|
||||
val:
|
||||
_target_: sam3.train.data.torch_dataset.TorchDataset
|
||||
dataset:
|
||||
_target_: sam3.train.data.sam3_image_dataset.Sam3ImageDataset
|
||||
coco_json_loader:
|
||||
_target_: sam3.train.data.coco_json_loaders.COCO_FROM_JSON
|
||||
prompts: ${odinw35_prompts.${supercategory_tuple.name}}
|
||||
include_negatives: true
|
||||
category_chunk_size: 20 # Note: Since we are doing AP +ve we need to include all categories!
|
||||
_partial_: true
|
||||
img_folder: ${paths.odinw_data_root}/${supercategory_tuple.val.img_folder}
|
||||
ann_file:
|
||||
_target_: sam3.eval.coco_reindex.reindex_coco_to_temp
|
||||
input_json_path: ${paths.odinw_data_root}/${supercategory_tuple.val.json}
|
||||
transforms: ${val_transforms}
|
||||
max_ann_per_img: 100000
|
||||
multiplier: 1
|
||||
training: false
|
||||
|
||||
shuffle: False
|
||||
batch_size: ${scratch.val_batch_size}
|
||||
num_workers: ${scratch.num_val_workers}
|
||||
pin_memory: False
|
||||
drop_last: False
|
||||
collate_fn:
|
||||
_target_: sam3.train.data.collator.collate_fn_api
|
||||
_partial_: true
|
||||
repeats: 1
|
||||
dict_key: odinw35
|
||||
|
||||
model:
|
||||
_target_: sam3.model_builder.build_sam3_image_model
|
||||
bpe_path: ${paths.bpe_path}
|
||||
device: cpus
|
||||
eval_mode: true # Set to false if training
|
||||
enable_segmentation: ${scratch.enable_segmentation} # Warning: Enable this if using segmentation.
|
||||
|
||||
meters:
|
||||
val:
|
||||
odinw35:
|
||||
detection:
|
||||
_target_: sam3.eval.coco_writer.PredictionDumper
|
||||
iou_type: "bbox"
|
||||
dump_dir: ${launcher.experiment_log_dir}/dumps/roboflow/${supercategory_tuple.name}
|
||||
merge_predictions: True
|
||||
postprocessor: ${scratch.original_box_postprocessor}
|
||||
gather_pred_via_filesys: ${scratch.gather_pred_via_filesys}
|
||||
maxdets: 100
|
||||
pred_file_evaluators:
|
||||
- _target_: sam3.eval.coco_eval_offline.CocoEvaluatorOfflineWithPredFileEvaluators
|
||||
gt_path:
|
||||
_target_: sam3.eval.coco_reindex.reindex_coco_to_temp
|
||||
input_json_path: ${paths.odinw_data_root}/${supercategory_tuple.val.json}
|
||||
tide: False
|
||||
iou_type: "bbox"
|
||||
positive_split: true
|
||||
|
||||
checkpoint:
|
||||
save_dir: ${launcher.experiment_log_dir}/checkpoints
|
||||
save_freq: 0 # 0 only last checkpoint is saved.
|
||||
|
||||
|
||||
logging:
|
||||
tensorboard_writer:
|
||||
_target_: sam3.train.utils.logger.make_tensorboard_logger
|
||||
log_dir: ${launcher.experiment_log_dir}/tensorboard
|
||||
flush_secs: 120
|
||||
should_log: True
|
||||
wandb_writer: null
|
||||
log_dir: ${launcher.experiment_log_dir}/logs/${supercategory_tuple.name}
|
||||
log_freq: 10
|
||||
|
||||
# ============================================================================
|
||||
# Launcher and Submitit Configuration
|
||||
# ============================================================================
|
||||
|
||||
launcher:
|
||||
num_nodes: 1
|
||||
gpus_per_node: 2
|
||||
experiment_log_dir: ${paths.experiment_log_dir}
|
||||
multiprocessing_context: forkserver
|
||||
|
||||
submitit:
|
||||
account: null
|
||||
partition: null
|
||||
qos: null
|
||||
timeout_hour: 72
|
||||
use_cluster: True
|
||||
cpus_per_task: 10
|
||||
port_range: [10000, 65000]
|
||||
constraint: null
|
||||
|
||||
job_array:
|
||||
num_tasks: 13
|
||||
task_index: 0
|
||||
|
||||
# ============================================================================
|
||||
# ODinW13 Supercategories
|
||||
# ============================================================================
|
||||
|
||||
all_odinw_supercategories:
|
||||
- name: AerialMaritimeDrone_large
|
||||
val:
|
||||
img_folder: AerialMaritimeDrone/large/test/
|
||||
json: AerialMaritimeDrone/large/test/annotations_without_background.json
|
||||
- name: Aquarium
|
||||
val:
|
||||
img_folder: Aquarium/Aquarium Combined.v2-raw-1024.coco/test/
|
||||
json: Aquarium/Aquarium Combined.v2-raw-1024.coco/test/annotations_without_background.json
|
||||
- name: CottontailRabbits
|
||||
val:
|
||||
img_folder: CottontailRabbits/test/
|
||||
json: CottontailRabbits/test/annotations_without_background.json
|
||||
- name: EgoHands_generic
|
||||
val:
|
||||
img_folder: EgoHands/generic/test/
|
||||
json: EgoHands/generic/test/annotations_without_background.json
|
||||
- name: NorthAmericaMushrooms
|
||||
val:
|
||||
img_folder: NorthAmericaMushrooms/North American Mushrooms.v1-416x416.coco/test/
|
||||
json: NorthAmericaMushrooms/North American Mushrooms.v1-416x416.coco/test/annotations_without_background.json
|
||||
- name: Packages
|
||||
val:
|
||||
img_folder: Packages/Raw/test/
|
||||
json: Packages/Raw/test/annotations_without_background.json
|
||||
- name: PascalVOC
|
||||
val:
|
||||
img_folder: PascalVOC/valid/
|
||||
json: PascalVOC/valid/annotations_without_background.json
|
||||
- name: Raccoon
|
||||
val:
|
||||
img_folder: Raccoon/Raccoon.v2-raw.coco/test/
|
||||
json: Raccoon/Raccoon.v2-raw.coco/test/annotations_without_background.json
|
||||
- name: ShellfishOpenImages
|
||||
val:
|
||||
img_folder: ShellfishOpenImages/raw/test/
|
||||
json: ShellfishOpenImages/raw/test/annotations_without_background.json
|
||||
- name: VehiclesOpenImages
|
||||
val:
|
||||
img_folder: VehiclesOpenImages/416x416/test/
|
||||
json: VehiclesOpenImages/416x416/test/annotations_without_background.json
|
||||
- name: pistols
|
||||
val:
|
||||
img_folder: pistols/export/
|
||||
json: pistols/export/test_annotations_without_background.json
|
||||
- name: pothole
|
||||
val:
|
||||
img_folder: pothole/test/
|
||||
json: pothole/test/annotations_without_background.json
|
||||
- name: thermalDogsAndPeople
|
||||
val:
|
||||
img_folder: thermalDogsAndPeople/test/
|
||||
json: thermalDogsAndPeople/test/annotations_without_background.json
|
||||
|
||||
|
||||
odinw35_prompts:
|
||||
AerialMaritimeDrone_large: '[{"id": 1, "name": "boat", "supercategory": "movable-objects"},
|
||||
{"id": 2, "name": "car", "supercategory": "movable-objects"}, {"id": 3, "name": "dock",
|
||||
"supercategory": "movable-objects"}, {"id": 4, "name": "jet ski", "supercategory": "movable-objects"},
|
||||
{"id": 5, "name": "boat lift", "supercategory": "movable-objects"}]'
|
||||
Aquarium: null
|
||||
CottontailRabbits: null
|
||||
EgoHands_generic: null
|
||||
NorthAmericaMushrooms: '[{''id'': 1, ''name'':
|
||||
''chicken of the woods'', ''supercategory'': ''mushroom''}, {''id'': 2, ''name'': ''chanterelle'', ''supercategory'': ''mushroom''}]'
|
||||
Packages: null
|
||||
PascalVOC: null
|
||||
Raccoon: null
|
||||
ShellfishOpenImages: null
|
||||
VehiclesOpenImages: null
|
||||
pistols: null
|
||||
pothole: null
|
||||
thermalDogsAndPeople: null
|
||||
591
sam3/train/configs/odinw13/odinw_text_only_train.yaml
Normal file
591
sam3/train/configs/odinw13/odinw_text_only_train.yaml
Normal file
@@ -0,0 +1,591 @@
|
||||
# @package _global_
|
||||
defaults:
|
||||
- _self_
|
||||
|
||||
# ============================================================================
|
||||
# Paths Configuration (Chage this to your own paths)
|
||||
# ============================================================================
|
||||
# python sam3/train/train.py -c configs/odinw_text_only.yaml --use-cluster 1 --partition ${PARTITION} --account ${ACCOUNT} --qos ${QoS}
|
||||
|
||||
paths:
|
||||
odinw_data_root: <YOUR_DATA_DIR>
|
||||
experiment_log_dir: <YOUR EXPERIMENET LOG_DIR>
|
||||
bpe_path: <BPE_PATH> # This should be under assets/bpe_simple_vocab_16e6.txt.gz
|
||||
|
||||
|
||||
odinw_train:
|
||||
train_file: fewshot_train_shot10_seed300
|
||||
num_images: null
|
||||
supercategory_tuple: ${all_odinw_supercategories.${string:${submitit.job_array.task_index}}}
|
||||
# Training transforms pipeline
|
||||
train_transforms:
|
||||
- _target_: sam3.train.transforms.basic_for_api.ComposeAPI
|
||||
transforms:
|
||||
- _target_: sam3.train.transforms.filter_query_transforms.FlexibleFilterFindGetQueries
|
||||
query_filter:
|
||||
_target_: sam3.train.transforms.filter_query_transforms.FilterCrowds
|
||||
- _target_: sam3.train.transforms.point_sampling.RandomizeInputBbox
|
||||
box_noise_std: 0.1
|
||||
box_noise_max: 20
|
||||
- _target_: sam3.train.transforms.segmentation.DecodeRle
|
||||
- _target_: sam3.train.transforms.basic_for_api.RandomResizeAPI
|
||||
sizes:
|
||||
_target_: sam3.train.transforms.basic.get_random_resize_scales
|
||||
size: ${scratch.resolution}
|
||||
min_size: 480
|
||||
rounded: false
|
||||
max_size:
|
||||
_target_: sam3.train.transforms.basic.get_random_resize_max_size
|
||||
size: ${scratch.resolution}
|
||||
square: true
|
||||
consistent_transform: ${scratch.consistent_transform}
|
||||
- _target_: sam3.train.transforms.basic_for_api.PadToSizeAPI
|
||||
size: ${scratch.resolution}
|
||||
consistent_transform: ${scratch.consistent_transform}
|
||||
- _target_: sam3.train.transforms.basic_for_api.ToTensorAPI
|
||||
- _target_: sam3.train.transforms.filter_query_transforms.FlexibleFilterFindGetQueries
|
||||
query_filter:
|
||||
_target_: sam3.train.transforms.filter_query_transforms.FilterEmptyTargets
|
||||
- _target_: sam3.train.transforms.basic_for_api.NormalizeAPI
|
||||
mean: ${scratch.train_norm_mean}
|
||||
std: ${scratch.train_norm_std}
|
||||
- _target_: sam3.train.transforms.filter_query_transforms.FlexibleFilterFindGetQueries
|
||||
query_filter:
|
||||
_target_: sam3.train.transforms.filter_query_transforms.FilterEmptyTargets
|
||||
- _target_: sam3.train.transforms.filter_query_transforms.FlexibleFilterFindGetQueries
|
||||
query_filter:
|
||||
_target_: sam3.train.transforms.filter_query_transforms.FilterFindQueriesWithTooManyOut
|
||||
max_num_objects: ${scratch.max_ann_per_img}
|
||||
|
||||
# Validation transforms pipeline
|
||||
val_transforms:
|
||||
- _target_: sam3.train.transforms.basic_for_api.ComposeAPI
|
||||
transforms:
|
||||
- _target_: sam3.train.transforms.basic_for_api.RandomResizeAPI
|
||||
sizes: ${scratch.resolution}
|
||||
max_size:
|
||||
_target_: sam3.train.transforms.basic.get_random_resize_max_size
|
||||
size: ${scratch.resolution}
|
||||
square: true
|
||||
consistent_transform: False
|
||||
- _target_: sam3.train.transforms.basic_for_api.ToTensorAPI
|
||||
- _target_: sam3.train.transforms.basic_for_api.NormalizeAPI
|
||||
mean: ${scratch.val_norm_mean}
|
||||
std: ${scratch.val_norm_std}
|
||||
|
||||
# loss config (no mask loss)
|
||||
loss:
|
||||
_target_: sam3.train.loss.sam3_loss.Sam3LossWrapper
|
||||
matcher: ${scratch.matcher}
|
||||
o2m_weight: 2.0
|
||||
o2m_matcher:
|
||||
_target_: sam3.train.matcher.BinaryOneToManyMatcher
|
||||
alpha: 0.3
|
||||
threshold: 0.4
|
||||
topk: 4
|
||||
use_o2m_matcher_on_o2m_aux: ${scratch.use_o2m_matcher_on_o2m_aux}
|
||||
loss_fns_find:
|
||||
- _target_: sam3.train.loss.loss_fns.Boxes
|
||||
weight_dict:
|
||||
loss_bbox: 5.0
|
||||
loss_giou: 2.0
|
||||
- _target_: sam3.train.loss.loss_fns.IABCEMdetr
|
||||
weak_loss: False
|
||||
weight_dict:
|
||||
loss_ce: ${scratch.loss_ce_weight} # Change
|
||||
presence_loss: ${scratch.presence_weight} # Change
|
||||
pos_weight: ${scratch.iabce_pos_weight}
|
||||
alpha: ${scratch.iabce_alpha}
|
||||
gamma: 2
|
||||
use_presence: True # Change
|
||||
pos_focal: ${scratch.iabce_pos_focal}
|
||||
pad_n_queries: ${scratch.num_queries}
|
||||
pad_scale_pos: ${scratch.instance_query_loss_pad_scale_pos}
|
||||
|
||||
loss_fn_semantic_seg: null
|
||||
scale_by_find_batch_size: ${scratch.scale_by_find_batch_size}
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Different helper parameters and functions
|
||||
# ============================================================================
|
||||
scratch:
|
||||
enable_segmentation: False
|
||||
use_act_checkpoint_geo_encoder: True
|
||||
input_geometry_encoder:
|
||||
_target_: sam3.model.geometry_encoders.SequenceGeometryEncoder
|
||||
pos_enc: ${scratch.pos_embed}
|
||||
encode_boxes_as_points: False
|
||||
points_direct_project: True
|
||||
points_pool: True
|
||||
points_pos_enc: True
|
||||
boxes_direct_project: True
|
||||
boxes_pool: True
|
||||
boxes_pos_enc: True
|
||||
d_model: ${scratch.d_model}
|
||||
num_layers: 3
|
||||
use_act_ckpt: ${scratch.use_act_checkpoint_geo_encoder}
|
||||
layer:
|
||||
_target_: sam3.model.encoder.TransformerEncoderLayer
|
||||
activation: "relu"
|
||||
d_model: ${scratch.d_model}
|
||||
dim_feedforward: 2048
|
||||
dropout: ${scratch.encoder_dropout}
|
||||
pos_enc_at_attn: false
|
||||
pre_norm: True
|
||||
pos_enc_at_cross_attn_queries: false
|
||||
pos_enc_at_cross_attn_keys: true
|
||||
self_attention:
|
||||
_target_: sam3.model.attention.MultiheadAttention
|
||||
attn_type: Vanilla
|
||||
num_heads: 8
|
||||
dropout: ${scratch.encoder_dropout}
|
||||
embed_dim: ${scratch.d_model}
|
||||
batch_first: False
|
||||
cross_attention:
|
||||
_target_: sam3.model.attention.MultiheadAttention
|
||||
attn_type: Vanilla
|
||||
num_heads: 8
|
||||
dropout: ${scratch.encoder_dropout}
|
||||
embed_dim: ${scratch.d_model}
|
||||
batch_first: False
|
||||
add_cls: true
|
||||
add_post_encode_proj: True
|
||||
|
||||
boxRPB: "log"
|
||||
dac: True
|
||||
use_early_fusion: true
|
||||
o2m_mask: false
|
||||
num_feature_levels: 1 # > 1 not implemented
|
||||
encoder_dropout: 0.1
|
||||
decoder_dropout: 0.1
|
||||
|
||||
tokenizer_ve:
|
||||
_target_: sam3.model.tokenizer_ve.SimpleTokenizer
|
||||
bpe_path: ${paths.bpe_path}
|
||||
|
||||
|
||||
freeze_text_tower: False
|
||||
freeze_image_tower: NoFreeze
|
||||
vis_backbone_dp: 0.0
|
||||
# Activation checkpointing (Save memory)
|
||||
use_act_checkpoint_vision_backbone: True
|
||||
use_act_checkpoint_text_backbone: True
|
||||
use_act_checkpoint_encoder: True
|
||||
use_act_checkpoint_decoder: True
|
||||
|
||||
loss: null
|
||||
# Loss parameters
|
||||
num_queries: 200
|
||||
presence_weight: 20.0
|
||||
loss_ce_weight: 20.0
|
||||
iabce_pos_weight: 5.0
|
||||
iabce_pos_focal: false
|
||||
iabce_alpha: 0.25
|
||||
instance_query_loss_pad_scale_pos: 1.0
|
||||
use_o2m_matcher_on_o2m_aux: false
|
||||
|
||||
# Model parameters
|
||||
use_instance_query: true
|
||||
d_model: 256
|
||||
pos_embed:
|
||||
_target_: sam3.model.position_encoding.PositionEmbeddingSine
|
||||
num_pos_feats: ${scratch.d_model}
|
||||
normalize: true
|
||||
scale: null
|
||||
temperature: 10000
|
||||
|
||||
# Box processing
|
||||
use_presence_eval: True
|
||||
original_box_postprocessor:
|
||||
_target_: sam3.eval.postprocessors.PostProcessImage
|
||||
max_dets_per_img: -1 # infinite detections
|
||||
use_original_ids: true
|
||||
use_original_sizes_box: true
|
||||
use_presence: ${scratch.use_presence_eval}
|
||||
|
||||
|
||||
# Matcher configuration
|
||||
matcher:
|
||||
_target_: sam3.train.matcher.BinaryHungarianMatcherV2
|
||||
focal: true
|
||||
cost_class: 2.0
|
||||
cost_bbox: 5.0
|
||||
cost_giou: 2.0
|
||||
alpha: 0.25
|
||||
gamma: 2
|
||||
stable: False
|
||||
scale_by_find_batch_size: True
|
||||
|
||||
# Image processing parameters
|
||||
resolution: 1008
|
||||
consistent_transform: False
|
||||
max_ann_per_img: 200
|
||||
|
||||
# Normalization parameters
|
||||
train_norm_mean: [0.5, 0.5, 0.5]
|
||||
train_norm_std: [0.5, 0.5, 0.5]
|
||||
val_norm_mean: [0.5, 0.5, 0.5]
|
||||
val_norm_std: [0.5, 0.5, 0.5]
|
||||
|
||||
# Training parameters
|
||||
train_batch_size: 1
|
||||
val_batch_size: 1
|
||||
num_train_workers: 0
|
||||
num_val_workers: 0
|
||||
max_data_epochs: 40
|
||||
target_epoch_size: 1500
|
||||
hybrid_repeats: 1
|
||||
context_length: 2
|
||||
gather_pred_via_filesys: false
|
||||
|
||||
# Learning rate and scheduler parameters
|
||||
lr_scale: 0.1
|
||||
lr_transformer: ${times:8e-4,${scratch.lr_scale}}
|
||||
lr_vision_backbone: ${times:2.5e-4,${scratch.lr_scale}}
|
||||
lr_language_backbone: ${times:5e-5,${scratch.lr_scale}}
|
||||
lrd_vision_backbone: 0.9
|
||||
wd: 0.1
|
||||
scheduler_timescale: 20
|
||||
scheduler_warmup: 20
|
||||
scheduler_cooldown: 20
|
||||
|
||||
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Trainer Configuration
|
||||
# ============================================================================
|
||||
|
||||
trainer:
|
||||
_target_: sam3.train.trainer.Trainer
|
||||
skip_saving_ckpts: true
|
||||
# _target_: sam3.train.trainer.Trainer
|
||||
# skip_saving_ckpts: true
|
||||
empty_gpu_mem_cache_after_eval: True
|
||||
skip_first_val: True
|
||||
max_epochs: ${scratch.max_data_epochs}
|
||||
accelerator: cuda
|
||||
seed_value: 123
|
||||
val_epoch_freq: 10
|
||||
mode: train
|
||||
|
||||
distributed:
|
||||
backend: nccl
|
||||
find_unused_parameters: True
|
||||
gradient_as_bucket_view: True
|
||||
|
||||
loss:
|
||||
all: ${odinw_train.loss}
|
||||
default:
|
||||
_target_: sam3.train.loss.sam3_loss.DummyLoss
|
||||
|
||||
data:
|
||||
train:
|
||||
_target_: sam3.train.data.torch_dataset.TorchDataset
|
||||
dataset:
|
||||
_target_: sam3.train.data.sam3_image_dataset.Sam3ImageDataset
|
||||
limit_ids: ${odinw_train.num_images}
|
||||
transforms: ${odinw_train.train_transforms}
|
||||
load_segmentation: ${scratch.enable_segmentation}
|
||||
max_ann_per_img: 500000
|
||||
multiplier: 1
|
||||
max_train_queries: 50000
|
||||
max_val_queries: 50000
|
||||
training: true
|
||||
use_caching: False
|
||||
img_folder: ${paths.odinw_data_root}/${odinw_train.supercategory_tuple.train.img_folder}
|
||||
ann_file:
|
||||
_target_: sam3.eval.coco_reindex.reindex_coco_to_temp
|
||||
input_json_path: ${paths.odinw_data_root}/${odinw_train.supercategory_tuple.train.json}
|
||||
coco_json_loader:
|
||||
_target_: sam3.train.data.coco_json_loaders.COCO_FROM_JSON
|
||||
prompts: ${odinw35_prompts.${odinw_train.supercategory_tuple.name}} #${odinw_train.supercategory_tuple.name)
|
||||
_partial_: true
|
||||
shuffle: True
|
||||
batch_size: ${scratch.train_batch_size}
|
||||
num_workers: ${scratch.num_train_workers}
|
||||
pin_memory: False
|
||||
drop_last: True
|
||||
collate_fn:
|
||||
_target_: sam3.train.data.collator.collate_fn_api
|
||||
_partial_: true
|
||||
repeats: ${scratch.hybrid_repeats}
|
||||
dict_key: all
|
||||
with_seg_masks: ${scratch.enable_segmentation}
|
||||
|
||||
val:
|
||||
_target_: sam3.train.data.torch_dataset.TorchDataset
|
||||
dataset:
|
||||
_target_: sam3.train.data.sam3_image_dataset.Sam3ImageDataset
|
||||
load_segmentation: ${scratch.enable_segmentation}
|
||||
coco_json_loader:
|
||||
_target_: sam3.train.data.coco_json_loaders.COCO_FROM_JSON
|
||||
prompts: ${odinw35_prompts.${odinw_train.supercategory_tuple.name}}
|
||||
include_negatives: true
|
||||
category_chunk_size: 20 # Note: Since we are doing AP +ve we need to include all categories!
|
||||
_partial_: true
|
||||
img_folder: ${paths.odinw_data_root}/${odinw_train.supercategory_tuple.val.img_folder}
|
||||
ann_file:
|
||||
_target_: sam3.eval.coco_reindex.reindex_coco_to_temp
|
||||
input_json_path: ${paths.odinw_data_root}/${odinw_train.supercategory_tuple.val.json}
|
||||
transforms: ${odinw_train.val_transforms}
|
||||
max_ann_per_img: 100000
|
||||
multiplier: 1
|
||||
training: false
|
||||
|
||||
shuffle: False
|
||||
batch_size: ${scratch.val_batch_size}
|
||||
num_workers: ${scratch.num_val_workers}
|
||||
pin_memory: False
|
||||
drop_last: False
|
||||
collate_fn:
|
||||
_target_: sam3.train.data.collator.collate_fn_api
|
||||
_partial_: true
|
||||
repeats: 1
|
||||
dict_key: odinw35
|
||||
with_seg_masks: ${scratch.enable_segmentation}
|
||||
|
||||
model:
|
||||
_target_: sam3.model_builder.build_sam3_image_model
|
||||
bpe_path: ${paths.bpe_path}
|
||||
device: cpus
|
||||
eval_mode: false # Set to false if training
|
||||
enable_segmentation: ${scratch.enable_segmentation} # Warning: Enable this if using segmentation.
|
||||
|
||||
meters:
|
||||
val:
|
||||
odinw35:
|
||||
detection:
|
||||
_target_: sam3.eval.coco_writer.PredictionDumper
|
||||
iou_type: "bbox"
|
||||
dump_dir: ${launcher.experiment_log_dir}/dumps/odinw/${odinw_train.supercategory_tuple.name}
|
||||
merge_predictions: True
|
||||
postprocessor: ${scratch.original_box_postprocessor}
|
||||
gather_pred_via_filesys: ${scratch.gather_pred_via_filesys}
|
||||
maxdets: 100
|
||||
pred_file_evaluators:
|
||||
- _target_: sam3.eval.coco_eval_offline.CocoEvaluatorOfflineWithPredFileEvaluators
|
||||
gt_path:
|
||||
_target_: sam3.eval.coco_reindex.reindex_coco_to_temp
|
||||
input_json_path: ${paths.odinw_data_root}/${odinw_train.supercategory_tuple.val.json}
|
||||
tide: False
|
||||
iou_type: "bbox"
|
||||
positive_split: False
|
||||
|
||||
optim:
|
||||
amp:
|
||||
enabled: True
|
||||
amp_dtype: bfloat16
|
||||
|
||||
optimizer:
|
||||
_target_: torch.optim.AdamW
|
||||
|
||||
gradient_clip:
|
||||
_target_: sam3.train.optim.optimizer.GradientClipper
|
||||
max_norm: 0.1
|
||||
norm_type: 2
|
||||
|
||||
param_group_modifiers:
|
||||
- _target_: sam3.train.optim.optimizer.layer_decay_param_modifier
|
||||
_partial_: True
|
||||
layer_decay_value: ${scratch.lrd_vision_backbone}
|
||||
apply_to: 'backbone.vision_backbone.trunk'
|
||||
overrides:
|
||||
- pattern: '*pos_embed*'
|
||||
value: 1.0
|
||||
|
||||
options:
|
||||
lr:
|
||||
- scheduler: # transformer and class_embed
|
||||
_target_: sam3.train.optim.schedulers.InverseSquareRootParamScheduler
|
||||
base_lr: ${scratch.lr_transformer}
|
||||
timescale: ${scratch.scheduler_timescale}
|
||||
warmup_steps: ${scratch.scheduler_warmup}
|
||||
cooldown_steps: ${scratch.scheduler_cooldown}
|
||||
- scheduler:
|
||||
_target_: sam3.train.optim.schedulers.InverseSquareRootParamScheduler
|
||||
base_lr: ${scratch.lr_vision_backbone}
|
||||
timescale: ${scratch.scheduler_timescale}
|
||||
warmup_steps: ${scratch.scheduler_warmup}
|
||||
cooldown_steps: ${scratch.scheduler_cooldown}
|
||||
param_names:
|
||||
- 'backbone.vision_backbone.*'
|
||||
- scheduler:
|
||||
_target_: sam3.train.optim.schedulers.InverseSquareRootParamScheduler
|
||||
base_lr: ${scratch.lr_language_backbone}
|
||||
timescale: ${scratch.scheduler_timescale}
|
||||
warmup_steps: ${scratch.scheduler_warmup}
|
||||
cooldown_steps: ${scratch.scheduler_cooldown}
|
||||
param_names:
|
||||
- 'backbone.language_backbone.*'
|
||||
|
||||
weight_decay:
|
||||
- scheduler:
|
||||
_target_: fvcore.common.param_scheduler.ConstantParamScheduler
|
||||
value: ${scratch.wd}
|
||||
- scheduler:
|
||||
_target_: fvcore.common.param_scheduler.ConstantParamScheduler
|
||||
value: 0.0
|
||||
param_names:
|
||||
- '*bias*'
|
||||
module_cls_names: ['torch.nn.LayerNorm']
|
||||
|
||||
checkpoint:
|
||||
save_dir: ${launcher.experiment_log_dir}/checkpoints
|
||||
save_freq: 0 # 0 only last checkpoint is saved.
|
||||
|
||||
|
||||
logging:
|
||||
tensorboard_writer:
|
||||
_target_: sam3.train.utils.logger.make_tensorboard_logger
|
||||
log_dir: ${launcher.experiment_log_dir}/tensorboard
|
||||
flush_secs: 120
|
||||
should_log: True
|
||||
wandb_writer: null
|
||||
log_dir: ${launcher.experiment_log_dir}/logs/${odinw_train.supercategory_tuple.name}
|
||||
log_freq: 10
|
||||
|
||||
# ============================================================================
|
||||
# Launcher and Submitit Configuration
|
||||
# ============================================================================
|
||||
|
||||
launcher:
|
||||
num_nodes: 1
|
||||
gpus_per_node: 2
|
||||
experiment_log_dir: null #${paths.experiment_log_dir}
|
||||
multiprocessing_context: forkserver
|
||||
|
||||
submitit:
|
||||
account: null
|
||||
partition: null
|
||||
qos: null
|
||||
timeout_hour: 72
|
||||
use_cluster: True
|
||||
cpus_per_task: 10
|
||||
port_range: [10000, 65000]
|
||||
constraint: null
|
||||
|
||||
# task_index: 2
|
||||
# Uncomment for job array configuration
|
||||
job_array:
|
||||
num_tasks: 13
|
||||
task_index: 0
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# ODinW13 Supercategories
|
||||
# ============================================================================
|
||||
|
||||
all_odinw_supercategories:
|
||||
- name: AerialMaritimeDrone_large
|
||||
val:
|
||||
img_folder: AerialMaritimeDrone/large/test/
|
||||
json: AerialMaritimeDrone/large/test/annotations_without_background.json
|
||||
train:
|
||||
img_folder: AerialMaritimeDrone/large/train/
|
||||
json: AerialMaritimeDrone/large/train/${odinw_train.train_file}.json
|
||||
- name: Aquarium
|
||||
val:
|
||||
img_folder: Aquarium/Aquarium Combined.v2-raw-1024.coco/test/
|
||||
json: Aquarium/Aquarium Combined.v2-raw-1024.coco/test/annotations_without_background.json
|
||||
train:
|
||||
img_folder: Aquarium/Aquarium Combined.v2-raw-1024.coco/train/
|
||||
json: Aquarium/Aquarium Combined.v2-raw-1024.coco/train/${odinw_train.train_file}.json
|
||||
- name: CottontailRabbits
|
||||
val:
|
||||
img_folder: CottontailRabbits/test/
|
||||
json: CottontailRabbits/test/annotations_without_background.json
|
||||
train:
|
||||
img_folder: CottontailRabbits/train/
|
||||
json: CottontailRabbits/train/${odinw_train.train_file}.json
|
||||
- name: EgoHands_generic
|
||||
val:
|
||||
img_folder: EgoHands/generic/test/
|
||||
json: EgoHands/generic/test/annotations_without_background.json
|
||||
train:
|
||||
img_folder: EgoHands/generic/train/
|
||||
json: EgoHands/generic/train/${odinw_train.train_file}.json
|
||||
- name: NorthAmericaMushrooms
|
||||
val:
|
||||
img_folder: NorthAmericaMushrooms/North American Mushrooms.v1-416x416.coco/test/
|
||||
json: NorthAmericaMushrooms/North American Mushrooms.v1-416x416.coco/test/annotations_without_background.json
|
||||
train:
|
||||
img_folder: NorthAmericaMushrooms/North American Mushrooms.v1-416x416.coco/train/
|
||||
json: NorthAmericaMushrooms/North American Mushrooms.v1-416x416.coco/train/${odinw_train.train_file}.json
|
||||
- name: Packages
|
||||
val:
|
||||
img_folder: Packages/Raw/test/
|
||||
json: Packages/Raw/test/annotations_without_background.json
|
||||
train:
|
||||
img_folder: Packages/Raw/train/
|
||||
json: Packages/Raw/train/${odinw_train.train_file}.json
|
||||
- name: PascalVOC
|
||||
val:
|
||||
img_folder: PascalVOC/valid/
|
||||
json: PascalVOC/valid/annotations_without_background.json
|
||||
train:
|
||||
img_folder: PascalVOC/train/
|
||||
json: PascalVOC/train/${odinw_train.train_file}.json
|
||||
- name: Raccoon
|
||||
val:
|
||||
img_folder: Raccoon/Raccoon.v2-raw.coco/test/
|
||||
json: Raccoon/Raccoon.v2-raw.coco/test/annotations_without_background.json
|
||||
train:
|
||||
img_folder: Raccoon/Raccoon.v2-raw.coco/train/
|
||||
json: Raccoon/Raccoon.v2-raw.coco/train/${odinw_train.train_file}.json
|
||||
- name: ShellfishOpenImages
|
||||
val:
|
||||
img_folder: ShellfishOpenImages/raw/test/
|
||||
json: ShellfishOpenImages/raw/test/annotations_without_background.json
|
||||
train:
|
||||
img_folder: ShellfishOpenImages/raw/train/
|
||||
json: ShellfishOpenImages/raw/train/${odinw_train.train_file}.json
|
||||
- name: VehiclesOpenImages
|
||||
val:
|
||||
img_folder: VehiclesOpenImages/416x416/test/
|
||||
json: VehiclesOpenImages/416x416/test/annotations_without_background.json
|
||||
train:
|
||||
img_folder: VehiclesOpenImages/416x416/train/
|
||||
json: VehiclesOpenImages/416x416/train/${odinw_train.train_file}.json
|
||||
- name: pistols
|
||||
val:
|
||||
img_folder: pistols/export/
|
||||
json: pistols/export/test_annotations_without_background.json
|
||||
train:
|
||||
img_folder: pistols/export/
|
||||
json: pistols/export/${odinw_train.train_file}.json
|
||||
- name: pothole
|
||||
val:
|
||||
img_folder: pothole/test/
|
||||
json: pothole/test/annotations_without_background.json
|
||||
train:
|
||||
img_folder: pothole/train/
|
||||
json: pothole/train/${odinw_train.train_file}.json
|
||||
- name: thermalDogsAndPeople
|
||||
val:
|
||||
img_folder: thermalDogsAndPeople/test/
|
||||
json: thermalDogsAndPeople/test/annotations_without_background.json
|
||||
train:
|
||||
img_folder: thermalDogsAndPeople/train/
|
||||
json: thermalDogsAndPeople/train/${odinw_train.train_file}.json
|
||||
|
||||
|
||||
odinw35_prompts:
|
||||
AerialMaritimeDrone_large: '[{"id": 1, "name": "boat", "supercategory": "movable-objects"},
|
||||
{"id": 2, "name": "car", "supercategory": "movable-objects"}, {"id": 3, "name": "dock",
|
||||
"supercategory": "movable-objects"}, {"id": 4, "name": "jet ski", "supercategory": "movable-objects"},
|
||||
{"id": 5, "name": "boat lift", "supercategory": "movable-objects"}]'
|
||||
Aquarium: null
|
||||
CottontailRabbits: null
|
||||
EgoHands_generic: null
|
||||
NorthAmericaMushrooms: '[{''id'': 1, ''name'':
|
||||
''chicken of the woods'', ''supercategory'': ''mushroom''}, {''id'': 2, ''name'': ''chanterelle'', ''supercategory'': ''mushroom''}]'
|
||||
Packages: null
|
||||
PascalVOC: null
|
||||
Raccoon: null
|
||||
ShellfishOpenImages: null
|
||||
VehiclesOpenImages: null
|
||||
pistols: null
|
||||
pothole: null
|
||||
thermalDogsAndPeople: null
|
||||
256
sam3/train/configs/odinw13/odinw_visual_only.yaml
Normal file
256
sam3/train/configs/odinw13/odinw_visual_only.yaml
Normal file
@@ -0,0 +1,256 @@
|
||||
# @package _global_
|
||||
defaults:
|
||||
- _self_
|
||||
|
||||
# ============================================================================
|
||||
# Paths Configuration (Chage this to your own paths)
|
||||
# ============================================================================
|
||||
# python sam3/train/train.py -c configs/odinw_text_only.yaml --use-cluster 1 --partition ${PARTITION} --account ${ACCOUNT} --qos ${QoS}
|
||||
|
||||
paths:
|
||||
odinw_data_root: <YOUR_DATA_DIR>
|
||||
experiment_log_dir: <YOUR EXPERIMENET LOG_DIR>
|
||||
bpe_path: <BPE_PATH> # This should be under assets/bpe_simple_vocab_16e6.txt.gz
|
||||
|
||||
|
||||
supercategory_tuple: ${all_odinw_supercategories.${string:${submitit.job_array.task_index}}}
|
||||
# Validation transforms pipeline
|
||||
val_transforms:
|
||||
- _target_: sam3.train.transforms.basic_for_api.ComposeAPI
|
||||
transforms:
|
||||
- _target_: sam3.train.transforms.basic_for_api.RandomResizeAPI
|
||||
sizes: ${scratch.resolution}
|
||||
max_size:
|
||||
_target_: sam3.train.transforms.basic.get_random_resize_max_size
|
||||
size: ${scratch.resolution}
|
||||
square: true
|
||||
consistent_transform: False
|
||||
- _target_: sam3.train.transforms.basic_for_api.ToTensorAPI
|
||||
- _target_: sam3.train.transforms.basic_for_api.NormalizeAPI
|
||||
mean: ${scratch.val_norm_mean}
|
||||
std: ${scratch.val_norm_std}
|
||||
- _target_: sam3.train.transforms.filter_query_transforms.TextQueryToVisual
|
||||
keep_text_queries: false # Note: set this to false if you only want visual
|
||||
probability: 1.0 # always
|
||||
|
||||
# ============================================================================
|
||||
# Different helper parameters and functions
|
||||
# ============================================================================
|
||||
scratch:
|
||||
enable_segmentation: True
|
||||
# Box processing
|
||||
use_presence_eval: True
|
||||
original_box_postprocessor:
|
||||
_target_: sam3.eval.postprocessors.PostProcessImage
|
||||
max_dets_per_img: -1 # infinite detections
|
||||
use_original_ids: true
|
||||
use_original_sizes_box: true
|
||||
use_presence: ${scratch.use_presence_eval}
|
||||
|
||||
# Image processing parameters
|
||||
resolution: 1008
|
||||
# Normalization parameters
|
||||
val_norm_mean: [0.5, 0.5, 0.5]
|
||||
val_norm_std: [0.5, 0.5, 0.5]
|
||||
|
||||
# Training parameters
|
||||
val_batch_size: 2
|
||||
num_val_workers: 0
|
||||
gather_pred_via_filesys: false
|
||||
|
||||
# ============================================================================
|
||||
# Trainer Configuration
|
||||
# ============================================================================
|
||||
|
||||
trainer:
|
||||
_target_: sam3.train.trainer.Trainer
|
||||
skip_saving_ckpts: true
|
||||
empty_gpu_mem_cache_after_eval: True
|
||||
max_epochs: 1
|
||||
accelerator: cuda
|
||||
seed_value: 123
|
||||
mode: val
|
||||
|
||||
distributed:
|
||||
backend: nccl
|
||||
find_unused_parameters: True
|
||||
gradient_as_bucket_view: True
|
||||
|
||||
loss:
|
||||
default:
|
||||
_target_: sam3.train.loss.sam3_loss.DummyLoss
|
||||
|
||||
data:
|
||||
val:
|
||||
_target_: sam3.train.data.torch_dataset.TorchDataset
|
||||
dataset:
|
||||
_target_: sam3.train.data.sam3_image_dataset.Sam3ImageDataset
|
||||
coco_json_loader:
|
||||
_target_: sam3.train.data.coco_json_loaders.COCO_FROM_JSON
|
||||
prompts: ${odinw35_prompts.${supercategory_tuple.name}}
|
||||
include_negatives: true
|
||||
category_chunk_size: 20 # Note: Since we are doing AP +ve we need to include all categories!
|
||||
_partial_: true
|
||||
img_folder: ${paths.odinw_data_root}/${supercategory_tuple.val.img_folder}
|
||||
ann_file:
|
||||
_target_: sam3.eval.coco_reindex.reindex_coco_to_temp
|
||||
input_json_path: ${paths.odinw_data_root}/${supercategory_tuple.val.json}
|
||||
transforms: ${val_transforms}
|
||||
max_ann_per_img: 100000
|
||||
multiplier: 1
|
||||
training: false
|
||||
|
||||
shuffle: False
|
||||
batch_size: ${scratch.val_batch_size}
|
||||
num_workers: ${scratch.num_val_workers}
|
||||
pin_memory: False
|
||||
drop_last: False
|
||||
collate_fn:
|
||||
_target_: sam3.train.data.collator.collate_fn_api
|
||||
_partial_: true
|
||||
repeats: 1
|
||||
dict_key: odinw35
|
||||
|
||||
model:
|
||||
_target_: sam3.model_builder.build_sam3_image_model
|
||||
bpe_path: ${paths.bpe_path}
|
||||
device: cpus
|
||||
eval_mode: true # Set to false if training
|
||||
enable_segmentation: ${scratch.enable_segmentation} # Warning: Enable this if using segmentation.
|
||||
|
||||
meters:
|
||||
val:
|
||||
odinw35:
|
||||
detection:
|
||||
_target_: sam3.eval.coco_writer.PredictionDumper
|
||||
iou_type: "bbox"
|
||||
dump_dir: ${launcher.experiment_log_dir}/dumps/roboflow/${supercategory_tuple.name}
|
||||
merge_predictions: True
|
||||
postprocessor: ${scratch.original_box_postprocessor}
|
||||
gather_pred_via_filesys: ${scratch.gather_pred_via_filesys}
|
||||
maxdets: 100
|
||||
pred_file_evaluators:
|
||||
- _target_: sam3.eval.coco_eval_offline.CocoEvaluatorOfflineWithPredFileEvaluators
|
||||
gt_path:
|
||||
_target_: sam3.eval.coco_reindex.reindex_coco_to_temp
|
||||
input_json_path: ${paths.odinw_data_root}/${supercategory_tuple.val.json}
|
||||
tide: False
|
||||
iou_type: "bbox"
|
||||
positive_split: true
|
||||
|
||||
checkpoint:
|
||||
save_dir: ${launcher.experiment_log_dir}/checkpoints
|
||||
save_freq: 0 # 0 only last checkpoint is saved.
|
||||
|
||||
|
||||
logging:
|
||||
tensorboard_writer:
|
||||
_target_: sam3.train.utils.logger.make_tensorboard_logger
|
||||
log_dir: ${launcher.experiment_log_dir}/tensorboard
|
||||
flush_secs: 120
|
||||
should_log: True
|
||||
wandb_writer: null
|
||||
log_dir: ${launcher.experiment_log_dir}/logs/${supercategory_tuple.name}
|
||||
log_freq: 10
|
||||
|
||||
# ============================================================================
|
||||
# Launcher and Submitit Configuration
|
||||
# ============================================================================
|
||||
|
||||
launcher:
|
||||
num_nodes: 1
|
||||
gpus_per_node: 2
|
||||
experiment_log_dir: ${paths.experiment_log_dir}
|
||||
multiprocessing_context: forkserver
|
||||
|
||||
submitit:
|
||||
account: null
|
||||
partition: null
|
||||
qos: null
|
||||
timeout_hour: 72
|
||||
use_cluster: True
|
||||
cpus_per_task: 10
|
||||
port_range: [10000, 65000]
|
||||
constraint: null
|
||||
|
||||
job_array:
|
||||
num_tasks: 13
|
||||
task_index: 0
|
||||
|
||||
# ============================================================================
|
||||
# ODinW13 Supercategories
|
||||
# ============================================================================
|
||||
|
||||
all_odinw_supercategories:
|
||||
- name: AerialMaritimeDrone_large
|
||||
val:
|
||||
img_folder: AerialMaritimeDrone/large/test/
|
||||
json: AerialMaritimeDrone/large/test/annotations_without_background.json
|
||||
- name: Aquarium
|
||||
val:
|
||||
img_folder: Aquarium/Aquarium Combined.v2-raw-1024.coco/test/
|
||||
json: Aquarium/Aquarium Combined.v2-raw-1024.coco/test/annotations_without_background.json
|
||||
- name: CottontailRabbits
|
||||
val:
|
||||
img_folder: CottontailRabbits/test/
|
||||
json: CottontailRabbits/test/annotations_without_background.json
|
||||
- name: EgoHands_generic
|
||||
val:
|
||||
img_folder: EgoHands/generic/test/
|
||||
json: EgoHands/generic/test/annotations_without_background.json
|
||||
- name: NorthAmericaMushrooms
|
||||
val:
|
||||
img_folder: NorthAmericaMushrooms/North American Mushrooms.v1-416x416.coco/test/
|
||||
json: NorthAmericaMushrooms/North American Mushrooms.v1-416x416.coco/test/annotations_without_background.json
|
||||
- name: Packages
|
||||
val:
|
||||
img_folder: Packages/Raw/test/
|
||||
json: Packages/Raw/test/annotations_without_background.json
|
||||
- name: PascalVOC
|
||||
val:
|
||||
img_folder: PascalVOC/valid/
|
||||
json: PascalVOC/valid/annotations_without_background.json
|
||||
- name: Raccoon
|
||||
val:
|
||||
img_folder: Raccoon/Raccoon.v2-raw.coco/test/
|
||||
json: Raccoon/Raccoon.v2-raw.coco/test/annotations_without_background.json
|
||||
- name: ShellfishOpenImages
|
||||
val:
|
||||
img_folder: ShellfishOpenImages/raw/test/
|
||||
json: ShellfishOpenImages/raw/test/annotations_without_background.json
|
||||
- name: VehiclesOpenImages
|
||||
val:
|
||||
img_folder: VehiclesOpenImages/416x416/test/
|
||||
json: VehiclesOpenImages/416x416/test/annotations_without_background.json
|
||||
- name: pistols
|
||||
val:
|
||||
img_folder: pistols/export/
|
||||
json: pistols/export/test_annotations_without_background.json
|
||||
- name: pothole
|
||||
val:
|
||||
img_folder: pothole/test/
|
||||
json: pothole/test/annotations_without_background.json
|
||||
- name: thermalDogsAndPeople
|
||||
val:
|
||||
img_folder: thermalDogsAndPeople/test/
|
||||
json: thermalDogsAndPeople/test/annotations_without_background.json
|
||||
|
||||
|
||||
odinw35_prompts:
|
||||
AerialMaritimeDrone_large: '[{"id": 1, "name": "boat", "supercategory": "movable-objects"},
|
||||
{"id": 2, "name": "car", "supercategory": "movable-objects"}, {"id": 3, "name": "dock",
|
||||
"supercategory": "movable-objects"}, {"id": 4, "name": "jet ski", "supercategory": "movable-objects"},
|
||||
{"id": 5, "name": "boat lift", "supercategory": "movable-objects"}]'
|
||||
Aquarium: null
|
||||
CottontailRabbits: null
|
||||
EgoHands_generic: null
|
||||
NorthAmericaMushrooms: '[{''id'': 1, ''name'':
|
||||
''chicken of the woods'', ''supercategory'': ''mushroom''}, {''id'': 2, ''name'': ''chanterelle'', ''supercategory'': ''mushroom''}]'
|
||||
Packages: null
|
||||
PascalVOC: null
|
||||
Raccoon: null
|
||||
ShellfishOpenImages: null
|
||||
VehiclesOpenImages: null
|
||||
pistols: null
|
||||
pothole: null
|
||||
thermalDogsAndPeople: null
|
||||
539
sam3/train/configs/roboflow_v100/roboflow_v100_eval.yaml
Normal file
539
sam3/train/configs/roboflow_v100/roboflow_v100_eval.yaml
Normal file
@@ -0,0 +1,539 @@
|
||||
# @package _global_
|
||||
defaults:
|
||||
- _self_
|
||||
|
||||
# ============================================================================
|
||||
# Paths Configuration (Chage this to your own paths)
|
||||
# ============================================================================
|
||||
paths:
|
||||
roboflow_vl_100_root: <YOUR_DATASET_DIR>
|
||||
experiment_log_dir: <YOUR EXPERIMENET LOG_DIR>
|
||||
bpe_path: <BPE_PATH> # This should be under assets/bpe_simple_vocab_16e6.txt.gz
|
||||
|
||||
# Roboflow dataset configuration
|
||||
roboflow_train:
|
||||
num_images: 100 # Note: This is the number of images used for training. If null, all images are used.
|
||||
supercategory: ${all_roboflow_supercategories.${string:${submitit.job_array.task_index}}}
|
||||
|
||||
# Training transforms pipeline
|
||||
train_transforms:
|
||||
- _target_: sam3.train.transforms.basic_for_api.ComposeAPI
|
||||
transforms:
|
||||
- _target_: sam3.train.transforms.filter_query_transforms.FlexibleFilterFindGetQueries
|
||||
query_filter:
|
||||
_target_: sam3.train.transforms.filter_query_transforms.FilterCrowds
|
||||
- _target_: sam3.train.transforms.point_sampling.RandomizeInputBbox
|
||||
box_noise_std: 0.1
|
||||
box_noise_max: 20
|
||||
- _target_: sam3.train.transforms.segmentation.DecodeRle
|
||||
- _target_: sam3.train.transforms.basic_for_api.RandomResizeAPI
|
||||
sizes:
|
||||
_target_: sam3.train.transforms.basic.get_random_resize_scales
|
||||
size: ${scratch.resolution}
|
||||
min_size: 480
|
||||
rounded: false
|
||||
max_size:
|
||||
_target_: sam3.train.transforms.basic.get_random_resize_max_size
|
||||
size: ${scratch.resolution}
|
||||
square: true
|
||||
consistent_transform: ${scratch.consistent_transform}
|
||||
- _target_: sam3.train.transforms.basic_for_api.PadToSizeAPI
|
||||
size: ${scratch.resolution}
|
||||
consistent_transform: ${scratch.consistent_transform}
|
||||
- _target_: sam3.train.transforms.basic_for_api.ToTensorAPI
|
||||
- _target_: sam3.train.transforms.filter_query_transforms.FlexibleFilterFindGetQueries
|
||||
query_filter:
|
||||
_target_: sam3.train.transforms.filter_query_transforms.FilterEmptyTargets
|
||||
- _target_: sam3.train.transforms.basic_for_api.NormalizeAPI
|
||||
mean: ${scratch.train_norm_mean}
|
||||
std: ${scratch.train_norm_std}
|
||||
- _target_: sam3.train.transforms.filter_query_transforms.FlexibleFilterFindGetQueries
|
||||
query_filter:
|
||||
_target_: sam3.train.transforms.filter_query_transforms.FilterEmptyTargets
|
||||
- _target_: sam3.train.transforms.filter_query_transforms.FlexibleFilterFindGetQueries
|
||||
query_filter:
|
||||
_target_: sam3.train.transforms.filter_query_transforms.FilterFindQueriesWithTooManyOut
|
||||
max_num_objects: ${scratch.max_ann_per_img}
|
||||
|
||||
# Validation transforms pipeline
|
||||
val_transforms:
|
||||
- _target_: sam3.train.transforms.basic_for_api.ComposeAPI
|
||||
transforms:
|
||||
- _target_: sam3.train.transforms.basic_for_api.RandomResizeAPI
|
||||
sizes: ${scratch.resolution}
|
||||
max_size:
|
||||
_target_: sam3.train.transforms.basic.get_random_resize_max_size
|
||||
size: ${scratch.resolution}
|
||||
square: true
|
||||
consistent_transform: False
|
||||
- _target_: sam3.train.transforms.basic_for_api.ToTensorAPI
|
||||
- _target_: sam3.train.transforms.basic_for_api.NormalizeAPI
|
||||
mean: ${scratch.train_norm_mean}
|
||||
std: ${scratch.train_norm_std}
|
||||
|
||||
# loss config (no mask loss)
|
||||
loss:
|
||||
_target_: sam3.train.loss.sam3_loss.Sam3LossWrapper
|
||||
matcher: ${scratch.matcher}
|
||||
o2m_weight: 2.0
|
||||
o2m_matcher:
|
||||
_target_: sam3.train.matcher.BinaryOneToManyMatcher
|
||||
alpha: 0.3
|
||||
threshold: 0.4
|
||||
topk: 4
|
||||
use_o2m_matcher_on_o2m_aux: false # Another option is true
|
||||
loss_fns_find:
|
||||
- _target_: sam3.train.loss.loss_fns.Boxes
|
||||
weight_dict:
|
||||
loss_bbox: 5.0
|
||||
loss_giou: 2.0
|
||||
- _target_: sam3.train.loss.loss_fns.IABCEMdetr
|
||||
weak_loss: False
|
||||
weight_dict:
|
||||
loss_ce: 20.0 # Another option is 100.0
|
||||
presence_loss: 20.0
|
||||
pos_weight: 10.0 # Another option is 5.0
|
||||
alpha: 0.25
|
||||
gamma: 2
|
||||
use_presence: True # Change
|
||||
pos_focal: false
|
||||
pad_n_queries: 200
|
||||
pad_scale_pos: 1.0
|
||||
|
||||
loss_fn_semantic_seg: null
|
||||
scale_by_find_batch_size: ${scratch.scale_by_find_batch_size}
|
||||
|
||||
|
||||
# NOTE: Loss to be used for training in case of segmentation
|
||||
# loss:
|
||||
# _target_: sam3.train.loss.sam3_loss.Sam3LossWrapper
|
||||
# matcher: ${scratch.matcher}
|
||||
# o2m_weight: 2.0
|
||||
# o2m_matcher:
|
||||
# _target_: sam3.train.matcher.BinaryOneToManyMatcher
|
||||
# alpha: 0.3
|
||||
# threshold: 0.4
|
||||
# topk: 4
|
||||
# use_o2m_matcher_on_o2m_aux: false
|
||||
# loss_fns_find:
|
||||
# - _target_: sam3.train.loss.loss_fns.Boxes
|
||||
# weight_dict:
|
||||
# loss_bbox: 5.0
|
||||
# loss_giou: 2.0
|
||||
# - _target_: sam3.train.loss.loss_fns.IABCEMdetr
|
||||
# weak_loss: False
|
||||
# weight_dict:
|
||||
# loss_ce: 20.0 # Another option is 100.0
|
||||
# presence_loss: 20.0
|
||||
# pos_weight: 10.0 # Another option is 5.0
|
||||
# alpha: 0.25
|
||||
# gamma: 2
|
||||
# use_presence: True # Change
|
||||
# pos_focal: false
|
||||
# pad_n_queries: 200
|
||||
# pad_scale_pos: 1.0
|
||||
# - _target_: sam3.train.loss.loss_fns.Masks
|
||||
# focal_alpha: 0.25
|
||||
# focal_gamma: 2.0
|
||||
# weight_dict:
|
||||
# loss_mask: 200.0
|
||||
# loss_dice: 10.0
|
||||
# compute_aux: false
|
||||
# loss_fn_semantic_seg:
|
||||
# _target_: sam3.losses.loss_fns.SemanticSegCriterion
|
||||
# presence_head: True
|
||||
# presence_loss: False # Change
|
||||
# focal: True
|
||||
# focal_alpha: 0.6
|
||||
# focal_gamma: 2.0
|
||||
# downsample: False
|
||||
# weight_dict:
|
||||
# loss_semantic_seg: 20.0
|
||||
# loss_semantic_presence: 1.0
|
||||
# loss_semantic_dice: 30.0
|
||||
# scale_by_find_batch_size: ${scratch.scale_by_find_batch_size}
|
||||
|
||||
# ============================================================================
|
||||
# Different helper parameters and functions
|
||||
# ============================================================================
|
||||
scratch:
|
||||
enable_segmentation: False # NOTE: This is the number of queries used for segmentation
|
||||
# Model parameters
|
||||
d_model: 256
|
||||
pos_embed:
|
||||
_target_: sam3.model.position_encoding.PositionEmbeddingSine
|
||||
num_pos_feats: ${scratch.d_model}
|
||||
normalize: true
|
||||
scale: null
|
||||
temperature: 10000
|
||||
|
||||
# Box processing
|
||||
use_presence_eval: True
|
||||
original_box_postprocessor:
|
||||
_target_: sam3.eval.postprocessors.PostProcessImage
|
||||
max_dets_per_img: -1 # infinite detections
|
||||
use_original_ids: true
|
||||
use_original_sizes_box: true
|
||||
use_presence: ${scratch.use_presence_eval}
|
||||
|
||||
# Matcher configuration
|
||||
matcher:
|
||||
_target_: sam3.train.matcher.BinaryHungarianMatcherV2
|
||||
focal: true # with `focal: true` it is equivalent to BinaryFocalHungarianMatcher
|
||||
cost_class: 2.0
|
||||
cost_bbox: 5.0
|
||||
cost_giou: 2.0
|
||||
alpha: 0.25
|
||||
gamma: 2
|
||||
stable: False
|
||||
scale_by_find_batch_size: True
|
||||
|
||||
# Image processing parameters
|
||||
resolution: 1008
|
||||
consistent_transform: False
|
||||
max_ann_per_img: 200
|
||||
|
||||
# Normalization parameters
|
||||
train_norm_mean: [0.5, 0.5, 0.5]
|
||||
train_norm_std: [0.5, 0.5, 0.5]
|
||||
val_norm_mean: [0.5, 0.5, 0.5]
|
||||
val_norm_std: [0.5, 0.5, 0.5]
|
||||
|
||||
# Training parameters
|
||||
num_train_workers: 10
|
||||
num_val_workers: 0
|
||||
max_data_epochs: 20
|
||||
target_epoch_size: 1500
|
||||
hybrid_repeats: 1
|
||||
context_length: 2
|
||||
gather_pred_via_filesys: false
|
||||
|
||||
# Learning rate and scheduler parameters
|
||||
lr_scale: 0.1
|
||||
lr_transformer: ${times:8e-4,${scratch.lr_scale}}
|
||||
lr_vision_backbone: ${times:2.5e-4,${scratch.lr_scale}}
|
||||
lr_language_backbone: ${times:5e-5,${scratch.lr_scale}}
|
||||
lrd_vision_backbone: 0.9
|
||||
wd: 0.1
|
||||
scheduler_timescale: 20
|
||||
scheduler_warmup: 20
|
||||
scheduler_cooldown: 20
|
||||
|
||||
val_batch_size: 1
|
||||
collate_fn_val:
|
||||
_target_: sam3.train.data.collator.collate_fn_api
|
||||
_partial_: true
|
||||
repeats: ${scratch.hybrid_repeats}
|
||||
dict_key: roboflow100
|
||||
with_seg_masks: ${scratch.enable_segmentation} # Note: Set this to true if using segmentation masks!
|
||||
|
||||
gradient_accumulation_steps: 1
|
||||
train_batch_size: 1
|
||||
collate_fn:
|
||||
_target_: sam3.train.data.collator.collate_fn_api
|
||||
_partial_: true
|
||||
repeats: ${scratch.hybrid_repeats}
|
||||
dict_key: all
|
||||
with_seg_masks: ${scratch.enable_segmentation} # Note: Set this to true if using segmentation masks!
|
||||
|
||||
# ============================================================================
|
||||
# Trainer Configuration
|
||||
# ============================================================================
|
||||
|
||||
trainer:
|
||||
|
||||
_target_: sam3.train.trainer.Trainer
|
||||
skip_saving_ckpts: true
|
||||
empty_gpu_mem_cache_after_eval: True
|
||||
skip_first_val: True
|
||||
max_epochs: 20
|
||||
accelerator: cuda
|
||||
seed_value: 123
|
||||
val_epoch_freq: 10
|
||||
mode: val
|
||||
gradient_accumulation_steps: ${scratch.gradient_accumulation_steps}
|
||||
|
||||
distributed:
|
||||
backend: nccl
|
||||
find_unused_parameters: True
|
||||
gradient_as_bucket_view: True
|
||||
|
||||
loss:
|
||||
all: ${roboflow_train.loss}
|
||||
default:
|
||||
_target_: sam3.train.loss.sam3_loss.DummyLoss
|
||||
|
||||
data:
|
||||
train:
|
||||
_target_: sam3.train.data.torch_dataset.TorchDataset
|
||||
dataset:
|
||||
_target_: sam3.train.data.sam3_image_dataset.Sam3ImageDataset
|
||||
limit_ids: ${roboflow_train.num_images}
|
||||
transforms: ${roboflow_train.train_transforms}
|
||||
load_segmentation: ${scratch.enable_segmentation}
|
||||
max_ann_per_img: 500000
|
||||
multiplier: 1
|
||||
max_train_queries: 50000
|
||||
max_val_queries: 50000
|
||||
training: true
|
||||
use_caching: False
|
||||
img_folder: ${paths.roboflow_vl_100_root}/${roboflow_train.supercategory}/train/
|
||||
ann_file: ${paths.roboflow_vl_100_root}/${roboflow_train.supercategory}/train/_annotations.coco.json
|
||||
|
||||
shuffle: True
|
||||
batch_size: ${scratch.train_batch_size}
|
||||
num_workers: ${scratch.num_train_workers}
|
||||
pin_memory: True
|
||||
drop_last: True
|
||||
collate_fn: ${scratch.collate_fn}
|
||||
|
||||
val:
|
||||
_target_: sam3.train.data.torch_dataset.TorchDataset
|
||||
dataset:
|
||||
_target_: sam3.train.data.sam3_image_dataset.Sam3ImageDataset
|
||||
load_segmentation: ${scratch.enable_segmentation}
|
||||
coco_json_loader:
|
||||
_target_: sam3.train.data.coco_json_loaders.COCO_FROM_JSON
|
||||
include_negatives: true
|
||||
category_chunk_size: 2 # Note: You can increase this based on the memory of your GPU.
|
||||
_partial_: true
|
||||
img_folder: ${paths.roboflow_vl_100_root}/${roboflow_train.supercategory}/test/
|
||||
ann_file: ${paths.roboflow_vl_100_root}/${roboflow_train.supercategory}/test/_annotations.coco.json
|
||||
transforms: ${roboflow_train.val_transforms}
|
||||
max_ann_per_img: 100000
|
||||
multiplier: 1
|
||||
training: false
|
||||
|
||||
shuffle: False
|
||||
batch_size: ${scratch.val_batch_size}
|
||||
num_workers: ${scratch.num_val_workers}
|
||||
pin_memory: True
|
||||
drop_last: False
|
||||
collate_fn: ${scratch.collate_fn_val}
|
||||
|
||||
|
||||
model:
|
||||
_target_: sam3.model_builder.build_sam3_image_model
|
||||
bpe_path: ${paths.bpe_path}
|
||||
device: cpus
|
||||
eval_mode: true
|
||||
enable_segmentation: ${scratch.enable_segmentation} # Warning: Enable this if using segmentation.
|
||||
|
||||
meters:
|
||||
val:
|
||||
roboflow100:
|
||||
detection:
|
||||
_target_: sam3.eval.coco_writer.PredictionDumper
|
||||
iou_type: "bbox"
|
||||
dump_dir: ${launcher.experiment_log_dir}/dumps/roboflow/${roboflow_train.supercategory}
|
||||
merge_predictions: True
|
||||
postprocessor: ${scratch.original_box_postprocessor}
|
||||
gather_pred_via_filesys: ${scratch.gather_pred_via_filesys}
|
||||
maxdets: 100
|
||||
pred_file_evaluators:
|
||||
- _target_: sam3.eval.coco_eval_offline.CocoEvaluatorOfflineWithPredFileEvaluators
|
||||
gt_path: ${paths.roboflow_vl_100_root}/${roboflow_train.supercategory}/test/_annotations.coco.json
|
||||
tide: False
|
||||
iou_type: "bbox"
|
||||
|
||||
optim:
|
||||
amp:
|
||||
enabled: True
|
||||
amp_dtype: bfloat16
|
||||
|
||||
optimizer:
|
||||
_target_: torch.optim.AdamW
|
||||
|
||||
gradient_clip:
|
||||
_target_: sam3.train.optim.optimizer.GradientClipper
|
||||
max_norm: 0.1
|
||||
norm_type: 2
|
||||
|
||||
param_group_modifiers:
|
||||
- _target_: sam3.train.optim.optimizer.layer_decay_param_modifier
|
||||
_partial_: True
|
||||
layer_decay_value: ${scratch.lrd_vision_backbone}
|
||||
apply_to: 'backbone.vision_backbone.trunk'
|
||||
overrides:
|
||||
- pattern: '*pos_embed*'
|
||||
value: 1.0
|
||||
|
||||
options:
|
||||
lr:
|
||||
- scheduler: # transformer and class_embed
|
||||
_target_: sam3.train.optim.schedulers.InverseSquareRootParamScheduler
|
||||
base_lr: ${scratch.lr_transformer}
|
||||
timescale: ${scratch.scheduler_timescale}
|
||||
warmup_steps: ${scratch.scheduler_warmup}
|
||||
cooldown_steps: ${scratch.scheduler_cooldown}
|
||||
- scheduler:
|
||||
_target_: sam3.train.optim.schedulers.InverseSquareRootParamScheduler
|
||||
base_lr: ${scratch.lr_vision_backbone}
|
||||
timescale: ${scratch.scheduler_timescale}
|
||||
warmup_steps: ${scratch.scheduler_warmup}
|
||||
cooldown_steps: ${scratch.scheduler_cooldown}
|
||||
param_names:
|
||||
- 'backbone.vision_backbone.*'
|
||||
- scheduler:
|
||||
_target_: sam3.train.optim.schedulers.InverseSquareRootParamScheduler
|
||||
base_lr: ${scratch.lr_language_backbone}
|
||||
timescale: ${scratch.scheduler_timescale}
|
||||
warmup_steps: ${scratch.scheduler_warmup}
|
||||
cooldown_steps: ${scratch.scheduler_cooldown}
|
||||
param_names:
|
||||
- 'backbone.language_backbone.*'
|
||||
|
||||
weight_decay:
|
||||
- scheduler:
|
||||
_target_: fvcore.common.param_scheduler.ConstantParamScheduler
|
||||
value: ${scratch.wd}
|
||||
- scheduler:
|
||||
_target_: fvcore.common.param_scheduler.ConstantParamScheduler
|
||||
value: 0.0
|
||||
param_names:
|
||||
- '*bias*'
|
||||
module_cls_names: ['torch.nn.LayerNorm']
|
||||
|
||||
checkpoint:
|
||||
save_dir: ${launcher.experiment_log_dir}/checkpoints
|
||||
save_freq: 0 # 0 only last checkpoint is saved.
|
||||
|
||||
logging:
|
||||
tensorboard_writer:
|
||||
_target_: sam3.train.utils.logger.make_tensorboard_logger
|
||||
log_dir: ${launcher.experiment_log_dir}/tensorboard
|
||||
flush_secs: 120
|
||||
should_log: True
|
||||
wandb_writer: null
|
||||
log_dir: ${launcher.experiment_log_dir}/logs/${roboflow_train.supercategory}
|
||||
log_freq: 10
|
||||
|
||||
# ============================================================================
|
||||
# Launcher and Submitit Configuration
|
||||
# ============================================================================
|
||||
|
||||
launcher:
|
||||
num_nodes: 1
|
||||
gpus_per_node: 2
|
||||
experiment_log_dir: ${paths.experiment_log_dir}
|
||||
multiprocessing_context: forkserver
|
||||
|
||||
submitit:
|
||||
account: null
|
||||
partition: null
|
||||
qos: null
|
||||
timeout_hour: 72
|
||||
use_cluster: True
|
||||
cpus_per_task: 10
|
||||
port_range: [10000, 65000]
|
||||
constraint: null
|
||||
# Uncomment for job array configuration
|
||||
job_array:
|
||||
num_tasks: 100
|
||||
task_index: 0
|
||||
|
||||
# ============================================================================
|
||||
# Available Roboflow Supercategories (for reference)
|
||||
# ============================================================================
|
||||
|
||||
all_roboflow_supercategories:
|
||||
- -grccs
|
||||
- zebrasatasturias
|
||||
- cod-mw-warzone
|
||||
- canalstenosis
|
||||
- label-printing-defect-version-2
|
||||
- new-defects-in-wood
|
||||
- orionproducts
|
||||
- aquarium-combined
|
||||
- varroa-mites-detection--test-set
|
||||
- clashroyalechardetector
|
||||
- stomata-cells
|
||||
- halo-infinite-angel-videogame
|
||||
- pig-detection
|
||||
- urine-analysis1
|
||||
- aerial-sheep
|
||||
- orgharvest
|
||||
- actions
|
||||
- mahjong
|
||||
- liver-disease
|
||||
- needle-base-tip-min-max
|
||||
- wheel-defect-detection
|
||||
- aircraft-turnaround-dataset
|
||||
- xray
|
||||
- wildfire-smoke
|
||||
- spinefrxnormalvindr
|
||||
- ufba-425
|
||||
- speech-bubbles-detection
|
||||
- train
|
||||
- pill
|
||||
- truck-movement
|
||||
- car-logo-detection
|
||||
- inbreast
|
||||
- sea-cucumbers-new-tiles
|
||||
- uavdet-small
|
||||
- penguin-finder-seg
|
||||
- aerial-airport
|
||||
- bibdetection
|
||||
- taco-trash-annotations-in-context
|
||||
- bees
|
||||
- recode-waste
|
||||
- screwdetectclassification
|
||||
- wine-labels
|
||||
- aerial-cows
|
||||
- into-the-vale
|
||||
- gwhd2021
|
||||
- lacrosse-object-detection
|
||||
- defect-detection
|
||||
- dataconvert
|
||||
- x-ray-id
|
||||
- ball
|
||||
- tube
|
||||
- 2024-frc
|
||||
- crystal-clean-brain-tumors-mri-dataset
|
||||
- grapes-5
|
||||
- human-detection-in-floods
|
||||
- buoy-onboarding
|
||||
- apoce-aerial-photographs-for-object-detection-of-construction-equipment
|
||||
- l10ul502
|
||||
- floating-waste
|
||||
- deeppcb
|
||||
- ism-band-packet-detection
|
||||
- weeds4
|
||||
- invoice-processing
|
||||
- thermal-cheetah
|
||||
- tomatoes-2
|
||||
- marine-sharks
|
||||
- peixos-fish
|
||||
- sssod
|
||||
- aerial-pool
|
||||
- countingpills
|
||||
- asphaltdistressdetection
|
||||
- roboflow-trained-dataset
|
||||
- everdaynew
|
||||
- underwater-objects
|
||||
- soda-bottles
|
||||
- dentalai
|
||||
- jellyfish
|
||||
- deepfruits
|
||||
- activity-diagrams
|
||||
- circuit-voltages
|
||||
- all-elements
|
||||
- macro-segmentation
|
||||
- exploratorium-daphnia
|
||||
- signatures
|
||||
- conveyor-t-shirts
|
||||
- fruitjes
|
||||
- grass-weeds
|
||||
- infraredimageofpowerequipment
|
||||
- 13-lkc01
|
||||
- wb-prova
|
||||
- flir-camera-objects
|
||||
- paper-parts
|
||||
- football-player-detection
|
||||
- trail-camera
|
||||
- smd-components
|
||||
- water-meter
|
||||
- nih-xray
|
||||
- the-dreidel-project
|
||||
- electric-pylon-detection-in-rsi
|
||||
- cable-damage
|
||||
@@ -0,0 +1,539 @@
|
||||
# @package _global_
|
||||
defaults:
|
||||
- _self_
|
||||
|
||||
# ============================================================================
|
||||
# Paths Configuration (Chage this to your own paths)
|
||||
# ============================================================================
|
||||
paths:
|
||||
roboflow_vl_100_root: <YOUR_DATASET_DIR>
|
||||
experiment_log_dir: <YOUR EXPERIMENET LOG_DIR>
|
||||
bpe_path: <BPE_PATH> # This should be under assets/bpe_simple_vocab_16e6.txt.gz
|
||||
|
||||
# Roboflow dataset configuration
|
||||
roboflow_train:
|
||||
num_images: 100 # Note: This is the number of images used for training. If null, all images are used.
|
||||
supercategory: ${all_roboflow_supercategories.${string:${submitit.job_array.task_index}}}
|
||||
|
||||
# Training transforms pipeline
|
||||
train_transforms:
|
||||
- _target_: sam3.train.transforms.basic_for_api.ComposeAPI
|
||||
transforms:
|
||||
- _target_: sam3.train.transforms.filter_query_transforms.FlexibleFilterFindGetQueries
|
||||
query_filter:
|
||||
_target_: sam3.train.transforms.filter_query_transforms.FilterCrowds
|
||||
- _target_: sam3.train.transforms.point_sampling.RandomizeInputBbox
|
||||
box_noise_std: 0.1
|
||||
box_noise_max: 20
|
||||
- _target_: sam3.train.transforms.segmentation.DecodeRle
|
||||
- _target_: sam3.train.transforms.basic_for_api.RandomResizeAPI
|
||||
sizes:
|
||||
_target_: sam3.train.transforms.basic.get_random_resize_scales
|
||||
size: ${scratch.resolution}
|
||||
min_size: 480
|
||||
rounded: false
|
||||
max_size:
|
||||
_target_: sam3.train.transforms.basic.get_random_resize_max_size
|
||||
size: ${scratch.resolution}
|
||||
square: true
|
||||
consistent_transform: ${scratch.consistent_transform}
|
||||
- _target_: sam3.train.transforms.basic_for_api.PadToSizeAPI
|
||||
size: ${scratch.resolution}
|
||||
consistent_transform: ${scratch.consistent_transform}
|
||||
- _target_: sam3.train.transforms.basic_for_api.ToTensorAPI
|
||||
- _target_: sam3.train.transforms.filter_query_transforms.FlexibleFilterFindGetQueries
|
||||
query_filter:
|
||||
_target_: sam3.train.transforms.filter_query_transforms.FilterEmptyTargets
|
||||
- _target_: sam3.train.transforms.basic_for_api.NormalizeAPI
|
||||
mean: ${scratch.train_norm_mean}
|
||||
std: ${scratch.train_norm_std}
|
||||
- _target_: sam3.train.transforms.filter_query_transforms.FlexibleFilterFindGetQueries
|
||||
query_filter:
|
||||
_target_: sam3.train.transforms.filter_query_transforms.FilterEmptyTargets
|
||||
- _target_: sam3.train.transforms.filter_query_transforms.FlexibleFilterFindGetQueries
|
||||
query_filter:
|
||||
_target_: sam3.train.transforms.filter_query_transforms.FilterFindQueriesWithTooManyOut
|
||||
max_num_objects: ${scratch.max_ann_per_img}
|
||||
|
||||
# Validation transforms pipeline
|
||||
val_transforms:
|
||||
- _target_: sam3.train.transforms.basic_for_api.ComposeAPI
|
||||
transforms:
|
||||
- _target_: sam3.train.transforms.basic_for_api.RandomResizeAPI
|
||||
sizes: ${scratch.resolution}
|
||||
max_size:
|
||||
_target_: sam3.train.transforms.basic.get_random_resize_max_size
|
||||
size: ${scratch.resolution}
|
||||
square: true
|
||||
consistent_transform: False
|
||||
- _target_: sam3.train.transforms.basic_for_api.ToTensorAPI
|
||||
- _target_: sam3.train.transforms.basic_for_api.NormalizeAPI
|
||||
mean: ${scratch.train_norm_mean}
|
||||
std: ${scratch.train_norm_std}
|
||||
|
||||
# loss config (no mask loss)
|
||||
loss:
|
||||
_target_: sam3.train.loss.sam3_loss.Sam3LossWrapper
|
||||
matcher: ${scratch.matcher}
|
||||
o2m_weight: 2.0
|
||||
o2m_matcher:
|
||||
_target_: sam3.train.matcher.BinaryOneToManyMatcher
|
||||
alpha: 0.3
|
||||
threshold: 0.4
|
||||
topk: 4
|
||||
use_o2m_matcher_on_o2m_aux: false # Another option is true
|
||||
loss_fns_find:
|
||||
- _target_: sam3.train.loss.loss_fns.Boxes
|
||||
weight_dict:
|
||||
loss_bbox: 5.0
|
||||
loss_giou: 2.0
|
||||
- _target_: sam3.train.loss.loss_fns.IABCEMdetr
|
||||
weak_loss: False
|
||||
weight_dict:
|
||||
loss_ce: 20.0 # Another option is 100.0
|
||||
presence_loss: 20.0
|
||||
pos_weight: 10.0 # Another option is 5.0
|
||||
alpha: 0.25
|
||||
gamma: 2
|
||||
use_presence: True # Change
|
||||
pos_focal: false
|
||||
pad_n_queries: 200
|
||||
pad_scale_pos: 1.0
|
||||
|
||||
loss_fn_semantic_seg: null
|
||||
scale_by_find_batch_size: ${scratch.scale_by_find_batch_size}
|
||||
|
||||
|
||||
# NOTE: Loss to be used for training in case of segmentation
|
||||
# loss:
|
||||
# _target_: sam3.train.loss.sam3_loss.Sam3LossWrapper
|
||||
# matcher: ${scratch.matcher}
|
||||
# o2m_weight: 2.0
|
||||
# o2m_matcher:
|
||||
# _target_: sam3.train.matcher.BinaryOneToManyMatcher
|
||||
# alpha: 0.3
|
||||
# threshold: 0.4
|
||||
# topk: 4
|
||||
# use_o2m_matcher_on_o2m_aux: false
|
||||
# loss_fns_find:
|
||||
# - _target_: sam3.train.loss.loss_fns.Boxes
|
||||
# weight_dict:
|
||||
# loss_bbox: 5.0
|
||||
# loss_giou: 2.0
|
||||
# - _target_: sam3.train.loss.loss_fns.IABCEMdetr
|
||||
# weak_loss: False
|
||||
# weight_dict:
|
||||
# loss_ce: 20.0 # Another option is 100.0
|
||||
# presence_loss: 20.0
|
||||
# pos_weight: 10.0 # Another option is 5.0
|
||||
# alpha: 0.25
|
||||
# gamma: 2
|
||||
# use_presence: True # Change
|
||||
# pos_focal: false
|
||||
# pad_n_queries: 200
|
||||
# pad_scale_pos: 1.0
|
||||
# - _target_: sam3.train.loss.loss_fns.Masks
|
||||
# focal_alpha: 0.25
|
||||
# focal_gamma: 2.0
|
||||
# weight_dict:
|
||||
# loss_mask: 200.0
|
||||
# loss_dice: 10.0
|
||||
# compute_aux: false
|
||||
# loss_fn_semantic_seg:
|
||||
# _target_: sam3.losses.loss_fns.SemanticSegCriterion
|
||||
# presence_head: True
|
||||
# presence_loss: False # Change
|
||||
# focal: True
|
||||
# focal_alpha: 0.6
|
||||
# focal_gamma: 2.0
|
||||
# downsample: False
|
||||
# weight_dict:
|
||||
# loss_semantic_seg: 20.0
|
||||
# loss_semantic_presence: 1.0
|
||||
# loss_semantic_dice: 30.0
|
||||
# scale_by_find_batch_size: ${scratch.scale_by_find_batch_size}
|
||||
|
||||
# ============================================================================
|
||||
# Different helper parameters and functions
|
||||
# ============================================================================
|
||||
scratch:
|
||||
enable_segmentation: False # NOTE: This is the number of queries used for segmentation
|
||||
# Model parameters
|
||||
d_model: 256
|
||||
pos_embed:
|
||||
_target_: sam3.model.position_encoding.PositionEmbeddingSine
|
||||
num_pos_feats: ${scratch.d_model}
|
||||
normalize: true
|
||||
scale: null
|
||||
temperature: 10000
|
||||
|
||||
# Box processing
|
||||
use_presence_eval: True
|
||||
original_box_postprocessor:
|
||||
_target_: sam3.eval.postprocessors.PostProcessImage
|
||||
max_dets_per_img: -1 # infinite detections
|
||||
use_original_ids: true
|
||||
use_original_sizes_box: true
|
||||
use_presence: ${scratch.use_presence_eval}
|
||||
|
||||
# Matcher configuration
|
||||
matcher:
|
||||
_target_: sam3.train.matcher.BinaryHungarianMatcherV2
|
||||
focal: true # with `focal: true` it is equivalent to BinaryFocalHungarianMatcher
|
||||
cost_class: 2.0
|
||||
cost_bbox: 5.0
|
||||
cost_giou: 2.0
|
||||
alpha: 0.25
|
||||
gamma: 2
|
||||
stable: False
|
||||
scale_by_find_batch_size: True
|
||||
|
||||
# Image processing parameters
|
||||
resolution: 1008
|
||||
consistent_transform: False
|
||||
max_ann_per_img: 200
|
||||
|
||||
# Normalization parameters
|
||||
train_norm_mean: [0.5, 0.5, 0.5]
|
||||
train_norm_std: [0.5, 0.5, 0.5]
|
||||
val_norm_mean: [0.5, 0.5, 0.5]
|
||||
val_norm_std: [0.5, 0.5, 0.5]
|
||||
|
||||
# Training parameters
|
||||
num_train_workers: 10
|
||||
num_val_workers: 0
|
||||
max_data_epochs: 20
|
||||
target_epoch_size: 1500
|
||||
hybrid_repeats: 1
|
||||
context_length: 2
|
||||
gather_pred_via_filesys: false
|
||||
|
||||
# Learning rate and scheduler parameters
|
||||
lr_scale: 0.1
|
||||
lr_transformer: ${times:8e-4,${scratch.lr_scale}}
|
||||
lr_vision_backbone: ${times:2.5e-4,${scratch.lr_scale}}
|
||||
lr_language_backbone: ${times:5e-5,${scratch.lr_scale}}
|
||||
lrd_vision_backbone: 0.9
|
||||
wd: 0.1
|
||||
scheduler_timescale: 20
|
||||
scheduler_warmup: 20
|
||||
scheduler_cooldown: 20
|
||||
|
||||
val_batch_size: 1
|
||||
collate_fn_val:
|
||||
_target_: sam3.train.data.collator.collate_fn_api
|
||||
_partial_: true
|
||||
repeats: ${scratch.hybrid_repeats}
|
||||
dict_key: roboflow100
|
||||
with_seg_masks: ${scratch.enable_segmentation} # Note: Set this to true if using segmentation masks!
|
||||
|
||||
gradient_accumulation_steps: 1
|
||||
train_batch_size: 1
|
||||
collate_fn:
|
||||
_target_: sam3.train.data.collator.collate_fn_api
|
||||
_partial_: true
|
||||
repeats: ${scratch.hybrid_repeats}
|
||||
dict_key: all
|
||||
with_seg_masks: ${scratch.enable_segmentation} # Note: Set this to true if using segmentation masks!
|
||||
|
||||
# ============================================================================
|
||||
# Trainer Configuration
|
||||
# ============================================================================
|
||||
|
||||
trainer:
|
||||
|
||||
_target_: sam3.train.trainer.Trainer
|
||||
skip_saving_ckpts: true
|
||||
empty_gpu_mem_cache_after_eval: True
|
||||
skip_first_val: True
|
||||
max_epochs: 20
|
||||
accelerator: cuda
|
||||
seed_value: 123
|
||||
val_epoch_freq: 10
|
||||
mode: train
|
||||
gradient_accumulation_steps: ${scratch.gradient_accumulation_steps}
|
||||
|
||||
distributed:
|
||||
backend: nccl
|
||||
find_unused_parameters: True
|
||||
gradient_as_bucket_view: True
|
||||
|
||||
loss:
|
||||
all: ${roboflow_train.loss}
|
||||
default:
|
||||
_target_: sam3.train.loss.sam3_loss.DummyLoss
|
||||
|
||||
data:
|
||||
train:
|
||||
_target_: sam3.train.data.torch_dataset.TorchDataset
|
||||
dataset:
|
||||
_target_: sam3.train.data.sam3_image_dataset.Sam3ImageDataset
|
||||
limit_ids: ${roboflow_train.num_images}
|
||||
transforms: ${roboflow_train.train_transforms}
|
||||
load_segmentation: ${scratch.enable_segmentation}
|
||||
max_ann_per_img: 500000
|
||||
multiplier: 1
|
||||
max_train_queries: 50000
|
||||
max_val_queries: 50000
|
||||
training: true
|
||||
use_caching: False
|
||||
img_folder: ${paths.roboflow_vl_100_root}/${roboflow_train.supercategory}/train/
|
||||
ann_file: ${paths.roboflow_vl_100_root}/${roboflow_train.supercategory}/train/_annotations.coco.json
|
||||
|
||||
shuffle: True
|
||||
batch_size: ${scratch.train_batch_size}
|
||||
num_workers: ${scratch.num_train_workers}
|
||||
pin_memory: True
|
||||
drop_last: True
|
||||
collate_fn: ${scratch.collate_fn}
|
||||
|
||||
val:
|
||||
_target_: sam3.train.data.torch_dataset.TorchDataset
|
||||
dataset:
|
||||
_target_: sam3.train.data.sam3_image_dataset.Sam3ImageDataset
|
||||
load_segmentation: ${scratch.enable_segmentation}
|
||||
coco_json_loader:
|
||||
_target_: sam3.train.data.coco_json_loaders.COCO_FROM_JSON
|
||||
include_negatives: true
|
||||
category_chunk_size: 2 # Note: You can increase this based on the memory of your GPU.
|
||||
_partial_: true
|
||||
img_folder: ${paths.roboflow_vl_100_root}/${roboflow_train.supercategory}/test/
|
||||
ann_file: ${paths.roboflow_vl_100_root}/${roboflow_train.supercategory}/test/_annotations.coco.json
|
||||
transforms: ${roboflow_train.val_transforms}
|
||||
max_ann_per_img: 100000
|
||||
multiplier: 1
|
||||
training: false
|
||||
|
||||
shuffle: False
|
||||
batch_size: ${scratch.val_batch_size}
|
||||
num_workers: ${scratch.num_val_workers}
|
||||
pin_memory: True
|
||||
drop_last: False
|
||||
collate_fn: ${scratch.collate_fn_val}
|
||||
|
||||
|
||||
model:
|
||||
_target_: sam3.model_builder.build_sam3_image_model
|
||||
bpe_path: ${paths.bpe_path}
|
||||
device: cpus
|
||||
eval_mode: false
|
||||
enable_segmentation: ${scratch.enable_segmentation} # Warning: Enable this if using segmentation.
|
||||
|
||||
meters:
|
||||
val:
|
||||
roboflow100:
|
||||
detection:
|
||||
_target_: sam3.eval.coco_writer.PredictionDumper
|
||||
iou_type: "bbox"
|
||||
dump_dir: ${launcher.experiment_log_dir}/dumps/roboflow/${roboflow_train.supercategory}
|
||||
merge_predictions: True
|
||||
postprocessor: ${scratch.original_box_postprocessor}
|
||||
gather_pred_via_filesys: ${scratch.gather_pred_via_filesys}
|
||||
maxdets: 100
|
||||
pred_file_evaluators:
|
||||
- _target_: sam3.eval.coco_eval_offline.CocoEvaluatorOfflineWithPredFileEvaluators
|
||||
gt_path: ${paths.roboflow_vl_100_root}/${roboflow_train.supercategory}/test/_annotations.coco.json
|
||||
tide: False
|
||||
iou_type: "bbox"
|
||||
|
||||
optim:
|
||||
amp:
|
||||
enabled: True
|
||||
amp_dtype: bfloat16
|
||||
|
||||
optimizer:
|
||||
_target_: torch.optim.AdamW
|
||||
|
||||
gradient_clip:
|
||||
_target_: sam3.train.optim.optimizer.GradientClipper
|
||||
max_norm: 0.1
|
||||
norm_type: 2
|
||||
|
||||
param_group_modifiers:
|
||||
- _target_: sam3.train.optim.optimizer.layer_decay_param_modifier
|
||||
_partial_: True
|
||||
layer_decay_value: ${scratch.lrd_vision_backbone}
|
||||
apply_to: 'backbone.vision_backbone.trunk'
|
||||
overrides:
|
||||
- pattern: '*pos_embed*'
|
||||
value: 1.0
|
||||
|
||||
options:
|
||||
lr:
|
||||
- scheduler: # transformer and class_embed
|
||||
_target_: sam3.train.optim.schedulers.InverseSquareRootParamScheduler
|
||||
base_lr: ${scratch.lr_transformer}
|
||||
timescale: ${scratch.scheduler_timescale}
|
||||
warmup_steps: ${scratch.scheduler_warmup}
|
||||
cooldown_steps: ${scratch.scheduler_cooldown}
|
||||
- scheduler:
|
||||
_target_: sam3.train.optim.schedulers.InverseSquareRootParamScheduler
|
||||
base_lr: ${scratch.lr_vision_backbone}
|
||||
timescale: ${scratch.scheduler_timescale}
|
||||
warmup_steps: ${scratch.scheduler_warmup}
|
||||
cooldown_steps: ${scratch.scheduler_cooldown}
|
||||
param_names:
|
||||
- 'backbone.vision_backbone.*'
|
||||
- scheduler:
|
||||
_target_: sam3.train.optim.schedulers.InverseSquareRootParamScheduler
|
||||
base_lr: ${scratch.lr_language_backbone}
|
||||
timescale: ${scratch.scheduler_timescale}
|
||||
warmup_steps: ${scratch.scheduler_warmup}
|
||||
cooldown_steps: ${scratch.scheduler_cooldown}
|
||||
param_names:
|
||||
- 'backbone.language_backbone.*'
|
||||
|
||||
weight_decay:
|
||||
- scheduler:
|
||||
_target_: fvcore.common.param_scheduler.ConstantParamScheduler
|
||||
value: ${scratch.wd}
|
||||
- scheduler:
|
||||
_target_: fvcore.common.param_scheduler.ConstantParamScheduler
|
||||
value: 0.0
|
||||
param_names:
|
||||
- '*bias*'
|
||||
module_cls_names: ['torch.nn.LayerNorm']
|
||||
|
||||
checkpoint:
|
||||
save_dir: ${launcher.experiment_log_dir}/checkpoints
|
||||
save_freq: 0 # 0 only last checkpoint is saved.
|
||||
|
||||
logging:
|
||||
tensorboard_writer:
|
||||
_target_: sam3.train.utils.logger.make_tensorboard_logger
|
||||
log_dir: ${launcher.experiment_log_dir}/tensorboard
|
||||
flush_secs: 120
|
||||
should_log: True
|
||||
wandb_writer: null
|
||||
log_dir: ${launcher.experiment_log_dir}/logs/${roboflow_train.supercategory}
|
||||
log_freq: 10
|
||||
|
||||
# ============================================================================
|
||||
# Launcher and Submitit Configuration
|
||||
# ============================================================================
|
||||
|
||||
launcher:
|
||||
num_nodes: 1
|
||||
gpus_per_node: 2
|
||||
experiment_log_dir: ${paths.experiment_log_dir}
|
||||
multiprocessing_context: forkserver
|
||||
|
||||
submitit:
|
||||
account: null
|
||||
partition: null
|
||||
qos: null
|
||||
timeout_hour: 72
|
||||
use_cluster: True
|
||||
cpus_per_task: 10
|
||||
port_range: [10000, 65000]
|
||||
constraint: null
|
||||
# Uncomment for job array configuration
|
||||
job_array:
|
||||
num_tasks: 100
|
||||
task_index: 0
|
||||
|
||||
# ============================================================================
|
||||
# Available Roboflow Supercategories (for reference)
|
||||
# ============================================================================
|
||||
|
||||
all_roboflow_supercategories:
|
||||
- -grccs
|
||||
- zebrasatasturias
|
||||
- cod-mw-warzone
|
||||
- canalstenosis
|
||||
- label-printing-defect-version-2
|
||||
- new-defects-in-wood
|
||||
- orionproducts
|
||||
- aquarium-combined
|
||||
- varroa-mites-detection--test-set
|
||||
- clashroyalechardetector
|
||||
- stomata-cells
|
||||
- halo-infinite-angel-videogame
|
||||
- pig-detection
|
||||
- urine-analysis1
|
||||
- aerial-sheep
|
||||
- orgharvest
|
||||
- actions
|
||||
- mahjong
|
||||
- liver-disease
|
||||
- needle-base-tip-min-max
|
||||
- wheel-defect-detection
|
||||
- aircraft-turnaround-dataset
|
||||
- xray
|
||||
- wildfire-smoke
|
||||
- spinefrxnormalvindr
|
||||
- ufba-425
|
||||
- speech-bubbles-detection
|
||||
- train
|
||||
- pill
|
||||
- truck-movement
|
||||
- car-logo-detection
|
||||
- inbreast
|
||||
- sea-cucumbers-new-tiles
|
||||
- uavdet-small
|
||||
- penguin-finder-seg
|
||||
- aerial-airport
|
||||
- bibdetection
|
||||
- taco-trash-annotations-in-context
|
||||
- bees
|
||||
- recode-waste
|
||||
- screwdetectclassification
|
||||
- wine-labels
|
||||
- aerial-cows
|
||||
- into-the-vale
|
||||
- gwhd2021
|
||||
- lacrosse-object-detection
|
||||
- defect-detection
|
||||
- dataconvert
|
||||
- x-ray-id
|
||||
- ball
|
||||
- tube
|
||||
- 2024-frc
|
||||
- crystal-clean-brain-tumors-mri-dataset
|
||||
- grapes-5
|
||||
- human-detection-in-floods
|
||||
- buoy-onboarding
|
||||
- apoce-aerial-photographs-for-object-detection-of-construction-equipment
|
||||
- l10ul502
|
||||
- floating-waste
|
||||
- deeppcb
|
||||
- ism-band-packet-detection
|
||||
- weeds4
|
||||
- invoice-processing
|
||||
- thermal-cheetah
|
||||
- tomatoes-2
|
||||
- marine-sharks
|
||||
- peixos-fish
|
||||
- sssod
|
||||
- aerial-pool
|
||||
- countingpills
|
||||
- asphaltdistressdetection
|
||||
- roboflow-trained-dataset
|
||||
- everdaynew
|
||||
- underwater-objects
|
||||
- soda-bottles
|
||||
- dentalai
|
||||
- jellyfish
|
||||
- deepfruits
|
||||
- activity-diagrams
|
||||
- circuit-voltages
|
||||
- all-elements
|
||||
- macro-segmentation
|
||||
- exploratorium-daphnia
|
||||
- signatures
|
||||
- conveyor-t-shirts
|
||||
- fruitjes
|
||||
- grass-weeds
|
||||
- infraredimageofpowerequipment
|
||||
- 13-lkc01
|
||||
- wb-prova
|
||||
- flir-camera-objects
|
||||
- paper-parts
|
||||
- football-player-detection
|
||||
- trail-camera
|
||||
- smd-components
|
||||
- water-meter
|
||||
- nih-xray
|
||||
- the-dreidel-project
|
||||
- electric-pylon-detection-in-rsi
|
||||
- cable-damage
|
||||
174
sam3/train/configs/saco_video_evals/saco_veval_sav_test.yaml
Normal file
174
sam3/train/configs/saco_video_evals/saco_veval_sav_test.yaml
Normal file
@@ -0,0 +1,174 @@
|
||||
# @package _global_
|
||||
defaults:
|
||||
- _self_
|
||||
|
||||
# ============================================================================
|
||||
# Paths Configuration (Chage this to your own paths)
|
||||
# ============================================================================
|
||||
paths:
|
||||
|
||||
dump_file_name: saco_veval_sav_test
|
||||
experiment_log_dir: <YOUR EXPERIMENET LOG_DIR>
|
||||
ytvis_json: <YOUR_GT_PATH>/saco_veval_sav_test.json
|
||||
ytvis_dir : <YOUR_VIDEO_JPG_DIR>
|
||||
bpe_path: <BPE_PATH> # This should be under assets/bpe_simple_vocab_16e6.txt.gz
|
||||
num_videos: null
|
||||
|
||||
# ============================================================================
|
||||
# Different helper parameters and functions
|
||||
# ============================================================================
|
||||
scratch:
|
||||
vid_mask_postprocessor:
|
||||
_target_: sam3.eval.postprocessors.PostProcessNullOp
|
||||
|
||||
use_presence_eval: True
|
||||
|
||||
video_transforms_val:
|
||||
- _target_: sam3.train.transforms.basic_for_api.ComposeAPI
|
||||
transforms:
|
||||
- _target_: sam3.train.transforms.segmentation.DecodeRle
|
||||
# resize the image to 1024x1024 resolution
|
||||
- _target_: sam3.train.transforms.basic_for_api.RandomResizeAPI
|
||||
sizes: ${scratch.resolution} # originally `resolution: 1024`
|
||||
square: true
|
||||
consistent_transform: true
|
||||
- _target_: sam3.train.transforms.basic_for_api.ToTensorAPI
|
||||
- _target_: sam3.train.transforms.basic_for_api.NormalizeAPI
|
||||
mean: ${scratch.val_norm_mean}
|
||||
std: ${scratch.val_norm_std}
|
||||
|
||||
# Model parameters
|
||||
d_model: 256
|
||||
|
||||
# Image processing parameters
|
||||
resolution: 1008
|
||||
|
||||
# Normalization parameters
|
||||
train_norm_mean: [0.5, 0.5, 0.5]
|
||||
train_norm_std: [0.5, 0.5, 0.5]
|
||||
val_norm_mean: [0.5, 0.5, 0.5]
|
||||
val_norm_std: [0.5, 0.5, 0.5]
|
||||
|
||||
val_batch_size: 1
|
||||
num_val_workers: 0
|
||||
max_data_epochs: 20
|
||||
hybrid_repeats: 1
|
||||
gather_pred_via_filesys: false
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Trainer Configuration
|
||||
# ============================================================================
|
||||
|
||||
trainer:
|
||||
_target_: sam3.train.trainer.Trainer
|
||||
skip_saving_ckpts: true
|
||||
empty_gpu_mem_cache_after_eval: True
|
||||
skip_first_val: True
|
||||
max_epochs: ${scratch.max_data_epochs}
|
||||
accelerator: cuda
|
||||
seed_value: 123
|
||||
val_epoch_freq: 10
|
||||
mode: val
|
||||
|
||||
distributed:
|
||||
backend: nccl
|
||||
find_unused_parameters: True
|
||||
gradient_as_bucket_view: True
|
||||
|
||||
loss:
|
||||
all:
|
||||
_target_: sam3.train.loss.sam3_loss.DummyLoss
|
||||
default:
|
||||
_target_: sam3.train.loss.sam3_loss.DummyLoss
|
||||
|
||||
data:
|
||||
train: null
|
||||
val:
|
||||
_target_: sam3.train.data.torch_dataset.TorchDataset
|
||||
dataset:
|
||||
_target_: sam3.train.data.sam3_video_dataset.VideoGroundingDataset
|
||||
limit_ids: ${paths.num_videos}
|
||||
img_folder: ${paths.ytvis_dir}
|
||||
ann_file: ${paths.ytvis_json}
|
||||
coco_json_loader:
|
||||
_target_: sam3.train.data.coco_json_loaders.SAM3_VEVAL_API_FROM_JSON_NP
|
||||
_partial_: true
|
||||
|
||||
transforms: ${scratch.video_transforms_val}
|
||||
max_ann_per_img: 100000 # filtered in transforms
|
||||
max_val_queries: 100000
|
||||
multiplier: 1
|
||||
load_segmentation: true
|
||||
training: false
|
||||
|
||||
|
||||
shuffle: False
|
||||
batch_size: ${scratch.val_batch_size}
|
||||
num_workers: ${scratch.num_val_workers}
|
||||
pin_memory: True
|
||||
drop_last: False
|
||||
collate_fn:
|
||||
_target_: sam3.train.data.collator.collate_fn_api
|
||||
_partial_: true
|
||||
repeats: ${scratch.hybrid_repeats}
|
||||
dict_key: ytvis_val
|
||||
with_seg_masks: true
|
||||
|
||||
|
||||
model:
|
||||
_target_: sam3.model_builder.build_sam3_video_model
|
||||
bpe_path: ${paths.bpe_path}
|
||||
has_presence_token: True
|
||||
geo_encoder_use_img_cross_attn: True
|
||||
apply_temporal_disambiguation: True
|
||||
|
||||
meters:
|
||||
val:
|
||||
ytvis_val:
|
||||
pred_file: # key
|
||||
_target_: sam3.eval.ytvis_eval.YTVISResultsWriter
|
||||
dump_file: ${launcher.experiment_log_dir}/preds/${paths.dump_file_name}.json
|
||||
postprocessor: ${scratch.vid_mask_postprocessor}
|
||||
gather_pred_via_filesys: ${scratch.gather_pred_via_filesys}
|
||||
|
||||
optim:
|
||||
amp:
|
||||
enabled: True
|
||||
amp_dtype: bfloat16
|
||||
|
||||
|
||||
checkpoint:
|
||||
save_dir: ${launcher.experiment_log_dir}/checkpoints
|
||||
save_freq: 0 # 0 only last checkpoint is saved.
|
||||
|
||||
|
||||
logging:
|
||||
tensorboard_writer:
|
||||
_target_: sam3.train.utils.logger.make_tensorboard_logger
|
||||
log_dir: ${launcher.experiment_log_dir}/tensorboard
|
||||
flush_secs: 120
|
||||
should_log: True
|
||||
wandb_writer: null
|
||||
log_dir: ${launcher.experiment_log_dir}/logs/
|
||||
log_freq: 10
|
||||
|
||||
# ============================================================================
|
||||
# Launcher and Submitit Configuration
|
||||
# ============================================================================
|
||||
|
||||
launcher:
|
||||
num_nodes: 8
|
||||
gpus_per_node: 8
|
||||
experiment_log_dir: ${paths.experiment_log_dir}
|
||||
multiprocessing_context: forkserver
|
||||
|
||||
submitit:
|
||||
account: null
|
||||
partition: null
|
||||
qos: null
|
||||
timeout_hour: 72
|
||||
use_cluster: True
|
||||
cpus_per_task: 10
|
||||
port_range: [10000, 65000]
|
||||
constraint: null
|
||||
@@ -0,0 +1,174 @@
|
||||
# @package _global_
|
||||
defaults:
|
||||
- _self_
|
||||
|
||||
# ============================================================================
|
||||
# Paths Configuration (Chage this to your own paths)
|
||||
# ============================================================================
|
||||
paths:
|
||||
|
||||
dump_file_name: saco_veval_sav_test
|
||||
experiment_log_dir: <YOUR EXPERIMENET LOG_DIR>
|
||||
ytvis_json: <YOUR_GT_PATH>/saco_veval_sav_test.json
|
||||
ytvis_dir : <YOUR_VIDEO_JPG_DIR>
|
||||
bpe_path: <BPE_PATH> # This should be under assets/bpe_simple_vocab_16e6.txt.gz
|
||||
num_videos: null
|
||||
|
||||
# ============================================================================
|
||||
# Different helper parameters and functions
|
||||
# ============================================================================
|
||||
scratch:
|
||||
vid_mask_postprocessor:
|
||||
_target_: sam3.eval.postprocessors.PostProcessNullOp
|
||||
|
||||
use_presence_eval: True
|
||||
|
||||
video_transforms_val:
|
||||
- _target_: sam3.train.transforms.basic_for_api.ComposeAPI
|
||||
transforms:
|
||||
- _target_: sam3.train.transforms.segmentation.DecodeRle
|
||||
# resize the image to 1024x1024 resolution
|
||||
- _target_: sam3.train.transforms.basic_for_api.RandomResizeAPI
|
||||
sizes: ${scratch.resolution} # originally `resolution: 1024`
|
||||
square: true
|
||||
consistent_transform: true
|
||||
- _target_: sam3.train.transforms.basic_for_api.ToTensorAPI
|
||||
- _target_: sam3.train.transforms.basic_for_api.NormalizeAPI
|
||||
mean: ${scratch.val_norm_mean}
|
||||
std: ${scratch.val_norm_std}
|
||||
|
||||
# Model parameters
|
||||
d_model: 256
|
||||
|
||||
# Image processing parameters
|
||||
resolution: 1008
|
||||
|
||||
# Normalization parameters
|
||||
train_norm_mean: [0.5, 0.5, 0.5]
|
||||
train_norm_std: [0.5, 0.5, 0.5]
|
||||
val_norm_mean: [0.5, 0.5, 0.5]
|
||||
val_norm_std: [0.5, 0.5, 0.5]
|
||||
|
||||
val_batch_size: 1
|
||||
num_val_workers: 0
|
||||
max_data_epochs: 20
|
||||
hybrid_repeats: 1
|
||||
gather_pred_via_filesys: false
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Trainer Configuration
|
||||
# ============================================================================
|
||||
|
||||
trainer:
|
||||
_target_: sam3.train.trainer.Trainer
|
||||
skip_saving_ckpts: true
|
||||
empty_gpu_mem_cache_after_eval: True
|
||||
skip_first_val: True
|
||||
max_epochs: ${scratch.max_data_epochs}
|
||||
accelerator: cuda
|
||||
seed_value: 123
|
||||
val_epoch_freq: 10
|
||||
mode: val
|
||||
|
||||
distributed:
|
||||
backend: nccl
|
||||
find_unused_parameters: True
|
||||
gradient_as_bucket_view: True
|
||||
|
||||
loss:
|
||||
all:
|
||||
_target_: sam3.train.loss.sam3_loss.DummyLoss
|
||||
default:
|
||||
_target_: sam3.train.loss.sam3_loss.DummyLoss
|
||||
|
||||
data:
|
||||
train: null
|
||||
val:
|
||||
_target_: sam3.train.data.torch_dataset.TorchDataset
|
||||
dataset:
|
||||
_target_: sam3.train.data.sam3_video_dataset.VideoGroundingDataset
|
||||
limit_ids: ${paths.num_videos}
|
||||
img_folder: ${paths.ytvis_dir}
|
||||
ann_file: ${paths.ytvis_json}
|
||||
coco_json_loader:
|
||||
_target_: sam3.train.data.coco_json_loaders.SAM3_VEVAL_API_FROM_JSON_NP
|
||||
_partial_: true
|
||||
|
||||
transforms: ${scratch.video_transforms_val}
|
||||
max_ann_per_img: 100000 # filtered in transforms
|
||||
max_val_queries: 100000
|
||||
multiplier: 1
|
||||
load_segmentation: true
|
||||
training: false
|
||||
|
||||
|
||||
shuffle: False
|
||||
batch_size: ${scratch.val_batch_size}
|
||||
num_workers: ${scratch.num_val_workers}
|
||||
pin_memory: True
|
||||
drop_last: False
|
||||
collate_fn:
|
||||
_target_: sam3.train.data.collator.collate_fn_api
|
||||
_partial_: true
|
||||
repeats: ${scratch.hybrid_repeats}
|
||||
dict_key: ytvis_val
|
||||
with_seg_masks: true
|
||||
|
||||
|
||||
model:
|
||||
_target_: sam3.model_builder.build_sam3_video_model
|
||||
bpe_path: ${paths.bpe_path}
|
||||
has_presence_token: True
|
||||
geo_encoder_use_img_cross_attn: True
|
||||
apply_temporal_disambiguation: False
|
||||
|
||||
meters:
|
||||
val:
|
||||
ytvis_val:
|
||||
pred_file: # key
|
||||
_target_: sam3.eval.ytvis_eval.YTVISResultsWriter
|
||||
dump_file: ${launcher.experiment_log_dir}/preds/${paths.dump_file_name}.json
|
||||
postprocessor: ${scratch.vid_mask_postprocessor}
|
||||
gather_pred_via_filesys: ${scratch.gather_pred_via_filesys}
|
||||
|
||||
optim:
|
||||
amp:
|
||||
enabled: True
|
||||
amp_dtype: bfloat16
|
||||
|
||||
|
||||
checkpoint:
|
||||
save_dir: ${launcher.experiment_log_dir}/checkpoints
|
||||
save_freq: 0 # 0 only last checkpoint is saved.
|
||||
|
||||
|
||||
logging:
|
||||
tensorboard_writer:
|
||||
_target_: sam3.train.utils.logger.make_tensorboard_logger
|
||||
log_dir: ${launcher.experiment_log_dir}/tensorboard
|
||||
flush_secs: 120
|
||||
should_log: True
|
||||
wandb_writer: null
|
||||
log_dir: ${launcher.experiment_log_dir}/logs/
|
||||
log_freq: 10
|
||||
|
||||
# ============================================================================
|
||||
# Launcher and Submitit Configuration
|
||||
# ============================================================================
|
||||
|
||||
launcher:
|
||||
num_nodes: 8
|
||||
gpus_per_node: 8
|
||||
experiment_log_dir: ${paths.experiment_log_dir}
|
||||
multiprocessing_context: forkserver
|
||||
|
||||
submitit:
|
||||
account: null
|
||||
partition: null
|
||||
qos: null
|
||||
timeout_hour: 72
|
||||
use_cluster: True
|
||||
cpus_per_task: 10
|
||||
port_range: [10000, 65000]
|
||||
constraint: null
|
||||
174
sam3/train/configs/saco_video_evals/saco_veval_sav_val.yaml
Normal file
174
sam3/train/configs/saco_video_evals/saco_veval_sav_val.yaml
Normal file
@@ -0,0 +1,174 @@
|
||||
# @package _global_
|
||||
defaults:
|
||||
- _self_
|
||||
|
||||
# ============================================================================
|
||||
# Paths Configuration (Chage this to your own paths)
|
||||
# ============================================================================
|
||||
paths:
|
||||
|
||||
dump_file_name: saco_veval_sav_val
|
||||
experiment_log_dir: <YOUR EXPERIMENET LOG_DIR>
|
||||
ytvis_json: <YOUR_GT_PATH>/saco_veval_sav_val.json
|
||||
ytvis_dir : <YOUR_VIDEO_JPG_DIR>
|
||||
bpe_path: <BPE_PATH> # This should be under assets/bpe_simple_vocab_16e6.txt.gz
|
||||
num_videos: null
|
||||
|
||||
# ============================================================================
|
||||
# Different helper parameters and functions
|
||||
# ============================================================================
|
||||
scratch:
|
||||
vid_mask_postprocessor:
|
||||
_target_: sam3.eval.postprocessors.PostProcessNullOp
|
||||
|
||||
use_presence_eval: True
|
||||
|
||||
video_transforms_val:
|
||||
- _target_: sam3.train.transforms.basic_for_api.ComposeAPI
|
||||
transforms:
|
||||
- _target_: sam3.train.transforms.segmentation.DecodeRle
|
||||
# resize the image to 1024x1024 resolution
|
||||
- _target_: sam3.train.transforms.basic_for_api.RandomResizeAPI
|
||||
sizes: ${scratch.resolution} # originally `resolution: 1024`
|
||||
square: true
|
||||
consistent_transform: true
|
||||
- _target_: sam3.train.transforms.basic_for_api.ToTensorAPI
|
||||
- _target_: sam3.train.transforms.basic_for_api.NormalizeAPI
|
||||
mean: ${scratch.val_norm_mean}
|
||||
std: ${scratch.val_norm_std}
|
||||
|
||||
# Model parameters
|
||||
d_model: 256
|
||||
|
||||
# Image processing parameters
|
||||
resolution: 1008
|
||||
|
||||
# Normalization parameters
|
||||
train_norm_mean: [0.5, 0.5, 0.5]
|
||||
train_norm_std: [0.5, 0.5, 0.5]
|
||||
val_norm_mean: [0.5, 0.5, 0.5]
|
||||
val_norm_std: [0.5, 0.5, 0.5]
|
||||
|
||||
val_batch_size: 1
|
||||
num_val_workers: 0
|
||||
max_data_epochs: 20
|
||||
hybrid_repeats: 1
|
||||
gather_pred_via_filesys: false
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Trainer Configuration
|
||||
# ============================================================================
|
||||
|
||||
trainer:
|
||||
_target_: sam3.train.trainer.Trainer
|
||||
skip_saving_ckpts: true
|
||||
empty_gpu_mem_cache_after_eval: True
|
||||
skip_first_val: True
|
||||
max_epochs: ${scratch.max_data_epochs}
|
||||
accelerator: cuda
|
||||
seed_value: 123
|
||||
val_epoch_freq: 10
|
||||
mode: val
|
||||
|
||||
distributed:
|
||||
backend: nccl
|
||||
find_unused_parameters: True
|
||||
gradient_as_bucket_view: True
|
||||
|
||||
loss:
|
||||
all:
|
||||
_target_: sam3.train.loss.sam3_loss.DummyLoss
|
||||
default:
|
||||
_target_: sam3.train.loss.sam3_loss.DummyLoss
|
||||
|
||||
data:
|
||||
train: null
|
||||
val:
|
||||
_target_: sam3.train.data.torch_dataset.TorchDataset
|
||||
dataset:
|
||||
_target_: sam3.train.data.sam3_video_dataset.VideoGroundingDataset
|
||||
limit_ids: ${paths.num_videos}
|
||||
img_folder: ${paths.ytvis_dir}
|
||||
ann_file: ${paths.ytvis_json}
|
||||
coco_json_loader:
|
||||
_target_: sam3.train.data.coco_json_loaders.SAM3_VEVAL_API_FROM_JSON_NP
|
||||
_partial_: true
|
||||
|
||||
transforms: ${scratch.video_transforms_val}
|
||||
max_ann_per_img: 100000 # filtered in transforms
|
||||
max_val_queries: 100000
|
||||
multiplier: 1
|
||||
load_segmentation: true
|
||||
training: false
|
||||
|
||||
|
||||
shuffle: False
|
||||
batch_size: ${scratch.val_batch_size}
|
||||
num_workers: ${scratch.num_val_workers}
|
||||
pin_memory: True
|
||||
drop_last: False
|
||||
collate_fn:
|
||||
_target_: sam3.train.data.collator.collate_fn_api
|
||||
_partial_: true
|
||||
repeats: ${scratch.hybrid_repeats}
|
||||
dict_key: ytvis_val
|
||||
with_seg_masks: true
|
||||
|
||||
|
||||
model:
|
||||
_target_: sam3.model_builder.build_sam3_video_model
|
||||
bpe_path: ${paths.bpe_path}
|
||||
has_presence_token: True
|
||||
geo_encoder_use_img_cross_attn: True
|
||||
apply_temporal_disambiguation: True
|
||||
|
||||
meters:
|
||||
val:
|
||||
ytvis_val:
|
||||
pred_file: # key
|
||||
_target_: sam3.eval.ytvis_eval.YTVISResultsWriter
|
||||
dump_file: ${launcher.experiment_log_dir}/preds/${paths.dump_file_name}.json
|
||||
postprocessor: ${scratch.vid_mask_postprocessor}
|
||||
gather_pred_via_filesys: ${scratch.gather_pred_via_filesys}
|
||||
|
||||
optim:
|
||||
amp:
|
||||
enabled: True
|
||||
amp_dtype: bfloat16
|
||||
|
||||
|
||||
checkpoint:
|
||||
save_dir: ${launcher.experiment_log_dir}/checkpoints
|
||||
save_freq: 0 # 0 only last checkpoint is saved.
|
||||
|
||||
|
||||
logging:
|
||||
tensorboard_writer:
|
||||
_target_: sam3.train.utils.logger.make_tensorboard_logger
|
||||
log_dir: ${launcher.experiment_log_dir}/tensorboard
|
||||
flush_secs: 120
|
||||
should_log: True
|
||||
wandb_writer: null
|
||||
log_dir: ${launcher.experiment_log_dir}/logs/
|
||||
log_freq: 10
|
||||
|
||||
# ============================================================================
|
||||
# Launcher and Submitit Configuration
|
||||
# ============================================================================
|
||||
|
||||
launcher:
|
||||
num_nodes: 8
|
||||
gpus_per_node: 8
|
||||
experiment_log_dir: ${paths.experiment_log_dir}
|
||||
multiprocessing_context: forkserver
|
||||
|
||||
submitit:
|
||||
account: null
|
||||
partition: null
|
||||
qos: null
|
||||
timeout_hour: 72
|
||||
use_cluster: True
|
||||
cpus_per_task: 10
|
||||
port_range: [10000, 65000]
|
||||
constraint: null
|
||||
@@ -0,0 +1,174 @@
|
||||
# @package _global_
|
||||
defaults:
|
||||
- _self_
|
||||
|
||||
# ============================================================================
|
||||
# Paths Configuration (Chage this to your own paths)
|
||||
# ============================================================================
|
||||
paths:
|
||||
|
||||
dump_file_name: saco_veval_sav_val
|
||||
experiment_log_dir: <YOUR EXPERIMENET LOG_DIR>
|
||||
ytvis_json: <YOUR_GT_PATH>/saco_veval_sav_val.json
|
||||
ytvis_dir : <YOUR_VIDEO_JPG_DIR>
|
||||
bpe_path: <BPE_PATH> # This should be under assets/bpe_simple_vocab_16e6.txt.gz
|
||||
num_videos: null
|
||||
|
||||
# ============================================================================
|
||||
# Different helper parameters and functions
|
||||
# ============================================================================
|
||||
scratch:
|
||||
vid_mask_postprocessor:
|
||||
_target_: sam3.eval.postprocessors.PostProcessNullOp
|
||||
|
||||
use_presence_eval: True
|
||||
|
||||
video_transforms_val:
|
||||
- _target_: sam3.train.transforms.basic_for_api.ComposeAPI
|
||||
transforms:
|
||||
- _target_: sam3.train.transforms.segmentation.DecodeRle
|
||||
# resize the image to 1024x1024 resolution
|
||||
- _target_: sam3.train.transforms.basic_for_api.RandomResizeAPI
|
||||
sizes: ${scratch.resolution} # originally `resolution: 1024`
|
||||
square: true
|
||||
consistent_transform: true
|
||||
- _target_: sam3.train.transforms.basic_for_api.ToTensorAPI
|
||||
- _target_: sam3.train.transforms.basic_for_api.NormalizeAPI
|
||||
mean: ${scratch.val_norm_mean}
|
||||
std: ${scratch.val_norm_std}
|
||||
|
||||
# Model parameters
|
||||
d_model: 256
|
||||
|
||||
# Image processing parameters
|
||||
resolution: 1008
|
||||
|
||||
# Normalization parameters
|
||||
train_norm_mean: [0.5, 0.5, 0.5]
|
||||
train_norm_std: [0.5, 0.5, 0.5]
|
||||
val_norm_mean: [0.5, 0.5, 0.5]
|
||||
val_norm_std: [0.5, 0.5, 0.5]
|
||||
|
||||
val_batch_size: 1
|
||||
num_val_workers: 0
|
||||
max_data_epochs: 20
|
||||
hybrid_repeats: 1
|
||||
gather_pred_via_filesys: false
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Trainer Configuration
|
||||
# ============================================================================
|
||||
|
||||
trainer:
|
||||
_target_: sam3.train.trainer.Trainer
|
||||
skip_saving_ckpts: true
|
||||
empty_gpu_mem_cache_after_eval: True
|
||||
skip_first_val: True
|
||||
max_epochs: ${scratch.max_data_epochs}
|
||||
accelerator: cuda
|
||||
seed_value: 123
|
||||
val_epoch_freq: 10
|
||||
mode: val
|
||||
|
||||
distributed:
|
||||
backend: nccl
|
||||
find_unused_parameters: True
|
||||
gradient_as_bucket_view: True
|
||||
|
||||
loss:
|
||||
all:
|
||||
_target_: sam3.train.loss.sam3_loss.DummyLoss
|
||||
default:
|
||||
_target_: sam3.train.loss.sam3_loss.DummyLoss
|
||||
|
||||
data:
|
||||
train: null
|
||||
val:
|
||||
_target_: sam3.train.data.torch_dataset.TorchDataset
|
||||
dataset:
|
||||
_target_: sam3.train.data.sam3_video_dataset.VideoGroundingDataset
|
||||
limit_ids: ${paths.num_videos}
|
||||
img_folder: ${paths.ytvis_dir}
|
||||
ann_file: ${paths.ytvis_json}
|
||||
coco_json_loader:
|
||||
_target_: sam3.train.data.coco_json_loaders.SAM3_VEVAL_API_FROM_JSON_NP
|
||||
_partial_: true
|
||||
|
||||
transforms: ${scratch.video_transforms_val}
|
||||
max_ann_per_img: 100000 # filtered in transforms
|
||||
max_val_queries: 100000
|
||||
multiplier: 1
|
||||
load_segmentation: true
|
||||
training: false
|
||||
|
||||
|
||||
shuffle: False
|
||||
batch_size: ${scratch.val_batch_size}
|
||||
num_workers: ${scratch.num_val_workers}
|
||||
pin_memory: True
|
||||
drop_last: False
|
||||
collate_fn:
|
||||
_target_: sam3.train.data.collator.collate_fn_api
|
||||
_partial_: true
|
||||
repeats: ${scratch.hybrid_repeats}
|
||||
dict_key: ytvis_val
|
||||
with_seg_masks: true
|
||||
|
||||
|
||||
model:
|
||||
_target_: sam3.model_builder.build_sam3_video_model
|
||||
bpe_path: ${paths.bpe_path}
|
||||
has_presence_token: True
|
||||
geo_encoder_use_img_cross_attn: True
|
||||
apply_temporal_disambiguation: False
|
||||
|
||||
meters:
|
||||
val:
|
||||
ytvis_val:
|
||||
pred_file: # key
|
||||
_target_: sam3.eval.ytvis_eval.YTVISResultsWriter
|
||||
dump_file: ${launcher.experiment_log_dir}/preds/${paths.dump_file_name}.json
|
||||
postprocessor: ${scratch.vid_mask_postprocessor}
|
||||
gather_pred_via_filesys: ${scratch.gather_pred_via_filesys}
|
||||
|
||||
optim:
|
||||
amp:
|
||||
enabled: True
|
||||
amp_dtype: bfloat16
|
||||
|
||||
|
||||
checkpoint:
|
||||
save_dir: ${launcher.experiment_log_dir}/checkpoints
|
||||
save_freq: 0 # 0 only last checkpoint is saved.
|
||||
|
||||
|
||||
logging:
|
||||
tensorboard_writer:
|
||||
_target_: sam3.train.utils.logger.make_tensorboard_logger
|
||||
log_dir: ${launcher.experiment_log_dir}/tensorboard
|
||||
flush_secs: 120
|
||||
should_log: True
|
||||
wandb_writer: null
|
||||
log_dir: ${launcher.experiment_log_dir}/logs/
|
||||
log_freq: 10
|
||||
|
||||
# ============================================================================
|
||||
# Launcher and Submitit Configuration
|
||||
# ============================================================================
|
||||
|
||||
launcher:
|
||||
num_nodes: 8
|
||||
gpus_per_node: 8
|
||||
experiment_log_dir: ${paths.experiment_log_dir}
|
||||
multiprocessing_context: forkserver
|
||||
|
||||
submitit:
|
||||
account: null
|
||||
partition: null
|
||||
qos: null
|
||||
timeout_hour: 72
|
||||
use_cluster: True
|
||||
cpus_per_task: 10
|
||||
port_range: [10000, 65000]
|
||||
constraint: null
|
||||
@@ -0,0 +1,174 @@
|
||||
# @package _global_
|
||||
defaults:
|
||||
- _self_
|
||||
|
||||
# ============================================================================
|
||||
# Paths Configuration (Chage this to your own paths)
|
||||
# ============================================================================
|
||||
paths:
|
||||
|
||||
dump_file_name: saco_veval_smartglasses_test
|
||||
experiment_log_dir: <YOUR EXPERIMENET LOG_DIR>
|
||||
ytvis_json: <YOUR_GT_PATH>/saco_veval_smartglasses_test.json
|
||||
ytvis_dir : <YOUR_VIDEO_JPG_DIR>
|
||||
bpe_path: <BPE_PATH> # This should be under assets/bpe_simple_vocab_16e6.txt.gz
|
||||
num_videos: null
|
||||
|
||||
# ============================================================================
|
||||
# Different helper parameters and functions
|
||||
# ============================================================================
|
||||
scratch:
|
||||
vid_mask_postprocessor:
|
||||
_target_: sam3.eval.postprocessors.PostProcessNullOp
|
||||
|
||||
use_presence_eval: True
|
||||
|
||||
video_transforms_val:
|
||||
- _target_: sam3.train.transforms.basic_for_api.ComposeAPI
|
||||
transforms:
|
||||
- _target_: sam3.train.transforms.segmentation.DecodeRle
|
||||
# resize the image to 1024x1024 resolution
|
||||
- _target_: sam3.train.transforms.basic_for_api.RandomResizeAPI
|
||||
sizes: ${scratch.resolution} # originally `resolution: 1024`
|
||||
square: true
|
||||
consistent_transform: true
|
||||
- _target_: sam3.train.transforms.basic_for_api.ToTensorAPI
|
||||
- _target_: sam3.train.transforms.basic_for_api.NormalizeAPI
|
||||
mean: ${scratch.val_norm_mean}
|
||||
std: ${scratch.val_norm_std}
|
||||
|
||||
# Model parameters
|
||||
d_model: 256
|
||||
|
||||
# Image processing parameters
|
||||
resolution: 1008
|
||||
|
||||
# Normalization parameters
|
||||
train_norm_mean: [0.5, 0.5, 0.5]
|
||||
train_norm_std: [0.5, 0.5, 0.5]
|
||||
val_norm_mean: [0.5, 0.5, 0.5]
|
||||
val_norm_std: [0.5, 0.5, 0.5]
|
||||
|
||||
val_batch_size: 1
|
||||
num_val_workers: 0
|
||||
max_data_epochs: 20
|
||||
hybrid_repeats: 1
|
||||
gather_pred_via_filesys: false
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Trainer Configuration
|
||||
# ============================================================================
|
||||
|
||||
trainer:
|
||||
_target_: sam3.train.trainer.Trainer
|
||||
skip_saving_ckpts: true
|
||||
empty_gpu_mem_cache_after_eval: True
|
||||
skip_first_val: True
|
||||
max_epochs: ${scratch.max_data_epochs}
|
||||
accelerator: cuda
|
||||
seed_value: 123
|
||||
val_epoch_freq: 10
|
||||
mode: val
|
||||
|
||||
distributed:
|
||||
backend: nccl
|
||||
find_unused_parameters: True
|
||||
gradient_as_bucket_view: True
|
||||
|
||||
loss:
|
||||
all:
|
||||
_target_: sam3.train.loss.sam3_loss.DummyLoss
|
||||
default:
|
||||
_target_: sam3.train.loss.sam3_loss.DummyLoss
|
||||
|
||||
data:
|
||||
train: null
|
||||
val:
|
||||
_target_: sam3.train.data.torch_dataset.TorchDataset
|
||||
dataset:
|
||||
_target_: sam3.train.data.sam3_video_dataset.VideoGroundingDataset
|
||||
limit_ids: ${paths.num_videos}
|
||||
img_folder: ${paths.ytvis_dir}
|
||||
ann_file: ${paths.ytvis_json}
|
||||
coco_json_loader:
|
||||
_target_: sam3.train.data.coco_json_loaders.SAM3_VEVAL_API_FROM_JSON_NP
|
||||
_partial_: true
|
||||
|
||||
transforms: ${scratch.video_transforms_val}
|
||||
max_ann_per_img: 100000 # filtered in transforms
|
||||
max_val_queries: 100000
|
||||
multiplier: 1
|
||||
load_segmentation: true
|
||||
training: false
|
||||
|
||||
|
||||
shuffle: False
|
||||
batch_size: ${scratch.val_batch_size}
|
||||
num_workers: ${scratch.num_val_workers}
|
||||
pin_memory: True
|
||||
drop_last: False
|
||||
collate_fn:
|
||||
_target_: sam3.train.data.collator.collate_fn_api
|
||||
_partial_: true
|
||||
repeats: ${scratch.hybrid_repeats}
|
||||
dict_key: ytvis_val
|
||||
with_seg_masks: true
|
||||
|
||||
|
||||
model:
|
||||
_target_: sam3.model_builder.build_sam3_video_model
|
||||
bpe_path: ${paths.bpe_path}
|
||||
has_presence_token: True
|
||||
geo_encoder_use_img_cross_attn: True
|
||||
apply_temporal_disambiguation: True
|
||||
|
||||
meters:
|
||||
val:
|
||||
ytvis_val:
|
||||
pred_file: # key
|
||||
_target_: sam3.eval.ytvis_eval.YTVISResultsWriter
|
||||
dump_file: ${launcher.experiment_log_dir}/preds/${paths.dump_file_name}.json
|
||||
postprocessor: ${scratch.vid_mask_postprocessor}
|
||||
gather_pred_via_filesys: ${scratch.gather_pred_via_filesys}
|
||||
|
||||
optim:
|
||||
amp:
|
||||
enabled: True
|
||||
amp_dtype: bfloat16
|
||||
|
||||
|
||||
checkpoint:
|
||||
save_dir: ${launcher.experiment_log_dir}/checkpoints
|
||||
save_freq: 0 # 0 only last checkpoint is saved.
|
||||
|
||||
|
||||
logging:
|
||||
tensorboard_writer:
|
||||
_target_: sam3.train.utils.logger.make_tensorboard_logger
|
||||
log_dir: ${launcher.experiment_log_dir}/tensorboard
|
||||
flush_secs: 120
|
||||
should_log: True
|
||||
wandb_writer: null
|
||||
log_dir: ${launcher.experiment_log_dir}/logs/
|
||||
log_freq: 10
|
||||
|
||||
# ============================================================================
|
||||
# Launcher and Submitit Configuration
|
||||
# ============================================================================
|
||||
|
||||
launcher:
|
||||
num_nodes: 8
|
||||
gpus_per_node: 8
|
||||
experiment_log_dir: ${paths.experiment_log_dir}
|
||||
multiprocessing_context: forkserver
|
||||
|
||||
submitit:
|
||||
account: null
|
||||
partition: null
|
||||
qos: null
|
||||
timeout_hour: 72
|
||||
use_cluster: True
|
||||
cpus_per_task: 10
|
||||
port_range: [10000, 65000]
|
||||
constraint: null
|
||||
@@ -0,0 +1,174 @@
|
||||
# @package _global_
|
||||
defaults:
|
||||
- _self_
|
||||
|
||||
# ============================================================================
|
||||
# Paths Configuration (Chage this to your own paths)
|
||||
# ============================================================================
|
||||
paths:
|
||||
|
||||
dump_file_name: saco_veval_smartglasses_test
|
||||
experiment_log_dir: <YOUR EXPERIMENET LOG_DIR>
|
||||
ytvis_json: <YOUR_GT_PATH>/saco_veval_smartglasses_test.json
|
||||
ytvis_dir : <YOUR_VIDEO_JPG_DIR>
|
||||
bpe_path: <BPE_PATH> # This should be under assets/bpe_simple_vocab_16e6.txt.gz
|
||||
num_videos: null
|
||||
|
||||
# ============================================================================
|
||||
# Different helper parameters and functions
|
||||
# ============================================================================
|
||||
scratch:
|
||||
vid_mask_postprocessor:
|
||||
_target_: sam3.eval.postprocessors.PostProcessNullOp
|
||||
|
||||
use_presence_eval: True
|
||||
|
||||
video_transforms_val:
|
||||
- _target_: sam3.train.transforms.basic_for_api.ComposeAPI
|
||||
transforms:
|
||||
- _target_: sam3.train.transforms.segmentation.DecodeRle
|
||||
# resize the image to 1024x1024 resolution
|
||||
- _target_: sam3.train.transforms.basic_for_api.RandomResizeAPI
|
||||
sizes: ${scratch.resolution} # originally `resolution: 1024`
|
||||
square: true
|
||||
consistent_transform: true
|
||||
- _target_: sam3.train.transforms.basic_for_api.ToTensorAPI
|
||||
- _target_: sam3.train.transforms.basic_for_api.NormalizeAPI
|
||||
mean: ${scratch.val_norm_mean}
|
||||
std: ${scratch.val_norm_std}
|
||||
|
||||
# Model parameters
|
||||
d_model: 256
|
||||
|
||||
# Image processing parameters
|
||||
resolution: 1008
|
||||
|
||||
# Normalization parameters
|
||||
train_norm_mean: [0.5, 0.5, 0.5]
|
||||
train_norm_std: [0.5, 0.5, 0.5]
|
||||
val_norm_mean: [0.5, 0.5, 0.5]
|
||||
val_norm_std: [0.5, 0.5, 0.5]
|
||||
|
||||
val_batch_size: 1
|
||||
num_val_workers: 0
|
||||
max_data_epochs: 20
|
||||
hybrid_repeats: 1
|
||||
gather_pred_via_filesys: false
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Trainer Configuration
|
||||
# ============================================================================
|
||||
|
||||
trainer:
|
||||
_target_: sam3.train.trainer.Trainer
|
||||
skip_saving_ckpts: true
|
||||
empty_gpu_mem_cache_after_eval: True
|
||||
skip_first_val: True
|
||||
max_epochs: ${scratch.max_data_epochs}
|
||||
accelerator: cuda
|
||||
seed_value: 123
|
||||
val_epoch_freq: 10
|
||||
mode: val
|
||||
|
||||
distributed:
|
||||
backend: nccl
|
||||
find_unused_parameters: True
|
||||
gradient_as_bucket_view: True
|
||||
|
||||
loss:
|
||||
all:
|
||||
_target_: sam3.train.loss.sam3_loss.DummyLoss
|
||||
default:
|
||||
_target_: sam3.train.loss.sam3_loss.DummyLoss
|
||||
|
||||
data:
|
||||
train: null
|
||||
val:
|
||||
_target_: sam3.train.data.torch_dataset.TorchDataset
|
||||
dataset:
|
||||
_target_: sam3.train.data.sam3_video_dataset.VideoGroundingDataset
|
||||
limit_ids: ${paths.num_videos}
|
||||
img_folder: ${paths.ytvis_dir}
|
||||
ann_file: ${paths.ytvis_json}
|
||||
coco_json_loader:
|
||||
_target_: sam3.train.data.coco_json_loaders.SAM3_VEVAL_API_FROM_JSON_NP
|
||||
_partial_: true
|
||||
|
||||
transforms: ${scratch.video_transforms_val}
|
||||
max_ann_per_img: 100000 # filtered in transforms
|
||||
max_val_queries: 100000
|
||||
multiplier: 1
|
||||
load_segmentation: true
|
||||
training: false
|
||||
|
||||
|
||||
shuffle: False
|
||||
batch_size: ${scratch.val_batch_size}
|
||||
num_workers: ${scratch.num_val_workers}
|
||||
pin_memory: True
|
||||
drop_last: False
|
||||
collate_fn:
|
||||
_target_: sam3.train.data.collator.collate_fn_api
|
||||
_partial_: true
|
||||
repeats: ${scratch.hybrid_repeats}
|
||||
dict_key: ytvis_val
|
||||
with_seg_masks: true
|
||||
|
||||
|
||||
model:
|
||||
_target_: sam3.model_builder.build_sam3_video_model
|
||||
bpe_path: ${paths.bpe_path}
|
||||
has_presence_token: True
|
||||
geo_encoder_use_img_cross_attn: True
|
||||
apply_temporal_disambiguation: False
|
||||
|
||||
meters:
|
||||
val:
|
||||
ytvis_val:
|
||||
pred_file: # key
|
||||
_target_: sam3.eval.ytvis_eval.YTVISResultsWriter
|
||||
dump_file: ${launcher.experiment_log_dir}/preds/${paths.dump_file_name}.json
|
||||
postprocessor: ${scratch.vid_mask_postprocessor}
|
||||
gather_pred_via_filesys: ${scratch.gather_pred_via_filesys}
|
||||
|
||||
optim:
|
||||
amp:
|
||||
enabled: True
|
||||
amp_dtype: bfloat16
|
||||
|
||||
|
||||
checkpoint:
|
||||
save_dir: ${launcher.experiment_log_dir}/checkpoints
|
||||
save_freq: 0 # 0 only last checkpoint is saved.
|
||||
|
||||
|
||||
logging:
|
||||
tensorboard_writer:
|
||||
_target_: sam3.train.utils.logger.make_tensorboard_logger
|
||||
log_dir: ${launcher.experiment_log_dir}/tensorboard
|
||||
flush_secs: 120
|
||||
should_log: True
|
||||
wandb_writer: null
|
||||
log_dir: ${launcher.experiment_log_dir}/logs/
|
||||
log_freq: 10
|
||||
|
||||
# ============================================================================
|
||||
# Launcher and Submitit Configuration
|
||||
# ============================================================================
|
||||
|
||||
launcher:
|
||||
num_nodes: 8
|
||||
gpus_per_node: 8
|
||||
experiment_log_dir: ${paths.experiment_log_dir}
|
||||
multiprocessing_context: forkserver
|
||||
|
||||
submitit:
|
||||
account: null
|
||||
partition: null
|
||||
qos: null
|
||||
timeout_hour: 72
|
||||
use_cluster: True
|
||||
cpus_per_task: 10
|
||||
port_range: [10000, 65000]
|
||||
constraint: null
|
||||
@@ -0,0 +1,174 @@
|
||||
# @package _global_
|
||||
defaults:
|
||||
- _self_
|
||||
|
||||
# ============================================================================
|
||||
# Paths Configuration (Chage this to your own paths)
|
||||
# ============================================================================
|
||||
paths:
|
||||
|
||||
dump_file_name: saco_veval_smartglasses_val
|
||||
experiment_log_dir: <YOUR EXPERIMENET LOG_DIR>
|
||||
ytvis_json: <YOUR_GT_PATH>/saco_veval_smartglasses_val.json
|
||||
ytvis_dir : <YOUR_VIDEO_JPG_DIR>
|
||||
bpe_path: <BPE_PATH> # This should be under assets/bpe_simple_vocab_16e6.txt.gz
|
||||
num_videos: null
|
||||
|
||||
# ============================================================================
|
||||
# Different helper parameters and functions
|
||||
# ============================================================================
|
||||
scratch:
|
||||
vid_mask_postprocessor:
|
||||
_target_: sam3.eval.postprocessors.PostProcessNullOp
|
||||
|
||||
use_presence_eval: True
|
||||
|
||||
video_transforms_val:
|
||||
- _target_: sam3.train.transforms.basic_for_api.ComposeAPI
|
||||
transforms:
|
||||
- _target_: sam3.train.transforms.segmentation.DecodeRle
|
||||
# resize the image to 1024x1024 resolution
|
||||
- _target_: sam3.train.transforms.basic_for_api.RandomResizeAPI
|
||||
sizes: ${scratch.resolution} # originally `resolution: 1024`
|
||||
square: true
|
||||
consistent_transform: true
|
||||
- _target_: sam3.train.transforms.basic_for_api.ToTensorAPI
|
||||
- _target_: sam3.train.transforms.basic_for_api.NormalizeAPI
|
||||
mean: ${scratch.val_norm_mean}
|
||||
std: ${scratch.val_norm_std}
|
||||
|
||||
# Model parameters
|
||||
d_model: 256
|
||||
|
||||
# Image processing parameters
|
||||
resolution: 1008
|
||||
|
||||
# Normalization parameters
|
||||
train_norm_mean: [0.5, 0.5, 0.5]
|
||||
train_norm_std: [0.5, 0.5, 0.5]
|
||||
val_norm_mean: [0.5, 0.5, 0.5]
|
||||
val_norm_std: [0.5, 0.5, 0.5]
|
||||
|
||||
val_batch_size: 1
|
||||
num_val_workers: 0
|
||||
max_data_epochs: 20
|
||||
hybrid_repeats: 1
|
||||
gather_pred_via_filesys: false
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Trainer Configuration
|
||||
# ============================================================================
|
||||
|
||||
trainer:
|
||||
_target_: sam3.train.trainer.Trainer
|
||||
skip_saving_ckpts: true
|
||||
empty_gpu_mem_cache_after_eval: True
|
||||
skip_first_val: True
|
||||
max_epochs: ${scratch.max_data_epochs}
|
||||
accelerator: cuda
|
||||
seed_value: 123
|
||||
val_epoch_freq: 10
|
||||
mode: val
|
||||
|
||||
distributed:
|
||||
backend: nccl
|
||||
find_unused_parameters: True
|
||||
gradient_as_bucket_view: True
|
||||
|
||||
loss:
|
||||
all:
|
||||
_target_: sam3.train.loss.sam3_loss.DummyLoss
|
||||
default:
|
||||
_target_: sam3.train.loss.sam3_loss.DummyLoss
|
||||
|
||||
data:
|
||||
train: null
|
||||
val:
|
||||
_target_: sam3.train.data.torch_dataset.TorchDataset
|
||||
dataset:
|
||||
_target_: sam3.train.data.sam3_video_dataset.VideoGroundingDataset
|
||||
limit_ids: ${paths.num_videos}
|
||||
img_folder: ${paths.ytvis_dir}
|
||||
ann_file: ${paths.ytvis_json}
|
||||
coco_json_loader:
|
||||
_target_: sam3.train.data.coco_json_loaders.SAM3_VEVAL_API_FROM_JSON_NP
|
||||
_partial_: true
|
||||
|
||||
transforms: ${scratch.video_transforms_val}
|
||||
max_ann_per_img: 100000 # filtered in transforms
|
||||
max_val_queries: 100000
|
||||
multiplier: 1
|
||||
load_segmentation: true
|
||||
training: false
|
||||
|
||||
|
||||
shuffle: False
|
||||
batch_size: ${scratch.val_batch_size}
|
||||
num_workers: ${scratch.num_val_workers}
|
||||
pin_memory: True
|
||||
drop_last: False
|
||||
collate_fn:
|
||||
_target_: sam3.train.data.collator.collate_fn_api
|
||||
_partial_: true
|
||||
repeats: ${scratch.hybrid_repeats}
|
||||
dict_key: ytvis_val
|
||||
with_seg_masks: true
|
||||
|
||||
|
||||
model:
|
||||
_target_: sam3.model_builder.build_sam3_video_model
|
||||
bpe_path: ${paths.bpe_path}
|
||||
has_presence_token: True
|
||||
geo_encoder_use_img_cross_attn: True
|
||||
apply_temporal_disambiguation: True
|
||||
|
||||
meters:
|
||||
val:
|
||||
ytvis_val:
|
||||
pred_file: # key
|
||||
_target_: sam3.eval.ytvis_eval.YTVISResultsWriter
|
||||
dump_file: ${launcher.experiment_log_dir}/preds/${paths.dump_file_name}.json
|
||||
postprocessor: ${scratch.vid_mask_postprocessor}
|
||||
gather_pred_via_filesys: ${scratch.gather_pred_via_filesys}
|
||||
|
||||
optim:
|
||||
amp:
|
||||
enabled: True
|
||||
amp_dtype: bfloat16
|
||||
|
||||
|
||||
checkpoint:
|
||||
save_dir: ${launcher.experiment_log_dir}/checkpoints
|
||||
save_freq: 0 # 0 only last checkpoint is saved.
|
||||
|
||||
|
||||
logging:
|
||||
tensorboard_writer:
|
||||
_target_: sam3.train.utils.logger.make_tensorboard_logger
|
||||
log_dir: ${launcher.experiment_log_dir}/tensorboard
|
||||
flush_secs: 120
|
||||
should_log: True
|
||||
wandb_writer: null
|
||||
log_dir: ${launcher.experiment_log_dir}/logs/
|
||||
log_freq: 10
|
||||
|
||||
# ============================================================================
|
||||
# Launcher and Submitit Configuration
|
||||
# ============================================================================
|
||||
|
||||
launcher:
|
||||
num_nodes: 8
|
||||
gpus_per_node: 8
|
||||
experiment_log_dir: ${paths.experiment_log_dir}
|
||||
multiprocessing_context: forkserver
|
||||
|
||||
submitit:
|
||||
account: null
|
||||
partition: null
|
||||
qos: null
|
||||
timeout_hour: 72
|
||||
use_cluster: True
|
||||
cpus_per_task: 10
|
||||
port_range: [10000, 65000]
|
||||
constraint: null
|
||||
@@ -0,0 +1,174 @@
|
||||
# @package _global_
|
||||
defaults:
|
||||
- _self_
|
||||
|
||||
# ============================================================================
|
||||
# Paths Configuration (Chage this to your own paths)
|
||||
# ============================================================================
|
||||
paths:
|
||||
|
||||
dump_file_name: saco_veval_smartglasses_val
|
||||
experiment_log_dir: <YOUR EXPERIMENET LOG_DIR>
|
||||
ytvis_json: <YOUR_GT_PATH>/saco_veval_smartglasses_val.json
|
||||
ytvis_dir : <YOUR_VIDEO_JPG_DIR>
|
||||
bpe_path: <BPE_PATH> # This should be under assets/bpe_simple_vocab_16e6.txt.gz
|
||||
num_videos: null
|
||||
|
||||
# ============================================================================
|
||||
# Different helper parameters and functions
|
||||
# ============================================================================
|
||||
scratch:
|
||||
vid_mask_postprocessor:
|
||||
_target_: sam3.eval.postprocessors.PostProcessNullOp
|
||||
|
||||
use_presence_eval: True
|
||||
|
||||
video_transforms_val:
|
||||
- _target_: sam3.train.transforms.basic_for_api.ComposeAPI
|
||||
transforms:
|
||||
- _target_: sam3.train.transforms.segmentation.DecodeRle
|
||||
# resize the image to 1024x1024 resolution
|
||||
- _target_: sam3.train.transforms.basic_for_api.RandomResizeAPI
|
||||
sizes: ${scratch.resolution} # originally `resolution: 1024`
|
||||
square: true
|
||||
consistent_transform: true
|
||||
- _target_: sam3.train.transforms.basic_for_api.ToTensorAPI
|
||||
- _target_: sam3.train.transforms.basic_for_api.NormalizeAPI
|
||||
mean: ${scratch.val_norm_mean}
|
||||
std: ${scratch.val_norm_std}
|
||||
|
||||
# Model parameters
|
||||
d_model: 256
|
||||
|
||||
# Image processing parameters
|
||||
resolution: 1008
|
||||
|
||||
# Normalization parameters
|
||||
train_norm_mean: [0.5, 0.5, 0.5]
|
||||
train_norm_std: [0.5, 0.5, 0.5]
|
||||
val_norm_mean: [0.5, 0.5, 0.5]
|
||||
val_norm_std: [0.5, 0.5, 0.5]
|
||||
|
||||
val_batch_size: 1
|
||||
num_val_workers: 0
|
||||
max_data_epochs: 20
|
||||
hybrid_repeats: 1
|
||||
gather_pred_via_filesys: false
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Trainer Configuration
|
||||
# ============================================================================
|
||||
|
||||
trainer:
|
||||
_target_: sam3.train.trainer.Trainer
|
||||
skip_saving_ckpts: true
|
||||
empty_gpu_mem_cache_after_eval: True
|
||||
skip_first_val: True
|
||||
max_epochs: ${scratch.max_data_epochs}
|
||||
accelerator: cuda
|
||||
seed_value: 123
|
||||
val_epoch_freq: 10
|
||||
mode: val
|
||||
|
||||
distributed:
|
||||
backend: nccl
|
||||
find_unused_parameters: True
|
||||
gradient_as_bucket_view: True
|
||||
|
||||
loss:
|
||||
all:
|
||||
_target_: sam3.train.loss.sam3_loss.DummyLoss
|
||||
default:
|
||||
_target_: sam3.train.loss.sam3_loss.DummyLoss
|
||||
|
||||
data:
|
||||
train: null
|
||||
val:
|
||||
_target_: sam3.train.data.torch_dataset.TorchDataset
|
||||
dataset:
|
||||
_target_: sam3.train.data.sam3_video_dataset.VideoGroundingDataset
|
||||
limit_ids: ${paths.num_videos}
|
||||
img_folder: ${paths.ytvis_dir}
|
||||
ann_file: ${paths.ytvis_json}
|
||||
coco_json_loader:
|
||||
_target_: sam3.train.data.coco_json_loaders.SAM3_VEVAL_API_FROM_JSON_NP
|
||||
_partial_: true
|
||||
|
||||
transforms: ${scratch.video_transforms_val}
|
||||
max_ann_per_img: 100000 # filtered in transforms
|
||||
max_val_queries: 100000
|
||||
multiplier: 1
|
||||
load_segmentation: true
|
||||
training: false
|
||||
|
||||
|
||||
shuffle: False
|
||||
batch_size: ${scratch.val_batch_size}
|
||||
num_workers: ${scratch.num_val_workers}
|
||||
pin_memory: True
|
||||
drop_last: False
|
||||
collate_fn:
|
||||
_target_: sam3.train.data.collator.collate_fn_api
|
||||
_partial_: true
|
||||
repeats: ${scratch.hybrid_repeats}
|
||||
dict_key: ytvis_val
|
||||
with_seg_masks: true
|
||||
|
||||
|
||||
model:
|
||||
_target_: sam3.model_builder.build_sam3_video_model
|
||||
bpe_path: ${paths.bpe_path}
|
||||
has_presence_token: True
|
||||
geo_encoder_use_img_cross_attn: True
|
||||
apply_temporal_disambiguation: False
|
||||
|
||||
meters:
|
||||
val:
|
||||
ytvis_val:
|
||||
pred_file: # key
|
||||
_target_: sam3.eval.ytvis_eval.YTVISResultsWriter
|
||||
dump_file: ${launcher.experiment_log_dir}/preds/${paths.dump_file_name}.json
|
||||
postprocessor: ${scratch.vid_mask_postprocessor}
|
||||
gather_pred_via_filesys: ${scratch.gather_pred_via_filesys}
|
||||
|
||||
optim:
|
||||
amp:
|
||||
enabled: True
|
||||
amp_dtype: bfloat16
|
||||
|
||||
|
||||
checkpoint:
|
||||
save_dir: ${launcher.experiment_log_dir}/checkpoints
|
||||
save_freq: 0 # 0 only last checkpoint is saved.
|
||||
|
||||
|
||||
logging:
|
||||
tensorboard_writer:
|
||||
_target_: sam3.train.utils.logger.make_tensorboard_logger
|
||||
log_dir: ${launcher.experiment_log_dir}/tensorboard
|
||||
flush_secs: 120
|
||||
should_log: True
|
||||
wandb_writer: null
|
||||
log_dir: ${launcher.experiment_log_dir}/logs/
|
||||
log_freq: 10
|
||||
|
||||
# ============================================================================
|
||||
# Launcher and Submitit Configuration
|
||||
# ============================================================================
|
||||
|
||||
launcher:
|
||||
num_nodes: 8
|
||||
gpus_per_node: 8
|
||||
experiment_log_dir: ${paths.experiment_log_dir}
|
||||
multiprocessing_context: forkserver
|
||||
|
||||
submitit:
|
||||
account: null
|
||||
partition: null
|
||||
qos: null
|
||||
timeout_hour: 72
|
||||
use_cluster: True
|
||||
cpus_per_task: 10
|
||||
port_range: [10000, 65000]
|
||||
constraint: null
|
||||
174
sam3/train/configs/saco_video_evals/saco_veval_yt1b_test.yaml
Normal file
174
sam3/train/configs/saco_video_evals/saco_veval_yt1b_test.yaml
Normal file
@@ -0,0 +1,174 @@
|
||||
# @package _global_
|
||||
defaults:
|
||||
- _self_
|
||||
|
||||
# ============================================================================
|
||||
# Paths Configuration (Chage this to your own paths)
|
||||
# ============================================================================
|
||||
paths:
|
||||
|
||||
dump_file_name: saco_veval_yt1b_test
|
||||
experiment_log_dir: <YOUR EXPERIMENET LOG_DIR>
|
||||
ytvis_json: <YOUR_GT_PATH>/saco_veval_yt1b_test.json
|
||||
ytvis_dir : <YOUR_VIDEO_JPG_DIR>
|
||||
bpe_path: <BPE_PATH> # This should be under assets/bpe_simple_vocab_16e6.txt.gz
|
||||
num_videos: null
|
||||
|
||||
# ============================================================================
|
||||
# Different helper parameters and functions
|
||||
# ============================================================================
|
||||
scratch:
|
||||
vid_mask_postprocessor:
|
||||
_target_: sam3.eval.postprocessors.PostProcessNullOp
|
||||
|
||||
use_presence_eval: True
|
||||
|
||||
video_transforms_val:
|
||||
- _target_: sam3.train.transforms.basic_for_api.ComposeAPI
|
||||
transforms:
|
||||
- _target_: sam3.train.transforms.segmentation.DecodeRle
|
||||
# resize the image to 1024x1024 resolution
|
||||
- _target_: sam3.train.transforms.basic_for_api.RandomResizeAPI
|
||||
sizes: ${scratch.resolution} # originally `resolution: 1024`
|
||||
square: true
|
||||
consistent_transform: true
|
||||
- _target_: sam3.train.transforms.basic_for_api.ToTensorAPI
|
||||
- _target_: sam3.train.transforms.basic_for_api.NormalizeAPI
|
||||
mean: ${scratch.val_norm_mean}
|
||||
std: ${scratch.val_norm_std}
|
||||
|
||||
# Model parameters
|
||||
d_model: 256
|
||||
|
||||
# Image processing parameters
|
||||
resolution: 1008
|
||||
|
||||
# Normalization parameters
|
||||
train_norm_mean: [0.5, 0.5, 0.5]
|
||||
train_norm_std: [0.5, 0.5, 0.5]
|
||||
val_norm_mean: [0.5, 0.5, 0.5]
|
||||
val_norm_std: [0.5, 0.5, 0.5]
|
||||
|
||||
val_batch_size: 1
|
||||
num_val_workers: 0
|
||||
max_data_epochs: 20
|
||||
hybrid_repeats: 1
|
||||
gather_pred_via_filesys: false
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Trainer Configuration
|
||||
# ============================================================================
|
||||
|
||||
trainer:
|
||||
_target_: sam3.train.trainer.Trainer
|
||||
skip_saving_ckpts: true
|
||||
empty_gpu_mem_cache_after_eval: True
|
||||
skip_first_val: True
|
||||
max_epochs: ${scratch.max_data_epochs}
|
||||
accelerator: cuda
|
||||
seed_value: 123
|
||||
val_epoch_freq: 10
|
||||
mode: val
|
||||
|
||||
distributed:
|
||||
backend: nccl
|
||||
find_unused_parameters: True
|
||||
gradient_as_bucket_view: True
|
||||
|
||||
loss:
|
||||
all:
|
||||
_target_: sam3.train.loss.sam3_loss.DummyLoss
|
||||
default:
|
||||
_target_: sam3.train.loss.sam3_loss.DummyLoss
|
||||
|
||||
data:
|
||||
train: null
|
||||
val:
|
||||
_target_: sam3.train.data.torch_dataset.TorchDataset
|
||||
dataset:
|
||||
_target_: sam3.train.data.sam3_video_dataset.VideoGroundingDataset
|
||||
limit_ids: ${paths.num_videos}
|
||||
img_folder: ${paths.ytvis_dir}
|
||||
ann_file: ${paths.ytvis_json}
|
||||
coco_json_loader:
|
||||
_target_: sam3.train.data.coco_json_loaders.SAM3_VEVAL_API_FROM_JSON_NP
|
||||
_partial_: true
|
||||
|
||||
transforms: ${scratch.video_transforms_val}
|
||||
max_ann_per_img: 100000 # filtered in transforms
|
||||
max_val_queries: 100000
|
||||
multiplier: 1
|
||||
load_segmentation: true
|
||||
training: false
|
||||
|
||||
|
||||
shuffle: False
|
||||
batch_size: ${scratch.val_batch_size}
|
||||
num_workers: ${scratch.num_val_workers}
|
||||
pin_memory: True
|
||||
drop_last: False
|
||||
collate_fn:
|
||||
_target_: sam3.train.data.collator.collate_fn_api
|
||||
_partial_: true
|
||||
repeats: ${scratch.hybrid_repeats}
|
||||
dict_key: ytvis_val
|
||||
with_seg_masks: true
|
||||
|
||||
|
||||
model:
|
||||
_target_: sam3.model_builder.build_sam3_video_model
|
||||
bpe_path: ${paths.bpe_path}
|
||||
has_presence_token: True
|
||||
geo_encoder_use_img_cross_attn: True
|
||||
apply_temporal_disambiguation: True
|
||||
|
||||
meters:
|
||||
val:
|
||||
ytvis_val:
|
||||
pred_file: # key
|
||||
_target_: sam3.eval.ytvis_eval.YTVISResultsWriter
|
||||
dump_file: ${launcher.experiment_log_dir}/preds/${paths.dump_file_name}.json
|
||||
postprocessor: ${scratch.vid_mask_postprocessor}
|
||||
gather_pred_via_filesys: ${scratch.gather_pred_via_filesys}
|
||||
|
||||
optim:
|
||||
amp:
|
||||
enabled: True
|
||||
amp_dtype: bfloat16
|
||||
|
||||
|
||||
checkpoint:
|
||||
save_dir: ${launcher.experiment_log_dir}/checkpoints
|
||||
save_freq: 0 # 0 only last checkpoint is saved.
|
||||
|
||||
|
||||
logging:
|
||||
tensorboard_writer:
|
||||
_target_: sam3.train.utils.logger.make_tensorboard_logger
|
||||
log_dir: ${launcher.experiment_log_dir}/tensorboard
|
||||
flush_secs: 120
|
||||
should_log: True
|
||||
wandb_writer: null
|
||||
log_dir: ${launcher.experiment_log_dir}/logs/
|
||||
log_freq: 10
|
||||
|
||||
# ============================================================================
|
||||
# Launcher and Submitit Configuration
|
||||
# ============================================================================
|
||||
|
||||
launcher:
|
||||
num_nodes: 8
|
||||
gpus_per_node: 8
|
||||
experiment_log_dir: ${paths.experiment_log_dir}
|
||||
multiprocessing_context: forkserver
|
||||
|
||||
submitit:
|
||||
account: null
|
||||
partition: null
|
||||
qos: null
|
||||
timeout_hour: 72
|
||||
use_cluster: True
|
||||
cpus_per_task: 10
|
||||
port_range: [10000, 65000]
|
||||
constraint: null
|
||||
@@ -0,0 +1,174 @@
|
||||
# @package _global_
|
||||
defaults:
|
||||
- _self_
|
||||
|
||||
# ============================================================================
|
||||
# Paths Configuration (Chage this to your own paths)
|
||||
# ============================================================================
|
||||
paths:
|
||||
|
||||
dump_file_name: saco_veval_yt1b_test
|
||||
experiment_log_dir: <YOUR EXPERIMENET LOG_DIR>
|
||||
ytvis_json: <YOUR_GT_PATH>/saco_veval_yt1b_test.json
|
||||
ytvis_dir : <YOUR_VIDEO_JPG_DIR>
|
||||
bpe_path: <BPE_PATH> # This should be under assets/bpe_simple_vocab_16e6.txt.gz
|
||||
num_videos: null
|
||||
|
||||
# ============================================================================
|
||||
# Different helper parameters and functions
|
||||
# ============================================================================
|
||||
scratch:
|
||||
vid_mask_postprocessor:
|
||||
_target_: sam3.eval.postprocessors.PostProcessNullOp
|
||||
|
||||
use_presence_eval: True
|
||||
|
||||
video_transforms_val:
|
||||
- _target_: sam3.train.transforms.basic_for_api.ComposeAPI
|
||||
transforms:
|
||||
- _target_: sam3.train.transforms.segmentation.DecodeRle
|
||||
# resize the image to 1024x1024 resolution
|
||||
- _target_: sam3.train.transforms.basic_for_api.RandomResizeAPI
|
||||
sizes: ${scratch.resolution} # originally `resolution: 1024`
|
||||
square: true
|
||||
consistent_transform: true
|
||||
- _target_: sam3.train.transforms.basic_for_api.ToTensorAPI
|
||||
- _target_: sam3.train.transforms.basic_for_api.NormalizeAPI
|
||||
mean: ${scratch.val_norm_mean}
|
||||
std: ${scratch.val_norm_std}
|
||||
|
||||
# Model parameters
|
||||
d_model: 256
|
||||
|
||||
# Image processing parameters
|
||||
resolution: 1008
|
||||
|
||||
# Normalization parameters
|
||||
train_norm_mean: [0.5, 0.5, 0.5]
|
||||
train_norm_std: [0.5, 0.5, 0.5]
|
||||
val_norm_mean: [0.5, 0.5, 0.5]
|
||||
val_norm_std: [0.5, 0.5, 0.5]
|
||||
|
||||
val_batch_size: 1
|
||||
num_val_workers: 0
|
||||
max_data_epochs: 20
|
||||
hybrid_repeats: 1
|
||||
gather_pred_via_filesys: false
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Trainer Configuration
|
||||
# ============================================================================
|
||||
|
||||
trainer:
|
||||
_target_: sam3.train.trainer.Trainer
|
||||
skip_saving_ckpts: true
|
||||
empty_gpu_mem_cache_after_eval: True
|
||||
skip_first_val: True
|
||||
max_epochs: ${scratch.max_data_epochs}
|
||||
accelerator: cuda
|
||||
seed_value: 123
|
||||
val_epoch_freq: 10
|
||||
mode: val
|
||||
|
||||
distributed:
|
||||
backend: nccl
|
||||
find_unused_parameters: True
|
||||
gradient_as_bucket_view: True
|
||||
|
||||
loss:
|
||||
all:
|
||||
_target_: sam3.train.loss.sam3_loss.DummyLoss
|
||||
default:
|
||||
_target_: sam3.train.loss.sam3_loss.DummyLoss
|
||||
|
||||
data:
|
||||
train: null
|
||||
val:
|
||||
_target_: sam3.train.data.torch_dataset.TorchDataset
|
||||
dataset:
|
||||
_target_: sam3.train.data.sam3_video_dataset.VideoGroundingDataset
|
||||
limit_ids: ${paths.num_videos}
|
||||
img_folder: ${paths.ytvis_dir}
|
||||
ann_file: ${paths.ytvis_json}
|
||||
coco_json_loader:
|
||||
_target_: sam3.train.data.coco_json_loaders.SAM3_VEVAL_API_FROM_JSON_NP
|
||||
_partial_: true
|
||||
|
||||
transforms: ${scratch.video_transforms_val}
|
||||
max_ann_per_img: 100000 # filtered in transforms
|
||||
max_val_queries: 100000
|
||||
multiplier: 1
|
||||
load_segmentation: true
|
||||
training: false
|
||||
|
||||
|
||||
shuffle: False
|
||||
batch_size: ${scratch.val_batch_size}
|
||||
num_workers: ${scratch.num_val_workers}
|
||||
pin_memory: True
|
||||
drop_last: False
|
||||
collate_fn:
|
||||
_target_: sam3.train.data.collator.collate_fn_api
|
||||
_partial_: true
|
||||
repeats: ${scratch.hybrid_repeats}
|
||||
dict_key: ytvis_val
|
||||
with_seg_masks: true
|
||||
|
||||
|
||||
model:
|
||||
_target_: sam3.model_builder.build_sam3_video_model
|
||||
bpe_path: ${paths.bpe_path}
|
||||
has_presence_token: True
|
||||
geo_encoder_use_img_cross_attn: True
|
||||
apply_temporal_disambiguation: False
|
||||
|
||||
meters:
|
||||
val:
|
||||
ytvis_val:
|
||||
pred_file: # key
|
||||
_target_: sam3.eval.ytvis_eval.YTVISResultsWriter
|
||||
dump_file: ${launcher.experiment_log_dir}/preds/${paths.dump_file_name}.json
|
||||
postprocessor: ${scratch.vid_mask_postprocessor}
|
||||
gather_pred_via_filesys: ${scratch.gather_pred_via_filesys}
|
||||
|
||||
optim:
|
||||
amp:
|
||||
enabled: True
|
||||
amp_dtype: bfloat16
|
||||
|
||||
|
||||
checkpoint:
|
||||
save_dir: ${launcher.experiment_log_dir}/checkpoints
|
||||
save_freq: 0 # 0 only last checkpoint is saved.
|
||||
|
||||
|
||||
logging:
|
||||
tensorboard_writer:
|
||||
_target_: sam3.train.utils.logger.make_tensorboard_logger
|
||||
log_dir: ${launcher.experiment_log_dir}/tensorboard
|
||||
flush_secs: 120
|
||||
should_log: True
|
||||
wandb_writer: null
|
||||
log_dir: ${launcher.experiment_log_dir}/logs/
|
||||
log_freq: 10
|
||||
|
||||
# ============================================================================
|
||||
# Launcher and Submitit Configuration
|
||||
# ============================================================================
|
||||
|
||||
launcher:
|
||||
num_nodes: 8
|
||||
gpus_per_node: 8
|
||||
experiment_log_dir: ${paths.experiment_log_dir}
|
||||
multiprocessing_context: forkserver
|
||||
|
||||
submitit:
|
||||
account: null
|
||||
partition: null
|
||||
qos: null
|
||||
timeout_hour: 72
|
||||
use_cluster: True
|
||||
cpus_per_task: 10
|
||||
port_range: [10000, 65000]
|
||||
constraint: null
|
||||
174
sam3/train/configs/saco_video_evals/saco_veval_yt1b_val.yaml
Normal file
174
sam3/train/configs/saco_video_evals/saco_veval_yt1b_val.yaml
Normal file
@@ -0,0 +1,174 @@
|
||||
# @package _global_
|
||||
defaults:
|
||||
- _self_
|
||||
|
||||
# ============================================================================
|
||||
# Paths Configuration (Chage this to your own paths)
|
||||
# ============================================================================
|
||||
paths:
|
||||
|
||||
dump_file_name: saco_veval_yt1b_val
|
||||
experiment_log_dir: <YOUR EXPERIMENET LOG_DIR>
|
||||
ytvis_json: <YOUR_GT_PATH>/saco_veval_yt1b_val.json
|
||||
ytvis_dir : <YOUR_VIDEO_JPG_DIR>
|
||||
bpe_path: <BPE_PATH> # This should be under assets/bpe_simple_vocab_16e6.txt.gz
|
||||
num_videos: null
|
||||
|
||||
# ============================================================================
|
||||
# Different helper parameters and functions
|
||||
# ============================================================================
|
||||
scratch:
|
||||
vid_mask_postprocessor:
|
||||
_target_: sam3.eval.postprocessors.PostProcessNullOp
|
||||
|
||||
use_presence_eval: True
|
||||
|
||||
video_transforms_val:
|
||||
- _target_: sam3.train.transforms.basic_for_api.ComposeAPI
|
||||
transforms:
|
||||
- _target_: sam3.train.transforms.segmentation.DecodeRle
|
||||
# resize the image to 1024x1024 resolution
|
||||
- _target_: sam3.train.transforms.basic_for_api.RandomResizeAPI
|
||||
sizes: ${scratch.resolution} # originally `resolution: 1024`
|
||||
square: true
|
||||
consistent_transform: true
|
||||
- _target_: sam3.train.transforms.basic_for_api.ToTensorAPI
|
||||
- _target_: sam3.train.transforms.basic_for_api.NormalizeAPI
|
||||
mean: ${scratch.val_norm_mean}
|
||||
std: ${scratch.val_norm_std}
|
||||
|
||||
# Model parameters
|
||||
d_model: 256
|
||||
|
||||
# Image processing parameters
|
||||
resolution: 1008
|
||||
|
||||
# Normalization parameters
|
||||
train_norm_mean: [0.5, 0.5, 0.5]
|
||||
train_norm_std: [0.5, 0.5, 0.5]
|
||||
val_norm_mean: [0.5, 0.5, 0.5]
|
||||
val_norm_std: [0.5, 0.5, 0.5]
|
||||
|
||||
val_batch_size: 1
|
||||
num_val_workers: 0
|
||||
max_data_epochs: 20
|
||||
hybrid_repeats: 1
|
||||
gather_pred_via_filesys: false
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Trainer Configuration
|
||||
# ============================================================================
|
||||
|
||||
trainer:
|
||||
_target_: sam3.train.trainer.Trainer
|
||||
skip_saving_ckpts: true
|
||||
empty_gpu_mem_cache_after_eval: True
|
||||
skip_first_val: True
|
||||
max_epochs: ${scratch.max_data_epochs}
|
||||
accelerator: cuda
|
||||
seed_value: 123
|
||||
val_epoch_freq: 10
|
||||
mode: val
|
||||
|
||||
distributed:
|
||||
backend: nccl
|
||||
find_unused_parameters: True
|
||||
gradient_as_bucket_view: True
|
||||
|
||||
loss:
|
||||
all:
|
||||
_target_: sam3.train.loss.sam3_loss.DummyLoss
|
||||
default:
|
||||
_target_: sam3.train.loss.sam3_loss.DummyLoss
|
||||
|
||||
data:
|
||||
train: null
|
||||
val:
|
||||
_target_: sam3.train.data.torch_dataset.TorchDataset
|
||||
dataset:
|
||||
_target_: sam3.train.data.sam3_video_dataset.VideoGroundingDataset
|
||||
limit_ids: ${paths.num_videos}
|
||||
img_folder: ${paths.ytvis_dir}
|
||||
ann_file: ${paths.ytvis_json}
|
||||
coco_json_loader:
|
||||
_target_: sam3.train.data.coco_json_loaders.SAM3_VEVAL_API_FROM_JSON_NP
|
||||
_partial_: true
|
||||
|
||||
transforms: ${scratch.video_transforms_val}
|
||||
max_ann_per_img: 100000 # filtered in transforms
|
||||
max_val_queries: 100000
|
||||
multiplier: 1
|
||||
load_segmentation: true
|
||||
training: false
|
||||
|
||||
|
||||
shuffle: False
|
||||
batch_size: ${scratch.val_batch_size}
|
||||
num_workers: ${scratch.num_val_workers}
|
||||
pin_memory: True
|
||||
drop_last: False
|
||||
collate_fn:
|
||||
_target_: sam3.train.data.collator.collate_fn_api
|
||||
_partial_: true
|
||||
repeats: ${scratch.hybrid_repeats}
|
||||
dict_key: ytvis_val
|
||||
with_seg_masks: true
|
||||
|
||||
|
||||
model:
|
||||
_target_: sam3.model_builder.build_sam3_video_model
|
||||
bpe_path: ${paths.bpe_path}
|
||||
has_presence_token: True
|
||||
geo_encoder_use_img_cross_attn: True
|
||||
apply_temporal_disambiguation: True
|
||||
|
||||
meters:
|
||||
val:
|
||||
ytvis_val:
|
||||
pred_file: # key
|
||||
_target_: sam3.eval.ytvis_eval.YTVISResultsWriter
|
||||
dump_file: ${launcher.experiment_log_dir}/preds/${paths.dump_file_name}.json
|
||||
postprocessor: ${scratch.vid_mask_postprocessor}
|
||||
gather_pred_via_filesys: ${scratch.gather_pred_via_filesys}
|
||||
|
||||
optim:
|
||||
amp:
|
||||
enabled: True
|
||||
amp_dtype: bfloat16
|
||||
|
||||
|
||||
checkpoint:
|
||||
save_dir: ${launcher.experiment_log_dir}/checkpoints
|
||||
save_freq: 0 # 0 only last checkpoint is saved.
|
||||
|
||||
|
||||
logging:
|
||||
tensorboard_writer:
|
||||
_target_: sam3.train.utils.logger.make_tensorboard_logger
|
||||
log_dir: ${launcher.experiment_log_dir}/tensorboard
|
||||
flush_secs: 120
|
||||
should_log: True
|
||||
wandb_writer: null
|
||||
log_dir: ${launcher.experiment_log_dir}/logs/
|
||||
log_freq: 10
|
||||
|
||||
# ============================================================================
|
||||
# Launcher and Submitit Configuration
|
||||
# ============================================================================
|
||||
|
||||
launcher:
|
||||
num_nodes: 8
|
||||
gpus_per_node: 8
|
||||
experiment_log_dir: ${paths.experiment_log_dir}
|
||||
multiprocessing_context: forkserver
|
||||
|
||||
submitit:
|
||||
account: null
|
||||
partition: null
|
||||
qos: null
|
||||
timeout_hour: 72
|
||||
use_cluster: True
|
||||
cpus_per_task: 10
|
||||
port_range: [10000, 65000]
|
||||
constraint: null
|
||||
@@ -0,0 +1,174 @@
|
||||
# @package _global_
|
||||
defaults:
|
||||
- _self_
|
||||
|
||||
# ============================================================================
|
||||
# Paths Configuration (Chage this to your own paths)
|
||||
# ============================================================================
|
||||
paths:
|
||||
|
||||
dump_file_name: saco_veval_yt1b_val
|
||||
experiment_log_dir: <YOUR EXPERIMENET LOG_DIR>
|
||||
ytvis_json: <YOUR_GT_PATH>/saco_veval_yt1b_val.json
|
||||
ytvis_dir : <YOUR_VIDEO_JPG_DIR>
|
||||
bpe_path: <BPE_PATH> # This should be under assets/bpe_simple_vocab_16e6.txt.gz
|
||||
num_videos: null
|
||||
|
||||
# ============================================================================
|
||||
# Different helper parameters and functions
|
||||
# ============================================================================
|
||||
scratch:
|
||||
vid_mask_postprocessor:
|
||||
_target_: sam3.eval.postprocessors.PostProcessNullOp
|
||||
|
||||
use_presence_eval: True
|
||||
|
||||
video_transforms_val:
|
||||
- _target_: sam3.train.transforms.basic_for_api.ComposeAPI
|
||||
transforms:
|
||||
- _target_: sam3.train.transforms.segmentation.DecodeRle
|
||||
# resize the image to 1024x1024 resolution
|
||||
- _target_: sam3.train.transforms.basic_for_api.RandomResizeAPI
|
||||
sizes: ${scratch.resolution} # originally `resolution: 1024`
|
||||
square: true
|
||||
consistent_transform: true
|
||||
- _target_: sam3.train.transforms.basic_for_api.ToTensorAPI
|
||||
- _target_: sam3.train.transforms.basic_for_api.NormalizeAPI
|
||||
mean: ${scratch.val_norm_mean}
|
||||
std: ${scratch.val_norm_std}
|
||||
|
||||
# Model parameters
|
||||
d_model: 256
|
||||
|
||||
# Image processing parameters
|
||||
resolution: 1008
|
||||
|
||||
# Normalization parameters
|
||||
train_norm_mean: [0.5, 0.5, 0.5]
|
||||
train_norm_std: [0.5, 0.5, 0.5]
|
||||
val_norm_mean: [0.5, 0.5, 0.5]
|
||||
val_norm_std: [0.5, 0.5, 0.5]
|
||||
|
||||
val_batch_size: 1
|
||||
num_val_workers: 0
|
||||
max_data_epochs: 20
|
||||
hybrid_repeats: 1
|
||||
gather_pred_via_filesys: false
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Trainer Configuration
|
||||
# ============================================================================
|
||||
|
||||
trainer:
|
||||
_target_: sam3.train.trainer.Trainer
|
||||
skip_saving_ckpts: true
|
||||
empty_gpu_mem_cache_after_eval: True
|
||||
skip_first_val: True
|
||||
max_epochs: ${scratch.max_data_epochs}
|
||||
accelerator: cuda
|
||||
seed_value: 123
|
||||
val_epoch_freq: 10
|
||||
mode: val
|
||||
|
||||
distributed:
|
||||
backend: nccl
|
||||
find_unused_parameters: True
|
||||
gradient_as_bucket_view: True
|
||||
|
||||
loss:
|
||||
all:
|
||||
_target_: sam3.train.loss.sam3_loss.DummyLoss
|
||||
default:
|
||||
_target_: sam3.train.loss.sam3_loss.DummyLoss
|
||||
|
||||
data:
|
||||
train: null
|
||||
val:
|
||||
_target_: sam3.train.data.torch_dataset.TorchDataset
|
||||
dataset:
|
||||
_target_: sam3.train.data.sam3_video_dataset.VideoGroundingDataset
|
||||
limit_ids: ${paths.num_videos}
|
||||
img_folder: ${paths.ytvis_dir}
|
||||
ann_file: ${paths.ytvis_json}
|
||||
coco_json_loader:
|
||||
_target_: sam3.train.data.coco_json_loaders.SAM3_VEVAL_API_FROM_JSON_NP
|
||||
_partial_: true
|
||||
|
||||
transforms: ${scratch.video_transforms_val}
|
||||
max_ann_per_img: 100000 # filtered in transforms
|
||||
max_val_queries: 100000
|
||||
multiplier: 1
|
||||
load_segmentation: true
|
||||
training: false
|
||||
|
||||
|
||||
shuffle: False
|
||||
batch_size: ${scratch.val_batch_size}
|
||||
num_workers: ${scratch.num_val_workers}
|
||||
pin_memory: True
|
||||
drop_last: False
|
||||
collate_fn:
|
||||
_target_: sam3.train.data.collator.collate_fn_api
|
||||
_partial_: true
|
||||
repeats: ${scratch.hybrid_repeats}
|
||||
dict_key: ytvis_val
|
||||
with_seg_masks: true
|
||||
|
||||
|
||||
model:
|
||||
_target_: sam3.model_builder.build_sam3_video_model
|
||||
bpe_path: ${paths.bpe_path}
|
||||
has_presence_token: True
|
||||
geo_encoder_use_img_cross_attn: True
|
||||
apply_temporal_disambiguation: False
|
||||
|
||||
meters:
|
||||
val:
|
||||
ytvis_val:
|
||||
pred_file: # key
|
||||
_target_: sam3.eval.ytvis_eval.YTVISResultsWriter
|
||||
dump_file: ${launcher.experiment_log_dir}/preds/${paths.dump_file_name}.json
|
||||
postprocessor: ${scratch.vid_mask_postprocessor}
|
||||
gather_pred_via_filesys: ${scratch.gather_pred_via_filesys}
|
||||
|
||||
optim:
|
||||
amp:
|
||||
enabled: True
|
||||
amp_dtype: bfloat16
|
||||
|
||||
|
||||
checkpoint:
|
||||
save_dir: ${launcher.experiment_log_dir}/checkpoints
|
||||
save_freq: 0 # 0 only last checkpoint is saved.
|
||||
|
||||
|
||||
logging:
|
||||
tensorboard_writer:
|
||||
_target_: sam3.train.utils.logger.make_tensorboard_logger
|
||||
log_dir: ${launcher.experiment_log_dir}/tensorboard
|
||||
flush_secs: 120
|
||||
should_log: True
|
||||
wandb_writer: null
|
||||
log_dir: ${launcher.experiment_log_dir}/logs/
|
||||
log_freq: 10
|
||||
|
||||
# ============================================================================
|
||||
# Launcher and Submitit Configuration
|
||||
# ============================================================================
|
||||
|
||||
launcher:
|
||||
num_nodes: 8
|
||||
gpus_per_node: 8
|
||||
experiment_log_dir: ${paths.experiment_log_dir}
|
||||
multiprocessing_context: forkserver
|
||||
|
||||
submitit:
|
||||
account: null
|
||||
partition: null
|
||||
qos: null
|
||||
timeout_hour: 72
|
||||
use_cluster: True
|
||||
cpus_per_task: 10
|
||||
port_range: [10000, 65000]
|
||||
constraint: null
|
||||
@@ -0,0 +1,64 @@
|
||||
# @package _global_
|
||||
defaults:
|
||||
- /configs/eval_base.yaml
|
||||
- _self_
|
||||
|
||||
# ============================================================================
|
||||
# Paths Configuration (you can override here, but it shouldn't require further changes if eval_base.yaml is correct
|
||||
# ============================================================================
|
||||
paths:
|
||||
experiment_log_dir: ${paths.base_experiment_log_dir}/silver_bdd100k/
|
||||
coco_gt: ${paths.base_annotation_path_silver}/silver_bdd100k_merged_test.json
|
||||
img_path: ${paths.silver_img_path}/bdd100k/
|
||||
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Trainer Configuration
|
||||
# ============================================================================
|
||||
|
||||
trainer:
|
||||
data:
|
||||
val:
|
||||
_target_: sam3.train.data.torch_dataset.TorchDataset
|
||||
dataset:
|
||||
_target_: sam3.train.data.sam3_image_dataset.Sam3ImageDataset
|
||||
coco_json_loader:
|
||||
_target_: sam3.train.data.coco_json_loaders.SAM3_EVAL_API_FROM_JSON_NP
|
||||
_partial_: true
|
||||
img_folder: ${paths.img_path}
|
||||
ann_file: ${paths.coco_gt}
|
||||
transforms: ${scratch.base_val_transform}
|
||||
max_ann_per_img: 100000
|
||||
multiplier: 1
|
||||
training: false
|
||||
|
||||
shuffle: False
|
||||
batch_size: ${scratch.val_batch_size}
|
||||
num_workers: ${scratch.num_val_workers}
|
||||
pin_memory: False
|
||||
drop_last: False
|
||||
collate_fn:
|
||||
_target_: sam3.train.data.collator.collate_fn_api
|
||||
_partial_: true
|
||||
repeats: ${scratch.hybrid_repeats}
|
||||
dict_key: silver_bdd100k
|
||||
|
||||
meters:
|
||||
val:
|
||||
silver_bdd100k: # this key matches the "dict_key" in the dataloader's collate function
|
||||
cgf1:
|
||||
_target_: sam3.eval.coco_writer.PredictionDumper
|
||||
iou_type: "segm"
|
||||
dump_dir: ${launcher.experiment_log_dir}/dumps/silver_bdd100k
|
||||
merge_predictions: True
|
||||
postprocessor: ${scratch.mask_postprocessor_thresholded}
|
||||
gather_pred_via_filesys: ${scratch.gather_pred_via_filesys}
|
||||
maxdets: 1000000 # no limit
|
||||
pred_file_evaluators:
|
||||
- _target_: sam3.eval.cgf1_eval.CGF1Evaluator
|
||||
gt_path: ${paths.coco_gt}
|
||||
iou_type: "bbox"
|
||||
- _target_: sam3.eval.cgf1_eval.CGF1Evaluator
|
||||
gt_path: ${paths.coco_gt}
|
||||
iou_type: "segm"
|
||||
@@ -0,0 +1,64 @@
|
||||
# @package _global_
|
||||
defaults:
|
||||
- /configs/eval_base.yaml
|
||||
- _self_
|
||||
|
||||
# ============================================================================
|
||||
# Paths Configuration (you can override here, but it shouldn't require further changes if eval_base.yaml is correct
|
||||
# ============================================================================
|
||||
paths:
|
||||
experiment_log_dir: ${paths.base_experiment_log_dir}/silver_droid/
|
||||
coco_gt: ${paths.base_annotation_path_silver}/silver_droid_merged_test.json
|
||||
img_path: ${paths.silver_img_path}/droid/
|
||||
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Trainer Configuration
|
||||
# ============================================================================
|
||||
|
||||
trainer:
|
||||
data:
|
||||
val:
|
||||
_target_: sam3.train.data.torch_dataset.TorchDataset
|
||||
dataset:
|
||||
_target_: sam3.train.data.sam3_image_dataset.Sam3ImageDataset
|
||||
coco_json_loader:
|
||||
_target_: sam3.train.data.coco_json_loaders.SAM3_EVAL_API_FROM_JSON_NP
|
||||
_partial_: true
|
||||
img_folder: ${paths.img_path}
|
||||
ann_file: ${paths.coco_gt}
|
||||
transforms: ${scratch.base_val_transform}
|
||||
max_ann_per_img: 100000
|
||||
multiplier: 1
|
||||
training: false
|
||||
|
||||
shuffle: False
|
||||
batch_size: ${scratch.val_batch_size}
|
||||
num_workers: ${scratch.num_val_workers}
|
||||
pin_memory: False
|
||||
drop_last: False
|
||||
collate_fn:
|
||||
_target_: sam3.train.data.collator.collate_fn_api
|
||||
_partial_: true
|
||||
repeats: ${scratch.hybrid_repeats}
|
||||
dict_key: silver_droid
|
||||
|
||||
meters:
|
||||
val:
|
||||
silver_droid: # this key matches the "dict_key" in the dataloader's collate function
|
||||
cgf1:
|
||||
_target_: sam3.eval.coco_writer.PredictionDumper
|
||||
iou_type: "segm"
|
||||
dump_dir: ${launcher.experiment_log_dir}/dumps/silver_droid
|
||||
merge_predictions: True
|
||||
postprocessor: ${scratch.mask_postprocessor_thresholded}
|
||||
gather_pred_via_filesys: ${scratch.gather_pred_via_filesys}
|
||||
maxdets: 1000000 # no limit
|
||||
pred_file_evaluators:
|
||||
- _target_: sam3.eval.cgf1_eval.CGF1Evaluator
|
||||
gt_path: ${paths.coco_gt}
|
||||
iou_type: "bbox"
|
||||
- _target_: sam3.eval.cgf1_eval.CGF1Evaluator
|
||||
gt_path: ${paths.coco_gt}
|
||||
iou_type: "segm"
|
||||
@@ -0,0 +1,64 @@
|
||||
# @package _global_
|
||||
defaults:
|
||||
- /configs/eval_base.yaml
|
||||
- _self_
|
||||
|
||||
# ============================================================================
|
||||
# Paths Configuration (you can override here, but it shouldn't require further changes if eval_base.yaml is correct
|
||||
# ============================================================================
|
||||
paths:
|
||||
experiment_log_dir: ${paths.base_experiment_log_dir}/silver_ego4d/
|
||||
coco_gt: ${paths.base_annotation_path_silver}/silver_ego4d_merged_test.json
|
||||
img_path: ${paths.silver_img_path}/ego4d/
|
||||
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Trainer Configuration
|
||||
# ============================================================================
|
||||
|
||||
trainer:
|
||||
data:
|
||||
val:
|
||||
_target_: sam3.train.data.torch_dataset.TorchDataset
|
||||
dataset:
|
||||
_target_: sam3.train.data.sam3_image_dataset.Sam3ImageDataset
|
||||
coco_json_loader:
|
||||
_target_: sam3.train.data.coco_json_loaders.SAM3_EVAL_API_FROM_JSON_NP
|
||||
_partial_: true
|
||||
img_folder: ${paths.img_path}
|
||||
ann_file: ${paths.coco_gt}
|
||||
transforms: ${scratch.base_val_transform}
|
||||
max_ann_per_img: 100000
|
||||
multiplier: 1
|
||||
training: false
|
||||
|
||||
shuffle: False
|
||||
batch_size: ${scratch.val_batch_size}
|
||||
num_workers: ${scratch.num_val_workers}
|
||||
pin_memory: False
|
||||
drop_last: False
|
||||
collate_fn:
|
||||
_target_: sam3.train.data.collator.collate_fn_api
|
||||
_partial_: true
|
||||
repeats: ${scratch.hybrid_repeats}
|
||||
dict_key: silver_ego4d
|
||||
|
||||
meters:
|
||||
val:
|
||||
silver_ego4d: # this key matches the "dict_key" in the dataloader's collate function
|
||||
cgf1:
|
||||
_target_: sam3.eval.coco_writer.PredictionDumper
|
||||
iou_type: "segm"
|
||||
dump_dir: ${launcher.experiment_log_dir}/dumps/silver_ego4d
|
||||
merge_predictions: True
|
||||
postprocessor: ${scratch.mask_postprocessor_thresholded}
|
||||
gather_pred_via_filesys: ${scratch.gather_pred_via_filesys}
|
||||
maxdets: 1000000 # no limit
|
||||
pred_file_evaluators:
|
||||
- _target_: sam3.eval.cgf1_eval.CGF1Evaluator
|
||||
gt_path: ${paths.coco_gt}
|
||||
iou_type: "bbox"
|
||||
- _target_: sam3.eval.cgf1_eval.CGF1Evaluator
|
||||
gt_path: ${paths.coco_gt}
|
||||
iou_type: "segm"
|
||||
@@ -0,0 +1,64 @@
|
||||
# @package _global_
|
||||
defaults:
|
||||
- /configs/eval_base.yaml
|
||||
- _self_
|
||||
|
||||
# ============================================================================
|
||||
# Paths Configuration (you can override here, but it shouldn't require further changes if eval_base.yaml is correct
|
||||
# ============================================================================
|
||||
paths:
|
||||
experiment_log_dir: ${paths.base_experiment_log_dir}/silver_fathomnet/
|
||||
coco_gt: ${paths.base_annotation_path_silver}/silver_fathomnet_test.json
|
||||
img_path: ${paths.silver_img_path}/fathomnet/
|
||||
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Trainer Configuration
|
||||
# ============================================================================
|
||||
|
||||
trainer:
|
||||
data:
|
||||
val:
|
||||
_target_: sam3.train.data.torch_dataset.TorchDataset
|
||||
dataset:
|
||||
_target_: sam3.train.data.sam3_image_dataset.Sam3ImageDataset
|
||||
coco_json_loader:
|
||||
_target_: sam3.train.data.coco_json_loaders.SAM3_EVAL_API_FROM_JSON_NP
|
||||
_partial_: true
|
||||
img_folder: ${paths.img_path}
|
||||
ann_file: ${paths.coco_gt}
|
||||
transforms: ${scratch.base_val_transform}
|
||||
max_ann_per_img: 100000
|
||||
multiplier: 1
|
||||
training: false
|
||||
|
||||
shuffle: False
|
||||
batch_size: ${scratch.val_batch_size}
|
||||
num_workers: ${scratch.num_val_workers}
|
||||
pin_memory: False
|
||||
drop_last: False
|
||||
collate_fn:
|
||||
_target_: sam3.train.data.collator.collate_fn_api
|
||||
_partial_: true
|
||||
repeats: ${scratch.hybrid_repeats}
|
||||
dict_key: silver_fathomnet
|
||||
|
||||
meters:
|
||||
val:
|
||||
silver_fathomnet: # this key matches the "dict_key" in the dataloader's collate function
|
||||
cgf1:
|
||||
_target_: sam3.eval.coco_writer.PredictionDumper
|
||||
iou_type: "segm"
|
||||
dump_dir: ${launcher.experiment_log_dir}/dumps/silver_fathomnet
|
||||
merge_predictions: True
|
||||
postprocessor: ${scratch.mask_postprocessor_thresholded}
|
||||
gather_pred_via_filesys: ${scratch.gather_pred_via_filesys}
|
||||
maxdets: 1000000 # no limit
|
||||
pred_file_evaluators:
|
||||
- _target_: sam3.eval.cgf1_eval.CGF1Evaluator
|
||||
gt_path: ${paths.coco_gt}
|
||||
iou_type: "bbox"
|
||||
- _target_: sam3.eval.cgf1_eval.CGF1Evaluator
|
||||
gt_path: ${paths.coco_gt}
|
||||
iou_type: "segm"
|
||||
@@ -0,0 +1,64 @@
|
||||
# @package _global_
|
||||
defaults:
|
||||
- /configs/eval_base.yaml
|
||||
- _self_
|
||||
|
||||
# ============================================================================
|
||||
# Paths Configuration (you can override here, but it shouldn't require further changes if eval_base.yaml is correct
|
||||
# ============================================================================
|
||||
paths:
|
||||
experiment_log_dir: ${paths.base_experiment_log_dir}/silver_food_rec/
|
||||
coco_gt: ${paths.base_annotation_path_silver}/silver_food_rec_merged_test.json
|
||||
img_path: ${paths.silver_img_path}/food_rec/
|
||||
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Trainer Configuration
|
||||
# ============================================================================
|
||||
|
||||
trainer:
|
||||
data:
|
||||
val:
|
||||
_target_: sam3.train.data.torch_dataset.TorchDataset
|
||||
dataset:
|
||||
_target_: sam3.train.data.sam3_image_dataset.Sam3ImageDataset
|
||||
coco_json_loader:
|
||||
_target_: sam3.train.data.coco_json_loaders.SAM3_EVAL_API_FROM_JSON_NP
|
||||
_partial_: true
|
||||
img_folder: ${paths.img_path}
|
||||
ann_file: ${paths.coco_gt}
|
||||
transforms: ${scratch.base_val_transform}
|
||||
max_ann_per_img: 100000
|
||||
multiplier: 1
|
||||
training: false
|
||||
|
||||
shuffle: False
|
||||
batch_size: ${scratch.val_batch_size}
|
||||
num_workers: ${scratch.num_val_workers}
|
||||
pin_memory: False
|
||||
drop_last: False
|
||||
collate_fn:
|
||||
_target_: sam3.train.data.collator.collate_fn_api
|
||||
_partial_: true
|
||||
repeats: ${scratch.hybrid_repeats}
|
||||
dict_key: silver_food_rec
|
||||
|
||||
meters:
|
||||
val:
|
||||
silver_food_rec: # this key matches the "dict_key" in the dataloader's collate function
|
||||
cgf1:
|
||||
_target_: sam3.eval.coco_writer.PredictionDumper
|
||||
iou_type: "segm"
|
||||
dump_dir: ${launcher.experiment_log_dir}/dumps/silver_food_rec
|
||||
merge_predictions: True
|
||||
postprocessor: ${scratch.mask_postprocessor_thresholded}
|
||||
gather_pred_via_filesys: ${scratch.gather_pred_via_filesys}
|
||||
maxdets: 1000000 # no limit
|
||||
pred_file_evaluators:
|
||||
- _target_: sam3.eval.cgf1_eval.CGF1Evaluator
|
||||
gt_path: ${paths.coco_gt}
|
||||
iou_type: "bbox"
|
||||
- _target_: sam3.eval.cgf1_eval.CGF1Evaluator
|
||||
gt_path: ${paths.coco_gt}
|
||||
iou_type: "segm"
|
||||
@@ -0,0 +1,64 @@
|
||||
# @package _global_
|
||||
defaults:
|
||||
- /configs/eval_base.yaml
|
||||
- _self_
|
||||
|
||||
# ============================================================================
|
||||
# Paths Configuration (you can override here, but it shouldn't require further changes if eval_base.yaml is correct
|
||||
# ============================================================================
|
||||
paths:
|
||||
experiment_log_dir: ${paths.base_experiment_log_dir}/silver_geode/
|
||||
coco_gt: ${paths.base_annotation_path_silver}/silver_geode_merged_test.json
|
||||
img_path: ${paths.silver_img_path}/geode/
|
||||
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Trainer Configuration
|
||||
# ============================================================================
|
||||
|
||||
trainer:
|
||||
data:
|
||||
val:
|
||||
_target_: sam3.train.data.torch_dataset.TorchDataset
|
||||
dataset:
|
||||
_target_: sam3.train.data.sam3_image_dataset.Sam3ImageDataset
|
||||
coco_json_loader:
|
||||
_target_: sam3.train.data.coco_json_loaders.SAM3_EVAL_API_FROM_JSON_NP
|
||||
_partial_: true
|
||||
img_folder: ${paths.img_path}
|
||||
ann_file: ${paths.coco_gt}
|
||||
transforms: ${scratch.base_val_transform}
|
||||
max_ann_per_img: 100000
|
||||
multiplier: 1
|
||||
training: false
|
||||
|
||||
shuffle: False
|
||||
batch_size: ${scratch.val_batch_size}
|
||||
num_workers: ${scratch.num_val_workers}
|
||||
pin_memory: False
|
||||
drop_last: False
|
||||
collate_fn:
|
||||
_target_: sam3.train.data.collator.collate_fn_api
|
||||
_partial_: true
|
||||
repeats: ${scratch.hybrid_repeats}
|
||||
dict_key: silver_geode
|
||||
|
||||
meters:
|
||||
val:
|
||||
silver_geode: # this key matches the "dict_key" in the dataloader's collate function
|
||||
cgf1:
|
||||
_target_: sam3.eval.coco_writer.PredictionDumper
|
||||
iou_type: "segm"
|
||||
dump_dir: ${launcher.experiment_log_dir}/dumps/silver_geode
|
||||
merge_predictions: True
|
||||
postprocessor: ${scratch.mask_postprocessor_thresholded}
|
||||
gather_pred_via_filesys: ${scratch.gather_pred_via_filesys}
|
||||
maxdets: 1000000 # no limit
|
||||
pred_file_evaluators:
|
||||
- _target_: sam3.eval.cgf1_eval.CGF1Evaluator
|
||||
gt_path: ${paths.coco_gt}
|
||||
iou_type: "bbox"
|
||||
- _target_: sam3.eval.cgf1_eval.CGF1Evaluator
|
||||
gt_path: ${paths.coco_gt}
|
||||
iou_type: "segm"
|
||||
@@ -0,0 +1,64 @@
|
||||
# @package _global_
|
||||
defaults:
|
||||
- /configs/eval_base.yaml
|
||||
- _self_
|
||||
|
||||
# ============================================================================
|
||||
# Paths Configuration (you can override here, but it shouldn't require further changes if eval_base.yaml is correct
|
||||
# ============================================================================
|
||||
paths:
|
||||
experiment_log_dir: ${paths.base_experiment_log_dir}/silver_inaturalist/
|
||||
coco_gt: ${paths.base_annotation_path_silver}/silver_inaturalist_merged_test.json
|
||||
img_path: ${paths.silver_img_path}/inaturalist/
|
||||
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Trainer Configuration
|
||||
# ============================================================================
|
||||
|
||||
trainer:
|
||||
data:
|
||||
val:
|
||||
_target_: sam3.train.data.torch_dataset.TorchDataset
|
||||
dataset:
|
||||
_target_: sam3.train.data.sam3_image_dataset.Sam3ImageDataset
|
||||
coco_json_loader:
|
||||
_target_: sam3.train.data.coco_json_loaders.SAM3_EVAL_API_FROM_JSON_NP
|
||||
_partial_: true
|
||||
img_folder: ${paths.img_path}
|
||||
ann_file: ${paths.coco_gt}
|
||||
transforms: ${scratch.base_val_transform}
|
||||
max_ann_per_img: 100000
|
||||
multiplier: 1
|
||||
training: false
|
||||
|
||||
shuffle: False
|
||||
batch_size: ${scratch.val_batch_size}
|
||||
num_workers: ${scratch.num_val_workers}
|
||||
pin_memory: False
|
||||
drop_last: False
|
||||
collate_fn:
|
||||
_target_: sam3.train.data.collator.collate_fn_api
|
||||
_partial_: true
|
||||
repeats: ${scratch.hybrid_repeats}
|
||||
dict_key: silver_inaturalist
|
||||
|
||||
meters:
|
||||
val:
|
||||
silver_inaturalist: # this key matches the "dict_key" in the dataloader's collate function
|
||||
cgf1:
|
||||
_target_: sam3.eval.coco_writer.PredictionDumper
|
||||
iou_type: "segm"
|
||||
dump_dir: ${launcher.experiment_log_dir}/dumps/silver_inaturalist
|
||||
merge_predictions: True
|
||||
postprocessor: ${scratch.mask_postprocessor_thresholded}
|
||||
gather_pred_via_filesys: ${scratch.gather_pred_via_filesys}
|
||||
maxdets: 1000000 # no limit
|
||||
pred_file_evaluators:
|
||||
- _target_: sam3.eval.cgf1_eval.CGF1Evaluator
|
||||
gt_path: ${paths.coco_gt}
|
||||
iou_type: "bbox"
|
||||
- _target_: sam3.eval.cgf1_eval.CGF1Evaluator
|
||||
gt_path: ${paths.coco_gt}
|
||||
iou_type: "segm"
|
||||
@@ -0,0 +1,64 @@
|
||||
# @package _global_
|
||||
defaults:
|
||||
- /configs/eval_base.yaml
|
||||
- _self_
|
||||
|
||||
# ============================================================================
|
||||
# Paths Configuration (you can override here, but it shouldn't require further changes if eval_base.yaml is correct
|
||||
# ============================================================================
|
||||
paths:
|
||||
experiment_log_dir: ${paths.base_experiment_log_dir}/silver_nga_art/
|
||||
coco_gt: ${paths.base_annotation_path_silver}/silver_nga_art_merged_test.json
|
||||
img_path: ${paths.silver_img_path}/nga/
|
||||
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Trainer Configuration
|
||||
# ============================================================================
|
||||
|
||||
trainer:
|
||||
data:
|
||||
val:
|
||||
_target_: sam3.train.data.torch_dataset.TorchDataset
|
||||
dataset:
|
||||
_target_: sam3.train.data.sam3_image_dataset.Sam3ImageDataset
|
||||
coco_json_loader:
|
||||
_target_: sam3.train.data.coco_json_loaders.SAM3_EVAL_API_FROM_JSON_NP
|
||||
_partial_: true
|
||||
img_folder: ${paths.img_path}
|
||||
ann_file: ${paths.coco_gt}
|
||||
transforms: ${scratch.base_val_transform}
|
||||
max_ann_per_img: 100000
|
||||
multiplier: 1
|
||||
training: false
|
||||
|
||||
shuffle: False
|
||||
batch_size: ${scratch.val_batch_size}
|
||||
num_workers: ${scratch.num_val_workers}
|
||||
pin_memory: False
|
||||
drop_last: False
|
||||
collate_fn:
|
||||
_target_: sam3.train.data.collator.collate_fn_api
|
||||
_partial_: true
|
||||
repeats: ${scratch.hybrid_repeats}
|
||||
dict_key: silver_nga_art
|
||||
|
||||
meters:
|
||||
val:
|
||||
silver_nga_art: # this key matches the "dict_key" in the dataloader's collate function
|
||||
cgf1:
|
||||
_target_: sam3.eval.coco_writer.PredictionDumper
|
||||
iou_type: "segm"
|
||||
dump_dir: ${launcher.experiment_log_dir}/dumps/silver_nga_art
|
||||
merge_predictions: True
|
||||
postprocessor: ${scratch.mask_postprocessor_thresholded}
|
||||
gather_pred_via_filesys: ${scratch.gather_pred_via_filesys}
|
||||
maxdets: 1000000 # no limit
|
||||
pred_file_evaluators:
|
||||
- _target_: sam3.eval.cgf1_eval.CGF1Evaluator
|
||||
gt_path: ${paths.coco_gt}
|
||||
iou_type: "bbox"
|
||||
- _target_: sam3.eval.cgf1_eval.CGF1Evaluator
|
||||
gt_path: ${paths.coco_gt}
|
||||
iou_type: "segm"
|
||||
@@ -0,0 +1,64 @@
|
||||
# @package _global_
|
||||
defaults:
|
||||
- /configs/eval_base.yaml
|
||||
- _self_
|
||||
|
||||
# ============================================================================
|
||||
# Paths Configuration (you can override here, but it shouldn't require further changes if eval_base.yaml is correct
|
||||
# ============================================================================
|
||||
paths:
|
||||
experiment_log_dir: ${paths.base_experiment_log_dir}/silver_sav/
|
||||
coco_gt: ${paths.base_annotation_path_silver}/silver_sav_merged_test.json
|
||||
img_path: ${paths.silver_img_path}/sav/
|
||||
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Trainer Configuration
|
||||
# ============================================================================
|
||||
|
||||
trainer:
|
||||
data:
|
||||
val:
|
||||
_target_: sam3.train.data.torch_dataset.TorchDataset
|
||||
dataset:
|
||||
_target_: sam3.train.data.sam3_image_dataset.Sam3ImageDataset
|
||||
coco_json_loader:
|
||||
_target_: sam3.train.data.coco_json_loaders.SAM3_EVAL_API_FROM_JSON_NP
|
||||
_partial_: true
|
||||
img_folder: ${paths.img_path}
|
||||
ann_file: ${paths.coco_gt}
|
||||
transforms: ${scratch.base_val_transform}
|
||||
max_ann_per_img: 100000
|
||||
multiplier: 1
|
||||
training: false
|
||||
|
||||
shuffle: False
|
||||
batch_size: ${scratch.val_batch_size}
|
||||
num_workers: ${scratch.num_val_workers}
|
||||
pin_memory: False
|
||||
drop_last: False
|
||||
collate_fn:
|
||||
_target_: sam3.train.data.collator.collate_fn_api
|
||||
_partial_: true
|
||||
repeats: ${scratch.hybrid_repeats}
|
||||
dict_key: silver_sav
|
||||
|
||||
meters:
|
||||
val:
|
||||
silver_sav: # this key matches the "dict_key" in the dataloader's collate function
|
||||
cgf1:
|
||||
_target_: sam3.eval.coco_writer.PredictionDumper
|
||||
iou_type: "segm"
|
||||
dump_dir: ${launcher.experiment_log_dir}/dumps/silver_sav
|
||||
merge_predictions: True
|
||||
postprocessor: ${scratch.mask_postprocessor_thresholded}
|
||||
gather_pred_via_filesys: ${scratch.gather_pred_via_filesys}
|
||||
maxdets: 1000000 # no limit
|
||||
pred_file_evaluators:
|
||||
- _target_: sam3.eval.cgf1_eval.CGF1Evaluator
|
||||
gt_path: ${paths.coco_gt}
|
||||
iou_type: "bbox"
|
||||
- _target_: sam3.eval.cgf1_eval.CGF1Evaluator
|
||||
gt_path: ${paths.coco_gt}
|
||||
iou_type: "segm"
|
||||
@@ -0,0 +1,64 @@
|
||||
# @package _global_
|
||||
defaults:
|
||||
- /configs/eval_base.yaml
|
||||
- _self_
|
||||
|
||||
# ============================================================================
|
||||
# Paths Configuration (you can override here, but it shouldn't require further changes if eval_base.yaml is correct
|
||||
# ============================================================================
|
||||
paths:
|
||||
experiment_log_dir: ${paths.base_experiment_log_dir}/silver_yt1b/
|
||||
coco_gt: ${paths.base_annotation_path_silver}/silver_yt1b_merged_test.json
|
||||
img_path: ${paths.silver_img_path}/yt1b/
|
||||
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Trainer Configuration
|
||||
# ============================================================================
|
||||
|
||||
trainer:
|
||||
data:
|
||||
val:
|
||||
_target_: sam3.train.data.torch_dataset.TorchDataset
|
||||
dataset:
|
||||
_target_: sam3.train.data.sam3_image_dataset.Sam3ImageDataset
|
||||
coco_json_loader:
|
||||
_target_: sam3.train.data.coco_json_loaders.SAM3_EVAL_API_FROM_JSON_NP
|
||||
_partial_: true
|
||||
img_folder: ${paths.img_path}
|
||||
ann_file: ${paths.coco_gt}
|
||||
transforms: ${scratch.base_val_transform}
|
||||
max_ann_per_img: 100000
|
||||
multiplier: 1
|
||||
training: false
|
||||
|
||||
shuffle: False
|
||||
batch_size: ${scratch.val_batch_size}
|
||||
num_workers: ${scratch.num_val_workers}
|
||||
pin_memory: False
|
||||
drop_last: False
|
||||
collate_fn:
|
||||
_target_: sam3.train.data.collator.collate_fn_api
|
||||
_partial_: true
|
||||
repeats: ${scratch.hybrid_repeats}
|
||||
dict_key: silver_yt1b
|
||||
|
||||
meters:
|
||||
val:
|
||||
silver_yt1b: # this key matches the "dict_key" in the dataloader's collate function
|
||||
cgf1:
|
||||
_target_: sam3.eval.coco_writer.PredictionDumper
|
||||
iou_type: "segm"
|
||||
dump_dir: ${launcher.experiment_log_dir}/dumps/silver_yt1b
|
||||
merge_predictions: True
|
||||
postprocessor: ${scratch.mask_postprocessor_thresholded}
|
||||
gather_pred_via_filesys: ${scratch.gather_pred_via_filesys}
|
||||
maxdets: 1000000 # no limit
|
||||
pred_file_evaluators:
|
||||
- _target_: sam3.eval.cgf1_eval.CGF1Evaluator
|
||||
gt_path: ${paths.coco_gt}
|
||||
iou_type: "bbox"
|
||||
- _target_: sam3.eval.cgf1_eval.CGF1Evaluator
|
||||
gt_path: ${paths.coco_gt}
|
||||
iou_type: "segm"
|
||||
Reference in New Issue
Block a user