Initial commit

fbshipit-source-id: da6be2f26e3a1202f4bffde8cb980e2dcb851294
This commit is contained in:
facebook-github-bot
2025-11-18 23:07:42 -08:00
commit a13e358df4
504 changed files with 122758 additions and 0 deletions

View File

@@ -0,0 +1,174 @@
# @package _global_
defaults:
- _self_
# ============================================================================
# Paths Configuration (Chage this to your own paths)
# ============================================================================
paths:
dump_file_name: saco_veval_sav_test
experiment_log_dir: <YOUR EXPERIMENET LOG_DIR>
ytvis_json: <YOUR_GT_PATH>/saco_veval_sav_test.json
ytvis_dir : <YOUR_VIDEO_JPG_DIR>
bpe_path: <BPE_PATH> # This should be under assets/bpe_simple_vocab_16e6.txt.gz
num_videos: null
# ============================================================================
# Different helper parameters and functions
# ============================================================================
scratch:
vid_mask_postprocessor:
_target_: sam3.eval.postprocessors.PostProcessNullOp
use_presence_eval: True
video_transforms_val:
- _target_: sam3.train.transforms.basic_for_api.ComposeAPI
transforms:
- _target_: sam3.train.transforms.segmentation.DecodeRle
# resize the image to 1024x1024 resolution
- _target_: sam3.train.transforms.basic_for_api.RandomResizeAPI
sizes: ${scratch.resolution} # originally `resolution: 1024`
square: true
consistent_transform: true
- _target_: sam3.train.transforms.basic_for_api.ToTensorAPI
- _target_: sam3.train.transforms.basic_for_api.NormalizeAPI
mean: ${scratch.val_norm_mean}
std: ${scratch.val_norm_std}
# Model parameters
d_model: 256
# Image processing parameters
resolution: 1008
# Normalization parameters
train_norm_mean: [0.5, 0.5, 0.5]
train_norm_std: [0.5, 0.5, 0.5]
val_norm_mean: [0.5, 0.5, 0.5]
val_norm_std: [0.5, 0.5, 0.5]
val_batch_size: 1
num_val_workers: 0
max_data_epochs: 20
hybrid_repeats: 1
gather_pred_via_filesys: false
# ============================================================================
# Trainer Configuration
# ============================================================================
trainer:
_target_: sam3.train.trainer.Trainer
skip_saving_ckpts: true
empty_gpu_mem_cache_after_eval: True
skip_first_val: True
max_epochs: ${scratch.max_data_epochs}
accelerator: cuda
seed_value: 123
val_epoch_freq: 10
mode: val
distributed:
backend: nccl
find_unused_parameters: True
gradient_as_bucket_view: True
loss:
all:
_target_: sam3.train.loss.sam3_loss.DummyLoss
default:
_target_: sam3.train.loss.sam3_loss.DummyLoss
data:
train: null
val:
_target_: sam3.train.data.torch_dataset.TorchDataset
dataset:
_target_: sam3.train.data.sam3_video_dataset.VideoGroundingDataset
limit_ids: ${paths.num_videos}
img_folder: ${paths.ytvis_dir}
ann_file: ${paths.ytvis_json}
coco_json_loader:
_target_: sam3.train.data.coco_json_loaders.SAM3_VEVAL_API_FROM_JSON_NP
_partial_: true
transforms: ${scratch.video_transforms_val}
max_ann_per_img: 100000 # filtered in transforms
max_val_queries: 100000
multiplier: 1
load_segmentation: true
training: false
shuffle: False
batch_size: ${scratch.val_batch_size}
num_workers: ${scratch.num_val_workers}
pin_memory: True
drop_last: False
collate_fn:
_target_: sam3.train.data.collator.collate_fn_api
_partial_: true
repeats: ${scratch.hybrid_repeats}
dict_key: ytvis_val
with_seg_masks: true
model:
_target_: sam3.model_builder.build_sam3_video_model
bpe_path: ${paths.bpe_path}
has_presence_token: True
geo_encoder_use_img_cross_attn: True
apply_temporal_disambiguation: True
meters:
val:
ytvis_val:
pred_file: # key
_target_: sam3.eval.ytvis_eval.YTVISResultsWriter
dump_file: ${launcher.experiment_log_dir}/preds/${paths.dump_file_name}.json
postprocessor: ${scratch.vid_mask_postprocessor}
gather_pred_via_filesys: ${scratch.gather_pred_via_filesys}
optim:
amp:
enabled: True
amp_dtype: bfloat16
checkpoint:
save_dir: ${launcher.experiment_log_dir}/checkpoints
save_freq: 0 # 0 only last checkpoint is saved.
logging:
tensorboard_writer:
_target_: sam3.train.utils.logger.make_tensorboard_logger
log_dir: ${launcher.experiment_log_dir}/tensorboard
flush_secs: 120
should_log: True
wandb_writer: null
log_dir: ${launcher.experiment_log_dir}/logs/
log_freq: 10
# ============================================================================
# Launcher and Submitit Configuration
# ============================================================================
launcher:
num_nodes: 8
gpus_per_node: 8
experiment_log_dir: ${paths.experiment_log_dir}
multiprocessing_context: forkserver
submitit:
account: null
partition: null
qos: null
timeout_hour: 72
use_cluster: True
cpus_per_task: 10
port_range: [10000, 65000]
constraint: null

View File

@@ -0,0 +1,174 @@
# @package _global_
defaults:
- _self_
# ============================================================================
# Paths Configuration (Chage this to your own paths)
# ============================================================================
paths:
dump_file_name: saco_veval_sav_test
experiment_log_dir: <YOUR EXPERIMENET LOG_DIR>
ytvis_json: <YOUR_GT_PATH>/saco_veval_sav_test.json
ytvis_dir : <YOUR_VIDEO_JPG_DIR>
bpe_path: <BPE_PATH> # This should be under assets/bpe_simple_vocab_16e6.txt.gz
num_videos: null
# ============================================================================
# Different helper parameters and functions
# ============================================================================
scratch:
vid_mask_postprocessor:
_target_: sam3.eval.postprocessors.PostProcessNullOp
use_presence_eval: True
video_transforms_val:
- _target_: sam3.train.transforms.basic_for_api.ComposeAPI
transforms:
- _target_: sam3.train.transforms.segmentation.DecodeRle
# resize the image to 1024x1024 resolution
- _target_: sam3.train.transforms.basic_for_api.RandomResizeAPI
sizes: ${scratch.resolution} # originally `resolution: 1024`
square: true
consistent_transform: true
- _target_: sam3.train.transforms.basic_for_api.ToTensorAPI
- _target_: sam3.train.transforms.basic_for_api.NormalizeAPI
mean: ${scratch.val_norm_mean}
std: ${scratch.val_norm_std}
# Model parameters
d_model: 256
# Image processing parameters
resolution: 1008
# Normalization parameters
train_norm_mean: [0.5, 0.5, 0.5]
train_norm_std: [0.5, 0.5, 0.5]
val_norm_mean: [0.5, 0.5, 0.5]
val_norm_std: [0.5, 0.5, 0.5]
val_batch_size: 1
num_val_workers: 0
max_data_epochs: 20
hybrid_repeats: 1
gather_pred_via_filesys: false
# ============================================================================
# Trainer Configuration
# ============================================================================
trainer:
_target_: sam3.train.trainer.Trainer
skip_saving_ckpts: true
empty_gpu_mem_cache_after_eval: True
skip_first_val: True
max_epochs: ${scratch.max_data_epochs}
accelerator: cuda
seed_value: 123
val_epoch_freq: 10
mode: val
distributed:
backend: nccl
find_unused_parameters: True
gradient_as_bucket_view: True
loss:
all:
_target_: sam3.train.loss.sam3_loss.DummyLoss
default:
_target_: sam3.train.loss.sam3_loss.DummyLoss
data:
train: null
val:
_target_: sam3.train.data.torch_dataset.TorchDataset
dataset:
_target_: sam3.train.data.sam3_video_dataset.VideoGroundingDataset
limit_ids: ${paths.num_videos}
img_folder: ${paths.ytvis_dir}
ann_file: ${paths.ytvis_json}
coco_json_loader:
_target_: sam3.train.data.coco_json_loaders.SAM3_VEVAL_API_FROM_JSON_NP
_partial_: true
transforms: ${scratch.video_transforms_val}
max_ann_per_img: 100000 # filtered in transforms
max_val_queries: 100000
multiplier: 1
load_segmentation: true
training: false
shuffle: False
batch_size: ${scratch.val_batch_size}
num_workers: ${scratch.num_val_workers}
pin_memory: True
drop_last: False
collate_fn:
_target_: sam3.train.data.collator.collate_fn_api
_partial_: true
repeats: ${scratch.hybrid_repeats}
dict_key: ytvis_val
with_seg_masks: true
model:
_target_: sam3.model_builder.build_sam3_video_model
bpe_path: ${paths.bpe_path}
has_presence_token: True
geo_encoder_use_img_cross_attn: True
apply_temporal_disambiguation: False
meters:
val:
ytvis_val:
pred_file: # key
_target_: sam3.eval.ytvis_eval.YTVISResultsWriter
dump_file: ${launcher.experiment_log_dir}/preds/${paths.dump_file_name}.json
postprocessor: ${scratch.vid_mask_postprocessor}
gather_pred_via_filesys: ${scratch.gather_pred_via_filesys}
optim:
amp:
enabled: True
amp_dtype: bfloat16
checkpoint:
save_dir: ${launcher.experiment_log_dir}/checkpoints
save_freq: 0 # 0 only last checkpoint is saved.
logging:
tensorboard_writer:
_target_: sam3.train.utils.logger.make_tensorboard_logger
log_dir: ${launcher.experiment_log_dir}/tensorboard
flush_secs: 120
should_log: True
wandb_writer: null
log_dir: ${launcher.experiment_log_dir}/logs/
log_freq: 10
# ============================================================================
# Launcher and Submitit Configuration
# ============================================================================
launcher:
num_nodes: 8
gpus_per_node: 8
experiment_log_dir: ${paths.experiment_log_dir}
multiprocessing_context: forkserver
submitit:
account: null
partition: null
qos: null
timeout_hour: 72
use_cluster: True
cpus_per_task: 10
port_range: [10000, 65000]
constraint: null

View File

@@ -0,0 +1,174 @@
# @package _global_
defaults:
- _self_
# ============================================================================
# Paths Configuration (Chage this to your own paths)
# ============================================================================
paths:
dump_file_name: saco_veval_sav_val
experiment_log_dir: <YOUR EXPERIMENET LOG_DIR>
ytvis_json: <YOUR_GT_PATH>/saco_veval_sav_val.json
ytvis_dir : <YOUR_VIDEO_JPG_DIR>
bpe_path: <BPE_PATH> # This should be under assets/bpe_simple_vocab_16e6.txt.gz
num_videos: null
# ============================================================================
# Different helper parameters and functions
# ============================================================================
scratch:
vid_mask_postprocessor:
_target_: sam3.eval.postprocessors.PostProcessNullOp
use_presence_eval: True
video_transforms_val:
- _target_: sam3.train.transforms.basic_for_api.ComposeAPI
transforms:
- _target_: sam3.train.transforms.segmentation.DecodeRle
# resize the image to 1024x1024 resolution
- _target_: sam3.train.transforms.basic_for_api.RandomResizeAPI
sizes: ${scratch.resolution} # originally `resolution: 1024`
square: true
consistent_transform: true
- _target_: sam3.train.transforms.basic_for_api.ToTensorAPI
- _target_: sam3.train.transforms.basic_for_api.NormalizeAPI
mean: ${scratch.val_norm_mean}
std: ${scratch.val_norm_std}
# Model parameters
d_model: 256
# Image processing parameters
resolution: 1008
# Normalization parameters
train_norm_mean: [0.5, 0.5, 0.5]
train_norm_std: [0.5, 0.5, 0.5]
val_norm_mean: [0.5, 0.5, 0.5]
val_norm_std: [0.5, 0.5, 0.5]
val_batch_size: 1
num_val_workers: 0
max_data_epochs: 20
hybrid_repeats: 1
gather_pred_via_filesys: false
# ============================================================================
# Trainer Configuration
# ============================================================================
trainer:
_target_: sam3.train.trainer.Trainer
skip_saving_ckpts: true
empty_gpu_mem_cache_after_eval: True
skip_first_val: True
max_epochs: ${scratch.max_data_epochs}
accelerator: cuda
seed_value: 123
val_epoch_freq: 10
mode: val
distributed:
backend: nccl
find_unused_parameters: True
gradient_as_bucket_view: True
loss:
all:
_target_: sam3.train.loss.sam3_loss.DummyLoss
default:
_target_: sam3.train.loss.sam3_loss.DummyLoss
data:
train: null
val:
_target_: sam3.train.data.torch_dataset.TorchDataset
dataset:
_target_: sam3.train.data.sam3_video_dataset.VideoGroundingDataset
limit_ids: ${paths.num_videos}
img_folder: ${paths.ytvis_dir}
ann_file: ${paths.ytvis_json}
coco_json_loader:
_target_: sam3.train.data.coco_json_loaders.SAM3_VEVAL_API_FROM_JSON_NP
_partial_: true
transforms: ${scratch.video_transforms_val}
max_ann_per_img: 100000 # filtered in transforms
max_val_queries: 100000
multiplier: 1
load_segmentation: true
training: false
shuffle: False
batch_size: ${scratch.val_batch_size}
num_workers: ${scratch.num_val_workers}
pin_memory: True
drop_last: False
collate_fn:
_target_: sam3.train.data.collator.collate_fn_api
_partial_: true
repeats: ${scratch.hybrid_repeats}
dict_key: ytvis_val
with_seg_masks: true
model:
_target_: sam3.model_builder.build_sam3_video_model
bpe_path: ${paths.bpe_path}
has_presence_token: True
geo_encoder_use_img_cross_attn: True
apply_temporal_disambiguation: True
meters:
val:
ytvis_val:
pred_file: # key
_target_: sam3.eval.ytvis_eval.YTVISResultsWriter
dump_file: ${launcher.experiment_log_dir}/preds/${paths.dump_file_name}.json
postprocessor: ${scratch.vid_mask_postprocessor}
gather_pred_via_filesys: ${scratch.gather_pred_via_filesys}
optim:
amp:
enabled: True
amp_dtype: bfloat16
checkpoint:
save_dir: ${launcher.experiment_log_dir}/checkpoints
save_freq: 0 # 0 only last checkpoint is saved.
logging:
tensorboard_writer:
_target_: sam3.train.utils.logger.make_tensorboard_logger
log_dir: ${launcher.experiment_log_dir}/tensorboard
flush_secs: 120
should_log: True
wandb_writer: null
log_dir: ${launcher.experiment_log_dir}/logs/
log_freq: 10
# ============================================================================
# Launcher and Submitit Configuration
# ============================================================================
launcher:
num_nodes: 8
gpus_per_node: 8
experiment_log_dir: ${paths.experiment_log_dir}
multiprocessing_context: forkserver
submitit:
account: null
partition: null
qos: null
timeout_hour: 72
use_cluster: True
cpus_per_task: 10
port_range: [10000, 65000]
constraint: null

View File

@@ -0,0 +1,174 @@
# @package _global_
defaults:
- _self_
# ============================================================================
# Paths Configuration (Chage this to your own paths)
# ============================================================================
paths:
dump_file_name: saco_veval_sav_val
experiment_log_dir: <YOUR EXPERIMENET LOG_DIR>
ytvis_json: <YOUR_GT_PATH>/saco_veval_sav_val.json
ytvis_dir : <YOUR_VIDEO_JPG_DIR>
bpe_path: <BPE_PATH> # This should be under assets/bpe_simple_vocab_16e6.txt.gz
num_videos: null
# ============================================================================
# Different helper parameters and functions
# ============================================================================
scratch:
vid_mask_postprocessor:
_target_: sam3.eval.postprocessors.PostProcessNullOp
use_presence_eval: True
video_transforms_val:
- _target_: sam3.train.transforms.basic_for_api.ComposeAPI
transforms:
- _target_: sam3.train.transforms.segmentation.DecodeRle
# resize the image to 1024x1024 resolution
- _target_: sam3.train.transforms.basic_for_api.RandomResizeAPI
sizes: ${scratch.resolution} # originally `resolution: 1024`
square: true
consistent_transform: true
- _target_: sam3.train.transforms.basic_for_api.ToTensorAPI
- _target_: sam3.train.transforms.basic_for_api.NormalizeAPI
mean: ${scratch.val_norm_mean}
std: ${scratch.val_norm_std}
# Model parameters
d_model: 256
# Image processing parameters
resolution: 1008
# Normalization parameters
train_norm_mean: [0.5, 0.5, 0.5]
train_norm_std: [0.5, 0.5, 0.5]
val_norm_mean: [0.5, 0.5, 0.5]
val_norm_std: [0.5, 0.5, 0.5]
val_batch_size: 1
num_val_workers: 0
max_data_epochs: 20
hybrid_repeats: 1
gather_pred_via_filesys: false
# ============================================================================
# Trainer Configuration
# ============================================================================
trainer:
_target_: sam3.train.trainer.Trainer
skip_saving_ckpts: true
empty_gpu_mem_cache_after_eval: True
skip_first_val: True
max_epochs: ${scratch.max_data_epochs}
accelerator: cuda
seed_value: 123
val_epoch_freq: 10
mode: val
distributed:
backend: nccl
find_unused_parameters: True
gradient_as_bucket_view: True
loss:
all:
_target_: sam3.train.loss.sam3_loss.DummyLoss
default:
_target_: sam3.train.loss.sam3_loss.DummyLoss
data:
train: null
val:
_target_: sam3.train.data.torch_dataset.TorchDataset
dataset:
_target_: sam3.train.data.sam3_video_dataset.VideoGroundingDataset
limit_ids: ${paths.num_videos}
img_folder: ${paths.ytvis_dir}
ann_file: ${paths.ytvis_json}
coco_json_loader:
_target_: sam3.train.data.coco_json_loaders.SAM3_VEVAL_API_FROM_JSON_NP
_partial_: true
transforms: ${scratch.video_transforms_val}
max_ann_per_img: 100000 # filtered in transforms
max_val_queries: 100000
multiplier: 1
load_segmentation: true
training: false
shuffle: False
batch_size: ${scratch.val_batch_size}
num_workers: ${scratch.num_val_workers}
pin_memory: True
drop_last: False
collate_fn:
_target_: sam3.train.data.collator.collate_fn_api
_partial_: true
repeats: ${scratch.hybrid_repeats}
dict_key: ytvis_val
with_seg_masks: true
model:
_target_: sam3.model_builder.build_sam3_video_model
bpe_path: ${paths.bpe_path}
has_presence_token: True
geo_encoder_use_img_cross_attn: True
apply_temporal_disambiguation: False
meters:
val:
ytvis_val:
pred_file: # key
_target_: sam3.eval.ytvis_eval.YTVISResultsWriter
dump_file: ${launcher.experiment_log_dir}/preds/${paths.dump_file_name}.json
postprocessor: ${scratch.vid_mask_postprocessor}
gather_pred_via_filesys: ${scratch.gather_pred_via_filesys}
optim:
amp:
enabled: True
amp_dtype: bfloat16
checkpoint:
save_dir: ${launcher.experiment_log_dir}/checkpoints
save_freq: 0 # 0 only last checkpoint is saved.
logging:
tensorboard_writer:
_target_: sam3.train.utils.logger.make_tensorboard_logger
log_dir: ${launcher.experiment_log_dir}/tensorboard
flush_secs: 120
should_log: True
wandb_writer: null
log_dir: ${launcher.experiment_log_dir}/logs/
log_freq: 10
# ============================================================================
# Launcher and Submitit Configuration
# ============================================================================
launcher:
num_nodes: 8
gpus_per_node: 8
experiment_log_dir: ${paths.experiment_log_dir}
multiprocessing_context: forkserver
submitit:
account: null
partition: null
qos: null
timeout_hour: 72
use_cluster: True
cpus_per_task: 10
port_range: [10000, 65000]
constraint: null

View File

@@ -0,0 +1,174 @@
# @package _global_
defaults:
- _self_
# ============================================================================
# Paths Configuration (Chage this to your own paths)
# ============================================================================
paths:
dump_file_name: saco_veval_smartglasses_test
experiment_log_dir: <YOUR EXPERIMENET LOG_DIR>
ytvis_json: <YOUR_GT_PATH>/saco_veval_smartglasses_test.json
ytvis_dir : <YOUR_VIDEO_JPG_DIR>
bpe_path: <BPE_PATH> # This should be under assets/bpe_simple_vocab_16e6.txt.gz
num_videos: null
# ============================================================================
# Different helper parameters and functions
# ============================================================================
scratch:
vid_mask_postprocessor:
_target_: sam3.eval.postprocessors.PostProcessNullOp
use_presence_eval: True
video_transforms_val:
- _target_: sam3.train.transforms.basic_for_api.ComposeAPI
transforms:
- _target_: sam3.train.transforms.segmentation.DecodeRle
# resize the image to 1024x1024 resolution
- _target_: sam3.train.transforms.basic_for_api.RandomResizeAPI
sizes: ${scratch.resolution} # originally `resolution: 1024`
square: true
consistent_transform: true
- _target_: sam3.train.transforms.basic_for_api.ToTensorAPI
- _target_: sam3.train.transforms.basic_for_api.NormalizeAPI
mean: ${scratch.val_norm_mean}
std: ${scratch.val_norm_std}
# Model parameters
d_model: 256
# Image processing parameters
resolution: 1008
# Normalization parameters
train_norm_mean: [0.5, 0.5, 0.5]
train_norm_std: [0.5, 0.5, 0.5]
val_norm_mean: [0.5, 0.5, 0.5]
val_norm_std: [0.5, 0.5, 0.5]
val_batch_size: 1
num_val_workers: 0
max_data_epochs: 20
hybrid_repeats: 1
gather_pred_via_filesys: false
# ============================================================================
# Trainer Configuration
# ============================================================================
trainer:
_target_: sam3.train.trainer.Trainer
skip_saving_ckpts: true
empty_gpu_mem_cache_after_eval: True
skip_first_val: True
max_epochs: ${scratch.max_data_epochs}
accelerator: cuda
seed_value: 123
val_epoch_freq: 10
mode: val
distributed:
backend: nccl
find_unused_parameters: True
gradient_as_bucket_view: True
loss:
all:
_target_: sam3.train.loss.sam3_loss.DummyLoss
default:
_target_: sam3.train.loss.sam3_loss.DummyLoss
data:
train: null
val:
_target_: sam3.train.data.torch_dataset.TorchDataset
dataset:
_target_: sam3.train.data.sam3_video_dataset.VideoGroundingDataset
limit_ids: ${paths.num_videos}
img_folder: ${paths.ytvis_dir}
ann_file: ${paths.ytvis_json}
coco_json_loader:
_target_: sam3.train.data.coco_json_loaders.SAM3_VEVAL_API_FROM_JSON_NP
_partial_: true
transforms: ${scratch.video_transforms_val}
max_ann_per_img: 100000 # filtered in transforms
max_val_queries: 100000
multiplier: 1
load_segmentation: true
training: false
shuffle: False
batch_size: ${scratch.val_batch_size}
num_workers: ${scratch.num_val_workers}
pin_memory: True
drop_last: False
collate_fn:
_target_: sam3.train.data.collator.collate_fn_api
_partial_: true
repeats: ${scratch.hybrid_repeats}
dict_key: ytvis_val
with_seg_masks: true
model:
_target_: sam3.model_builder.build_sam3_video_model
bpe_path: ${paths.bpe_path}
has_presence_token: True
geo_encoder_use_img_cross_attn: True
apply_temporal_disambiguation: True
meters:
val:
ytvis_val:
pred_file: # key
_target_: sam3.eval.ytvis_eval.YTVISResultsWriter
dump_file: ${launcher.experiment_log_dir}/preds/${paths.dump_file_name}.json
postprocessor: ${scratch.vid_mask_postprocessor}
gather_pred_via_filesys: ${scratch.gather_pred_via_filesys}
optim:
amp:
enabled: True
amp_dtype: bfloat16
checkpoint:
save_dir: ${launcher.experiment_log_dir}/checkpoints
save_freq: 0 # 0 only last checkpoint is saved.
logging:
tensorboard_writer:
_target_: sam3.train.utils.logger.make_tensorboard_logger
log_dir: ${launcher.experiment_log_dir}/tensorboard
flush_secs: 120
should_log: True
wandb_writer: null
log_dir: ${launcher.experiment_log_dir}/logs/
log_freq: 10
# ============================================================================
# Launcher and Submitit Configuration
# ============================================================================
launcher:
num_nodes: 8
gpus_per_node: 8
experiment_log_dir: ${paths.experiment_log_dir}
multiprocessing_context: forkserver
submitit:
account: null
partition: null
qos: null
timeout_hour: 72
use_cluster: True
cpus_per_task: 10
port_range: [10000, 65000]
constraint: null

View File

@@ -0,0 +1,174 @@
# @package _global_
defaults:
- _self_
# ============================================================================
# Paths Configuration (Chage this to your own paths)
# ============================================================================
paths:
dump_file_name: saco_veval_smartglasses_test
experiment_log_dir: <YOUR EXPERIMENET LOG_DIR>
ytvis_json: <YOUR_GT_PATH>/saco_veval_smartglasses_test.json
ytvis_dir : <YOUR_VIDEO_JPG_DIR>
bpe_path: <BPE_PATH> # This should be under assets/bpe_simple_vocab_16e6.txt.gz
num_videos: null
# ============================================================================
# Different helper parameters and functions
# ============================================================================
scratch:
vid_mask_postprocessor:
_target_: sam3.eval.postprocessors.PostProcessNullOp
use_presence_eval: True
video_transforms_val:
- _target_: sam3.train.transforms.basic_for_api.ComposeAPI
transforms:
- _target_: sam3.train.transforms.segmentation.DecodeRle
# resize the image to 1024x1024 resolution
- _target_: sam3.train.transforms.basic_for_api.RandomResizeAPI
sizes: ${scratch.resolution} # originally `resolution: 1024`
square: true
consistent_transform: true
- _target_: sam3.train.transforms.basic_for_api.ToTensorAPI
- _target_: sam3.train.transforms.basic_for_api.NormalizeAPI
mean: ${scratch.val_norm_mean}
std: ${scratch.val_norm_std}
# Model parameters
d_model: 256
# Image processing parameters
resolution: 1008
# Normalization parameters
train_norm_mean: [0.5, 0.5, 0.5]
train_norm_std: [0.5, 0.5, 0.5]
val_norm_mean: [0.5, 0.5, 0.5]
val_norm_std: [0.5, 0.5, 0.5]
val_batch_size: 1
num_val_workers: 0
max_data_epochs: 20
hybrid_repeats: 1
gather_pred_via_filesys: false
# ============================================================================
# Trainer Configuration
# ============================================================================
trainer:
_target_: sam3.train.trainer.Trainer
skip_saving_ckpts: true
empty_gpu_mem_cache_after_eval: True
skip_first_val: True
max_epochs: ${scratch.max_data_epochs}
accelerator: cuda
seed_value: 123
val_epoch_freq: 10
mode: val
distributed:
backend: nccl
find_unused_parameters: True
gradient_as_bucket_view: True
loss:
all:
_target_: sam3.train.loss.sam3_loss.DummyLoss
default:
_target_: sam3.train.loss.sam3_loss.DummyLoss
data:
train: null
val:
_target_: sam3.train.data.torch_dataset.TorchDataset
dataset:
_target_: sam3.train.data.sam3_video_dataset.VideoGroundingDataset
limit_ids: ${paths.num_videos}
img_folder: ${paths.ytvis_dir}
ann_file: ${paths.ytvis_json}
coco_json_loader:
_target_: sam3.train.data.coco_json_loaders.SAM3_VEVAL_API_FROM_JSON_NP
_partial_: true
transforms: ${scratch.video_transforms_val}
max_ann_per_img: 100000 # filtered in transforms
max_val_queries: 100000
multiplier: 1
load_segmentation: true
training: false
shuffle: False
batch_size: ${scratch.val_batch_size}
num_workers: ${scratch.num_val_workers}
pin_memory: True
drop_last: False
collate_fn:
_target_: sam3.train.data.collator.collate_fn_api
_partial_: true
repeats: ${scratch.hybrid_repeats}
dict_key: ytvis_val
with_seg_masks: true
model:
_target_: sam3.model_builder.build_sam3_video_model
bpe_path: ${paths.bpe_path}
has_presence_token: True
geo_encoder_use_img_cross_attn: True
apply_temporal_disambiguation: False
meters:
val:
ytvis_val:
pred_file: # key
_target_: sam3.eval.ytvis_eval.YTVISResultsWriter
dump_file: ${launcher.experiment_log_dir}/preds/${paths.dump_file_name}.json
postprocessor: ${scratch.vid_mask_postprocessor}
gather_pred_via_filesys: ${scratch.gather_pred_via_filesys}
optim:
amp:
enabled: True
amp_dtype: bfloat16
checkpoint:
save_dir: ${launcher.experiment_log_dir}/checkpoints
save_freq: 0 # 0 only last checkpoint is saved.
logging:
tensorboard_writer:
_target_: sam3.train.utils.logger.make_tensorboard_logger
log_dir: ${launcher.experiment_log_dir}/tensorboard
flush_secs: 120
should_log: True
wandb_writer: null
log_dir: ${launcher.experiment_log_dir}/logs/
log_freq: 10
# ============================================================================
# Launcher and Submitit Configuration
# ============================================================================
launcher:
num_nodes: 8
gpus_per_node: 8
experiment_log_dir: ${paths.experiment_log_dir}
multiprocessing_context: forkserver
submitit:
account: null
partition: null
qos: null
timeout_hour: 72
use_cluster: True
cpus_per_task: 10
port_range: [10000, 65000]
constraint: null

View File

@@ -0,0 +1,174 @@
# @package _global_
defaults:
- _self_
# ============================================================================
# Paths Configuration (Chage this to your own paths)
# ============================================================================
paths:
dump_file_name: saco_veval_smartglasses_val
experiment_log_dir: <YOUR EXPERIMENET LOG_DIR>
ytvis_json: <YOUR_GT_PATH>/saco_veval_smartglasses_val.json
ytvis_dir : <YOUR_VIDEO_JPG_DIR>
bpe_path: <BPE_PATH> # This should be under assets/bpe_simple_vocab_16e6.txt.gz
num_videos: null
# ============================================================================
# Different helper parameters and functions
# ============================================================================
scratch:
vid_mask_postprocessor:
_target_: sam3.eval.postprocessors.PostProcessNullOp
use_presence_eval: True
video_transforms_val:
- _target_: sam3.train.transforms.basic_for_api.ComposeAPI
transforms:
- _target_: sam3.train.transforms.segmentation.DecodeRle
# resize the image to 1024x1024 resolution
- _target_: sam3.train.transforms.basic_for_api.RandomResizeAPI
sizes: ${scratch.resolution} # originally `resolution: 1024`
square: true
consistent_transform: true
- _target_: sam3.train.transforms.basic_for_api.ToTensorAPI
- _target_: sam3.train.transforms.basic_for_api.NormalizeAPI
mean: ${scratch.val_norm_mean}
std: ${scratch.val_norm_std}
# Model parameters
d_model: 256
# Image processing parameters
resolution: 1008
# Normalization parameters
train_norm_mean: [0.5, 0.5, 0.5]
train_norm_std: [0.5, 0.5, 0.5]
val_norm_mean: [0.5, 0.5, 0.5]
val_norm_std: [0.5, 0.5, 0.5]
val_batch_size: 1
num_val_workers: 0
max_data_epochs: 20
hybrid_repeats: 1
gather_pred_via_filesys: false
# ============================================================================
# Trainer Configuration
# ============================================================================
trainer:
_target_: sam3.train.trainer.Trainer
skip_saving_ckpts: true
empty_gpu_mem_cache_after_eval: True
skip_first_val: True
max_epochs: ${scratch.max_data_epochs}
accelerator: cuda
seed_value: 123
val_epoch_freq: 10
mode: val
distributed:
backend: nccl
find_unused_parameters: True
gradient_as_bucket_view: True
loss:
all:
_target_: sam3.train.loss.sam3_loss.DummyLoss
default:
_target_: sam3.train.loss.sam3_loss.DummyLoss
data:
train: null
val:
_target_: sam3.train.data.torch_dataset.TorchDataset
dataset:
_target_: sam3.train.data.sam3_video_dataset.VideoGroundingDataset
limit_ids: ${paths.num_videos}
img_folder: ${paths.ytvis_dir}
ann_file: ${paths.ytvis_json}
coco_json_loader:
_target_: sam3.train.data.coco_json_loaders.SAM3_VEVAL_API_FROM_JSON_NP
_partial_: true
transforms: ${scratch.video_transforms_val}
max_ann_per_img: 100000 # filtered in transforms
max_val_queries: 100000
multiplier: 1
load_segmentation: true
training: false
shuffle: False
batch_size: ${scratch.val_batch_size}
num_workers: ${scratch.num_val_workers}
pin_memory: True
drop_last: False
collate_fn:
_target_: sam3.train.data.collator.collate_fn_api
_partial_: true
repeats: ${scratch.hybrid_repeats}
dict_key: ytvis_val
with_seg_masks: true
model:
_target_: sam3.model_builder.build_sam3_video_model
bpe_path: ${paths.bpe_path}
has_presence_token: True
geo_encoder_use_img_cross_attn: True
apply_temporal_disambiguation: True
meters:
val:
ytvis_val:
pred_file: # key
_target_: sam3.eval.ytvis_eval.YTVISResultsWriter
dump_file: ${launcher.experiment_log_dir}/preds/${paths.dump_file_name}.json
postprocessor: ${scratch.vid_mask_postprocessor}
gather_pred_via_filesys: ${scratch.gather_pred_via_filesys}
optim:
amp:
enabled: True
amp_dtype: bfloat16
checkpoint:
save_dir: ${launcher.experiment_log_dir}/checkpoints
save_freq: 0 # 0 only last checkpoint is saved.
logging:
tensorboard_writer:
_target_: sam3.train.utils.logger.make_tensorboard_logger
log_dir: ${launcher.experiment_log_dir}/tensorboard
flush_secs: 120
should_log: True
wandb_writer: null
log_dir: ${launcher.experiment_log_dir}/logs/
log_freq: 10
# ============================================================================
# Launcher and Submitit Configuration
# ============================================================================
launcher:
num_nodes: 8
gpus_per_node: 8
experiment_log_dir: ${paths.experiment_log_dir}
multiprocessing_context: forkserver
submitit:
account: null
partition: null
qos: null
timeout_hour: 72
use_cluster: True
cpus_per_task: 10
port_range: [10000, 65000]
constraint: null

View File

@@ -0,0 +1,174 @@
# @package _global_
defaults:
- _self_
# ============================================================================
# Paths Configuration (Chage this to your own paths)
# ============================================================================
paths:
dump_file_name: saco_veval_smartglasses_val
experiment_log_dir: <YOUR EXPERIMENET LOG_DIR>
ytvis_json: <YOUR_GT_PATH>/saco_veval_smartglasses_val.json
ytvis_dir : <YOUR_VIDEO_JPG_DIR>
bpe_path: <BPE_PATH> # This should be under assets/bpe_simple_vocab_16e6.txt.gz
num_videos: null
# ============================================================================
# Different helper parameters and functions
# ============================================================================
scratch:
vid_mask_postprocessor:
_target_: sam3.eval.postprocessors.PostProcessNullOp
use_presence_eval: True
video_transforms_val:
- _target_: sam3.train.transforms.basic_for_api.ComposeAPI
transforms:
- _target_: sam3.train.transforms.segmentation.DecodeRle
# resize the image to 1024x1024 resolution
- _target_: sam3.train.transforms.basic_for_api.RandomResizeAPI
sizes: ${scratch.resolution} # originally `resolution: 1024`
square: true
consistent_transform: true
- _target_: sam3.train.transforms.basic_for_api.ToTensorAPI
- _target_: sam3.train.transforms.basic_for_api.NormalizeAPI
mean: ${scratch.val_norm_mean}
std: ${scratch.val_norm_std}
# Model parameters
d_model: 256
# Image processing parameters
resolution: 1008
# Normalization parameters
train_norm_mean: [0.5, 0.5, 0.5]
train_norm_std: [0.5, 0.5, 0.5]
val_norm_mean: [0.5, 0.5, 0.5]
val_norm_std: [0.5, 0.5, 0.5]
val_batch_size: 1
num_val_workers: 0
max_data_epochs: 20
hybrid_repeats: 1
gather_pred_via_filesys: false
# ============================================================================
# Trainer Configuration
# ============================================================================
trainer:
_target_: sam3.train.trainer.Trainer
skip_saving_ckpts: true
empty_gpu_mem_cache_after_eval: True
skip_first_val: True
max_epochs: ${scratch.max_data_epochs}
accelerator: cuda
seed_value: 123
val_epoch_freq: 10
mode: val
distributed:
backend: nccl
find_unused_parameters: True
gradient_as_bucket_view: True
loss:
all:
_target_: sam3.train.loss.sam3_loss.DummyLoss
default:
_target_: sam3.train.loss.sam3_loss.DummyLoss
data:
train: null
val:
_target_: sam3.train.data.torch_dataset.TorchDataset
dataset:
_target_: sam3.train.data.sam3_video_dataset.VideoGroundingDataset
limit_ids: ${paths.num_videos}
img_folder: ${paths.ytvis_dir}
ann_file: ${paths.ytvis_json}
coco_json_loader:
_target_: sam3.train.data.coco_json_loaders.SAM3_VEVAL_API_FROM_JSON_NP
_partial_: true
transforms: ${scratch.video_transforms_val}
max_ann_per_img: 100000 # filtered in transforms
max_val_queries: 100000
multiplier: 1
load_segmentation: true
training: false
shuffle: False
batch_size: ${scratch.val_batch_size}
num_workers: ${scratch.num_val_workers}
pin_memory: True
drop_last: False
collate_fn:
_target_: sam3.train.data.collator.collate_fn_api
_partial_: true
repeats: ${scratch.hybrid_repeats}
dict_key: ytvis_val
with_seg_masks: true
model:
_target_: sam3.model_builder.build_sam3_video_model
bpe_path: ${paths.bpe_path}
has_presence_token: True
geo_encoder_use_img_cross_attn: True
apply_temporal_disambiguation: False
meters:
val:
ytvis_val:
pred_file: # key
_target_: sam3.eval.ytvis_eval.YTVISResultsWriter
dump_file: ${launcher.experiment_log_dir}/preds/${paths.dump_file_name}.json
postprocessor: ${scratch.vid_mask_postprocessor}
gather_pred_via_filesys: ${scratch.gather_pred_via_filesys}
optim:
amp:
enabled: True
amp_dtype: bfloat16
checkpoint:
save_dir: ${launcher.experiment_log_dir}/checkpoints
save_freq: 0 # 0 only last checkpoint is saved.
logging:
tensorboard_writer:
_target_: sam3.train.utils.logger.make_tensorboard_logger
log_dir: ${launcher.experiment_log_dir}/tensorboard
flush_secs: 120
should_log: True
wandb_writer: null
log_dir: ${launcher.experiment_log_dir}/logs/
log_freq: 10
# ============================================================================
# Launcher and Submitit Configuration
# ============================================================================
launcher:
num_nodes: 8
gpus_per_node: 8
experiment_log_dir: ${paths.experiment_log_dir}
multiprocessing_context: forkserver
submitit:
account: null
partition: null
qos: null
timeout_hour: 72
use_cluster: True
cpus_per_task: 10
port_range: [10000, 65000]
constraint: null

View File

@@ -0,0 +1,174 @@
# @package _global_
defaults:
- _self_
# ============================================================================
# Paths Configuration (Chage this to your own paths)
# ============================================================================
paths:
dump_file_name: saco_veval_yt1b_test
experiment_log_dir: <YOUR EXPERIMENET LOG_DIR>
ytvis_json: <YOUR_GT_PATH>/saco_veval_yt1b_test.json
ytvis_dir : <YOUR_VIDEO_JPG_DIR>
bpe_path: <BPE_PATH> # This should be under assets/bpe_simple_vocab_16e6.txt.gz
num_videos: null
# ============================================================================
# Different helper parameters and functions
# ============================================================================
scratch:
vid_mask_postprocessor:
_target_: sam3.eval.postprocessors.PostProcessNullOp
use_presence_eval: True
video_transforms_val:
- _target_: sam3.train.transforms.basic_for_api.ComposeAPI
transforms:
- _target_: sam3.train.transforms.segmentation.DecodeRle
# resize the image to 1024x1024 resolution
- _target_: sam3.train.transforms.basic_for_api.RandomResizeAPI
sizes: ${scratch.resolution} # originally `resolution: 1024`
square: true
consistent_transform: true
- _target_: sam3.train.transforms.basic_for_api.ToTensorAPI
- _target_: sam3.train.transforms.basic_for_api.NormalizeAPI
mean: ${scratch.val_norm_mean}
std: ${scratch.val_norm_std}
# Model parameters
d_model: 256
# Image processing parameters
resolution: 1008
# Normalization parameters
train_norm_mean: [0.5, 0.5, 0.5]
train_norm_std: [0.5, 0.5, 0.5]
val_norm_mean: [0.5, 0.5, 0.5]
val_norm_std: [0.5, 0.5, 0.5]
val_batch_size: 1
num_val_workers: 0
max_data_epochs: 20
hybrid_repeats: 1
gather_pred_via_filesys: false
# ============================================================================
# Trainer Configuration
# ============================================================================
trainer:
_target_: sam3.train.trainer.Trainer
skip_saving_ckpts: true
empty_gpu_mem_cache_after_eval: True
skip_first_val: True
max_epochs: ${scratch.max_data_epochs}
accelerator: cuda
seed_value: 123
val_epoch_freq: 10
mode: val
distributed:
backend: nccl
find_unused_parameters: True
gradient_as_bucket_view: True
loss:
all:
_target_: sam3.train.loss.sam3_loss.DummyLoss
default:
_target_: sam3.train.loss.sam3_loss.DummyLoss
data:
train: null
val:
_target_: sam3.train.data.torch_dataset.TorchDataset
dataset:
_target_: sam3.train.data.sam3_video_dataset.VideoGroundingDataset
limit_ids: ${paths.num_videos}
img_folder: ${paths.ytvis_dir}
ann_file: ${paths.ytvis_json}
coco_json_loader:
_target_: sam3.train.data.coco_json_loaders.SAM3_VEVAL_API_FROM_JSON_NP
_partial_: true
transforms: ${scratch.video_transforms_val}
max_ann_per_img: 100000 # filtered in transforms
max_val_queries: 100000
multiplier: 1
load_segmentation: true
training: false
shuffle: False
batch_size: ${scratch.val_batch_size}
num_workers: ${scratch.num_val_workers}
pin_memory: True
drop_last: False
collate_fn:
_target_: sam3.train.data.collator.collate_fn_api
_partial_: true
repeats: ${scratch.hybrid_repeats}
dict_key: ytvis_val
with_seg_masks: true
model:
_target_: sam3.model_builder.build_sam3_video_model
bpe_path: ${paths.bpe_path}
has_presence_token: True
geo_encoder_use_img_cross_attn: True
apply_temporal_disambiguation: True
meters:
val:
ytvis_val:
pred_file: # key
_target_: sam3.eval.ytvis_eval.YTVISResultsWriter
dump_file: ${launcher.experiment_log_dir}/preds/${paths.dump_file_name}.json
postprocessor: ${scratch.vid_mask_postprocessor}
gather_pred_via_filesys: ${scratch.gather_pred_via_filesys}
optim:
amp:
enabled: True
amp_dtype: bfloat16
checkpoint:
save_dir: ${launcher.experiment_log_dir}/checkpoints
save_freq: 0 # 0 only last checkpoint is saved.
logging:
tensorboard_writer:
_target_: sam3.train.utils.logger.make_tensorboard_logger
log_dir: ${launcher.experiment_log_dir}/tensorboard
flush_secs: 120
should_log: True
wandb_writer: null
log_dir: ${launcher.experiment_log_dir}/logs/
log_freq: 10
# ============================================================================
# Launcher and Submitit Configuration
# ============================================================================
launcher:
num_nodes: 8
gpus_per_node: 8
experiment_log_dir: ${paths.experiment_log_dir}
multiprocessing_context: forkserver
submitit:
account: null
partition: null
qos: null
timeout_hour: 72
use_cluster: True
cpus_per_task: 10
port_range: [10000, 65000]
constraint: null

View File

@@ -0,0 +1,174 @@
# @package _global_
defaults:
- _self_
# ============================================================================
# Paths Configuration (Chage this to your own paths)
# ============================================================================
paths:
dump_file_name: saco_veval_yt1b_test
experiment_log_dir: <YOUR EXPERIMENET LOG_DIR>
ytvis_json: <YOUR_GT_PATH>/saco_veval_yt1b_test.json
ytvis_dir : <YOUR_VIDEO_JPG_DIR>
bpe_path: <BPE_PATH> # This should be under assets/bpe_simple_vocab_16e6.txt.gz
num_videos: null
# ============================================================================
# Different helper parameters and functions
# ============================================================================
scratch:
vid_mask_postprocessor:
_target_: sam3.eval.postprocessors.PostProcessNullOp
use_presence_eval: True
video_transforms_val:
- _target_: sam3.train.transforms.basic_for_api.ComposeAPI
transforms:
- _target_: sam3.train.transforms.segmentation.DecodeRle
# resize the image to 1024x1024 resolution
- _target_: sam3.train.transforms.basic_for_api.RandomResizeAPI
sizes: ${scratch.resolution} # originally `resolution: 1024`
square: true
consistent_transform: true
- _target_: sam3.train.transforms.basic_for_api.ToTensorAPI
- _target_: sam3.train.transforms.basic_for_api.NormalizeAPI
mean: ${scratch.val_norm_mean}
std: ${scratch.val_norm_std}
# Model parameters
d_model: 256
# Image processing parameters
resolution: 1008
# Normalization parameters
train_norm_mean: [0.5, 0.5, 0.5]
train_norm_std: [0.5, 0.5, 0.5]
val_norm_mean: [0.5, 0.5, 0.5]
val_norm_std: [0.5, 0.5, 0.5]
val_batch_size: 1
num_val_workers: 0
max_data_epochs: 20
hybrid_repeats: 1
gather_pred_via_filesys: false
# ============================================================================
# Trainer Configuration
# ============================================================================
trainer:
_target_: sam3.train.trainer.Trainer
skip_saving_ckpts: true
empty_gpu_mem_cache_after_eval: True
skip_first_val: True
max_epochs: ${scratch.max_data_epochs}
accelerator: cuda
seed_value: 123
val_epoch_freq: 10
mode: val
distributed:
backend: nccl
find_unused_parameters: True
gradient_as_bucket_view: True
loss:
all:
_target_: sam3.train.loss.sam3_loss.DummyLoss
default:
_target_: sam3.train.loss.sam3_loss.DummyLoss
data:
train: null
val:
_target_: sam3.train.data.torch_dataset.TorchDataset
dataset:
_target_: sam3.train.data.sam3_video_dataset.VideoGroundingDataset
limit_ids: ${paths.num_videos}
img_folder: ${paths.ytvis_dir}
ann_file: ${paths.ytvis_json}
coco_json_loader:
_target_: sam3.train.data.coco_json_loaders.SAM3_VEVAL_API_FROM_JSON_NP
_partial_: true
transforms: ${scratch.video_transforms_val}
max_ann_per_img: 100000 # filtered in transforms
max_val_queries: 100000
multiplier: 1
load_segmentation: true
training: false
shuffle: False
batch_size: ${scratch.val_batch_size}
num_workers: ${scratch.num_val_workers}
pin_memory: True
drop_last: False
collate_fn:
_target_: sam3.train.data.collator.collate_fn_api
_partial_: true
repeats: ${scratch.hybrid_repeats}
dict_key: ytvis_val
with_seg_masks: true
model:
_target_: sam3.model_builder.build_sam3_video_model
bpe_path: ${paths.bpe_path}
has_presence_token: True
geo_encoder_use_img_cross_attn: True
apply_temporal_disambiguation: False
meters:
val:
ytvis_val:
pred_file: # key
_target_: sam3.eval.ytvis_eval.YTVISResultsWriter
dump_file: ${launcher.experiment_log_dir}/preds/${paths.dump_file_name}.json
postprocessor: ${scratch.vid_mask_postprocessor}
gather_pred_via_filesys: ${scratch.gather_pred_via_filesys}
optim:
amp:
enabled: True
amp_dtype: bfloat16
checkpoint:
save_dir: ${launcher.experiment_log_dir}/checkpoints
save_freq: 0 # 0 only last checkpoint is saved.
logging:
tensorboard_writer:
_target_: sam3.train.utils.logger.make_tensorboard_logger
log_dir: ${launcher.experiment_log_dir}/tensorboard
flush_secs: 120
should_log: True
wandb_writer: null
log_dir: ${launcher.experiment_log_dir}/logs/
log_freq: 10
# ============================================================================
# Launcher and Submitit Configuration
# ============================================================================
launcher:
num_nodes: 8
gpus_per_node: 8
experiment_log_dir: ${paths.experiment_log_dir}
multiprocessing_context: forkserver
submitit:
account: null
partition: null
qos: null
timeout_hour: 72
use_cluster: True
cpus_per_task: 10
port_range: [10000, 65000]
constraint: null

View File

@@ -0,0 +1,174 @@
# @package _global_
defaults:
- _self_
# ============================================================================
# Paths Configuration (Chage this to your own paths)
# ============================================================================
paths:
dump_file_name: saco_veval_yt1b_val
experiment_log_dir: <YOUR EXPERIMENET LOG_DIR>
ytvis_json: <YOUR_GT_PATH>/saco_veval_yt1b_val.json
ytvis_dir : <YOUR_VIDEO_JPG_DIR>
bpe_path: <BPE_PATH> # This should be under assets/bpe_simple_vocab_16e6.txt.gz
num_videos: null
# ============================================================================
# Different helper parameters and functions
# ============================================================================
scratch:
vid_mask_postprocessor:
_target_: sam3.eval.postprocessors.PostProcessNullOp
use_presence_eval: True
video_transforms_val:
- _target_: sam3.train.transforms.basic_for_api.ComposeAPI
transforms:
- _target_: sam3.train.transforms.segmentation.DecodeRle
# resize the image to 1024x1024 resolution
- _target_: sam3.train.transforms.basic_for_api.RandomResizeAPI
sizes: ${scratch.resolution} # originally `resolution: 1024`
square: true
consistent_transform: true
- _target_: sam3.train.transforms.basic_for_api.ToTensorAPI
- _target_: sam3.train.transforms.basic_for_api.NormalizeAPI
mean: ${scratch.val_norm_mean}
std: ${scratch.val_norm_std}
# Model parameters
d_model: 256
# Image processing parameters
resolution: 1008
# Normalization parameters
train_norm_mean: [0.5, 0.5, 0.5]
train_norm_std: [0.5, 0.5, 0.5]
val_norm_mean: [0.5, 0.5, 0.5]
val_norm_std: [0.5, 0.5, 0.5]
val_batch_size: 1
num_val_workers: 0
max_data_epochs: 20
hybrid_repeats: 1
gather_pred_via_filesys: false
# ============================================================================
# Trainer Configuration
# ============================================================================
trainer:
_target_: sam3.train.trainer.Trainer
skip_saving_ckpts: true
empty_gpu_mem_cache_after_eval: True
skip_first_val: True
max_epochs: ${scratch.max_data_epochs}
accelerator: cuda
seed_value: 123
val_epoch_freq: 10
mode: val
distributed:
backend: nccl
find_unused_parameters: True
gradient_as_bucket_view: True
loss:
all:
_target_: sam3.train.loss.sam3_loss.DummyLoss
default:
_target_: sam3.train.loss.sam3_loss.DummyLoss
data:
train: null
val:
_target_: sam3.train.data.torch_dataset.TorchDataset
dataset:
_target_: sam3.train.data.sam3_video_dataset.VideoGroundingDataset
limit_ids: ${paths.num_videos}
img_folder: ${paths.ytvis_dir}
ann_file: ${paths.ytvis_json}
coco_json_loader:
_target_: sam3.train.data.coco_json_loaders.SAM3_VEVAL_API_FROM_JSON_NP
_partial_: true
transforms: ${scratch.video_transforms_val}
max_ann_per_img: 100000 # filtered in transforms
max_val_queries: 100000
multiplier: 1
load_segmentation: true
training: false
shuffle: False
batch_size: ${scratch.val_batch_size}
num_workers: ${scratch.num_val_workers}
pin_memory: True
drop_last: False
collate_fn:
_target_: sam3.train.data.collator.collate_fn_api
_partial_: true
repeats: ${scratch.hybrid_repeats}
dict_key: ytvis_val
with_seg_masks: true
model:
_target_: sam3.model_builder.build_sam3_video_model
bpe_path: ${paths.bpe_path}
has_presence_token: True
geo_encoder_use_img_cross_attn: True
apply_temporal_disambiguation: True
meters:
val:
ytvis_val:
pred_file: # key
_target_: sam3.eval.ytvis_eval.YTVISResultsWriter
dump_file: ${launcher.experiment_log_dir}/preds/${paths.dump_file_name}.json
postprocessor: ${scratch.vid_mask_postprocessor}
gather_pred_via_filesys: ${scratch.gather_pred_via_filesys}
optim:
amp:
enabled: True
amp_dtype: bfloat16
checkpoint:
save_dir: ${launcher.experiment_log_dir}/checkpoints
save_freq: 0 # 0 only last checkpoint is saved.
logging:
tensorboard_writer:
_target_: sam3.train.utils.logger.make_tensorboard_logger
log_dir: ${launcher.experiment_log_dir}/tensorboard
flush_secs: 120
should_log: True
wandb_writer: null
log_dir: ${launcher.experiment_log_dir}/logs/
log_freq: 10
# ============================================================================
# Launcher and Submitit Configuration
# ============================================================================
launcher:
num_nodes: 8
gpus_per_node: 8
experiment_log_dir: ${paths.experiment_log_dir}
multiprocessing_context: forkserver
submitit:
account: null
partition: null
qos: null
timeout_hour: 72
use_cluster: True
cpus_per_task: 10
port_range: [10000, 65000]
constraint: null

View File

@@ -0,0 +1,174 @@
# @package _global_
defaults:
- _self_
# ============================================================================
# Paths Configuration (Chage this to your own paths)
# ============================================================================
paths:
dump_file_name: saco_veval_yt1b_val
experiment_log_dir: <YOUR EXPERIMENET LOG_DIR>
ytvis_json: <YOUR_GT_PATH>/saco_veval_yt1b_val.json
ytvis_dir : <YOUR_VIDEO_JPG_DIR>
bpe_path: <BPE_PATH> # This should be under assets/bpe_simple_vocab_16e6.txt.gz
num_videos: null
# ============================================================================
# Different helper parameters and functions
# ============================================================================
scratch:
vid_mask_postprocessor:
_target_: sam3.eval.postprocessors.PostProcessNullOp
use_presence_eval: True
video_transforms_val:
- _target_: sam3.train.transforms.basic_for_api.ComposeAPI
transforms:
- _target_: sam3.train.transforms.segmentation.DecodeRle
# resize the image to 1024x1024 resolution
- _target_: sam3.train.transforms.basic_for_api.RandomResizeAPI
sizes: ${scratch.resolution} # originally `resolution: 1024`
square: true
consistent_transform: true
- _target_: sam3.train.transforms.basic_for_api.ToTensorAPI
- _target_: sam3.train.transforms.basic_for_api.NormalizeAPI
mean: ${scratch.val_norm_mean}
std: ${scratch.val_norm_std}
# Model parameters
d_model: 256
# Image processing parameters
resolution: 1008
# Normalization parameters
train_norm_mean: [0.5, 0.5, 0.5]
train_norm_std: [0.5, 0.5, 0.5]
val_norm_mean: [0.5, 0.5, 0.5]
val_norm_std: [0.5, 0.5, 0.5]
val_batch_size: 1
num_val_workers: 0
max_data_epochs: 20
hybrid_repeats: 1
gather_pred_via_filesys: false
# ============================================================================
# Trainer Configuration
# ============================================================================
trainer:
_target_: sam3.train.trainer.Trainer
skip_saving_ckpts: true
empty_gpu_mem_cache_after_eval: True
skip_first_val: True
max_epochs: ${scratch.max_data_epochs}
accelerator: cuda
seed_value: 123
val_epoch_freq: 10
mode: val
distributed:
backend: nccl
find_unused_parameters: True
gradient_as_bucket_view: True
loss:
all:
_target_: sam3.train.loss.sam3_loss.DummyLoss
default:
_target_: sam3.train.loss.sam3_loss.DummyLoss
data:
train: null
val:
_target_: sam3.train.data.torch_dataset.TorchDataset
dataset:
_target_: sam3.train.data.sam3_video_dataset.VideoGroundingDataset
limit_ids: ${paths.num_videos}
img_folder: ${paths.ytvis_dir}
ann_file: ${paths.ytvis_json}
coco_json_loader:
_target_: sam3.train.data.coco_json_loaders.SAM3_VEVAL_API_FROM_JSON_NP
_partial_: true
transforms: ${scratch.video_transforms_val}
max_ann_per_img: 100000 # filtered in transforms
max_val_queries: 100000
multiplier: 1
load_segmentation: true
training: false
shuffle: False
batch_size: ${scratch.val_batch_size}
num_workers: ${scratch.num_val_workers}
pin_memory: True
drop_last: False
collate_fn:
_target_: sam3.train.data.collator.collate_fn_api
_partial_: true
repeats: ${scratch.hybrid_repeats}
dict_key: ytvis_val
with_seg_masks: true
model:
_target_: sam3.model_builder.build_sam3_video_model
bpe_path: ${paths.bpe_path}
has_presence_token: True
geo_encoder_use_img_cross_attn: True
apply_temporal_disambiguation: False
meters:
val:
ytvis_val:
pred_file: # key
_target_: sam3.eval.ytvis_eval.YTVISResultsWriter
dump_file: ${launcher.experiment_log_dir}/preds/${paths.dump_file_name}.json
postprocessor: ${scratch.vid_mask_postprocessor}
gather_pred_via_filesys: ${scratch.gather_pred_via_filesys}
optim:
amp:
enabled: True
amp_dtype: bfloat16
checkpoint:
save_dir: ${launcher.experiment_log_dir}/checkpoints
save_freq: 0 # 0 only last checkpoint is saved.
logging:
tensorboard_writer:
_target_: sam3.train.utils.logger.make_tensorboard_logger
log_dir: ${launcher.experiment_log_dir}/tensorboard
flush_secs: 120
should_log: True
wandb_writer: null
log_dir: ${launcher.experiment_log_dir}/logs/
log_freq: 10
# ============================================================================
# Launcher and Submitit Configuration
# ============================================================================
launcher:
num_nodes: 8
gpus_per_node: 8
experiment_log_dir: ${paths.experiment_log_dir}
multiprocessing_context: forkserver
submitit:
account: null
partition: null
qos: null
timeout_hour: 72
use_cluster: True
cpus_per_task: 10
port_range: [10000, 65000]
constraint: null