From 1daff5eb927cb37a3fd93879173001e451ab1795 Mon Sep 17 00:00:00 2001
From: Haitham Khedr <haithamkhedr@meta.com>
Date: Mon, 24 Nov 2025 13:30:21 -0800
Subject: [PATCH] Update README with arXiv bibtex

Reviewed By: jayleicn

Differential Revision:
D87813153

Privacy Context Container: L1256182

fbshipit-source-id: 9361ff55ebdb1ee78f694cb9c41b8bc83bf600fb
---
 README.md                      | 16 ++++++++++++----
 sam3/model/utils/sam2_utils.py | 12 ++++++------
 sam3/visualization_utils.py    |  4 ++--
 3 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/README.md b/README.md
index b9d310b..669242d 100644
--- a/README.md
+++ b/README.md
@@ -44,7 +44,7 @@ Meng Wang, [Peize Sun](https://peizesun.github.io/),
 [[`Project`](https://ai.meta.com/sam3)]
 [[`Demo`](https://segment-anything.com/)]
 [[`Blog`](https://ai.meta.com/blog/segment-anything-model-3/)]
-<!-- [[`BibTeX`](#citing-sam-3)] -->
+[[`BibTeX`](#citing-sam-3)]
 
 ![SAM 3 architecture](assets/model_diagram.png?raw=true) SAM 3 is a unified foundation model for promptable segmentation in images and videos. It can detect, segment, and track objects using text or visual prompts such as points, boxes, and masks. Compared to its predecessor [SAM 2](https://github.com/facebookresearch/sam2), SAM 3 introduces the ability to exhaustively segment all instances of an open-vocabulary concept specified by a short text phrase or exemplars. Unlike prior work, SAM 3 can handle a vastly larger set of open-vocabulary prompts. It achieves 75-80% of human performance on our new [SA-CO benchmark](https://github.com/facebookresearch/sam3?tab=readme-ov-file#sa-co-dataset) which contains 270K unique concepts, over 50 times more than existing benchmarks.
 
@@ -378,10 +378,18 @@ Nisha Deo, Peter Park, Phillip Thomas, Raghu Nayani, Rene Martinez Doehner, Robb
 Mitts, Shashank Jain, Spencer Whitehead, Ty Toledano, Valentin Gabeur, Vincent Cho, Vivian Lee, William Ngan,
 Xuehai He, Yael Yungster, Ziqi Pang, Ziyi Dou, Zoe Quake.
 
-<!-- ## Citing SAM 3
+## Citing SAM 3
 
 If you use SAM 3 or the SA-Co dataset in your research, please use the following BibTeX entry.
 
 ```bibtex
-TODO
-``` -->
+@misc{carion2025sam3segmentconcepts,
+      title={SAM 3: Segment Anything with Concepts},
+      author={Nicolas Carion and Laura Gustafson and Yuan-Ting Hu and Shoubhik Debnath and Ronghang Hu and Didac Suris and Chaitanya Ryali and Kalyan Vasudev Alwala and Haitham Khedr and Andrew Huang and Jie Lei and Tengyu Ma and Baishan Guo and Arpit Kalla and Markus Marks and Joseph Greer and Meng Wang and Peize Sun and Roman Rädle and Triantafyllos Afouras and Effrosyni Mavroudi and Katherine Xu and Tsung-Han Wu and Yu Zhou and Liliane Momeni and Rishi Hazra and Shuangrui Ding and Sagar Vaze and Francois Porcher and Feng Li and Siyuan Li and Aishwarya Kamath and Ho Kei Cheng and Piotr Dollár and Nikhila Ravi and Kate Saenko and Pengchuan Zhang and Christoph Feichtenhofer},
+      year={2025},
+      eprint={2511.16719},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV},
+      url={https://arxiv.org/abs/2511.16719},
+}
+```
diff --git a/sam3/model/utils/sam2_utils.py b/sam3/model/utils/sam2_utils.py
index cc3d12e..d91ba0f 100644
--- a/sam3/model/utils/sam2_utils.py
+++ b/sam3/model/utils/sam2_utils.py
@@ -97,8 +97,8 @@ def load_video_frames(
     video_path,
     image_size,
     offload_video_to_cpu,
-    img_mean=(0.485, 0.456, 0.406),
-    img_std=(0.229, 0.224, 0.225),
+    img_mean=(0.5, 0.5, 0.5),
+    img_std=(0.5, 0.5, 0.5),
     async_loading_frames=False,
     compute_device=torch.device("cuda"),
 ):
@@ -138,8 +138,8 @@ def load_video_frames_from_jpg_images(
     video_path,
     image_size,
     offload_video_to_cpu,
-    img_mean=(0.485, 0.456, 0.406),
-    img_std=(0.229, 0.224, 0.225),
+    img_mean=(0.5, 0.5, 0.5),
+    img_std=(0.5, 0.5, 0.5),
     async_loading_frames=False,
     compute_device=torch.device("cuda"),
 ):
@@ -205,8 +205,8 @@ def load_video_frames_from_video_file(
     video_path,
     image_size,
     offload_video_to_cpu,
-    img_mean=(0.485, 0.456, 0.406),
-    img_std=(0.229, 0.224, 0.225),
+    img_mean=(0.5, 0.5, 0.5),
+    img_std=(0.5, 0.5, 0.5),
     compute_device=torch.device("cuda"),
 ):
     """Load the video frames from a video file."""
diff --git a/sam3/visualization_utils.py b/sam3/visualization_utils.py
index 090f086..73398f2 100644
--- a/sam3/visualization_utils.py
+++ b/sam3/visualization_utils.py
@@ -43,8 +43,8 @@ COLORS = generate_colors(n_colors=128, n_samples=5000)
 
 
 def show_img_tensor(img_batch, vis_img_idx=0):
-    MEAN_IMG = np.array([0.485, 0.456, 0.406])
-    STD_IMG = np.array([0.229, 0.224, 0.225])
+    MEAN_IMG = np.array([0.5, 0.5, 0.5])
+    STD_IMG = np.array([0.5, 0.5, 0.5])
     im_tensor = img_batch[vis_img_idx].detach().cpu()
     assert im_tensor.dim() == 3
     im_tensor = im_tensor.numpy().transpose((1, 2, 0))