Initial commit

fbshipit-source-id: da6be2f26e3a1202f4bffde8cb980e2dcb851294
2025-11-18 23:07:42 -08:00
commit a13e358df4
504 changed files with 122758 additions and 0 deletions
--- a/examples/sam3_for_sam2_video_task_example.ipynb
+++ b/examples/sam3_for_sam2_video_task_example.ipynb
@@ -0,0 +1,979 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3c3b1c46-9f5c-41c1-9101-85db8709ec0d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Copyright (c) Meta Platforms, Inc. and affiliates."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6e7a0db5-7f04-4845-8b11-684fe6e9f7f2",
+   "metadata": {},
+   "source": [
+    "# Video object segmentation with SAM 3"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "162d0b3c-4207-442d-969c-aa1cbb8fd4ad",
+   "metadata": {},
+   "source": [
+    "This notebook shows how to use SAM 3 for video object segmentation in videos, illustrating the use of the  `Sam3TrackerPredictor` class.\n",
+    "\n",
+    "\n",
+    "This notebook follows the SAM 2 API for interactive video segmentation.\n",
+    "\n",
+    "<a target=\"_blank\" href=\"https://colab.research.google.com/github/facebookresearch/sam3/blob/main/notebooks/sam3_for_sam2_video_task_example.ipynb\">\n",
+    "  <img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/>\n",
+    "</a>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "26616201-06df-435b-98fd-ad17c373bb4a",
+   "metadata": {},
+   "source": [
+    "## Environment Set-up"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8491a127-4c01-48f5-9dc5-f148a9417fdf",
+   "metadata": {},
+   "source": [
+    "First install `sam3` in your environment using the [installation instructions](https://github.com/facebookresearch/sam3?tab=readme-ov-file#installation) in the repository."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f74c53be-aab1-46b9-8c0b-068b52ef5948",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "using_colab = False"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d824a4b2-71f3-4da3-bfc7-3249625e6730",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if using_colab:\n",
+    "    import torch\n",
+    "    import torchvision\n",
+    "    print(\"PyTorch version:\", torch.__version__)\n",
+    "    print(\"Torchvision version:\", torchvision.__version__)\n",
+    "    print(\"CUDA is available:\", torch.cuda.is_available())\n",
+    "    import sys\n",
+    "    !{sys.executable} -m pip install opencv-python matplotlib scikit-learn\n",
+    "    !{sys.executable} -m pip install 'git+https://github.com/facebookresearch/sam3.git'"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "22e6aa9d-487f-4207-b657-8cff0902343e",
+   "metadata": {},
+   "source": [
+    "## Set-up"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d3cae821",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "\n",
+    "# select the device for computation\n",
+    "if torch.cuda.is_available():\n",
+    "    device = torch.device(\"cuda\")\n",
+    "elif torch.backends.mps.is_available():\n",
+    "    device = torch.device(\"mps\")\n",
+    "else:\n",
+    "    device = torch.device(\"cpu\")\n",
+    "print(f\"using device: {device}\")\n",
+    "\n",
+    "if device.type == \"cuda\":\n",
+    "    # use bfloat16 for the entire notebook\n",
+    "    torch.autocast(\"cuda\", dtype=torch.bfloat16).__enter__()\n",
+    "    # turn on tfloat32 for Ampere GPUs (https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices)\n",
+    "    if torch.cuda.get_device_properties(0).major >= 8:\n",
+    "        torch.backends.cuda.matmul.allow_tf32 = True\n",
+    "        torch.backends.cudnn.allow_tf32 = True\n",
+    "\n",
+    "elif device.type == \"mps\":\n",
+    "    print(\n",
+    "        \"\\nSupport for MPS devices is preliminary. SAM 3 is trained with CUDA and might \"\n",
+    "        \"give numerically different outputs and sometimes degraded performance on MPS. \"\n",
+    "        \"See e.g. https://github.com/pytorch/pytorch/issues/84936 for a discussion.\"\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e5318a85-5bf7-4880-b2b3-15e4db24d796",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import glob\n",
+    "import os\n",
+    "\n",
+    "import cv2\n",
+    "import matplotlib.pyplot as plt\n",
+    "import numpy as np\n",
+    "\n",
+    "import sam3\n",
+    "import torch\n",
+    "from PIL import Image\n",
+    "from sam3.visualization_utils import show_box, show_mask, show_points\n",
+    "\n",
+    "# font size for axes titles\n",
+    "plt.rcParams[\"axes.titlesize\"] = 12\n",
+    "plt.rcParams[\"figure.titlesize\"] = 12\n",
+    "\n",
+    "sam3_root = os.path.join(os.path.dirname(sam3.__file__), \"..\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ae8e0779-751f-4224-9b04-ed0f0b406500",
+   "metadata": {},
+   "source": [
+    "### Loading the SAM 3 tracking predictor"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f5f3245e-b4d6-418b-a42a-a67e0b3b5aec",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sam3.model_builder import build_sam3_video_model\n",
+    "\n",
+    "sam3_model = build_sam3_video_model()\n",
+    "predictor = sam3_model.tracker\n",
+    "predictor.backbone = sam3_model.detector.backbone"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "dff46b10-c17a-4a26-8004-8c6d80806b0a",
+   "metadata": {},
+   "source": [
+    "#### Initialize the inference state"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f594ac71-a6b9-461d-af27-500fa1d1a420",
+   "metadata": {},
+   "source": [
+    "Just like SAM 2, SAM 3 requires stateful inference for interactive video segmentation, so we need to initialize an **inference state** on this video.\n",
+    "\n",
+    "During initialization, it loads all the JPEG frames in `video_path` and stores their pixels in `inference_state` (as shown in the progress bar below)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9baa05c9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "video_path = f\"{sam3_root}/assets/videos/bedroom.mp4\"\n",
+    "inference_state = predictor.init_state(video_path=video_path)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "edb1f3f6-d74d-4016-934c-8d2a14d1a543",
+   "metadata": {},
+   "source": [
+    "### Example 1: Segment & track one object"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "aa2d3127-67b2-45d2-9f32-8fe3e10dc5eb",
+   "metadata": {},
+   "source": [
+    "Note: if you have run any previous tracking using this `inference_state`, please reset it first via `clear_all_points_in_video`.\n",
+    "\n",
+    "(The cell below is just for illustration; it's not needed to call `clear_all_points_in_video` here as this `inference_state` is just freshly initialized above.)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d2646a1d-3401-438c-a653-55e0e56b7d9d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "predictor.clear_all_points_in_video(inference_state)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "26aeb04d-8cba-4f57-95da-6e5a1796003e",
+   "metadata": {},
+   "source": [
+    "#### Step 1: Add a first click on a frame"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "695c7749-b523-4691-aad0-7558c5d1d68c",
+   "metadata": {},
+   "source": [
+    "To get started, let's try to segment the child on the left.\n",
+    "\n",
+    "Here we make a **positive click** at (x, y) = (210, 350) with label `1`, by sending their coordinates and labels into the `add_new_points` API.\n",
+    "\n",
+    "Note: label `1` indicates a *positive click (to add a region)* while label `0` indicates a *negative click (to remove a region)*."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dd6778a1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# load the frames for visualization\n",
+    "cap = cv2.VideoCapture(video_path)\n",
+    "video_frames_for_vis = []\n",
+    "while True:\n",
+    "    ret, frame = cap.read()\n",
+    "    if not ret:\n",
+    "        break\n",
+    "    video_frames_for_vis.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))\n",
+    "cap.release()\n",
+    "frame0 = video_frames_for_vis[0]\n",
+    "\n",
+    "width, height = frame0.shape[1], frame0.shape[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3e749bab-0f36-4173-bf8d-0c20cd5214b3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ann_frame_idx = 0  # the frame index we interact with\n",
+    "ann_obj_id = 1  # give a unique id to each object we interact with (it can be any integers)\n",
+    "\n",
+    "# Let's add a positive click at (x, y) = (210, 350) to get started\n",
+    "points = np.array([[210, 350]], dtype=np.float32)\n",
+    "# for labels, `1` means positive click and `0` means negative click\n",
+    "labels = np.array([1], np.int32)\n",
+    "\n",
+    "rel_points = [[x / width, y / height] for x, y in points]\n",
+    "\n",
+    "points_tensor = torch.tensor(rel_points, dtype=torch.float32)\n",
+    "points_labels_tensor = torch.tensor(labels, dtype=torch.int32)\n",
+    "\n",
+    "_, out_obj_ids, low_res_masks, video_res_masks = predictor.add_new_points(\n",
+    "    inference_state=inference_state,\n",
+    "    frame_idx=ann_frame_idx,\n",
+    "    obj_id=ann_obj_id,\n",
+    "    points=points_tensor,\n",
+    "    labels=points_labels_tensor,\n",
+    "    clear_old_points=False,\n",
+    ")\n",
+    "\n",
+    "# show the results on the current (interacted) frame\n",
+    "plt.figure(figsize=(9, 6))\n",
+    "plt.title(f\"frame {ann_frame_idx}\")\n",
+    "plt.imshow(frame0)\n",
+    "show_points(points, labels, plt.gca())\n",
+    "show_mask((video_res_masks[0] > 0.0).cpu().numpy(), plt.gca(), obj_id=out_obj_ids[0])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "89457875-93fa-40ed-b6dc-4e1c971a27f9",
+   "metadata": {},
+   "source": [
+    "#### Step 2: Add a second click to refine the prediction"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a75eb21b-1413-452c-827b-a04093c30c78",
+   "metadata": {},
+   "source": [
+    "Hmm, it seems that although we wanted to segment the child on the left, the model predicts the mask for only the shorts -- this can happen since there is ambiguity from a single click about what the target object should be. We can refine the mask on this frame via another positive click on the child's shirt.\n",
+    "\n",
+    "Here we make a **second positive click** at (x, y) = (250, 220) with label `1` to expand the mask.\n",
+    "\n",
+    "Note: we need to send **all the clicks and their labels** (i.e. not just the last click) when calling `add_new_points`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e1ab3ec7-2537-4158-bf98-3d0977d8908d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ann_frame_idx = 0  # the frame index we interact with\n",
+    "ann_obj_id = 1  # give a unique id to each object we interact with (it can be any integers)\n",
+    "\n",
+    "# Let's add a 2nd positive click at (x, y) = (250, 220) to refine the mask\n",
+    "# sending all clicks (and their labels) to `add_new_points_or_box`\n",
+    "points = np.array([[210, 350], [250, 220]], dtype=np.float32)\n",
+    "# for labels, `1` means positive click and `0` means negative click\n",
+    "labels = np.array([1, 1], np.int32)\n",
+    "\n",
+    "rel_points = [[x / width, y / height] for x, y in points]\n",
+    "\n",
+    "points_tensor = torch.tensor(rel_points, dtype=torch.float32)\n",
+    "points_labels_tensor = torch.tensor(labels, dtype=torch.int32)\n",
+    "\n",
+    "_, out_obj_ids, low_res_masks, video_res_masks  = predictor.add_new_points(\n",
+    "    inference_state=inference_state,\n",
+    "    frame_idx=ann_frame_idx,\n",
+    "    obj_id=ann_obj_id,\n",
+    "    points=points_tensor,\n",
+    "    labels=points_labels_tensor,\n",
+    "    clear_old_points=False,\n",
+    ")\n",
+    "\n",
+    "# show the results on the current (interacted) frame\n",
+    "plt.figure(figsize=(9, 6))\n",
+    "plt.title(f\"frame {ann_frame_idx}\")\n",
+    "plt.imshow(frame0)\n",
+    "show_points(points, labels, plt.gca())\n",
+    "show_mask((video_res_masks[0] > 0.0).cpu().numpy(), plt.gca(), obj_id=out_obj_ids[0])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "df4ab457-d91d-4ac8-b350-fbcd549fd3fd",
+   "metadata": {},
+   "source": [
+    "With this 2nd refinement click, now we get a segmentation mask of the entire child on frame 0."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f52015ac-1b7b-4c59-bca3-c2b28484cf46",
+   "metadata": {},
+   "source": [
+    "#### Step 3: Propagate the prompts to get the masklet across the video"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "30b025bd-cd58-4bfb-9572-c8d2fd0a02ef",
+   "metadata": {},
+   "source": [
+    "To get the masklet throughout the entire video, we propagate the prompts using the `propagate_in_video` API."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ab45e932-b0d5-4983-9718-6ee77d1ac31b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# run propagation throughout the video and collect the results in a dict\n",
+    "video_segments = {}  # video_segments contains the per-frame segmentation results\n",
+    "for frame_idx, obj_ids, low_res_masks, video_res_masks, obj_scores in predictor.propagate_in_video(inference_state, start_frame_idx=0, max_frame_num_to_track=240, reverse=False, propagate_preflight=True):\n",
+    "    video_segments[frame_idx] = {\n",
+    "        out_obj_id: (video_res_masks[i] > 0.0).cpu().numpy()\n",
+    "        for i, out_obj_id in enumerate(out_obj_ids)\n",
+    "    }\n",
+    "\n",
+    "# render the segmentation results every few frames\n",
+    "vis_frame_stride = 30\n",
+    "plt.close(\"all\")\n",
+    "for out_frame_idx in range(0, len(video_frames_for_vis), vis_frame_stride):\n",
+    "    plt.figure(figsize=(6, 4))\n",
+    "    plt.title(f\"frame {out_frame_idx}\")\n",
+    "    plt.imshow(video_frames_for_vis[out_frame_idx])\n",
+    "    for out_obj_id, out_mask in video_segments[out_frame_idx].items():\n",
+    "        show_mask(out_mask, plt.gca(), obj_id=out_obj_id)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3e801b70-72df-4a72-b3fe-84f145e5e3f6",
+   "metadata": {},
+   "source": [
+    "#### Step 4: Add new prompts to further refine the masklet"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "478958ab-29b4-4a75-bba4-adb1b03d0a2b",
+   "metadata": {},
+   "source": [
+    "It appears that in the output masklet above, there are some small imperfections in boundary details on frame 150.\n",
+    "\n",
+    "With SAM 3 we can fix the model predictions interactively. We can add a **negative click** at (x, y) = (82, 415) on this frame with label `0` to refine the masklet. Here we call the `add_new_points_or_box` API with a different `frame_idx` argument to indicate the frame index we want to refine."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1a572ea9-5b7e-479c-b30c-93c38b121131",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ann_frame_idx = 150  # further refine some details on this frame\n",
+    "ann_obj_id = 1  # give a unique id to the object we interact with (it can be any integers)\n",
+    "\n",
+    "# show the segment before further refinement\n",
+    "plt.figure(figsize=(9, 6))\n",
+    "plt.title(f\"frame {ann_frame_idx} -- before refinement\")\n",
+    "plt.imshow(video_frames_for_vis[ann_frame_idx])\n",
+    "show_mask(video_segments[ann_frame_idx][ann_obj_id], plt.gca(), obj_id=ann_obj_id)\n",
+    "\n",
+    "# Let's add a negative click on this frame at (x, y) = (82, 415) to refine the segment\n",
+    "points = np.array([[82, 410]], dtype=np.float32)\n",
+    "# for labels, `1` means positive click and `0` means negative click\n",
+    "labels = np.array([0], np.int32)\n",
+    "\n",
+    "rel_points = [[x / width, y / height] for x, y in points]\n",
+    "\n",
+    "points_tensor = torch.tensor(rel_points, dtype=torch.float32)\n",
+    "points_labels_tensor = torch.tensor(labels, dtype=torch.int32)\n",
+    "\n",
+    "_, out_obj_ids, low_res_masks, video_res_masks  = predictor.add_new_points(\n",
+    "    inference_state=inference_state,\n",
+    "    frame_idx=ann_frame_idx,\n",
+    "    obj_id=ann_obj_id,\n",
+    "    points=points_tensor,\n",
+    "    labels=points_labels_tensor,\n",
+    "    clear_old_points=False,\n",
+    ")\n",
+    "\n",
+    "\n",
+    "# show the segment after the further refinement\n",
+    "plt.figure(figsize=(9, 6))\n",
+    "plt.title(f\"frame {ann_frame_idx} -- after refinement\")\n",
+    "plt.imshow(video_frames_for_vis[ann_frame_idx])\n",
+    "show_points(points, labels, plt.gca())\n",
+    "show_mask((video_res_masks > 0.0).cpu().numpy(), plt.gca(), obj_id=ann_obj_id)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "50a3950a-acf1-435c-bd64-94297267b5e9",
+   "metadata": {},
+   "source": [
+    "#### Step 5: Propagate the prompts (again) to get the masklet across the video"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b1954ecf-c2ec-4f9c-8d10-c4f527a10cd2",
+   "metadata": {},
+   "source": [
+    "Let's get an updated masklet for the entire video. Here we call `propagate_in_video` again to propagate all the prompts after adding the new refinement click above."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "baa96690-4a38-4a24-aa17-fd2f4db0e232",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# run propagation throughout the video and collect the results in a dict\n",
+    "video_segments = {}  # video_segments contains the per-frame segmentation results\n",
+    "for frame_idx, obj_ids, low_res_masks, video_res_masks, obj_scores in predictor.propagate_in_video(inference_state, start_frame_idx=0, max_frame_num_to_track=300, reverse=False, propagate_preflight=True):\n",
+    "    video_segments[frame_idx] = {\n",
+    "        out_obj_id: (video_res_masks[i] > 0.0).cpu().numpy()\n",
+    "        for i, out_obj_id in enumerate(out_obj_ids)\n",
+    "    }\n",
+    "\n",
+    "# render the segmentation results every few frames\n",
+    "vis_frame_stride = 30\n",
+    "plt.close(\"all\")\n",
+    "for out_frame_idx in range(0, len(video_frames_for_vis), vis_frame_stride):\n",
+    "    plt.figure(figsize=(6, 4))\n",
+    "    plt.title(f\"frame {out_frame_idx}\")\n",
+    "    plt.imshow(video_frames_for_vis[out_frame_idx])\n",
+    "    for out_obj_id, out_mask in video_segments[out_frame_idx].items():\n",
+    "        show_mask(out_mask, plt.gca(), obj_id=out_obj_id)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "607507e3-6a2b-4fd7-944c-2371bdab9d01",
+   "metadata": {},
+   "source": [
+    "The segments now look good on all frames."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2502bb5a-3e1f-43d0-9f58-33f8676fff0d",
+   "metadata": {},
+   "source": [
+    "### Example 2: Segment an object using box prompt"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8e2d26c8-0432-48c6-997e-4a3b77bb5f6d",
+   "metadata": {},
+   "source": [
+    "Note: if you have run any previous tracking using this `inference_state`, please reset it first via `clear_all_points_in_video`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6dbe9183-abbb-4283-b0cb-d24f3d7beb34",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "predictor.clear_all_points_in_video(inference_state)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ceb6eae9-0f4c-434f-8089-a46c9ca59da5",
+   "metadata": {},
+   "source": [
+    "In addition to using clicks as inputs, SAM 3 also supports segmenting and tracking objects in a video via **bounding boxes**.\n",
+    "\n",
+    "In the example below, we segment the child on the right using a **box prompt** of (x_min, y_min, x_max, y_max) = (300, 0, 500, 400) on frame 0 as input into the `add_new_points_or_box` API."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1cbfb273-4e14-495b-bd89-87a8baf52ae7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ann_frame_idx = 0  # the frame index we interact with\n",
+    "ann_obj_id = 4  # give a unique id to each object we interact with (it can be any integers)\n",
+    "\n",
+    "# Let's add a box at (x_min, y_min, x_max, y_max) = (300, 0, 500, 400) to get started\n",
+    "box = np.array([[300, 0, 500, 400]], dtype=np.float32)\n",
+    "\n",
+    "rel_box = [[xmin / width, ymin / height, xmax / width, ymax / height] for xmin, ymin, xmax, ymax in box]\n",
+    "rel_box = np.array(rel_box, dtype=np.float32)\n",
+    "\n",
+    "_, out_obj_ids, low_res_masks, video_res_masks  = predictor.add_new_points_or_box(\n",
+    "    inference_state=inference_state,\n",
+    "    frame_idx=ann_frame_idx,\n",
+    "    obj_id=ann_obj_id,\n",
+    "    box=rel_box,\n",
+    ")\n",
+    "\n",
+    "# show the results on the current (interacted) frame\n",
+    "plt.figure(figsize=(9, 6))\n",
+    "plt.title(f\"frame {ann_frame_idx}\")\n",
+    "plt.imshow(video_frames_for_vis[ann_frame_idx])\n",
+    "show_box(box[0], plt.gca())\n",
+    "show_mask((video_res_masks[0] > 0.0).cpu().numpy(), plt.gca(), obj_id=ann_obj_id)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "bd3f9ba7-bf4d-47e5-9b02-8a424cab42cc",
+   "metadata": {},
+   "source": [
+    "Here, SAM 3 gets a pretty good segmentation mask of the entire child, even though the input bounding box is not perfectly tight around the object.\n",
+    "\n",
+    "Similar to the previous example, if the returned mask from is not perfect when using a box prompt, we can also further **refine** the output using positive or negative clicks. To illustrate this, here we make a **positive click** at (x, y) = (460, 60) with label `1` to expand the segment around the child's hair.\n",
+    "\n",
+    "Note: to refine the segmentation mask from a box prompt, we need to send **both the original box input and all subsequent refinement clicks and their labels** when calling `add_new_points_or_box`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "54906315-ab4c-4088-b866-4c22134d5b66",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ann_frame_idx = 0  # the frame index we interact with\n",
+    "ann_obj_id = 4  # give a unique id to each object we interact with (it can be any integers)\n",
+    "\n",
+    "# Let's add a positive click at (x, y) = (460, 60) to refine the mask\n",
+    "points = np.array([[460, 60]], dtype=np.float32)\n",
+    "# for labels, `1` means positive click and `0` means negative click\n",
+    "labels = np.array([1], np.int32)\n",
+    "# note that we also need to send the original box input along with\n",
+    "# the new refinement click together into `add_new_points_or_box`\n",
+    "box = np.array([[300, 0, 500, 400]], dtype=np.float32)\n",
+    "\n",
+    "rel_box = [[xmin / width, ymin / height, xmax / width, ymax / height] for xmin, ymin, xmax, ymax in box]\n",
+    "rel_box = np.array(rel_box, dtype=np.float32)\n",
+    "\n",
+    "rel_points = [[x / width, y / height] for x, y in points]\n",
+    "\n",
+    "points_tensor = torch.tensor(rel_points, dtype=torch.float32)\n",
+    "points_labels_tensor = torch.tensor(labels, dtype=torch.int32)\n",
+    "\n",
+    "_, out_obj_ids, low_res_masks, video_res_masks  = predictor.add_new_points_or_box(\n",
+    "    inference_state=inference_state,\n",
+    "    frame_idx=ann_frame_idx,\n",
+    "    obj_id=ann_obj_id,\n",
+    "    points=points_tensor,\n",
+    "    labels=points_labels_tensor,\n",
+    "    box=rel_box,\n",
+    ")\n",
+    "\n",
+    "# show the results on the current (interacted) frame\n",
+    "plt.figure(figsize=(9, 6))\n",
+    "plt.title(f\"frame {ann_frame_idx}\")\n",
+    "plt.imshow(video_frames_for_vis[ann_frame_idx])\n",
+    "show_box(box[0], plt.gca())\n",
+    "show_points(points, labels, plt.gca())\n",
+    "show_mask((video_res_masks[0][0] > 0.0).cpu().numpy(), plt.gca(), obj_id=out_obj_ids[0])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "73128cd6-dbfa-49f7-8d79-1a8e19835f7f",
+   "metadata": {},
+   "source": [
+    "Then, to get the masklet throughout the entire video, we propagate the prompts using the `propagate_in_video` API."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9cd90557-a0dc-442e-b091-9c74c831bef8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# run propagation throughout the video and collect the results in a dict\n",
+    "video_segments = {}  # video_segments contains the per-frame segmentation results\n",
+    "for frame_idx, obj_ids, low_res_masks, video_res_masks, obj_scores in predictor.propagate_in_video(inference_state, start_frame_idx=0, max_frame_num_to_track=300, reverse=False, propagate_preflight=True):\n",
+    "    video_segments[frame_idx] = {\n",
+    "        out_obj_id: (video_res_masks[i] > 0.0).cpu().numpy()\n",
+    "        for i, out_obj_id in enumerate(out_obj_ids)\n",
+    "    }\n",
+    "\n",
+    "# render the segmentation results every few frames\n",
+    "vis_frame_stride = 30\n",
+    "plt.close(\"all\")\n",
+    "for out_frame_idx in range(0, len(video_frames_for_vis), vis_frame_stride):\n",
+    "    plt.figure(figsize=(6, 4))\n",
+    "    plt.title(f\"frame {out_frame_idx}\")\n",
+    "    plt.imshow(video_frames_for_vis[out_frame_idx])\n",
+    "    for out_obj_id, out_mask in video_segments[out_frame_idx].items():\n",
+    "        show_mask(out_mask, plt.gca(), obj_id=out_obj_id)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e023f91f-0cc5-4980-ae8e-a13c5749112b",
+   "metadata": {},
+   "source": [
+    "Note that in addition to clicks or boxes, SAM 3 also supports directly using a **mask prompt** as input via the `add_new_mask` method in the `Sam3TrackerPredictor` class. This can be helpful in e.g. semi-supervised VOS evaluations (see [tools/vos_inference.py](https://github.com/facebookresearch/sam2/blob/main/tools/vos_inference.py) for an example)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "da018be8-a4ae-4943-b1ff-702c2b89cb68",
+   "metadata": {},
+   "source": [
+    "### Example 3: Segment multiple objects simultaneously"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "dea6c04c-3072-4876-b394-879321a48c4a",
+   "metadata": {},
+   "source": [
+    "Note: if you have run any previous tracking using this `inference_state`, please reset it first via `clear_all_points_in_video`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "29b874c8-9f39-42d3-a667-54a0bd696410",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "predictor.clear_all_points_in_video(inference_state)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "48f3f7e6-4821-468c-84e4-f3a0435c9149",
+   "metadata": {},
+   "source": [
+    "#### Step 1: Add two objects on a frame"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "95158714-86d7-48a9-8365-b213f97cc9ca",
+   "metadata": {},
+   "source": [
+    "SAM 3 can also segment and track two or more objects at the same time. One way, of course, is to do them one by one. However, it would be more efficient to batch them together (e.g. so that we can share the image features between objects to reduce computation costs).\n",
+    "\n",
+    "This time, let's focus on object parts and segment **the shirts of both childen** in this video. Here we add prompts for these two objects and assign each of them a unique object id."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e22d896d-3cd5-4fa0-9230-f33e217035dc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "prompts = {}  # hold all the clicks we add for visualization"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "59d9ac57-b14a-4237-828d-927e422c518b",
+   "metadata": {},
+   "source": [
+    "Add the first object (the left child's shirt) with a **positive click** at (x, y) = (200, 300) on frame 0.\n",
+    "\n",
+    "We assign it to object id `2` (it can be arbitrary integers, and only needs to be unique for each object to track), which is passed to the `add_new_points_or_box` API to distinguish the object we are clicking upon."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d13432fc-f467-44d8-adfe-3e0c488046b7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ann_frame_idx = 0  # the frame index we interact with\n",
+    "ann_obj_id = 2  # give a unique id to each object we interact with (it can be any integers)\n",
+    "\n",
+    "# Let's add a positive click at (x, y) = (200, 300) to get started on the first object\n",
+    "points = np.array([[200, 300]], dtype=np.float32)\n",
+    "# for labels, `1` means positive click and `0` means negative click\n",
+    "labels = np.array([1], np.int32)\n",
+    "prompts[ann_obj_id] = points, labels\n",
+    "\n",
+    "rel_points = [[x / width, y / height] for x, y in points]\n",
+    "points_tensor = torch.tensor(rel_points, dtype=torch.float32)\n",
+    "points_labels_tensor = torch.tensor(labels, dtype=torch.int32)\n",
+    "\n",
+    "_, out_obj_ids, low_res_masks, video_res_masks = predictor.add_new_points_or_box(\n",
+    "    inference_state=inference_state,\n",
+    "    frame_idx=ann_frame_idx,\n",
+    "    obj_id=ann_obj_id,\n",
+    "    points=points_tensor,\n",
+    "    labels=points_labels_tensor,\n",
+    ")\n",
+    "\n",
+    "\n",
+    "# show the results on the current (interacted) frame\n",
+    "plt.figure(figsize=(9, 6))\n",
+    "plt.title(f\"frame {ann_frame_idx}\")\n",
+    "plt.imshow(video_frames_for_vis[ann_frame_idx])\n",
+    "for i, out_obj_id in enumerate(out_obj_ids):\n",
+    "    show_points(points, labels, plt.gca())\n",
+    "    show_points(*prompts[out_obj_id], plt.gca())\n",
+    "    show_mask((video_res_masks[i][0] > 0.0).cpu().numpy(), plt.gca(), obj_id=out_obj_id)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1bbbd51b-e1e2-4c36-99ec-1d9a1b49b0cd",
+   "metadata": {},
+   "source": [
+    "Hmm, this time we just want to select the child's shirt, but the model predicts the mask for the entire child. Let's refine the prediction with a **negative click** at (x, y) = (275, 175)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "95ecf61d-662b-4f98-ae62-46557b219842",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# add the first object\n",
+    "ann_frame_idx = 0  # the frame index we interact with\n",
+    "ann_obj_id = 2  # give a unique id to each object we interact with (it can be any integers)\n",
+    "\n",
+    "# Let's add a 2nd negative click at (x, y) = (275, 175) to refine the first object\n",
+    "# sending all clicks (and their labels) to `add_new_points_or_box`\n",
+    "points = np.array([[200, 300], [275, 175]], dtype=np.float32)\n",
+    "# for labels, `1` means positive click and `0` means negative click\n",
+    "labels = np.array([1, 0], np.int32)\n",
+    "prompts[ann_obj_id] = points, labels\n",
+    "\n",
+    "rel_points = [[x / width, y / height] for x, y in points]\n",
+    "points_tensor = torch.tensor(rel_points, dtype=torch.float32)\n",
+    "points_labels_tensor = torch.tensor(labels, dtype=torch.int32)\n",
+    "\n",
+    "\n",
+    "_, out_obj_ids, low_res_masks, video_res_masks  = predictor.add_new_points_or_box(\n",
+    "    inference_state=inference_state,\n",
+    "    frame_idx=ann_frame_idx,\n",
+    "    obj_id=ann_obj_id,\n",
+    "    points=rel_points,\n",
+    "    labels=points_labels_tensor,\n",
+    ")\n",
+    "\n",
+    "# show the results on the current (interacted) frame\n",
+    "plt.figure(figsize=(9, 6))\n",
+    "plt.title(f\"frame {ann_frame_idx}\")\n",
+    "plt.imshow(video_frames_for_vis[ann_frame_idx])\n",
+    "for i, out_obj_id in enumerate(out_obj_ids):\n",
+    "    show_points(points, labels, plt.gca())\n",
+    "    show_points(*prompts[out_obj_id], plt.gca())\n",
+    "    show_mask((video_res_masks[i][0] > 0.0).cpu().numpy(), plt.gca(), obj_id=out_obj_id)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "194718c1-734d-446c-a3ef-361057de2f31",
+   "metadata": {},
+   "source": [
+    "After the 2nd negative click, now we get the left child's shirt as our first object.\n",
+    "\n",
+    "Let's move on to the second object (the right child's shirt) with a positive click at (x, y) = (400, 150) on frame 0. Here we assign object id `3` to this second object (it can be arbitrary integers, and only needs to be unique for each object to track).\n",
+    "\n",
+    "Note: when there are multiple objects, the `add_new_points_or_box` API will return a list of masks for each object."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "86ca1bde-62a4-40e6-98e4-15606441e52f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ann_frame_idx = 0  # the frame index we interact with\n",
+    "ann_obj_id = 3  # give a unique id to each object we interact with (it can be any integers)\n",
+    "\n",
+    "# Let's now move on to the second object we want to track (giving it object id `3`)\n",
+    "# with a positive click at (x, y) = (400, 150)\n",
+    "points = np.array([[400, 150]], dtype=np.float32)\n",
+    "# for labels, `1` means positive click and `0` means negative click\n",
+    "labels = np.array([1], np.int32)\n",
+    "prompts[ann_obj_id] = points, labels\n",
+    "\n",
+    "rel_points = [[x / width, y / height] for x, y in points]\n",
+    "points_tensor = torch.tensor(rel_points, dtype=torch.float32)\n",
+    "points_labels_tensor = torch.tensor(labels, dtype=torch.int32)\n",
+    "\n",
+    "\n",
+    "# `add_new_points_or_box` returns masks for all objects added so far on this interacted frame\n",
+    "_, out_obj_ids, low_res_masks, video_res_masks = predictor.add_new_points_or_box(\n",
+    "    inference_state=inference_state,\n",
+    "    frame_idx=ann_frame_idx,\n",
+    "    obj_id=ann_obj_id,\n",
+    "    points=points_tensor,\n",
+    "    labels=points_labels_tensor,\n",
+    ")\n",
+    "\n",
+    "# show the results on the current (interacted) frame on all objects\n",
+    "plt.figure(figsize=(9, 6))\n",
+    "plt.title(f\"frame {ann_frame_idx}\")\n",
+    "plt.imshow(video_frames_for_vis[ann_frame_idx])\n",
+    "for i, out_obj_id in enumerate(out_obj_ids):\n",
+    "    show_points(points, labels, plt.gca())\n",
+    "    show_points(*prompts[out_obj_id], plt.gca())\n",
+    "    show_mask((video_res_masks[i][0] > 0.0).cpu().numpy(), plt.gca(), obj_id=out_obj_id)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a1f7add8-d577-4597-ae2f-654b8c7b05e0",
+   "metadata": {},
+   "source": [
+    "This time the model predicts the mask of the shirt we want to track in just one click. Nice!"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "448733b8-ea8b-4078-995f-b676c3b558ba",
+   "metadata": {},
+   "source": [
+    "#### Step 2: Propagate the prompts to get masklets across the video"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "60bd73de-d669-41c8-b6ba-943883f0caa2",
+   "metadata": {},
+   "source": [
+    "Now, we propagate the prompts for both objects to get their masklets throughout the video.\n",
+    "\n",
+    "Note: when there are multiple objects, the `propagate_in_video` API will return a list of masks for each object."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "17737191-d62b-4611-b2c6-6d0418a9ab74",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# run propagation throughout the video and collect the results in a dict\n",
+    "video_segments = {}  # video_segments contains the per-frame segmentation results\n",
+    "for frame_idx, obj_ids, low_res_masks, video_res_masks, obj_scores in predictor.propagate_in_video(inference_state, start_frame_idx=0, max_frame_num_to_track=300, reverse=False, propagate_preflight=True):\n",
+    "    video_segments[frame_idx] = {\n",
+    "        out_obj_id: (video_res_masks[i] > 0.0).cpu().numpy()\n",
+    "        for i, out_obj_id in enumerate(out_obj_ids)\n",
+    "    }\n",
+    "\n",
+    "# render the segmentation results every few frames\n",
+    "vis_frame_stride = 30\n",
+    "plt.close(\"all\")\n",
+    "for out_frame_idx in range(0, len(video_frames_for_vis), vis_frame_stride):\n",
+    "    plt.figure(figsize=(6, 4))\n",
+    "    plt.title(f\"frame {out_frame_idx}\")\n",
+    "    plt.imshow(video_frames_for_vis[out_frame_idx])\n",
+    "    for out_obj_id, out_mask in video_segments[out_frame_idx].items():\n",
+    "        show_mask(out_mask, plt.gca(), obj_id=out_obj_id)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "18a0b9d7-c78f-432b-afb0-11f2ea5b652a",
+   "metadata": {},
+   "source": [
+    "Looks like both children's shirts are well segmented in this video.\n",
+    "\n",
+    "Now you can try SAM 3 on your own videos and use cases! "
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}