sam3/examples/saco_veval_vis_example.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "37048f21",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Copyright (c) Meta Platforms, Inc. and affiliates."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "154d8663",
   "metadata": {},
   "outputs": [],
   "source": [
    "using_colab = False"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b85d99d9",
   "metadata": {},
   "outputs": [],
   "source": [
    "if using_colab:\n",
    "    import torch\n",
    "    import torchvision\n",
    "    print(\"PyTorch version:\", torch.__version__)\n",
    "    print(\"Torchvision version:\", torchvision.__version__)\n",
    "    print(\"CUDA is available:\", torch.cuda.is_available())\n",
    "    import sys\n",
    "    !{sys.executable} -m pip install opencv-python matplotlib scikit-learn\n",
    "    !{sys.executable} -m pip install 'git+https://github.com/facebookresearch/sam3.git'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "da21a3bc",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "from glob import glob\n",
    "\n",
    "import numpy as np\n",
    "import utils\n",
    "\n",
    "from matplotlib import pyplot as plt\n",
    "\n",
    "COLORS = utils.pascal_color_map()[1:]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "57e85e7e",
   "metadata": {},
   "source": [
    "1. Load the data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a796734e",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Preapre the data path\n",
    "DATA_DIR = \"./sam3_saco_veval_data\" # PUT YOUR DATA PATH HERE\n",
    "ANNOT_DIR = os.path.join(DATA_DIR, \"annotation\")\n",
    "\n",
    "# Load the SACO/Veval annotation files\n",
    "annot_file_list = glob(os.path.join(ANNOT_DIR, \"*veval*.json\"))\n",
    "annot_dfs = utils.get_annot_dfs(file_list=annot_file_list)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "74bf92b1",
   "metadata": {},
   "source": [
    "Show the annotation files being loaded"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a95620ec",
   "metadata": {},
   "outputs": [],
   "source": [
    "annot_dfs.keys()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5ce211d3",
   "metadata": {},
   "source": [
    "2. Examples of the data format"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6ba749db",
   "metadata": {},
   "outputs": [],
   "source": [
    "annot_dfs[\"saco_veval_yt1b_val\"].keys()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4b6dc186",
   "metadata": {},
   "outputs": [],
   "source": [
    "annot_dfs[\"saco_veval_yt1b_val\"][\"info\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c41091b3",
   "metadata": {},
   "outputs": [],
   "source": [
    "annot_dfs[\"saco_veval_yt1b_val\"][\"videos\"].head(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a7df5771",
   "metadata": {},
   "outputs": [],
   "source": [
    "annot_dfs[\"saco_veval_yt1b_val\"][\"annotations\"].head(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "24d2861c",
   "metadata": {},
   "outputs": [],
   "source": [
    "annot_dfs[\"saco_veval_yt1b_val\"][\"categories\"].head(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f9f98f27",
   "metadata": {},
   "outputs": [],
   "source": [
    "annot_dfs[\"saco_veval_yt1b_val\"][\"video_np_pairs\"].head(3)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5673a63f",
   "metadata": {},
   "source": [
    "3. Visualize the data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "da827d09",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Select a target dataset\n",
    "target_dataset_name = \"saco_veval_yt1b_val\"\n",
    "\n",
    "# visualize a random positive video-np pair\n",
    "df_pairs = annot_dfs[target_dataset_name][\"video_np_pairs\"]\n",
    "df_positive_pairs = df_pairs[df_pairs.num_masklets > 0]\n",
    "rand_idx = np.random.randint(len(df_positive_pairs))\n",
    "pair_row = df_positive_pairs.iloc[rand_idx]\n",
    "video_id = pair_row.video_id\n",
    "noun_phrase = pair_row.noun_phrase\n",
    "print(f\"Randomly selected video-np pair: video_id={video_id}, noun_phrase={noun_phrase}\")\n",
    "\n",
    "def display_image_in_subplot(img, axes, row, col, title=\"\"):\n",
    "    axes[row, col].imshow(img)\n",
    "    axes[row, col].set_title(title)\n",
    "    axes[row, col].axis('off')\n",
    "\n",
    "num_frames_to_show = 5  # Number of frames to show per dataset\n",
    "every_n_frames = 4  # Interval between frames to show\n",
    "\n",
    "fig, axes = plt.subplots(num_frames_to_show, 3, figsize=(15, 5 * num_frames_to_show))\n",
    "\n",
    "for idx in range(0, num_frames_to_show):\n",
    "    sampled_frame_idx = idx * every_n_frames\n",
    "    print(f\"Reading annotations for frame {sampled_frame_idx}\")\n",
    "    # Get the frame and the corresponding masks and noun phrases\n",
    "    frame, annot_masks, annot_noun_phrases = utils.get_all_annotations_for_frame(\n",
    "        annot_dfs[target_dataset_name], video_id=video_id, frame_idx=sampled_frame_idx, data_dir=DATA_DIR, dataset=target_dataset_name\n",
    "    )\n",
    "    # Filter masks and noun phrases by the selected noun phrase\n",
    "    annot_masks = [m for m, np in zip(annot_masks, annot_noun_phrases) if np == noun_phrase]\n",
    "\n",
    "    # Show the frame\n",
    "    display_image_in_subplot(frame, axes, idx, 0, f\"{target_dataset_name} - {noun_phrase} - Frame {sampled_frame_idx}\")\n",
    "\n",
    "    # Show the annotated masks\n",
    "    if annot_masks is None:\n",
    "        print(f\"No masks found for video_id {video_id} at frame {sampled_frame_idx}\")\n",
    "    else:\n",
    "        # Show all masks over a white background\n",
    "        all_masks = utils.draw_masks_to_frame(\n",
    "            frame=np.ones_like(frame)*255, masks=annot_masks, colors=COLORS[: len(annot_masks)]\n",
    "        )\n",
    "        display_image_in_subplot(all_masks, axes, idx, 1, f\"{target_dataset_name} - {noun_phrase} - Frame {sampled_frame_idx} - Masks\")\n",
    "        \n",
    "        # Show masks overlaid on the frame\n",
    "        masked_frame = utils.draw_masks_to_frame(\n",
    "            frame=frame, masks=annot_masks, colors=COLORS[: len(annot_masks)]\n",
    "        )\n",
    "        display_image_in_subplot(masked_frame, axes, idx, 2, f\"Dataset: {target_dataset_name} - {noun_phrase} - Frame {sampled_frame_idx} - Masks overlaid\")\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a2a23152",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}