first commit
This commit is contained in:
242
examples/sam3_agent.ipynb
Normal file
242
examples/sam3_agent.ipynb
Normal file
@@ -0,0 +1,242 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Copyright (c) Meta Platforms, Inc. and affiliates."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# SAM 3 Agent"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"This notebook shows an example of how an MLLM can use SAM 3 as a tool, i.e., \"SAM 3 Agent\", to segment more complex text queries such as \"the leftmost child wearing blue vest\"."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Env Setup"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"First install `sam3` in your environment using the [installation instructions](https://github.com/facebookresearch/sam3?tab=readme-ov-file#installation) in the repository."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import torch\n",
|
||||
"# turn on tfloat32 for Ampere GPUs\n",
|
||||
"# https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices\n",
|
||||
"torch.backends.cuda.matmul.allow_tf32 = True\n",
|
||||
"torch.backends.cudnn.allow_tf32 = True\n",
|
||||
"\n",
|
||||
"# use bfloat16 for the entire notebook. If your card doesn't support it, try float16 instead\n",
|
||||
"torch.autocast(\"cuda\", dtype=torch.bfloat16).__enter__()\n",
|
||||
"\n",
|
||||
"# inference mode for the whole notebook. Disable if you need gradients\n",
|
||||
"torch.inference_mode().__enter__()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"\n",
|
||||
"SAM3_ROOT = os.path.dirname(os.getcwd())\n",
|
||||
"os.chdir(SAM3_ROOT)\n",
|
||||
"\n",
|
||||
"# setup GPU to use - A single GPU is good with the purpose of this demo\n",
|
||||
"os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\"\n",
|
||||
"_ = os.system(\"nvidia-smi\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Build SAM3 Model"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import sam3\n",
|
||||
"from sam3 import build_sam3_image_model\n",
|
||||
"from sam3.model.sam3_image_processor import Sam3Processor\n",
|
||||
"\n",
|
||||
"sam3_root = os.path.dirname(sam3.__file__)\n",
|
||||
"bpe_path = f\"{sam3_root}/assets/bpe_simple_vocab_16e6.txt.gz\"\n",
|
||||
"model = build_sam3_image_model(bpe_path=bpe_path)\n",
|
||||
"processor = Sam3Processor(model, confidence_threshold=0.5)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## LLM Setup\n",
|
||||
"\n",
|
||||
"Config which MLLM to use, it can either be a model served by vLLM that you launch from your own machine or a model is served via external API. If you want to using a vLLM model, we also provided insturctions below."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"LLM_CONFIGS = {\n",
|
||||
" # vLLM-served models\n",
|
||||
" \"qwen3_vl_8b_thinking\": {\n",
|
||||
" \"provider\": \"vllm\",\n",
|
||||
" \"model\": \"Qwen/Qwen3-VL-8B-Thinking\",\n",
|
||||
" },\n",
|
||||
" # models served via external APIs\n",
|
||||
" # add your own\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"model = \"qwen3_vl_8b_thinking\"\n",
|
||||
"LLM_API_KEY = \"DUMMY_API_KEY\"\n",
|
||||
"\n",
|
||||
"llm_config = LLM_CONFIGS[model]\n",
|
||||
"llm_config[\"api_key\"] = LLM_API_KEY\n",
|
||||
"llm_config[\"name\"] = model\n",
|
||||
"\n",
|
||||
"# setup API endpoint\n",
|
||||
"if llm_config[\"provider\"] == \"vllm\":\n",
|
||||
" LLM_SERVER_URL = \"http://0.0.0.0:8001/v1\" # replace this with your vLLM server address as needed\n",
|
||||
"else:\n",
|
||||
" LLM_SERVER_URL = llm_config[\"base_url\"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Setup vLLM server \n",
|
||||
"This step is only required if you are using a model served by vLLM, skip this step if you are calling LLM using an API like Gemini and GPT.\n",
|
||||
"\n",
|
||||
"* Install vLLM (in a separate conda env from SAM 3 to avoid dependency conflicts).\n",
|
||||
" ```bash\n",
|
||||
" conda create -n vllm python=3.12\n",
|
||||
" pip install vllm --extra-index-url https://download.pytorch.org/whl/cu128\n",
|
||||
" ```\n",
|
||||
"* Start vLLM server on the same machine of this notebook\n",
|
||||
" ```bash\n",
|
||||
" # qwen 3 VL 8B thinking\n",
|
||||
" vllm serve Qwen/Qwen3-VL-8B-Thinking --tensor-parallel-size 4 --allowed-local-media-path / --enforce-eager --port 8001\n",
|
||||
" ```"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Run SAM3 Agent Inference"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from functools import partial\n",
|
||||
"from IPython.display import display, Image\n",
|
||||
"from sam3.agent.client_llm import send_generate_request as send_generate_request_orig\n",
|
||||
"from sam3.agent.client_sam3 import call_sam_service as call_sam_service_orig\n",
|
||||
"from sam3.agent.inference import run_single_image_inference"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"output": {
|
||||
"id": 689664053567678,
|
||||
"loadingStatus": "loaded"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# prepare input args and run single image inference\n",
|
||||
"image = \"assets/images/test_image.jpg\"\n",
|
||||
"prompt = \"the leftmost child wearing blue vest\"\n",
|
||||
"image = os.path.abspath(image)\n",
|
||||
"send_generate_request = partial(send_generate_request_orig, server_url=LLM_SERVER_URL, model=llm_config[\"model\"], api_key=llm_config[\"api_key\"])\n",
|
||||
"call_sam_service = partial(call_sam_service_orig, sam3_processor=processor)\n",
|
||||
"output_image_path = run_single_image_inference(\n",
|
||||
" image, prompt, llm_config, send_generate_request, call_sam_service,\n",
|
||||
" debug=True, output_dir=\"agent_output\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# display output\n",
|
||||
"if output_image_path is not None:\n",
|
||||
" display(Image(filename=output_image_path))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"fileHeader": "",
|
||||
"fileUid": "be59e249-6c09-4634-a9e7-1f06fd233c42",
|
||||
"isAdHoc": false,
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.11"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
Reference in New Issue
Block a user