Files
sam3_local/scripts/eval/veval/saco_yt1b_frame_prep_util.py
facebook-github-bot a13e358df4 Initial commit
fbshipit-source-id: da6be2f26e3a1202f4bffde8cb980e2dcb851294
2025-11-18 23:07:54 -08:00

266 lines
8.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
import argparse
import logging
import os
import subprocess
import pandas as pd
import yt_dlp
logger = logging.getLogger(__name__)
class YtVideoPrep:
def __init__(
self,
saco_yt1b_id: str,
data_dir: str,
cookies_file: str,
yt1b_start_end_time_file: str,
ffmpeg_timeout: int,
sleep_interval: int = 10,
max_sleep_interval: int = 30,
):
self.saco_yt1b_id = saco_yt1b_id # saco_yt1b_id is like saco_yt1b_000000
self.data_dir = data_dir
self.cookies_file = cookies_file
self.ffmpeg_timeout = ffmpeg_timeout
self.sleep_interval = sleep_interval
self.max_sleep_interval = max_sleep_interval
self.yt1b_start_end_time_df = pd.read_json(yt1b_start_end_time_file)
(
self.yt_video_id,
self.yt_video_id_w_timestamps,
self.start_time,
self.end_time,
self.expected_num_frames,
) = self._get_yt_video_id_map_info()
self.raw_video_dir = os.path.join(self.data_dir, "raw_videos")
self.raw_video_path = os.path.join(
self.raw_video_dir, f"{self.yt_video_id}.mp4"
)
self.JPEGImages_6fps_dir = os.path.join(
self.data_dir, "JPEGImages_6fps", self.saco_yt1b_id
)
self.JPEGImages_6fps_pattern = os.path.join(
self.JPEGImages_6fps_dir, "%05d.jpg"
)
os.makedirs(self.raw_video_dir, exist_ok=True)
os.makedirs(self.JPEGImages_6fps_dir, exist_ok=True)
def _get_yt_video_id_map_info(self):
df = self.yt1b_start_end_time_df[
self.yt1b_start_end_time_df.saco_yt1b_id == self.saco_yt1b_id
]
assert (
len(df) == 1
), f"Expected exactly 1 row for saco_yt1b_id: {self.saco_yt1b_id}, found {len(df)}"
id_and_frame_map_row = df.iloc[0]
yt_video_id = (
id_and_frame_map_row.yt_video_id
) # yt_video_id is like -06NgWyZxC0
yt_video_id_w_timestamps = id_and_frame_map_row.yt_video_id_w_timestamps
start_time = id_and_frame_map_row.start_time
end_time = id_and_frame_map_row.end_time
expected_num_frames = id_and_frame_map_row.length
return (
yt_video_id,
yt_video_id_w_timestamps,
start_time,
end_time,
expected_num_frames,
)
def download_youtube_video(self):
video_url = f"https://youtube.com/watch?v={self.yt_video_id}"
assert os.path.exists(
self.cookies_file
), f"Cookies file '{self.cookies_file}' not found. Must have it to download videos."
outtmpl = self.raw_video_path
# Check if the output file already exists
if os.path.exists(outtmpl) and os.path.isfile(outtmpl):
return "already exists"
ydl_opts = {
"format": "best[height<=720]/best", # 720p or lower
"outtmpl": outtmpl,
"merge_output_format": "mp4",
"noplaylist": True,
"quiet": True,
"cookiefile": self.cookies_file,
"sleep_interval": self.sleep_interval, # Sleep before each download to avoid rate limiting
"max_sleep_interval": self.max_sleep_interval, # Random sleep for more human-like behavior
}
if self.yt_video_id in ["euohdDLEMRg", "nzfAn7n4d-0"]:
# For "euohdDLEMRg", we have to specify the https protocol or the video sometimes can't be downloaded completely
# For "nzfAn7n4d-0", without the https protocol, the video will be downloaded as 654×480, however we need 490×360 to match the frame matching after the 1080 width resizing
ydl_opts["format"] = (
"best[height<=720][ext=mp4][protocol^=https]/best[ext=mp4][protocol^=https]/best[height<=720]/best"
)
try:
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
ydl.download([video_url])
return "success"
except Exception as e:
logger.warning(
f"[video download][{self.saco_yt1b_id}] Error downloading video {self.yt_video_id}: {e}"
)
return f"error {e}"
def extract_frames_in_6fps_and_width_1080(self):
"""
Extract target frames in 6fps and width 1080.
"""
if not os.path.exists(self.raw_video_path):
logger.warning(
f"[frame extracting][{self.saco_yt1b_id}] Raw video file not found at {self.raw_video_path}"
)
os.rmdir(self.JPEGImages_6fps_dir)
return False
if (
os.path.exists(self.JPEGImages_6fps_dir)
and len(os.listdir(self.JPEGImages_6fps_dir)) == self.expected_num_frames
):
logger.info(
f"[frame extracting][{self.saco_yt1b_id}] JPEGImages_6fps directory already exists at {self.JPEGImages_6fps_dir} and expected number of frames {self.expected_num_frames} matches"
)
return True
# Clear the directory before extracting new frames
for file in os.listdir(self.JPEGImages_6fps_dir):
os.remove(os.path.join(self.JPEGImages_6fps_dir, file))
args = [
"-nostdin",
"-y",
# select video segment
"-ss",
str(self.start_time),
"-to",
str(self.end_time),
"-i",
self.raw_video_path,
# set output video resolution to be 6fps and at most 1080p
"-vf",
"fps=6,scale=1080:-2",
"-vsync",
"0", # passthrough mode - no frame duplication/dropping
"-q:v",
"2", # high quality JPEG output
"-start_number",
"0", # start frame numbering from 0
self.JPEGImages_6fps_pattern,
]
result = subprocess.run(
["ffmpeg"] + args,
timeout=self.ffmpeg_timeout,
capture_output=True,
text=True,
)
if result.returncode != 0:
logger.warning(
f"[frame extracting][{self.saco_yt1b_id}] Failed to extract raw frames: {result.stderr}"
)
os.rmdir(self.JPEGImages_6fps_dir)
return False
if len(os.listdir(self.JPEGImages_6fps_dir)) != self.expected_num_frames:
logger.warning(
f"[frame extracting][{self.saco_yt1b_id}] Expected {self.expected_num_frames} frames but extracted {len(os.listdir(self.JPEGImages_6fps_dir))}"
)
# Clear the directory after failed extraction
for file in os.listdir(self.JPEGImages_6fps_dir):
os.remove(os.path.join(self.JPEGImages_6fps_dir, file))
os.rmdir(self.JPEGImages_6fps_dir)
return False
logger.info(
f"[frame extracting][{self.saco_yt1b_id}] Successfully extracted {self.expected_num_frames} frames to {self.JPEGImages_6fps_dir}"
)
return True
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--saco_yt1b_id", type=str, required=True)
parser.add_argument(
"--data_dir",
type=str,
required=True,
)
parser.add_argument(
"--cookies_file",
type=str,
required=True,
)
parser.add_argument(
"--yt1b_start_end_time_file",
type=str,
required=True,
)
parser.add_argument(
"--yt1b_frame_prep_log_file",
type=str,
required=True,
)
parser.add_argument(
"--ffmpeg_timeout",
type=str,
default=7200, # Use longer timeout in case of large videos processing timeout
)
parser.add_argument(
"--sleep_interval",
type=int,
default=10,
)
parser.add_argument(
"--max_sleep_interval",
type=int,
default=30,
)
args = parser.parse_args()
logging.basicConfig(
filename=args.yt1b_frame_prep_log_file,
format="%(asctime)s [%(threadName)s] %(levelname)s: %(message)s",
level=logging.INFO,
filemode="w",
)
video_prep = YtVideoPrep(
saco_yt1b_id=args.saco_yt1b_id,
data_dir=args.data_dir,
cookies_file=args.cookies_file,
yt1b_start_end_time_file=args.yt1b_start_end_time_file,
ffmpeg_timeout=args.ffmpeg_timeout,
sleep_interval=args.sleep_interval,
max_sleep_interval=args.max_sleep_interval,
)
status = video_prep.download_youtube_video()
logger.info(f"[video download][{args.saco_yt1b_id}] download status {status}")
status = video_prep.extract_frames_in_6fps_and_width_1080()
logger.info(
f"[frame extracting][{args.saco_yt1b_id}] frame extracting status {status}"
)
if __name__ == "__main__":
main()