Initial commit
fbshipit-source-id: da6be2f26e3a1202f4bffde8cb980e2dcb851294
This commit is contained in:
260
scripts/eval/silver/download_videos.py
Normal file
260
scripts/eval/silver/download_videos.py
Normal file
@@ -0,0 +1,260 @@
|
||||
# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
|
||||
import ast
|
||||
import concurrent.futures
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
from concurrent.futures import as_completed, ThreadPoolExecutor
|
||||
from pathlib import Path
|
||||
|
||||
import yt_dlp
|
||||
|
||||
from utils import (
|
||||
annotation_files,
|
||||
config,
|
||||
load_json,
|
||||
run_command,
|
||||
save_json,
|
||||
update_annotations,
|
||||
)
|
||||
|
||||
|
||||
def construct_gcs_path(original_video):
|
||||
"""
|
||||
Convert original_video string to GCS path.
|
||||
Example:
|
||||
'AUTOLab_failure_2023-07-07_Fri_Jul__7_18:50:36_2023_recordings_MP4_22008760.mp4'
|
||||
->
|
||||
'gs://gresearch/robotics/droid_raw/1.0.1/AUTOLab/failure/2023-07-07/Fri_Jul__7_18:50:36_2023/recordings/MP4/22008760.mp4'
|
||||
"""
|
||||
parts = original_video.split("_")
|
||||
lab = parts[0]
|
||||
failure = parts[1]
|
||||
date = parts[2]
|
||||
time = "_".join(parts[3:-3])
|
||||
recordings = parts[-3]
|
||||
mp4 = parts[-2]
|
||||
file_id = parts[-1].split(".")[0]
|
||||
gcs_path = (
|
||||
f"gs://gresearch/robotics/droid_raw/1.0.1/"
|
||||
f"{lab}/{failure}/{date}/{time}/{recordings}/{mp4}/{file_id}.mp4"
|
||||
)
|
||||
return gcs_path
|
||||
|
||||
|
||||
def download_video(args):
|
||||
gcs_path, dst_dir, json_file = args
|
||||
# Ensure subdirectory exists
|
||||
subdir = Path(dst_dir)
|
||||
os.makedirs(subdir, exist_ok=True)
|
||||
# Save file with its original name inside the subdir
|
||||
print(json_file)
|
||||
local_path = subdir / json_file
|
||||
cmd = f'gsutil cp "{gcs_path}" "{local_path}"'
|
||||
print(f"Running: {cmd}")
|
||||
try:
|
||||
run_command(cmd)
|
||||
return (gcs_path, True, None)
|
||||
except Exception as e:
|
||||
return (gcs_path, False, str(e))
|
||||
|
||||
|
||||
def download_youtube_video(youtube_id, output_path=None):
|
||||
try:
|
||||
if output_path is None:
|
||||
output_path = os.path.join(
|
||||
config["yt1b_path"], "downloaded_videos", f"video_{youtube_id}.mp4"
|
||||
)
|
||||
url = f"https://www.youtube.com/watch?v={youtube_id}"
|
||||
if os.path.exists(output_path):
|
||||
return youtube_id, None
|
||||
format = "best[height<=720][fps<=30]/best[height<=720]/best" # 720p or lower, max 30fps
|
||||
ydl_opts = {
|
||||
"format": format,
|
||||
"outtmpl": output_path,
|
||||
"merge_output_format": "mp4",
|
||||
"quiet": True,
|
||||
"cookiefile": config["cookies_path"],
|
||||
"socket_timeout": 60, # Increase timeout to 60 seconds (default is 10)
|
||||
}
|
||||
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
||||
ydl.download([url])
|
||||
return youtube_id, None
|
||||
except Exception as e:
|
||||
return youtube_id, str(e)
|
||||
|
||||
|
||||
def download_youtube():
|
||||
all_videos_to_download = set()
|
||||
for annotation_file in annotation_files["yt1b"]:
|
||||
ann = load_json(os.path.join(config["path_annotations"], annotation_file))
|
||||
for video_info in ann["images"]:
|
||||
youtube_id = video_info["original_video"]
|
||||
all_videos_to_download.add(youtube_id)
|
||||
|
||||
videos_to_download_still = all_videos_to_download
|
||||
videos_downloaded = set()
|
||||
videos_unavailable = set()
|
||||
num_download_retries = 3
|
||||
for _ in range(num_download_retries):
|
||||
if len(videos_to_download_still) == 0:
|
||||
break
|
||||
videos_error = set()
|
||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||
futures = [
|
||||
executor.submit(download_youtube_video, youtube_id)
|
||||
for youtube_id in videos_to_download_still
|
||||
]
|
||||
for future in concurrent.futures.as_completed(futures):
|
||||
youtube_id, exception = future.result()
|
||||
if exception is None:
|
||||
videos_downloaded.add(youtube_id)
|
||||
elif "unavailable" in exception or "members-only" in exception:
|
||||
videos_unavailable.add(youtube_id)
|
||||
else:
|
||||
videos_error.add(youtube_id)
|
||||
videos_to_download_still = (
|
||||
all_videos_to_download - videos_downloaded - videos_unavailable
|
||||
)
|
||||
assert videos_to_download_still == videos_error
|
||||
|
||||
if len(videos_unavailable) + len(videos_to_download_still) > 0:
|
||||
message = "Some videos are either no longer available on YouTube, or are set to private, or resulted in some other error. "
|
||||
if config["update_annotation_yt1b"]:
|
||||
message += "The unavailable videos will be ***REMOVED*** from the annotation file. This will make the test results NOT DIRECTLY COMPARABLE to other reported results."
|
||||
print(message)
|
||||
update_annotations("yt1b", videos_downloaded)
|
||||
else:
|
||||
message += "You may want to either re-try the download, or remove these videos from the evaluation json"
|
||||
print(message)
|
||||
|
||||
|
||||
def download_droid():
|
||||
ann_dir = Path(config["path_annotations"])
|
||||
dst_dir = Path(config["droid_path"]) / "downloaded_videos"
|
||||
json_files = annotation_files["droid"]
|
||||
|
||||
download_tasks = []
|
||||
original_videos = set()
|
||||
for json_file in json_files:
|
||||
json_path = ann_dir / json_file
|
||||
data = load_json(json_path)
|
||||
for img in data["images"]:
|
||||
original_video = img["original_video"]
|
||||
original_videos.add(original_video)
|
||||
|
||||
print(len(original_videos))
|
||||
for original_video in original_videos:
|
||||
gcs_path = construct_gcs_path(original_video)
|
||||
download_tasks.append((gcs_path, dst_dir, original_video))
|
||||
|
||||
max_workers = min(16, len(download_tasks))
|
||||
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
||||
future_to_task = {
|
||||
executor.submit(download_video, task): task for task in download_tasks
|
||||
}
|
||||
for future in as_completed(future_to_task):
|
||||
gcs_path, success, error = future.result()
|
||||
if not success:
|
||||
print(f"Failed to download {gcs_path}: {error}")
|
||||
|
||||
|
||||
def download_ego4d():
|
||||
output_dir = os.path.join(config["ego4d_path"], "downloaded_videos")
|
||||
|
||||
ann_dir = Path(config["path_annotations"])
|
||||
json_files = annotation_files["ego4d"]
|
||||
original_videos = set()
|
||||
for json_file in json_files:
|
||||
json_path = ann_dir / json_file
|
||||
data = load_json(json_path)
|
||||
for img in data["images"]:
|
||||
original_video = img["original_video"]
|
||||
original_videos.add(original_video)
|
||||
|
||||
original_video_uids = [
|
||||
video_uid.replace(".mp4", "") for video_uid in original_videos
|
||||
]
|
||||
video_ids_download = original_video_uids
|
||||
num_download_retries = 2
|
||||
download_correct = False
|
||||
message = ""
|
||||
for _ in range(num_download_retries):
|
||||
cmd = (
|
||||
[
|
||||
# "python", "-m", "ego4d.cli.cli",
|
||||
"ego4d",
|
||||
"--output_directory",
|
||||
output_dir,
|
||||
"--datasets",
|
||||
"clips",
|
||||
"--version",
|
||||
"v1",
|
||||
"--video_uids",
|
||||
]
|
||||
+ video_ids_download
|
||||
+ ["--yes"]
|
||||
)
|
||||
|
||||
# Run the command
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
message = result.stderr
|
||||
if (
|
||||
"RuntimeError: The following requested video UIDs could not be found in the manifest for version:"
|
||||
in result.stderr
|
||||
):
|
||||
not_findable_videos = ast.literal_eval(result.stderr.split("\n")[-2])
|
||||
video_ids_download = [
|
||||
video_uid
|
||||
for video_uid in video_ids_download
|
||||
if video_uid not in not_findable_videos
|
||||
]
|
||||
else:
|
||||
download_correct = True
|
||||
break
|
||||
|
||||
if not download_correct:
|
||||
print(f"There was an error downloading the Ego4D data: {message}")
|
||||
|
||||
if len(video_ids_download) != len(original_video_uids):
|
||||
message = "Some videos are no longer available. "
|
||||
if config["update_annotation_ego4d"]:
|
||||
message += "The unavailable videos will be ***REMOVED*** from the annotation file. This will make the test results NOT DIRECTLY COMPARABLE to other reported results."
|
||||
print(message)
|
||||
update_annotations("ego4d", video_ids_download)
|
||||
else:
|
||||
message += "You may want to either re-try the download, or remove these videos from the evaluation json"
|
||||
print(message)
|
||||
|
||||
|
||||
def download_sav():
|
||||
tar_url = config["sav_videos_fps_6_download_path"]
|
||||
tar_file = "videos_fps_6.tar"
|
||||
sav_data_dir = os.path.join(config["sav_path"], "downloaded_videos")
|
||||
os.makedirs(sav_data_dir, exist_ok=True)
|
||||
|
||||
subprocess.run(["wget", tar_url, "-O", tar_file], cwd=sav_data_dir, check=True)
|
||||
subprocess.run(["tar", "-xvf", tar_file], cwd=sav_data_dir, check=True)
|
||||
subprocess.run(["rm", tar_file], cwd=sav_data_dir, check=True)
|
||||
|
||||
|
||||
def main():
|
||||
assert len(sys.argv) > 1, "You have to provide the name of the dataset"
|
||||
dataset_name = sys.argv[1]
|
||||
assert (
|
||||
dataset_name in annotation_files
|
||||
), f"The dataset can be one of {list(annotation_files.keys())}"
|
||||
|
||||
if dataset_name == "yt1b":
|
||||
download_youtube()
|
||||
elif dataset_name == "droid":
|
||||
download_droid()
|
||||
elif dataset_name == "ego4d":
|
||||
download_ego4d()
|
||||
elif dataset_name == "sav":
|
||||
download_sav()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user