Initial commit

fbshipit-source-id: da6be2f26e3a1202f4bffde8cb980e2dcb851294
2025-11-18 23:07:42 -08:00
commit a13e358df4
504 changed files with 122758 additions and 0 deletions
--- a/scripts/eval/silver/download_videos.py
+++ b/scripts/eval/silver/download_videos.py
@@ -0,0 +1,260 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
+import ast
+import concurrent.futures
+import os
+import shutil
+import subprocess
+import sys
+from concurrent.futures import as_completed, ThreadPoolExecutor
+from pathlib import Path
+
+import yt_dlp
+
+from utils import (
+    annotation_files,
+    config,
+    load_json,
+    run_command,
+    save_json,
+    update_annotations,
+)
+
+
+def construct_gcs_path(original_video):
+    """
+    Convert original_video string to GCS path.
+    Example:
+    'AUTOLab_failure_2023-07-07_Fri_Jul__7_18:50:36_2023_recordings_MP4_22008760.mp4'
+    ->
+    'gs://gresearch/robotics/droid_raw/1.0.1/AUTOLab/failure/2023-07-07/Fri_Jul__7_18:50:36_2023/recordings/MP4/22008760.mp4'
+    """
+    parts = original_video.split("_")
+    lab = parts[0]
+    failure = parts[1]
+    date = parts[2]
+    time = "_".join(parts[3:-3])
+    recordings = parts[-3]
+    mp4 = parts[-2]
+    file_id = parts[-1].split(".")[0]
+    gcs_path = (
+        f"gs://gresearch/robotics/droid_raw/1.0.1/"
+        f"{lab}/{failure}/{date}/{time}/{recordings}/{mp4}/{file_id}.mp4"
+    )
+    return gcs_path
+
+
+def download_video(args):
+    gcs_path, dst_dir, json_file = args
+    # Ensure subdirectory exists
+    subdir = Path(dst_dir)
+    os.makedirs(subdir, exist_ok=True)
+    # Save file with its original name inside the subdir
+    print(json_file)
+    local_path = subdir / json_file
+    cmd = f'gsutil cp "{gcs_path}" "{local_path}"'
+    print(f"Running: {cmd}")
+    try:
+        run_command(cmd)
+        return (gcs_path, True, None)
+    except Exception as e:
+        return (gcs_path, False, str(e))
+
+
+def download_youtube_video(youtube_id, output_path=None):
+    try:
+        if output_path is None:
+            output_path = os.path.join(
+                config["yt1b_path"], "downloaded_videos", f"video_{youtube_id}.mp4"
+            )
+        url = f"https://www.youtube.com/watch?v={youtube_id}"
+        if os.path.exists(output_path):
+            return youtube_id, None
+        format = "best[height<=720][fps<=30]/best[height<=720]/best"  # 720p or lower, max 30fps
+        ydl_opts = {
+            "format": format,
+            "outtmpl": output_path,
+            "merge_output_format": "mp4",
+            "quiet": True,
+            "cookiefile": config["cookies_path"],
+            "socket_timeout": 60,  # Increase timeout to 60 seconds (default is 10)
+        }
+        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+            ydl.download([url])
+        return youtube_id, None
+    except Exception as e:
+        return youtube_id, str(e)
+
+
+def download_youtube():
+    all_videos_to_download = set()
+    for annotation_file in annotation_files["yt1b"]:
+        ann = load_json(os.path.join(config["path_annotations"], annotation_file))
+        for video_info in ann["images"]:
+            youtube_id = video_info["original_video"]
+            all_videos_to_download.add(youtube_id)
+
+    videos_to_download_still = all_videos_to_download
+    videos_downloaded = set()
+    videos_unavailable = set()
+    num_download_retries = 3
+    for _ in range(num_download_retries):
+        if len(videos_to_download_still) == 0:
+            break
+        videos_error = set()
+        with concurrent.futures.ThreadPoolExecutor() as executor:
+            futures = [
+                executor.submit(download_youtube_video, youtube_id)
+                for youtube_id in videos_to_download_still
+            ]
+            for future in concurrent.futures.as_completed(futures):
+                youtube_id, exception = future.result()
+                if exception is None:
+                    videos_downloaded.add(youtube_id)
+                elif "unavailable" in exception or "members-only" in exception:
+                    videos_unavailable.add(youtube_id)
+                else:
+                    videos_error.add(youtube_id)
+        videos_to_download_still = (
+            all_videos_to_download - videos_downloaded - videos_unavailable
+        )
+        assert videos_to_download_still == videos_error
+
+    if len(videos_unavailable) + len(videos_to_download_still) > 0:
+        message = "Some videos are either no longer available on YouTube, or are set to private, or resulted in some other error. "
+        if config["update_annotation_yt1b"]:
+            message += "The unavailable videos will be ***REMOVED*** from the annotation file. This will make the test results NOT DIRECTLY COMPARABLE to other reported results."
+            print(message)
+            update_annotations("yt1b", videos_downloaded)
+        else:
+            message += "You may want to either re-try the download, or remove these videos from the evaluation json"
+            print(message)
+
+
+def download_droid():
+    ann_dir = Path(config["path_annotations"])
+    dst_dir = Path(config["droid_path"]) / "downloaded_videos"
+    json_files = annotation_files["droid"]
+
+    download_tasks = []
+    original_videos = set()
+    for json_file in json_files:
+        json_path = ann_dir / json_file
+        data = load_json(json_path)
+        for img in data["images"]:
+            original_video = img["original_video"]
+            original_videos.add(original_video)
+
+    print(len(original_videos))
+    for original_video in original_videos:
+        gcs_path = construct_gcs_path(original_video)
+        download_tasks.append((gcs_path, dst_dir, original_video))
+
+    max_workers = min(16, len(download_tasks))
+    with ThreadPoolExecutor(max_workers=max_workers) as executor:
+        future_to_task = {
+            executor.submit(download_video, task): task for task in download_tasks
+        }
+        for future in as_completed(future_to_task):
+            gcs_path, success, error = future.result()
+            if not success:
+                print(f"Failed to download {gcs_path}: {error}")
+
+
+def download_ego4d():
+    output_dir = os.path.join(config["ego4d_path"], "downloaded_videos")
+
+    ann_dir = Path(config["path_annotations"])
+    json_files = annotation_files["ego4d"]
+    original_videos = set()
+    for json_file in json_files:
+        json_path = ann_dir / json_file
+        data = load_json(json_path)
+        for img in data["images"]:
+            original_video = img["original_video"]
+            original_videos.add(original_video)
+
+    original_video_uids = [
+        video_uid.replace(".mp4", "") for video_uid in original_videos
+    ]
+    video_ids_download = original_video_uids
+    num_download_retries = 2
+    download_correct = False
+    message = ""
+    for _ in range(num_download_retries):
+        cmd = (
+            [
+                # "python", "-m", "ego4d.cli.cli",
+                "ego4d",
+                "--output_directory",
+                output_dir,
+                "--datasets",
+                "clips",
+                "--version",
+                "v1",
+                "--video_uids",
+            ]
+            + video_ids_download
+            + ["--yes"]
+        )
+
+        # Run the command
+        result = subprocess.run(cmd, capture_output=True, text=True)
+        message = result.stderr
+        if (
+            "RuntimeError: The following requested video UIDs could not be found in the manifest for version:"
+            in result.stderr
+        ):
+            not_findable_videos = ast.literal_eval(result.stderr.split("\n")[-2])
+            video_ids_download = [
+                video_uid
+                for video_uid in video_ids_download
+                if video_uid not in not_findable_videos
+            ]
+        else:
+            download_correct = True
+            break
+
+    if not download_correct:
+        print(f"There was an error downloading the Ego4D data: {message}")
+
+    if len(video_ids_download) != len(original_video_uids):
+        message = "Some videos are no longer available. "
+        if config["update_annotation_ego4d"]:
+            message += "The unavailable videos will be ***REMOVED*** from the annotation file. This will make the test results NOT DIRECTLY COMPARABLE to other reported results."
+            print(message)
+            update_annotations("ego4d", video_ids_download)
+        else:
+            message += "You may want to either re-try the download, or remove these videos from the evaluation json"
+            print(message)
+
+
+def download_sav():
+    tar_url = config["sav_videos_fps_6_download_path"]
+    tar_file = "videos_fps_6.tar"
+    sav_data_dir = os.path.join(config["sav_path"], "downloaded_videos")
+    os.makedirs(sav_data_dir, exist_ok=True)
+
+    subprocess.run(["wget", tar_url, "-O", tar_file], cwd=sav_data_dir, check=True)
+    subprocess.run(["tar", "-xvf", tar_file], cwd=sav_data_dir, check=True)
+    subprocess.run(["rm", tar_file], cwd=sav_data_dir, check=True)
+
+
+def main():
+    assert len(sys.argv) > 1, "You have to provide the name of the dataset"
+    dataset_name = sys.argv[1]
+    assert (
+        dataset_name in annotation_files
+    ), f"The dataset can be one of {list(annotation_files.keys())}"
+
+    if dataset_name == "yt1b":
+        download_youtube()
+    elif dataset_name == "droid":
+        download_droid()
+    elif dataset_name == "ego4d":
+        download_ego4d()
+    elif dataset_name == "sav":
+        download_sav()
+
+
+if __name__ == "__main__":
+    main()