first commit
This commit is contained in:
83
scripts/eval/silver/download_inaturalist.py
Normal file
83
scripts/eval/silver/download_inaturalist.py
Normal file
@@ -0,0 +1,83 @@
|
||||
# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
|
||||
|
||||
# pyre-unsafe
|
||||
import argparse
|
||||
import json
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
import tarfile
|
||||
from pathlib import Path
|
||||
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
def download_archive(url, dest_dir):
|
||||
dest_dir = Path(dest_dir)
|
||||
dest_dir.mkdir(parents=True, exist_ok=True)
|
||||
archive_path = dest_dir / url.split("/")[-1]
|
||||
if not archive_path.exists():
|
||||
print(f"Downloading archive to {archive_path}...")
|
||||
result = subprocess.run(["wget", "-O", str(archive_path), url])
|
||||
if result.returncode != 0:
|
||||
print("Download failed.")
|
||||
sys.exit(1)
|
||||
else:
|
||||
print(f"Archive already exists at {archive_path}")
|
||||
return archive_path
|
||||
|
||||
|
||||
def extract_archive(archive_path, dest_dir):
|
||||
print(f"Extracting {archive_path} to {dest_dir}...")
|
||||
with tarfile.open(archive_path, "r:gz") as tar:
|
||||
tar.extractall(path=dest_dir)
|
||||
print("Extraction complete.")
|
||||
|
||||
|
||||
def copy_images(subset_json, untar_dir, output_dir):
|
||||
with open(subset_json, "r") as f:
|
||||
image_dict = json.load(f)
|
||||
output_dir = Path(output_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
for target_name, rel_path in tqdm(image_dict.items(), "Copying image subset"):
|
||||
src = Path(untar_dir) / rel_path
|
||||
dst = output_dir / target_name
|
||||
if not src.exists():
|
||||
print(f"Warning: Source image {src} does not exist, skipping.")
|
||||
continue
|
||||
shutil.copy2(src, dst)
|
||||
print(f"Copied {len(image_dict)} images to {output_dir}")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Download, extract, and copy subset of iNaturalist images from archive."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--raw_images_folder", help="Path to downloaded and extract the archive"
|
||||
)
|
||||
parser.add_argument("--processed_images_folder", help="Path to processed images")
|
||||
parser.add_argument(
|
||||
"--subset-json",
|
||||
default="inaturalist_image_subset.json",
|
||||
help="Path to iNaturalist images subset",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--archive-url",
|
||||
default="https://ml-inat-competition-datasets.s3.amazonaws.com/2017/train_val_images.tar.gz",
|
||||
help="URL of the archive to download",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
dest_dir = Path(args.raw_images_folder)
|
||||
images_dir = Path(args.processed_images_folder)
|
||||
|
||||
archive_path = download_archive(args.archive_url, dest_dir)
|
||||
extract_archive(archive_path, dest_dir)
|
||||
|
||||
untar_dir = dest_dir / "train_val_images"
|
||||
copy_images(args.subset_json, untar_dir, images_dir)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user