73 lines
2.7 KiB
Python
73 lines
2.7 KiB
Python
# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
|
|
|
|
# pyre-unsafe
|
|
import argparse
|
|
from multiprocessing import Pool
|
|
from pathlib import Path
|
|
|
|
import pandas as pd
|
|
import utils
|
|
from tqdm import tqdm
|
|
|
|
|
|
def main(args, n_workers=20):
|
|
raw_folder = Path(args.raw_images_folder)
|
|
processed_folder = Path(args.processed_images_folder)
|
|
utils.setup(processed_folder)
|
|
img_ids = utils.get_image_ids(args.annotation_file)
|
|
if args.dataset_name == "geode":
|
|
metadata = pd.read_csv(raw_folder / "index.csv")
|
|
metadata["flat_filepath"] = metadata.file_path.apply(
|
|
lambda x: x.replace("/", "_")
|
|
)
|
|
metadata["original_absolute_path"] = metadata.file_path.apply(
|
|
lambda x: str((raw_folder / "images") / x)
|
|
)
|
|
metadata["new_absolute_path"] = metadata.flat_filepath.apply(
|
|
lambda x: str(processed_folder / x)
|
|
)
|
|
metadata["filestem"] = metadata.new_absolute_path.apply(lambda x: Path(x).stem)
|
|
img_id_mapping = metadata.set_index("filestem").to_dict()
|
|
# print(img_id_mapping.keys())
|
|
paths = [
|
|
(
|
|
img_id_mapping["original_absolute_path"][each],
|
|
img_id_mapping["new_absolute_path"][each],
|
|
)
|
|
for each in img_ids
|
|
]
|
|
elif args.dataset_name == "bdd100k":
|
|
bdd_subfolder = "100k/train"
|
|
img_filenames = utils.get_filenames(args.annotation_file)
|
|
raw_folder_bdd_images = raw_folder / bdd_subfolder
|
|
paths = [
|
|
(raw_folder_bdd_images / each, processed_folder / each)
|
|
for each in img_filenames
|
|
]
|
|
elif args.dataset_name == "food_rec":
|
|
food_subfolder = "public_validation_set_2.0/images"
|
|
img_filenames = utils.get_filenames(args.annotation_file)
|
|
raw_folder_food_images = raw_folder / food_subfolder
|
|
paths = [
|
|
(
|
|
raw_folder_food_images
|
|
/ f"{Path(each).stem.split('_')[-1]}{Path(each).suffix}",
|
|
processed_folder / each,
|
|
)
|
|
for each in img_filenames
|
|
]
|
|
print("Preparing to copy and flatten filename for", len(paths), "images")
|
|
with Pool(20) as p:
|
|
for _ in tqdm(p.imap(utils.copy_file, paths), total=len(paths)):
|
|
continue
|
|
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--annotation_file", help="Path to annotation file")
|
|
parser.add_argument("--raw_images_folder", help="Path to downloaded images")
|
|
parser.add_argument("--processed_images_folder", help="Path to processed images")
|
|
parser.add_argument("--dataset_name", help="Path to processed images")
|
|
args = parser.parse_args()
|
|
main(args)
|