From c6b3c797aba485ba4da979ce82ff839c3aca9c0b Mon Sep 17 00:00:00 2001 From: zhiltsov-max Date: Mon, 13 Jul 2020 07:13:00 +0300 Subject: [PATCH] [Datumaro] CLI updates (#1824) * updates * diff fix --- .../datumaro/cli/contexts/project/__init__.py | 16 ++++-- .../datumaro/cli/contexts/project/diff.py | 19 +++++-- datumaro/datumaro/components/project.py | 50 +++++++++++-------- datumaro/datumaro/plugins/transforms.py | 35 +++++++++++++ 4 files changed, 88 insertions(+), 32 deletions(-) diff --git a/datumaro/datumaro/cli/contexts/project/__init__.py b/datumaro/datumaro/cli/contexts/project/__init__.py index 6ee6dbaa..f116c797 100644 --- a/datumaro/datumaro/cli/contexts/project/__init__.py +++ b/datumaro/datumaro/cli/contexts/project/__init__.py @@ -550,11 +550,17 @@ def diff_command(args): dst_dir = osp.abspath(dst_dir) log.info("Saving diff to '%s'" % dst_dir) - visualizer = DiffVisualizer(save_dir=dst_dir, comparator=comparator, - output_format=args.format) - visualizer.save_dataset_diff( - first_project.make_dataset(), - second_project.make_dataset()) + dst_dir_existed = osp.exists(dst_dir) + try: + visualizer = DiffVisualizer(save_dir=dst_dir, comparator=comparator, + output_format=args.format) + visualizer.save_dataset_diff( + first_project.make_dataset(), + second_project.make_dataset()) + except BaseException: + if not dst_dir_existed and osp.isdir(dst_dir): + shutil.rmtree(dst_dir, ignore_errors=True) + raise return 0 diff --git a/datumaro/datumaro/cli/contexts/project/diff.py b/datumaro/datumaro/cli/contexts/project/diff.py index 06684c03..785c6c8e 100644 --- a/datumaro/datumaro/cli/contexts/project/diff.py +++ b/datumaro/datumaro/cli/contexts/project/diff.py @@ -83,11 +83,20 @@ class DiffVisualizer: if self.output_format is Format.tensorboard: self.file_writer.reopen() - for i, (item_a, item_b) in enumerate(zip(extractor_a, extractor_b)): - if item_a.id != item_b.id or not item_a.id or not item_b.id: - print("Dataset items #%s '%s' '%s' do not match" % \ - (i + 1, item_a.id, item_b.id)) - continue + ids_a = set((item.id, item.subset) for item in extractor_a) + ids_b = set((item.id, item.subset) for item in extractor_b) + ids = ids_a & ids_b + + if len(ids) != len(ids_a): + print("Unmatched items in the first dataset: ") + print(ids_a - ids) + if len(ids) != len(ids_b): + print("Unmatched items in the second dataset: ") + print(ids_b - ids) + + for item_id, item_subset in ids: + item_a = extractor_a.get(item_id, item_subset) + item_b = extractor_a.get(item_id, item_subset) label_diff = self.comparator.compare_item_labels(item_a, item_b) self.update_label_confusion(label_diff) diff --git a/datumaro/datumaro/components/project.py b/datumaro/datumaro/components/project.py index c99b514f..b8273cd5 100644 --- a/datumaro/datumaro/components/project.py +++ b/datumaro/datumaro/components/project.py @@ -611,33 +611,39 @@ class ProjectDataset(Dataset): project.config.remove('sources') save_dir = osp.abspath(save_dir) - os.makedirs(save_dir, exist_ok=True) - dataset_save_dir = osp.join(save_dir, project.config.dataset_dir) - os.makedirs(dataset_save_dir, exist_ok=True) converter_kwargs = { 'save_images': save_images, } - if merge: - # merge and save the resulting dataset - converter = self.env.make_converter( - DEFAULT_FORMAT, **converter_kwargs) - converter(self, dataset_save_dir) - else: - if recursive: - # children items should already be updated - # so we just save them recursively - for source in self._sources.values(): - if isinstance(source, ProjectDataset): - source.save(**converter_kwargs) - - converter = self.env.make_converter( - DEFAULT_FORMAT, **converter_kwargs) - converter(self.iterate_own(), dataset_save_dir) + save_dir_existed = osp.exists(save_dir) + try: + os.makedirs(save_dir, exist_ok=True) + os.makedirs(dataset_save_dir, exist_ok=True) - project.save(save_dir) + if merge: + # merge and save the resulting dataset + converter = self.env.make_converter( + DEFAULT_FORMAT, **converter_kwargs) + converter(self, dataset_save_dir) + else: + if recursive: + # children items should already be updated + # so we just save them recursively + for source in self._sources.values(): + if isinstance(source, ProjectDataset): + source.save(**converter_kwargs) + + converter = self.env.make_converter( + DEFAULT_FORMAT, **converter_kwargs) + converter(self.iterate_own(), dataset_save_dir) + + project.save(save_dir) + except BaseException: + if not save_dir_existed and osp.isdir(save_dir): + shutil.rmtree(save_dir, ignore_errors=True) + raise @property def env(self): @@ -705,7 +711,7 @@ class ProjectDataset(Dataset): try: os.makedirs(save_dir, exist_ok=True) converter(dataset, save_dir) - except Exception: + except BaseException: if not save_dir_existed: shutil.rmtree(save_dir) raise @@ -750,7 +756,7 @@ class Project: config_path = osp.join(save_dir, config.project_filename) config.dump(config_path) - except Exception: + except BaseException: if not env_dir_existed: shutil.rmtree(save_dir, ignore_errors=True) if not project_dir_existed: diff --git a/datumaro/datumaro/plugins/transforms.py b/datumaro/datumaro/plugins/transforms.py index 9d1baeda..520d6afd 100644 --- a/datumaro/datumaro/plugins/transforms.py +++ b/datumaro/datumaro/plugins/transforms.py @@ -7,6 +7,7 @@ from enum import Enum import logging as log import os.path as osp import random +import re import pycocotools.mask as mask_utils @@ -372,6 +373,40 @@ class IdFromImageName(Transform, CliPlugin): "item has no image info" % item.id) return item +class Rename(Transform, CliPlugin): + """ + Renames items in the dataset. Supports regular expressions. + The first character in the expression is a delimiter for + the pattern and replacement parts. Replacement part can also + contain string.format tokens with 'item' object available.|n + |n + Examples:|n + - Replace 'pattern' with 'replacement':|n + |s|srename -e '|pattern|replacement|'|n + - Remove 'frame_' from item ids:|n + |s|srename -e '|frame_(\d+)|\\1|' + """ + + @classmethod + def build_cmdline_parser(cls, **kwargs): + parser = super().build_cmdline_parser(**kwargs) + parser.add_argument('-e', '--regex', + help="Regex for renaming.") + return parser + + def __init__(self, extractor, regex): + super().__init__(extractor) + + assert regex and isinstance(regex, str) + parts = regex.split(regex[0], maxsplit=3) + regex, sub = parts[1:3] + self._re = re.compile(regex) + self._sub = sub + + def transform_item(self, item): + return self.wrap_item(item, id=self._re.sub(self._sub, item.id) \ + .format(item=item)) + class RemapLabels(Transform, CliPlugin): DefaultAction = Enum('DefaultAction', ['keep', 'delete'])