[Datumaro] CLI updates (#1824)

* updates * diff fix
6 years ago · c6b3c797ab
parent 575c93ff2c
commit c6b3c797ab
4 changed files with 88 additions and 32 deletions
--- a/datumaro/datumaro/cli/contexts/project/init.py
+++ b/datumaro/datumaro/cli/contexts/project/init.py
@ -550,11 +550,17 @@ def diff_command(args):
    dst_dir = osp.abspath(dst_dir)
    log.info("Saving diff to '%s'" % dst_dir)

-    visualizer = DiffVisualizer(save_dir=dst_dir, comparator=comparator,
-        output_format=args.format)
-    visualizer.save_dataset_diff(
-        first_project.make_dataset(),
-        second_project.make_dataset())
+    dst_dir_existed = osp.exists(dst_dir)
+    try:
+        visualizer = DiffVisualizer(save_dir=dst_dir, comparator=comparator,
+            output_format=args.format)
+        visualizer.save_dataset_diff(
+            first_project.make_dataset(),
+            second_project.make_dataset())
+    except BaseException:
+        if not dst_dir_existed and osp.isdir(dst_dir):
+            shutil.rmtree(dst_dir, ignore_errors=True)
+        raise

    return 0

--- a/datumaro/datumaro/cli/contexts/project/diff.py
+++ b/datumaro/datumaro/cli/contexts/project/diff.py
@ -83,11 +83,20 @@ class DiffVisualizer:
        if self.output_format is Format.tensorboard:
            self.file_writer.reopen()

-        for i, (item_a, item_b) in enumerate(zip(extractor_a, extractor_b)):
-            if item_a.id != item_b.id or not item_a.id or not item_b.id:
-                print("Dataset items #%s '%s' '%s' do not match" % \
-                    (i + 1, item_a.id, item_b.id))
-                continue
+        ids_a = set((item.id, item.subset) for item in extractor_a)
+        ids_b = set((item.id, item.subset) for item in extractor_b)
+        ids = ids_a & ids_b
+
+        if len(ids) != len(ids_a):
+            print("Unmatched items in the first dataset: ")
+            print(ids_a - ids)
+        if len(ids) != len(ids_b):
+            print("Unmatched items in the second dataset: ")
+            print(ids_b - ids)
+
+        for item_id, item_subset in ids:
+            item_a = extractor_a.get(item_id, item_subset)
+            item_b = extractor_a.get(item_id, item_subset)

            label_diff = self.comparator.compare_item_labels(item_a, item_b)
            self.update_label_confusion(label_diff)
--- a/datumaro/datumaro/components/project.py
+++ b/datumaro/datumaro/components/project.py
@ -611,33 +611,39 @@ class ProjectDataset(Dataset):
            project.config.remove('sources')

        save_dir = osp.abspath(save_dir)
-        os.makedirs(save_dir, exist_ok=True)
-
        dataset_save_dir = osp.join(save_dir, project.config.dataset_dir)
-        os.makedirs(dataset_save_dir, exist_ok=True)

        converter_kwargs = {
            'save_images': save_images,
        }

-        if merge:
-            # merge and save the resulting dataset
-            converter = self.env.make_converter(
-                DEFAULT_FORMAT, **converter_kwargs)
-            converter(self, dataset_save_dir)
-        else:
-            if recursive:
-                # children items should already be updated
-                # so we just save them recursively
-                for source in self._sources.values():
-                    if isinstance(source, ProjectDataset):
-                        source.save(**converter_kwargs)
-
-            converter = self.env.make_converter(
-                DEFAULT_FORMAT, **converter_kwargs)
-            converter(self.iterate_own(), dataset_save_dir)
+        save_dir_existed = osp.exists(save_dir)
+        try:
+            os.makedirs(save_dir, exist_ok=True)
+            os.makedirs(dataset_save_dir, exist_ok=True)

-        project.save(save_dir)
+            if merge:
+                # merge and save the resulting dataset
+                converter = self.env.make_converter(
+                    DEFAULT_FORMAT, **converter_kwargs)
+                converter(self, dataset_save_dir)
+            else:
+                if recursive:
+                    # children items should already be updated
+                    # so we just save them recursively
+                    for source in self._sources.values():
+                        if isinstance(source, ProjectDataset):
+                            source.save(**converter_kwargs)
+
+                converter = self.env.make_converter(
+                    DEFAULT_FORMAT, **converter_kwargs)
+                converter(self.iterate_own(), dataset_save_dir)
+
+            project.save(save_dir)
+        except BaseException:
+            if not save_dir_existed and osp.isdir(save_dir):
+                shutil.rmtree(save_dir, ignore_errors=True)
+            raise

    @property
    def env(self):
@ -705,7 +711,7 @@ class ProjectDataset(Dataset):
        try:
            os.makedirs(save_dir, exist_ok=True)
            converter(dataset, save_dir)
-        except Exception:
+        except BaseException:
            if not save_dir_existed:
                shutil.rmtree(save_dir)
            raise
@ -750,7 +756,7 @@ class Project:

            config_path = osp.join(save_dir, config.project_filename)
            config.dump(config_path)
-        except Exception:
+        except BaseException:
            if not env_dir_existed:
                shutil.rmtree(save_dir, ignore_errors=True)
            if not project_dir_existed:
--- a/datumaro/datumaro/plugins/transforms.py
+++ b/datumaro/datumaro/plugins/transforms.py
@ -7,6 +7,7 @@ from enum import Enum
 import logging as log
 import os.path as osp
 import random
+import re

 import pycocotools.mask as mask_utils

@ -372,6 +373,40 @@ class IdFromImageName(Transform, CliPlugin):
                "item has no image info" % item.id)
            return item

+class Rename(Transform, CliPlugin):
+    """
+    Renames items in the dataset. Supports regular expressions.
+    The first character in the expression is a delimiter for
+    the pattern and replacement parts. Replacement part can also
+    contain string.format tokens with 'item' object available.|n
+    |n
+    Examples:|n
+    - Replace 'pattern' with 'replacement':|n
+    |s|srename -e '|pattern|replacement|'|n
+    - Remove 'frame_' from item ids:|n
+    |s|srename -e '|frame_(\d+)|\\1|'
+    """
+
+    @classmethod
+    def build_cmdline_parser(cls, **kwargs):
+        parser = super().build_cmdline_parser(**kwargs)
+        parser.add_argument('-e', '--regex',
+            help="Regex for renaming.")
+        return parser
+
+    def __init__(self, extractor, regex):
+        super().__init__(extractor)
+
+        assert regex and isinstance(regex, str)
+        parts = regex.split(regex[0], maxsplit=3)
+        regex, sub = parts[1:3]
+        self._re = re.compile(regex)
+        self._sub = sub
+
+    def transform_item(self, item):
+        return self.wrap_item(item, id=self._re.sub(self._sub, item.id) \
+            .format(item=item))
+
 class RemapLabels(Transform, CliPlugin):
    DefaultAction = Enum('DefaultAction', ['keep', 'delete'])