[Datumaro] Convert command (#1837)

* Add convert command * fixes * update readme * update changelog Co-authored-by: Andrey Zhavoronkov <41117609+azhavoro@users.noreply.github.com>
6 years ago · 575c93ff2c
parent 0b1c3a31dd
commit 575c93ff2c
5 changed files with 144 additions and 5 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -54,6 +54,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Added new tag annotation workspace (<https://github.com/opencv/cvat/pull/1570>)
 - Appearance block in attribute annotation mode (<https://github.com/opencv/cvat/pull/1820>)
 - Keyframe navigations and some switchers in attribute annotation mode (<https://github.com/opencv/cvat/pull/1820>)
+- [Datumaro] Added `convert` command to convert datasets directly (<https://github.com/opencv/cvat/pull/1837>)

 ### Changed
 - Removed information about e-mail from the basic user information (<https://github.com/opencv/cvat/pull/1627>)
--- a/datumaro/README.md
+++ b/datumaro/README.md
@ -133,8 +133,8 @@ project = Project.load('directory')
  ```bash
  # Download VOC dataset:
  # http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar
-  datum project import --format voc --input-path <path/to/voc>
-  datum project export --format coco --filter '/item[annotation/label="cat"]'
+  datum convert --input-format voc --input-path <path/to/voc> \
+                --output-format coco --filter '/item[annotation/label="cat"]'
  ```

 - Convert only non-occluded annotations from a CVAT-annotated project to TFrecord:
@ -161,8 +161,8 @@ project = Project.load('directory')

 - Annotate instance polygons in CVAT, export as masks in COCO:
  ```bash
-  datum project import --format cvat --input-path <path/to/cvat.xml>
-  datum project export --format coco -- --segmentation-mode masks
+  datum convert --input-format cvat --input-path <path/to/cvat.xml> \
+                --output-format coco -- --segmentation-mode masks
  ```

 - Apply an OpenVINO detection model to some COCO-like dataset,
--- a/datumaro/datumaro/cli/main.py
+++ b/datumaro/datumaro/cli/main.py
@ -68,6 +68,7 @@ def make_parser():
        ('remove', commands.remove, "Remove source from project"),
        ('export', commands.export, "Export project"),
        ('explain', commands.explain, "Run Explainable AI algorithm for model"),
+        ('convert', commands.convert, "Convert dataset"),
    ]

    # Argparse doesn't support subparser groups:
--- a/datumaro/datumaro/cli/commands/init.py
+++ b/datumaro/datumaro/cli/commands/init.py
@ -3,4 +3,4 @@
 #
 # SPDX-License-Identifier: MIT

-from . import add, create, explain, export, remove
+from . import add, create, explain, export, remove, convert
--- a/datumaro/datumaro/cli/commands/convert.py
+++ b/datumaro/datumaro/cli/commands/convert.py
@ -0,0 +1,137 @@
+
+# Copyright (C) 2019 Intel Corporation
+#
+# SPDX-License-Identifier: MIT
+
+import argparse
+import logging as log
+import os
+import os.path as osp
+
+from datumaro.components.project import Environment
+
+from ..contexts.project import FilterModes
+from ..util import CliException, MultilineFormatter, make_file_name
+from ..util.project import generate_next_dir_name
+
+
+def build_parser(parser_ctor=argparse.ArgumentParser):
+    builtin_importers = sorted(Environment().importers.items)
+    builtin_converters = sorted(Environment().converters.items)
+
+    parser = parser_ctor(help="Convert an existing dataset to another format",
+        description="""
+            Converts a dataset from one format to another.
+            You can add your own formats using a project.|n
+            |n
+            Supported input formats: %s|n
+            |n
+            Supported output formats: %s|n
+            |n
+            Examples:|n
+            - Export a dataset as a PASCAL VOC dataset, include images:|n
+            |s|sconvert -i src/path -f voc -- --save-images|n
+            |n
+            - Export a dataset as a COCO dataset to a specific directory:|n
+            |s|sconvert -i src/path -f coco -o path/I/like/
+        """ % (', '.join(builtin_importers), ', '.join(builtin_converters)),
+        formatter_class=MultilineFormatter)
+
+    parser.add_argument('-i', '--input-path', default='.', dest='source',
+        help="Path to look for a dataset")
+    parser.add_argument('-if', '--input-format',
+        help="Input dataset format. Will try to detect, if not specified.")
+    parser.add_argument('-f', '--output-format', required=True,
+        help="Output format")
+    parser.add_argument('-o', '--output-dir', dest='dst_dir',
+        help="Directory to save output (default: a subdir in the current one)")
+    parser.add_argument('--overwrite', action='store_true',
+        help="Overwrite existing files in the save directory")
+    parser.add_argument('-e', '--filter',
+        help="Filter expression for dataset items")
+    parser.add_argument('--filter-mode', default=FilterModes.i.name,
+        type=FilterModes.parse,
+        help="Filter mode (options: %s; default: %s)" % \
+            (', '.join(FilterModes.list_options()) , '%(default)s'))
+    parser.add_argument('extra_args', nargs=argparse.REMAINDER,
+        help="Additional arguments for output format (pass '-- -h' for help)")
+    parser.set_defaults(command=convert_command)
+
+    return parser
+
+def convert_command(args):
+    env = Environment()
+
+    try:
+        converter = env.converters.get(args.output_format)
+    except KeyError:
+        raise CliException("Converter for format '%s' is not found" % \
+            args.output_format)
+    if hasattr(converter, 'from_cmdline'):
+        extra_args = converter.from_cmdline(args.extra_args)
+        converter = converter(**extra_args)
+
+    filter_args = FilterModes.make_filter_args(args.filter_mode)
+
+    if not args.input_format:
+        matches = []
+        for format_name in env.importers.items:
+            log.debug("Checking '%s' format...", format_name)
+            importer = env.make_importer(format_name)
+            try:
+                match = importer.detect(args.source)
+                if match:
+                    log.debug("format matched")
+                    matches.append((format_name, importer))
+            except NotImplementedError:
+                log.debug("Format '%s' does not support auto detection.",
+                    format_name)
+
+        if len(matches) == 0:
+            log.error("Failed to detect dataset format. "
+                "Try to specify format with '-if/--input-format' parameter.")
+            return 1
+        elif len(matches) != 1:
+            log.error("Multiple formats match the dataset: %s. "
+                "Try to specify format with '-if/--input-format' parameter.",
+                ', '.join(m[0] for m in matches))
+            return 2
+
+        format_name, importer = matches[0]
+        args.input_format = format_name
+        log.info("Source dataset format detected as '%s'", args.input_format)
+    else:
+        try:
+            importer = env.make_importer(args.input_format)
+            if hasattr(importer, 'from_cmdline'):
+                extra_args = importer.from_cmdline()
+        except KeyError:
+            raise CliException("Importer for format '%s' is not found" % \
+                args.input_format)
+
+    source = osp.abspath(args.source)
+
+    dst_dir = args.dst_dir
+    if dst_dir:
+        if not args.overwrite and osp.isdir(dst_dir) and os.listdir(dst_dir):
+            raise CliException("Directory '%s' already exists "
+                "(pass --overwrite to overwrite)" % dst_dir)
+    else:
+        dst_dir = generate_next_dir_name('%s-%s' % \
+            (osp.basename(source), make_file_name(args.output_format)))
+    dst_dir = osp.abspath(dst_dir)
+
+    project = importer(source)
+    dataset = project.make_dataset()
+
+    log.info("Exporting the dataset")
+    dataset.export_project(
+        save_dir=dst_dir,
+        converter=converter,
+        filter_expr=args.filter,
+        **filter_args)
+
+    log.info("Dataset exported to '%s' as '%s'" % \
+        (dst_dir, args.output_format))
+
+    return 0