[Datumaro] Convert command (#1837)

* Add convert command * fixes * update readme * update changelog Co-authored-by: Andrey Zhavoronkov <41117609+azhavoro@users.noreply.github.com>
6 years ago · 575c93ff2c
parent 0b1c3a31dd
commit 575c93ff2c
5 changed files with 144 additions and 5 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -54,6 +54,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Added new tag annotation workspace (<https://github.com/opencv/cvat/pull/1570>)
 - Appearance block in attribute annotation mode (<https://github.com/opencv/cvat/pull/1820>)
 - Keyframe navigations and some switchers in attribute annotation mode (<https://github.com/opencv/cvat/pull/1820>)
 - [Datumaro] Added `convert` command to convert datasets directly (<https://github.com/opencv/cvat/pull/1837>)
 ### Changed
 - Removed information about e-mail from the basic user information (<https://github.com/opencv/cvat/pull/1627>)
--- a/datumaro/README.md
+++ b/datumaro/README.md
@ -133,8 +133,8 @@ project = Project.load('directory')
  ```bash
  # Download VOC dataset:
  # http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar
-  datum project import --format voc --input-path <path/to/voc>
+  datum convert --input-format voc --input-path <path/to/voc> \
-  datum project export --format coco --filter '/item[annotation/label="cat"]'
+                --output-format coco --filter '/item[annotation/label="cat"]'
  ```
 - Convert only non-occluded annotations from a CVAT-annotated project to TFrecord:
@ -161,8 +161,8 @@ project = Project.load('directory')
 - Annotate instance polygons in CVAT, export as masks in COCO:
  ```bash
-  datum project import --format cvat --input-path <path/to/cvat.xml>
+  datum convert --input-format cvat --input-path <path/to/cvat.xml> \
-  datum project export --format coco -- --segmentation-mode masks
+                --output-format coco -- --segmentation-mode masks
  ```
 - Apply an OpenVINO detection model to some COCO-like dataset,
--- a/datumaro/datumaro/cli/main.py
+++ b/datumaro/datumaro/cli/main.py
@ -68,6 +68,7 @@ def make_parser():
        ('remove', commands.remove, "Remove source from project"),
        ('export', commands.export, "Export project"),
        ('explain', commands.explain, "Run Explainable AI algorithm for model"),
        ('convert', commands.convert, "Convert dataset"),
    ]
    # Argparse doesn't support subparser groups:
--- a/datumaro/datumaro/cli/commands/init.py
+++ b/datumaro/datumaro/cli/commands/init.py
@ -3,4 +3,4 @@
 #
 # SPDX-License-Identifier: MIT
-from . import add, create, explain, export, remove
+from . import add, create, explain, export, remove, convert
--- a/datumaro/datumaro/cli/commands/convert.py
+++ b/datumaro/datumaro/cli/commands/convert.py
@ -0,0 +1,137 @@
 # Copyright (C) 2019 Intel Corporation
 #
 # SPDX-License-Identifier: MIT
 import argparse
 import logging as log
 import os
 import os.path as osp
 from datumaro.components.project import Environment
 from ..contexts.project import FilterModes
 from ..util import CliException, MultilineFormatter, make_file_name
 from ..util.project import generate_next_dir_name
 def build_parser(parser_ctor=argparse.ArgumentParser):
    builtin_importers = sorted(Environment().importers.items)
    builtin_converters = sorted(Environment().converters.items)
    parser = parser_ctor(help="Convert an existing dataset to another format",
        description="""
            Converts a dataset from one format to another.
            You can add your own formats using a project.|n
            |n
            Supported input formats: %s|n
            |n
            Supported output formats: %s|n
            |n
            Examples:|n
            - Export a dataset as a PASCAL VOC dataset, include images:|n
            |s|sconvert -i src/path -f voc -- --save-images|n
            |n
            - Export a dataset as a COCO dataset to a specific directory:|n
            |s|sconvert -i src/path -f coco -o path/I/like/
        """ % (', '.join(builtin_importers), ', '.join(builtin_converters)),
        formatter_class=MultilineFormatter)
    parser.add_argument('-i', '--input-path', default='.', dest='source',
        help="Path to look for a dataset")
    parser.add_argument('-if', '--input-format',
        help="Input dataset format. Will try to detect, if not specified.")
    parser.add_argument('-f', '--output-format', required=True,
        help="Output format")
    parser.add_argument('-o', '--output-dir', dest='dst_dir',
        help="Directory to save output (default: a subdir in the current one)")
    parser.add_argument('--overwrite', action='store_true',
        help="Overwrite existing files in the save directory")
    parser.add_argument('-e', '--filter',
        help="Filter expression for dataset items")
    parser.add_argument('--filter-mode', default=FilterModes.i.name,
        type=FilterModes.parse,
        help="Filter mode (options: %s; default: %s)" % \
            (', '.join(FilterModes.list_options()) , '%(default)s'))
    parser.add_argument('extra_args', nargs=argparse.REMAINDER,
        help="Additional arguments for output format (pass '-- -h' for help)")
    parser.set_defaults(command=convert_command)
    return parser
 def convert_command(args):
    env = Environment()
    try:
        converter = env.converters.get(args.output_format)
    except KeyError:
        raise CliException("Converter for format '%s' is not found" % \
            args.output_format)
    if hasattr(converter, 'from_cmdline'):
        extra_args = converter.from_cmdline(args.extra_args)
        converter = converter(**extra_args)
    filter_args = FilterModes.make_filter_args(args.filter_mode)
    if not args.input_format:
        matches = []
        for format_name in env.importers.items:
            log.debug("Checking '%s' format...", format_name)
            importer = env.make_importer(format_name)
            try:
                match = importer.detect(args.source)
                if match:
                    log.debug("format matched")
                    matches.append((format_name, importer))
            except NotImplementedError:
                log.debug("Format '%s' does not support auto detection.",
                    format_name)
        if len(matches) == 0:
            log.error("Failed to detect dataset format. "
                "Try to specify format with '-if/--input-format' parameter.")
            return 1
        elif len(matches) != 1:
            log.error("Multiple formats match the dataset: %s. "
                "Try to specify format with '-if/--input-format' parameter.",
                ', '.join(m[0] for m in matches))
            return 2
        format_name, importer = matches[0]
        args.input_format = format_name
        log.info("Source dataset format detected as '%s'", args.input_format)
    else:
        try:
            importer = env.make_importer(args.input_format)
            if hasattr(importer, 'from_cmdline'):
                extra_args = importer.from_cmdline()
        except KeyError:
            raise CliException("Importer for format '%s' is not found" % \
                args.input_format)
    source = osp.abspath(args.source)
    dst_dir = args.dst_dir
    if dst_dir:
        if not args.overwrite and osp.isdir(dst_dir) and os.listdir(dst_dir):
            raise CliException("Directory '%s' already exists "
                "(pass --overwrite to overwrite)" % dst_dir)
    else:
        dst_dir = generate_next_dir_name('%s-%s' % \
            (osp.basename(source), make_file_name(args.output_format)))
    dst_dir = osp.abspath(dst_dir)
    project = importer(source)
    dataset = project.make_dataset()
    log.info("Exporting the dataset")
    dataset.export_project(
        save_dir=dst_dir,
        converter=converter,
        filter_expr=args.filter,
        **filter_args)
    log.info("Dataset exported to '%s' as '%s'" % \
        (dst_dir, args.output_format))
    return 0