[Datumaro] Instance polygon-mask conversions in COCO format (#1008)

* Microoptimizations * Mask conversion functions * Add mask-polygon conversions * Add mask-polygon conversions in coco * Add mask-polygon conversions in coco * Update requirements * Option to disable crop * Fix cli parameter passing * Fix test * Fixes in COCO
6 years ago · e0bcc4652b
parent 8da20b38d5
commit e0bcc4652b
9 changed files with 915 additions and 357 deletions
--- a/datumaro/datumaro/components/converters/ms_coco.py
+++ b/datumaro/datumaro/components/converters/ms_coco.py
@ -3,8 +3,10 @@
 #
 # SPDX-License-Identifier: MIT
 from enum import Enum
 from itertools import groupby
 import json
-import numpy as np
+import logging as log
 import os
 import os.path as osp
@ -12,7 +14,7 @@ import pycocotools.mask as mask_utils
 from datumaro.components.converter import Converter
 from datumaro.components.extractor import (
-    DEFAULT_SUBSET_NAME, AnnotationType, PointsObject, BboxObject
+    DEFAULT_SUBSET_NAME, AnnotationType, PointsObject, BboxObject, MaskObject
 )
 from datumaro.components.formats.ms_coco import CocoTask, CocoPath
 from datumaro.util import find
@ -28,6 +30,9 @@ def _cast(value, type_conv, default=None):
    except Exception:
        return default
 SegmentationMode = Enum('SegmentationMode', ['guess', 'polygons', 'mask'])
 class _TaskConverter:
    def __init__(self, context):
        self._min_ann_id = 1
@ -108,136 +113,6 @@ class _TaskConverter:
            self._min_ann_id = max(ann_id, self._min_ann_id)
        return ann_id
 class _InstancesConverter(_TaskConverter):
    def save_categories(self, dataset):
        label_categories = dataset.categories().get(AnnotationType.label)
        if label_categories is None:
            return
        for idx, cat in enumerate(label_categories.items):
            self.categories.append({
                'id': 1 + idx,
                'name': _cast(cat.name, str, ''),
                'supercategory': _cast(cat.parent, str, ''),
            })
    def save_annotations(self, item):
        annotations = item.annotations.copy()
        while len(annotations) != 0:
            ann = annotations.pop()
            if ann.type == AnnotationType.bbox and ann.label is not None:
                pass
            elif ann.type == AnnotationType.polygon and ann.label is not None:
                pass
            elif ann.type == AnnotationType.mask and ann.label is not None:
                pass
            else:
                continue
            bbox = None
            segmentation = None
            if ann.type == AnnotationType.bbox:
                is_crowd = ann.attributes.get('is_crowd', False)
                bbox = ann.get_bbox()
            elif ann.type == AnnotationType.polygon:
                is_crowd = ann.attributes.get('is_crowd', False)
            elif ann.type == AnnotationType.mask:
                is_crowd = ann.attributes.get('is_crowd', True)
                if is_crowd:
                    segmentation = ann
            area = None
            # If ann in a group, try to find corresponding annotations in
            # this group, otherwise try to infer them.
            if bbox is None and ann.group is not None:
                bbox = find(annotations, lambda x: \
                    x.group == ann.group and \
                    x.type == AnnotationType.bbox and \
                    x.label == ann.label)
                if bbox is not None:
                    bbox = bbox.get_bbox()
            if is_crowd:
                # is_crowd=True means there should be a mask
                if segmentation is None and ann.group is not None:
                    segmentation = find(annotations, lambda x: \
                        x.group == ann.group and \
                        x.type == AnnotationType.mask and \
                        x.label == ann.label)
                if segmentation is not None:
                    binary_mask = np.array(segmentation.image, dtype=np.bool)
                    binary_mask = np.asfortranarray(binary_mask, dtype=np.uint8)
                    segmentation = mask_utils.encode(binary_mask)
                    area = mask_utils.area(segmentation)
                    segmentation = mask_tools.convert_mask_to_rle(binary_mask)
            else:
                # is_crowd=False means there are some polygons
                polygons = []
                if ann.type == AnnotationType.polygon:
                    polygons = [ ann ]
                if ann.group is not None:
                    # A single object can consist of several polygons
                    polygons += [p for p in annotations
                        if p.group == ann.group and \
                           p.type == AnnotationType.polygon and \
                           p.label == ann.label]
                if polygons:
                    segmentation = [p.get_points() for p in polygons]
                    h, w = item.image.shape[:2]
                    rles = mask_utils.frPyObjects(segmentation, h, w)
                    rle = mask_utils.merge(rles)
                    area = mask_utils.area(rle)
                    if self._context._merge_polygons:
                        binary_mask = mask_utils.decode(rle).astype(np.bool)
                        binary_mask = np.asfortranarray(binary_mask, dtype=np.uint8)
                        segmentation = mask_tools.convert_mask_to_rle(binary_mask)
                        is_crowd = True
                        bbox = [int(i) for i in mask_utils.toBbox(rle)]
            if ann.group is not None:
                # Mark the group as visited to prevent repeats
                for a in annotations[:]:
                    if a.group == ann.group:
                        annotations.remove(a)
            if segmentation is None:
                is_crowd = False
                segmentation = [ann.get_polygon()]
                area = ann.area()
                if self._context._merge_polygons:
                    h, w = item.image.shape[:2]
                    rles = mask_utils.frPyObjects(segmentation, h, w)
                    rle = mask_utils.merge(rles)
                    area = mask_utils.area(rle)
                    binary_mask = mask_utils.decode(rle).astype(np.bool)
                    binary_mask = np.asfortranarray(binary_mask, dtype=np.uint8)
                    segmentation = mask_tools.convert_mask_to_rle(binary_mask)
                    is_crowd = True
                    bbox = [int(i) for i in mask_utils.toBbox(rle)]
            if bbox is None:
                bbox = ann.get_bbox()
            elem = {
                'id': self._get_ann_id(ann),
                'image_id': _cast(item.id, int, 0),
                'category_id': _cast(ann.label, int, -1) + 1,
                'segmentation': segmentation,
                'area': float(area),
                'bbox': bbox,
                'iscrowd': int(is_crowd),
            }
            if 'score' in ann.attributes:
                elem['score'] = float(ann.attributes['score'])
            self.annotations.append(elem)
 class _ImageInfoConverter(_TaskConverter):
    def is_empty(self):
        return len(self._data['images']) == 0
@ -268,7 +143,210 @@ class _CaptionsConverter(_TaskConverter):
            self.annotations.append(elem)
-class _KeypointsConverter(_TaskConverter):
+class _InstancesConverter(_TaskConverter):
    def save_categories(self, dataset):
        label_categories = dataset.categories().get(AnnotationType.label)
        if label_categories is None:
            return
        for idx, cat in enumerate(label_categories.items):
            self.categories.append({
                'id': 1 + idx,
                'name': _cast(cat.name, str, ''),
                'supercategory': _cast(cat.parent, str, ''),
            })
    @classmethod
    def crop_segments(cls, instances, img_width, img_height):
        instances = sorted(instances, key=lambda x: x[0].z_order)
        segment_map = []
        segments = []
        for inst_idx, (_, polygons, mask, _) in enumerate(instances):
            if polygons:
                segment_map.extend(inst_idx for p in polygons)
                segments.extend(polygons)
            elif mask is not None:
                segment_map.append(inst_idx)
                segments.append(mask)
        segments = mask_tools.crop_covered_segments(
            segments, img_width, img_height)
        for inst_idx, inst in enumerate(instances):
            new_segments = [s for si_id, s in zip(segment_map, segments)
                if si_id == inst_idx]
            if not new_segments:
                inst[1] = []
                inst[2] = None
                continue
            if inst[1]:
                inst[1] = sum(new_segments, [])
            else:
                mask = cls.merge_masks(new_segments)
                inst[2] = mask_tools.mask_to_rle(mask)
        return instances
    def find_instance_parts(self, group, img_width, img_height):
        boxes = [a for a in group if a.type == AnnotationType.bbox]
        polygons = [a for a in group if a.type == AnnotationType.polygon]
        masks = [a for a in group if a.type == AnnotationType.mask]
        anns = boxes + polygons + masks
        leader = self.find_group_leader(anns)
        bbox = self.compute_bbox(anns)
        mask = None
        polygons = [p.get_polygon() for p in polygons]
        if self._context._segmentation_mode == SegmentationMode.guess:
            use_masks = leader.attributes.get('is_crowd',
                find(masks, lambda x: x.label == leader.label) is not None)
        elif self._context._segmentation_mode == SegmentationMode.polygons:
            use_masks = False
        elif self._context._segmentation_mode == SegmentationMode.mask:
            use_masks = True
        else:
            raise NotImplementedError("Unexpected segmentation mode '%s'" % \
                self._context._segmentation_mode)
        if use_masks:
            if polygons:
                mask = mask_tools.rles_to_mask(polygons, img_width, img_height)
            if masks:
                if mask is not None:
                    masks += [mask]
                mask = self.merge_masks(masks)
            if mask is not None:
                mask = mask_tools.mask_to_rle(mask)
            polygons = []
        else:
            if masks:
                mask = self.merge_masks(masks)
                polygons += mask_tools.mask_to_polygons(mask)
            mask = None
        return [leader, polygons, mask, bbox]
    @staticmethod
    def find_group_leader(group):
        return max(group, key=lambda x: x.area())
    @staticmethod
    def merge_masks(masks):
        if not masks:
            return None
        def get_mask(m):
            if isinstance(m, MaskObject):
                return m.image
            else:
                return m
        binary_mask = get_mask(masks[0])
        for m in masks[1:]:
            binary_mask |= get_mask(m)
        return binary_mask
    @staticmethod
    def compute_bbox(annotations):
        boxes = [ann.get_bbox() for ann in annotations]
        x0 = min((b[0] for b in boxes), default=0)
        y0 = min((b[1] for b in boxes), default=0)
        x1 = max((b[0] + b[2] for b in boxes), default=0)
        y1 = max((b[1] + b[3] for b in boxes), default=0)
        return [x0, y0, x1 - x0, y1 - y0]
    @staticmethod
    def find_instance_anns(annotations):
        return [a for a in annotations
            if a.type in { AnnotationType.bbox, AnnotationType.polygon } or \
                a.type == AnnotationType.mask and a.label is not None
        ]
    @classmethod
    def find_instances(cls, annotations):
        instance_anns = cls.find_instance_anns(annotations)
        ann_groups = []
        for g_id, group in groupby(instance_anns, lambda a: a.group):
            if g_id is None:
                ann_groups.extend(([a] for a in group))
            else:
                ann_groups.append(list(group))
        return ann_groups
    def save_annotations(self, item):
        instances = self.find_instances(item.annotations)
        if not instances:
            return
        if not item.has_image:
            log.warn("Skipping writing instances for "
                "item '%s' as it has no image info" % item.id)
            return
        h, w, _ = item.image.shape
        instances = [self.find_instance_parts(i, w, h) for i in instances]
        if self._context._crop_covered:
            instances = self.crop_segments(instances, w, h)
        for instance in instances:
            elem = self.convert_instance(instance, item)
            if elem:
                self.annotations.append(elem)
    def convert_instance(self, instance, item):
        ann, polygons, mask, bbox = instance
        is_crowd = mask is not None
        if is_crowd:
            segmentation = mask
        else:
            segmentation = [list(map(float, p)) for p in polygons]
        area = 0
        if segmentation:
            if item.has_image:
                h, w, _ = item.image.shape
            else:
                # NOTE: here we can guess the image size as
                # it is only needed for the area computation
                w = bbox[0] + bbox[2]
                h = bbox[1] + bbox[3]
            rles = mask_utils.frPyObjects(segmentation, h, w)
            if is_crowd:
                rles = [rles]
            else:
                rles = mask_utils.merge(rles)
            area = mask_utils.area(rles)
        else:
            x, y, w, h = bbox
            segmentation = [[x, y, x + w, y, x + w, y + h, x, y + h]]
            area = w * h
        elem = {
            'id': self._get_ann_id(ann),
            'image_id': _cast(item.id, int, 0),
            'category_id': _cast(ann.label, int, -1) + 1,
            'segmentation': segmentation,
            'area': float(area),
            'bbox': list(map(float, bbox)),
            'iscrowd': int(is_crowd),
        }
        if 'score' in ann.attributes:
            elem['score'] = float(ann.attributes['score'])
        return elem
 class _KeypointsConverter(_InstancesConverter):
    def save_categories(self, dataset):
        label_categories = dataset.categories().get(AnnotationType.label)
        if label_categories is None:
@ -290,45 +368,61 @@ class _KeypointsConverter(_TaskConverter):
            self.categories.append(cat)
    def save_annotations(self, item):
-        for ann in item.annotations:
+        point_annotations = [a for a in item.annotations
-            if ann.type != AnnotationType.points:
+            if a.type == AnnotationType.points]
-                continue
+        if not point_annotations:
            return
-            elem = {
+        # Create annotations for solitary keypoints annotations
-                'id': self._get_ann_id(ann),
+        for points in self.find_solitary_points(item.annotations):
-                'image_id': _cast(item.id, int, 0),
+            instance = [points, [], None, points.get_bbox()]
-                'category_id': _cast(ann.label, int, -1) + 1,
+            elem = super().convert_instance(instance, item)
-            }
+            elem.update(self.convert_points_object(points))
-            if 'score' in ann.attributes:
+            if elem:
-                elem['score'] = float(ann.attributes['score'])
+                self.annotations.append(elem)
-            keypoints = []
+        # Create annotations for complete instance + keypoints annotations
-            points = ann.get_points()
+        super().save_annotations(item)
            visibility = ann.visibility
            for index in range(0, len(points), 2):
                kp = points[index : index + 2]
                state = visibility[index // 2].value
                keypoints.extend([*kp, state])
            num_visible = len([v for v in visibility \
                if v == PointsObject.Visibility.visible])
            bbox = find(item.annotations, lambda x: \
                x.group == ann.group and \
                x.type == AnnotationType.bbox and
                x.label == ann.label)
            if bbox is None:
                bbox = BboxObject(*ann.get_bbox())
            elem.update({
                'segmentation': bbox.get_polygon(),
                'area': bbox.area(),
                'bbox': bbox.get_bbox(),
                'iscrowd': 0,
                'keypoints': keypoints,
                'num_keypoints': num_visible,
            })
-            self.annotations.append(elem)
+    @classmethod
    def find_solitary_points(cls, annotations):
        solitary_points = []
        for g_id, group in groupby(annotations, lambda a: a.group):
            if g_id is not None and not cls.find_instance_anns(group):
                group = [a for a in group if a.type == AnnotationType.points]
            solitary_points.extend(group)
        return solitary_points
    @staticmethod
    def convert_points_object(ann):
        keypoints = []
        points = ann.get_points()
        visibility = ann.visibility
        for index in range(0, len(points), 2):
            kp = points[index : index + 2]
            state = visibility[index // 2].value
            keypoints.extend([*kp, state])
        num_annotated = len([v for v in visibility \
            if v != PointsObject.Visibility.absent])
        return {
            'keypoints': keypoints,
            'num_keypoints': num_annotated,
        }
    def convert_instance(self, instance, item):
        points_ann = find(item.annotations, lambda x: \
            x.type == AnnotationType.points and x.group == instance[0].group)
        if not points_ann:
            return None
        elem = super().convert_instance(instance, item)
        elem.update(self.convert_points_object(points_ann))
        return elem
 class _LabelsConverter(_TaskConverter):
    def save_categories(self, dataset):
@ -368,7 +462,8 @@ class _Converter:
    }
    def __init__(self, extractor, save_dir,
-            tasks=None, save_images=False, merge_polygons=False):
+            tasks=None, save_images=False, segmentation_mode=None,
            crop_covered=False):
        assert tasks is None or isinstance(tasks, (CocoTask, list))
        if tasks is None:
            tasks = list(self._TASK_CONVERTER)
@ -383,7 +478,17 @@ class _Converter:
        self._save_dir = save_dir
        self._save_images = save_images
-        self._merge_polygons = merge_polygons
+
        assert segmentation_mode is None or \
            segmentation_mode in SegmentationMode or \
            isinstance(segmentation_mode, str)
        if segmentation_mode is None:
            segmentation_mode = SegmentationMode.guess
        if isinstance(segmentation_mode, str):
            segmentation_mode = SegmentationMode[segmentation_mode]
        self._segmentation_mode = segmentation_mode
        self._crop_covered = crop_covered
    def make_dirs(self):
        self._images_dir = osp.join(self._save_dir, CocoPath.IMAGES_DIR)
@ -442,14 +547,16 @@ class _Converter:
 class CocoConverter(Converter):
    def __init__(self,
-            tasks=None, save_images=False, merge_polygons=False,
+            tasks=None, save_images=False, segmentation_mode=None,
            crop_covered=False,
            cmdline_args=None):
        super().__init__()
        self._options = {
            'tasks': tasks,
            'save_images': save_images,
-            'merge_polygons': merge_polygons,
+            'segmentation_mode': segmentation_mode,
            'crop_covered': crop_covered,
        }
        if cmdline_args is not None:
@ -467,8 +574,20 @@ class CocoConverter(Converter):
        parser.add_argument('--save-images', action='store_true',
            help="Save images (default: %(default)s)")
-        parser.add_argument('--merge-polygons', action='store_true',
+        parser.add_argument('--segmentation-mode',
-            help="Merge instance polygons into a mask (default: %(default)s)")
+            choices=[m.name for m in SegmentationMode],
            default=SegmentationMode.guess.name,
            help="Save mode for instance segmentation: "
                "- '{sm.guess.name}': guess the mode for each instance, "
                    "use 'is_crowd' attribute as hint; "
                "- '{sm.polygons.name}': save polygons, "
                    "merge and convert masks, prefer polygons; "
                "- '{sm.mask.name}': save masks, "
                    "merge and convert polygons, prefer masks; "
                "(default: %(default)s)".format(sm=SegmentationMode))
        parser.add_argument('--crop-covered', action='store_true',
            help="Crop covered segments so that background objects' "
                "segmentation was more accurate (default: %(default)s)")
        parser.add_argument('--tasks', type=cls._split_tasks_string,
            default=None,
            help="COCO task filter, comma-separated list of {%s} "
--- a/datumaro/datumaro/components/extractor.py
+++ b/datumaro/datumaro/components/extractor.py
@ -159,12 +159,16 @@ class MaskCategories(Categories):
 class MaskObject(Annotation):
    # pylint: disable=redefined-builtin
-    def __init__(self, image=None, label=None,
+    def __init__(self, image=None, label=None, z_order=None,
            id=None, attributes=None, group=None):
        super().__init__(id=id, type=AnnotationType.mask,
            attributes=attributes, group=group)
        self._image = image
        self._label = label
        if z_order is None:
            z_order = 0
        self._z_order = z_order
    # pylint: enable=redefined-builtin
    @property
@ -181,22 +185,69 @@ class MaskObject(Annotation):
        raise NotImplementedError()
    def area(self):
-        raise NotImplementedError()
+        if self._label is None:
            raise NotImplementedError()
        return np.count_nonzero(self.image)
    def extract(self, class_id):
        raise NotImplementedError()
-    def bbox(self):
+    def get_bbox(self):
-        raise NotImplementedError()
+        if self._label is None:
            raise NotImplementedError()
        image = self.image
        cols = np.any(image, axis=0)
        rows = np.any(image, axis=1)
        x0, x1 = np.where(cols)[0][[0, -1]]
        y0, y1 = np.where(rows)[0][[0, -1]]
        return [x0, y0, x1 - x0, y1 - y0]
    @property
    def z_order(self):
        return self._z_order
    def __eq__(self, other):
        if not super().__eq__(other):
            return False
        return \
            (self.label == other.label) and \
            (self.z_order == other.z_order) and \
            (self.image is not None and other.image is not None and \
                np.all(self.image == other.image))
 class RleMask(MaskObject):
    # pylint: disable=redefined-builtin
    def __init__(self, rle=None, label=None, z_order=None,
            id=None, attributes=None, group=None):
        lazy_decode = self._lazy_decode(rle)
        super().__init__(image=lazy_decode, label=label, z_order=z_order,
            id=id, attributes=attributes, group=group)
        self._rle = rle
    # pylint: enable=redefined-builtin
    @staticmethod
    def _lazy_decode(rle):
        from pycocotools import mask as mask_utils
        return lambda: mask_utils.decode(rle).astype(np.bool)
    def area(self):
        from pycocotools import mask as mask_utils
        return mask_utils.area(self._rle)
    def bbox(self):
        from pycocotools import mask as mask_utils
        return mask_utils.toBbox(self._rle)
    @property
    def rle(self):
        return self._rle
    def __eq__(self, other):
        if not isinstance(other, __class__):
            return super().__eq__(other)
        return self._rle == other._rle
 def compute_iou(bbox_a, bbox_b):
    aX, aY, aW, aH = bbox_a
    bX, bY, bW, bH = bbox_b
@ -217,12 +268,16 @@ def compute_iou(bbox_a, bbox_b):
 class ShapeObject(Annotation):
    # pylint: disable=redefined-builtin
-    def __init__(self, type, points=None, label=None,
+    def __init__(self, type, points=None, label=None, z_order=None,
            id=None, attributes=None, group=None):
        super().__init__(id=id, type=type,
            attributes=attributes, group=group)
        self.points = points
        self.label = label
        if z_order is None:
            z_order = 0
        self._z_order = z_order
    # pylint: enable=redefined-builtin
    def area(self):
@ -247,22 +302,24 @@ class ShapeObject(Annotation):
    def get_points(self):
        return self.points
-    def get_mask(self):
+    @property
-        raise NotImplementedError()
+    def z_order(self):
        return self._z_order
    def __eq__(self, other):
        if not super().__eq__(other):
            return False
        return \
            (self.points == other.points) and \
            (self.z_order == other.z_order) and \
            (self.label == other.label)
 class PolyLineObject(ShapeObject):
    # pylint: disable=redefined-builtin
-    def __init__(self, points=None,
+    def __init__(self, points=None, label=None, z_order=None,
-            label=None, id=None, attributes=None, group=None):
+            id=None, attributes=None, group=None):
        super().__init__(type=AnnotationType.polyline,
-            points=points, label=label,
+            points=points, label=label, z_order=z_order,
            id=id, attributes=attributes, group=group)
    # pylint: enable=redefined-builtin
@ -274,12 +331,12 @@ class PolyLineObject(ShapeObject):
 class PolygonObject(ShapeObject):
    # pylint: disable=redefined-builtin
-    def __init__(self, points=None,
+    def __init__(self, points=None, z_order=None,
            label=None, id=None, attributes=None, group=None):
        if points is not None:
            assert len(points) % 2 == 0 and 3 <= len(points) // 2, "Wrong polygon points: %s" % points
        super().__init__(type=AnnotationType.polygon,
-            points=points, label=label,
+            points=points, label=label, z_order=z_order,
            id=id, attributes=attributes, group=group)
    # pylint: enable=redefined-builtin
@ -291,15 +348,15 @@ class PolygonObject(ShapeObject):
        _, _, w, h = self.get_bbox()
        rle = mask_utils.frPyObjects([self.get_points()], h, w)
-        area = mask_utils.area(rle)
+        area = mask_utils.area(rle)[0]
        return area
 class BboxObject(ShapeObject):
    # pylint: disable=redefined-builtin
-    def __init__(self, x=0, y=0, w=0, h=0,
+    def __init__(self, x=0, y=0, w=0, h=0, label=None, z_order=None,
-            label=None, id=None, attributes=None, group=None):
+            id=None, attributes=None, group=None):
        super().__init__(type=AnnotationType.bbox,
-            points=[x, y, x + w, y + h], label=label,
+            points=[x, y, x + w, y + h], label=label, z_order=z_order,
            id=id, attributes=attributes, group=group)
    # pylint: enable=redefined-builtin
@ -368,7 +425,7 @@ class PointsObject(ShapeObject):
    ])
    # pylint: disable=redefined-builtin
-    def __init__(self, points=None, visibility=None, label=None,
+    def __init__(self, points=None, visibility=None, label=None, z_order=None,
            id=None, attributes=None, group=None):
        if points is not None:
            assert len(points) % 2 == 0
@ -381,10 +438,10 @@ class PointsObject(ShapeObject):
            else:
                visibility = []
                for _ in range(len(points) // 2):
-                    visibility.append(self.Visibility.absent)
+                    visibility.append(self.Visibility.visible)
        super().__init__(type=AnnotationType.points,
-            points=points, label=label,
+            points=points, label=label, z_order=z_order,
            id=id, attributes=attributes, group=group)
        self.visibility = visibility
@ -393,6 +450,17 @@ class PointsObject(ShapeObject):
    def area(self):
        return 0
    def get_bbox(self):
        xs = [p for p, v in zip(self.points[0::2], self.visibility)
            if v != __class__.Visibility.absent]
        ys = [p for p, v in zip(self.points[1::2], self.visibility)
            if v != __class__.Visibility.absent]
        x0 = min(xs, default=0)
        x1 = max(xs, default=0)
        y0 = min(ys, default=0)
        y1 = max(ys, default=0)
        return [x0, y0, x1 - x0, y1 - y0]
    def __eq__(self, other):
        if not super().__eq__(other):
            return False
--- a/datumaro/datumaro/components/extractors/ms_coco.py
+++ b/datumaro/datumaro/components/extractors/ms_coco.py
@ -4,8 +4,6 @@
 # SPDX-License-Identifier: MIT
 from collections import OrderedDict
 from itertools import chain
 import numpy as np
 import os.path as osp
 from pycocotools.coco import COCO
@ -13,7 +11,7 @@ import pycocotools.mask as mask_utils
 from datumaro.components.extractor import (Extractor, DatasetItem,
    DEFAULT_SUBSET_NAME, AnnotationType,
-    LabelObject, MaskObject, PointsObject, PolygonObject,
+    LabelObject, RleMask, PointsObject, PolygonObject,
    BboxObject, CaptionObject,
    LabelCategories, PointsCategories
 )
@ -21,28 +19,6 @@ from datumaro.components.formats.ms_coco import CocoTask, CocoPath
 from datumaro.util.image import lazy_image
 class RleMask(MaskObject):
    # pylint: disable=redefined-builtin
    def __init__(self, rle=None, label=None,
            id=None, attributes=None, group=None):
        lazy_decode = lambda: mask_utils.decode(rle).astype(np.bool)
        super().__init__(image=lazy_decode, label=label,
            id=id, attributes=attributes, group=group)
        self._rle = rle
    # pylint: enable=redefined-builtin
    def area(self):
        return mask_utils.area(self._rle)
    def bbox(self):
        return mask_utils.toBbox(self._rle)
    def __eq__(self, other):
        if not isinstance(other, __class__):
            return super().__eq__(other)
        return self._rle == other._rle
 class CocoExtractor(Extractor):
    def __init__(self, path, task, merge_instance_polygons=False):
        super().__init__()
@ -144,8 +120,7 @@ class CocoExtractor(Extractor):
            anns = loader.getAnnIds(imgIds=img_id)
            anns = loader.loadAnns(anns)
-            anns = list(chain(*(
+            anns = sum((self._load_annotations(a, image_info) for a in anns), [])
                self._load_annotations(ann, image_info) for ann in anns)))
            items[img_id] = DatasetItem(id=img_id, subset=self._subset,
                image=image, annotations=anns)
@ -167,17 +142,26 @@ class CocoExtractor(Extractor):
        if 'score' in ann:
            attributes['score'] = ann['score']
-        if self._task is CocoTask.instances:
+        group = ann_id # make sure all tasks' annotations are merged
        if self._task in [CocoTask.instances, CocoTask.person_keypoints]:
            x, y, w, h = ann['bbox']
            label_id = self._get_label_id(ann)
            group = None
            is_crowd = bool(ann['iscrowd'])
            attributes['is_crowd'] = is_crowd
            if self._task is CocoTask.person_keypoints:
                keypoints = ann['keypoints']
                points = [p for i, p in enumerate(keypoints) if i % 3 != 2]
                visibility = keypoints[2::3]
                parsed_annotations.append(
                    PointsObject(points, visibility, label=label_id,
                        id=ann_id, attributes=attributes, group=group)
                )
            segmentation = ann.get('segmentation')
            if segmentation is not None:
                group = ann_id
                rle = None
                if isinstance(segmentation, list):
@ -185,7 +169,7 @@ class CocoExtractor(Extractor):
                    for polygon_points in segmentation:
                        parsed_annotations.append(PolygonObject(
                            points=polygon_points, label=label_id,
-                            id=ann_id, group=group, attributes=attributes
+                            id=ann_id, attributes=attributes, group=group
                        ))
                    if self._merge_instance_polygons:
@ -204,7 +188,7 @@ class CocoExtractor(Extractor):
                if rle is not None:
                    parsed_annotations.append(RleMask(rle=rle, label=label_id,
-                        id=ann_id, group=group, attributes=attributes
+                        id=ann_id, attributes=attributes, group=group
                    ))
            parsed_annotations.append(
@ -214,30 +198,14 @@ class CocoExtractor(Extractor):
        elif self._task is CocoTask.labels:
            label_id = self._get_label_id(ann)
            parsed_annotations.append(
-                LabelObject(label=label_id, id=ann_id, attributes=attributes)
+                LabelObject(label=label_id,
            )
        elif self._task is CocoTask.person_keypoints:
            keypoints = ann['keypoints']
            points = [p for i, p in enumerate(keypoints) if i % 3 != 2]
            visibility = keypoints[2::3]
            bbox = ann.get('bbox')
            label_id = self._get_label_id(ann)
            group = None
            if bbox is not None:
                group = ann_id
            parsed_annotations.append(
                PointsObject(points, visibility, label=label_id,
                    id=ann_id, attributes=attributes, group=group)
            )
            if bbox is not None:
                parsed_annotations.append(
                    BboxObject(*bbox, label=label_id, group=group)
                )
        elif self._task is CocoTask.captions:
            caption = ann['caption']
            parsed_annotations.append(
                CaptionObject(caption,
-                    id=ann_id, attributes=attributes)
+                    id=ann_id, attributes=attributes, group=group)
            )
        else:
            raise NotImplementedError()
--- a/datumaro/datumaro/util/image.py
+++ b/datumaro/datumaro/util/image.py
@ -34,14 +34,14 @@ def load_image(path):
        from PIL import Image
        image = Image.open(path)
        image = np.asarray(image, dtype=np.float32)
-        if len(image.shape) == 3 and image.shape[2] in [3, 4]:
+        if len(image.shape) == 3 and image.shape[2] in {3, 4}:
            image[:, :, :3] = image[:, :, 2::-1] # RGB to BGR
    else:
        raise NotImplementedError()
-    assert len(image.shape) in [2, 3]
+    assert len(image.shape) in {2, 3}
    if len(image.shape) == 3:
-        assert image.shape[2] in [3, 4]
+        assert image.shape[2] in {3, 4}
    return image
 def save_image(path, image, params=None):
@ -60,7 +60,7 @@ def save_image(path, image, params=None):
            params = {}
        image = image.astype(np.uint8)
-        if len(image.shape) == 3 and image.shape[2] in [3, 4]:
+        if len(image.shape) == 3 and image.shape[2] in {3, 4}:
            image[:, :, :3] = image[:, :, 2::-1] # BGR to RGB
        image = Image.fromarray(image)
        image.save(path, **params)
@ -92,7 +92,7 @@ def encode_image(image, ext, params=None):
            params = {}
        image = image.astype(np.uint8)
-        if len(image.shape) == 3 and image.shape[2] in [3, 4]:
+        if len(image.shape) == 3 and image.shape[2] in {3, 4}:
            image[:, :, :3] = image[:, :, 2::-1] # BGR to RGB
        image = Image.fromarray(image)
        with BytesIO() as buffer:
@ -111,14 +111,14 @@ def decode_image(image_bytes):
        from PIL import Image
        image = Image.open(BytesIO(image_bytes))
        image = np.asarray(image, dtype=np.float32)
-        if len(image.shape) == 3 and image.shape[2] in [3, 4]:
+        if len(image.shape) == 3 and image.shape[2] in {3, 4}:
            image[:, :, :3] = image[:, :, 2::-1] # RGB to BGR
    else:
        raise NotImplementedError()
-    assert len(image.shape) in [2, 3]
+    assert len(image.shape) in {2, 3}
    if len(image.shape) == 3:
-        assert image.shape[2] in [3, 4]
+        assert image.shape[2] in {3, 4}
    return image
@ -131,7 +131,7 @@ class lazy_image:
        # - False: do not cache
        # - None: use default (don't store in a class variable)
        # - object: use this object as a cache
-        assert cache in [None, False] or isinstance(cache, object)
+        assert cache in {None, False} or isinstance(cache, object)
        self.cache = cache
    def __call__(self):
--- a/datumaro/datumaro/util/mask_tools.py
+++ b/datumaro/datumaro/util/mask_tools.py
@ -91,7 +91,7 @@ def lazy_mask(path, colormap=None):
    return lazy_image(path, lambda path: load_mask(path, colormap))
-def convert_mask_to_rle(binary_mask):
+def mask_to_rle(binary_mask):
    counts = []
    for i, (value, elements) in enumerate(
            groupby(binary_mask.ravel(order='F'))):
@ -103,4 +103,136 @@ def convert_mask_to_rle(binary_mask):
    return {
        'counts': counts,
        'size': list(binary_mask.shape)
-    }
+    }
 def mask_to_polygons(mask, tolerance=1.0, area_threshold=1):
    """
    Convert an instance mask to polygons
    Args:
        mask: a 2d binary mask
        tolerance: maximum distance from original points of
            a polygon to the approximated ones
        area_threshold: minimal area of generated polygons
    Returns:
        A list of polygons like [[x1,y1, x2,y2 ...], [...]]
    """
    from pycocotools import mask as mask_utils
    from skimage import measure
    polygons = []
    # pad mask with 0 around borders
    padded_mask = np.pad(mask, pad_width=1, mode='constant', constant_values=0)
    contours = measure.find_contours(padded_mask, 0.5)
    # Fix coordinates after padding
    contours = np.subtract(contours, 1)
    for contour in contours:
        if not np.array_equal(contour[0], contour[-1]):
            contour = np.vstack((contour, contour[0])) # make polygon closed
        contour = measure.approximate_polygon(contour, tolerance)
        if len(contour) <= 2:
            continue
        contour = np.flip(contour, axis=1).flatten().clip(0) # [x0, y0, ...]
        # Check if the polygon is big enough
        rle = mask_utils.frPyObjects([contour], mask.shape[0], mask.shape[1])
        area = sum(mask_utils.area(rle))
        if area_threshold <= area:
            polygons.append(contour)
    return polygons
 def crop_covered_segments(segments, width, height,
        iou_threshold=0.0, ratio_tolerance=0.001, area_threshold=1,
        return_masks=False):
    """
    Find all segments occluded by others and crop them to the visible part only.
    Input segments are expected to be sorted from background to foreground.
    Args:
        segments: 1d list of segment RLEs (in COCO format)
        width: width of the image
        height: height of the image
        iou_threshold: IoU threshold for objects to be counted as intersected
            By default is set to 0 to process any intersected objects
        ratio_tolerance: an IoU "handicap" value for a situation
            when an object is (almost) fully covered by another one and we
            don't want make a "hole" in the background object
        area_threshold: minimal area of included segments
    Returns:
        A list of input segments' parts (in the same order as input):
            [
                [[x1,y1, x2,y2 ...], ...], # input segment #0 parts
                mask1, # input segment #1 mask (if source segment is mask)
                [], # when source segment is too small
                ...
            ]
    """
    from pycocotools import mask as mask_utils
    segments = [[s] for s in segments]
    input_rles = [mask_utils.frPyObjects(s, height, width) for s in segments]
    for i, rle_bottom in enumerate(input_rles):
        area_bottom = sum(mask_utils.area(rle_bottom))
        if area_bottom < area_threshold:
            segments[i] = [] if not return_masks else None
            continue
        rles_top = []
        for j in range(i + 1, len(input_rles)):
            rle_top = input_rles[j]
            iou = sum(mask_utils.iou(rle_bottom, rle_top, [0, 0]))[0]
            if iou <= iou_threshold:
                continue
            area_top = sum(mask_utils.area(rle_top))
            area_ratio = area_top / area_bottom
            # If a segment is fully inside another one, skip this segment
            if abs(area_ratio - iou) < ratio_tolerance:
                continue
            # Check if the bottom segment is fully covered by the top one.
            # There is a mistake in the annotation, keep the background one
            if abs(1 / area_ratio - iou) < ratio_tolerance:
                rles_top = []
                break
            rles_top += rle_top
        if not rles_top and not isinstance(segments[i][0], dict) \
                and not return_masks:
            continue
        rle_bottom = rle_bottom[0]
        bottom_mask = mask_utils.decode(rle_bottom).astype(np.uint8)
        if rles_top:
            rle_top = mask_utils.merge(rles_top)
            top_mask = mask_utils.decode(rle_top).astype(np.uint8)
            bottom_mask -= top_mask
            bottom_mask[bottom_mask != 1] = 0
        if not return_masks and not isinstance(segments[i][0], dict):
            segments[i] = mask_to_polygons(bottom_mask,
                area_threshold=area_threshold)
        else:
            segments[i] = bottom_mask
    return segments
 def rles_to_mask(rles, width, height):
    from pycocotools import mask as mask_utils
    rles = mask_utils.frPyObjects(rles, height, width)
    rles = mask_utils.merge(rles)
    mask = mask_utils.decode(rles)
    return mask
--- a/datumaro/requirements.txt
+++ b/datumaro/requirements.txt
@ -6,5 +6,6 @@ opencv-python>=4.1.0.25
 Pillow>=6.1.0
 pycocotools>=2.0.0
 PyYAML>=5.1.1
 scikit-image>=0.15.0
 tensorboardX>=1.8
-tensorflow>=1.12.0
+tensorflow>=1.12.0
--- a/datumaro/setup.py
+++ b/datumaro/setup.py
@ -56,6 +56,7 @@ setuptools.setup(
        'Pillow',
        'PyYAML',
        'pycocotools',
        'scikit-image',
        'tensorboardX',
        'tensorflow',
    ],
@ -64,4 +65,4 @@ setuptools.setup(
            'datum=datumaro:main',
        ],
    },
-)
+)
--- a/datumaro/tests/test_coco_format.py
+++ b/datumaro/tests/test_coco_format.py
@ -160,28 +160,33 @@ class CocoConverterTest(TestCase):
            self.assertFalse(item_b is None)
            self.assertEqual(len(item_a.annotations), len(item_b.annotations))
            for ann_a in item_a.annotations:
-                ann_b = find(item_b.annotations, lambda x: \
+                # We might find few corresponding items, so check them all
-                    x.id == ann_a.id and \
+                ann_b_matches = [x for x in item_b.annotations
-                    x.type == ann_a.type and x.group == ann_a.group)
+                    if x.id == ann_a.id and \
-                self.assertEqual(ann_a, ann_b, 'id: ' + str(ann_a.id))
+                        x.type == ann_a.type and x.group == ann_a.group]
                self.assertFalse(len(ann_b_matches) == 0, 'aid: %s' % ann_a.id)
                ann_b = find(ann_b_matches, lambda x: x == ann_a)
                self.assertEqual(ann_a, ann_b, 'aid: %s' % ann_a.id)
                item_b.annotations.remove(ann_b) # avoid repeats
    def test_can_save_and_load_captions(self):
        class TestExtractor(Extractor):
            def __iter__(self):
                return iter([
-                    DatasetItem(id=0, subset='train',
+                    DatasetItem(id=1, subset='train',
                        annotations=[
-                            CaptionObject('hello', id=1),
+                            CaptionObject('hello', id=1, group=1),
-                            CaptionObject('world', id=2),
+                            CaptionObject('world', id=2, group=2),
                        ]),
-                    DatasetItem(id=1, subset='train',
+                    DatasetItem(id=2, subset='train',
                        annotations=[
-                            CaptionObject('test', id=3),
+                            CaptionObject('test', id=3, group=3),
                        ]),
-                    DatasetItem(id=2, subset='val',
+                    DatasetItem(id=3, subset='val',
                        annotations=[
-                            CaptionObject('word', id=1),
+                            CaptionObject('word', id=1, group=1),
                        ]
                    ),
                ])
@ -191,95 +196,185 @@ class CocoConverterTest(TestCase):
                CocoCaptionsConverter(), test_dir)
    def test_can_save_and_load_instances(self):
        label_categories = LabelCategories()
        for i in range(10):
            label_categories.add(str(i))
        categories = { AnnotationType.label: label_categories }
        class TestExtractor(Extractor):
            def __iter__(self):
                return iter([
-                    DatasetItem(id=0, subset='train', image=np.ones((4, 4, 3)),
+                    DatasetItem(id=1, subset='train', image=np.ones((4, 4, 3)),
                        annotations=[
                            # Bbox + single polygon
-                            BboxObject(0, 1, 2, 3, label=2, group=1, id=1,
+                            BboxObject(0, 1, 2, 2,
                                label=2, group=1, id=1,
                                attributes={ 'is_crowd': False }),
                            PolygonObject([0, 1, 2, 1, 2, 3, 0, 3],
                                attributes={ 'is_crowd': False },
                                label=2, group=1, id=1),
                        ]),
-                    DatasetItem(id=1, subset='train',
+                    DatasetItem(id=2, subset='train', image=np.ones((4, 4, 3)),
                        annotations=[
                            # Mask + bbox
-                            MaskObject(np.array([[0, 0, 0, 0], [1, 0, 1, 0],
+                            MaskObject(np.array([
-                                                 [1, 1, 0, 0], [0, 0, 1, 0]],
+                                    [0, 1, 0, 0],
-                                                 dtype=np.bool),
+                                    [0, 1, 0, 0],
                                    [0, 1, 1, 1],
                                    [0, 0, 0, 0]],
                                    ),
                                attributes={ 'is_crowd': True },
                                label=4, group=3, id=3),
-                            BboxObject(0, 1, 3, 3, label=4, group=3, id=3,
+                            BboxObject(1, 0, 2, 2, label=4, group=3, id=3,
                                attributes={ 'is_crowd': True }),
                        ]),
-                    DatasetItem(id=3, subset='val',
+                    DatasetItem(id=3, subset='val', image=np.ones((4, 4, 3)),
                        annotations=[
                            # Bbox + mask
-                            BboxObject(0, 1, 3, 2, label=4, group=3, id=3,
+                            BboxObject(0, 1, 2, 2, label=4, group=3, id=3,
                                attributes={ 'is_crowd': True }),
-                            MaskObject(np.array([[0, 0, 0, 0], [1, 0, 1, 0],
+                            MaskObject(np.array([
-                                                 [1, 1, 0, 0], [0, 0, 0, 0]],
+                                    [0, 0, 0, 0],
-                                                 dtype=np.bool),
+                                    [1, 1, 1, 0],
                                    [1, 1, 0, 0],
                                    [0, 0, 0, 0]],
                                    ),
                                attributes={ 'is_crowd': True },
                                label=4, group=3, id=3),
                        ]),
                ])
            def categories(self):
-                label_categories = LabelCategories()
+                return categories
                for i in range(10):
                    label_categories.add(str(i))
                return {
                    AnnotationType.label: label_categories,
                }
        with TestDir() as test_dir:
            self._test_save_and_load(TestExtractor(),
                CocoInstancesConverter(), test_dir)
-    def test_can_save_and_load_instances_with_mask_conversion(self):
+    def test_can_merge_polygons_on_loading(self):
        label_categories = LabelCategories()
        for i in range(10):
            label_categories.add(str(i))
        categories = { AnnotationType.label: label_categories }
        class TestExtractor(Extractor):
            def __iter__(self):
                return iter([
-                    DatasetItem(id=0, image=np.zeros((5, 5, 3)), subset='train',
+                    DatasetItem(id=1, image=np.zeros((6, 10, 3)),
                        annotations=[
                            BboxObject(0, 0, 5, 5, label=3, id=4, group=4,
                                attributes={ 'is_crowd': False }),
                            PolygonObject([0, 0, 4, 0, 4, 4],
-                                label=3, id=4, group=4,
+                                label=3, id=4, group=4),
-                                attributes={ 'is_crowd': False }),
+                            PolygonObject([5, 0, 9, 0, 5, 5],
                            MaskObject(np.array([
                                    [0, 1, 1, 1, 0],
                                    [0, 0, 1, 1, 0],
                                    [0, 0, 0, 1, 0],
                                    [0, 0, 0, 0, 0],
                                    [0, 0, 0, 0, 0]],
                                    # only internal fragment (without the border),
                                    # but not everywhere...
                                    dtype=np.bool),
                                attributes={ 'is_crowd': False },
                                label=3, id=4, group=4),
                        ]
                    ),
                ])
            def categories(self):
-                label_categories = LabelCategories()
+                return categories
-                for i in range(10):
+
-                    label_categories.add(str(i))
+        class TargetExtractor(TestExtractor):
-                return {
+            def __iter__(self):
-                    AnnotationType.label: label_categories,
+                items = list(super().__iter__())
-                }
+                items[0]._annotations = [
                    BboxObject(0, 0, 9, 5,
                        label=3, id=4, group=4,
                        attributes={ 'is_crowd': False }),
                    PolygonObject([0, 0, 4, 0, 4, 4],
                        label=3, id=4, group=4,
                        attributes={ 'is_crowd': False }),
                    PolygonObject([5, 0, 9, 0, 5, 5],
                        label=3, id=4, group=4,
                        attributes={ 'is_crowd': False }),
                    MaskObject(np.array([
                            [0, 1, 1, 1, 0, 1, 1, 1, 1, 0],
                            [0, 0, 1, 1, 0, 1, 1, 1, 0, 0],
                            [0, 0, 0, 1, 0, 1, 1, 0, 0, 0],
                            [0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
                            [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                            [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
                            # only internal fragment (without the border),
                            # but not everywhere...
                        ),
                        label=3, id=4, group=4,
                        attributes={ 'is_crowd': False }),
                ]
                return iter(items)
        with TestDir() as test_dir:
            self._test_save_and_load(TestExtractor(),
                CocoInstancesConverter(), test_dir,
-                {'merge_instance_polygons': True})
+                importer_params={'merge_instance_polygons': True},
                target_dataset=TargetExtractor())
    def test_can_crop_covered_segments(self):
        label_categories = LabelCategories()
        for i in range(10):
            label_categories.add(str(i))
        class SrcTestExtractor(Extractor):
            def __iter__(self):
                items = [
                    DatasetItem(id=1, image=np.zeros((5, 5, 3)),
                        annotations=[
                            MaskObject(np.array([
                                    [0, 0, 1, 1, 1],
                                    [0, 0, 1, 1, 1],
                                    [1, 1, 0, 1, 1],
                                    [1, 1, 1, 0, 0],
                                    [1, 1, 1, 0, 0]],
                                    ),
                                label=2, id=1, z_order=0),
                            PolygonObject([1, 1, 4, 1, 4, 4, 1, 4],
                                label=1, id=2, z_order=1),
                        ]
                    ),
                ]
                return iter(items)
            def categories(self):
                return { AnnotationType.label: label_categories }
        class DstTestExtractor(Extractor):
            def __iter__(self):
                items = [
                    DatasetItem(id=1, image=np.zeros((5, 5, 3)),
                        annotations=[
                            BboxObject(0, 0, 4, 4,
                                label=2, id=1, group=1,
                                attributes={ 'is_crowd': True }),
                            MaskObject(np.array([
                                    [0, 0, 1, 1, 1],
                                    [0, 0, 0, 0, 1],
                                    [1, 0, 0, 0, 1],
                                    [1, 0, 0, 0, 0],
                                    [1, 1, 1, 0, 0]],
                                    ),
                                attributes={ 'is_crowd': True },
                                label=2, id=1, group=1),
                            BboxObject(1, 1, 3, 3,
                                label=1, id=2, group=2,
                                attributes={ 'is_crowd': False }),
                            PolygonObject([1, 1, 4, 1, 4, 4, 1, 4],
                                label=1, id=2, group=2,
                                attributes={ 'is_crowd': False }),
                            # NOTE: Why it's 4 in COCOapi?..
                        ]
                    ),
                ]
                return iter(items)
-    def test_can_merge_instance_polygons_to_mask_in_coverter(self):
+            def categories(self):
                return { AnnotationType.label: label_categories }
        with TestDir() as test_dir:
            self._test_save_and_load(SrcTestExtractor(),
                CocoInstancesConverter(crop_covered=True), test_dir,
                target_dataset=DstTestExtractor())
    def test_can_convert_polygons_to_mask(self):
        label_categories = LabelCategories()
        for i in range(10):
            label_categories.add(str(i))
@ -287,14 +382,12 @@ class CocoConverterTest(TestCase):
        class SrcTestExtractor(Extractor):
            def __iter__(self):
                return iter([
-                    DatasetItem(id=0, image=np.zeros((5, 10, 3)),
+                    DatasetItem(id=1, image=np.zeros((6, 10, 3)),
                        annotations=[
                            PolygonObject([0, 0, 4, 0, 4, 4],
-                                label=3, id=4, group=4,
+                                label=3, id=4, group=4),
                                attributes={ 'is_crowd': False }),
                            PolygonObject([5, 0, 9, 0, 5, 5],
-                                label=3, id=4, group=4,
+                                label=3, id=4, group=4),
                                attributes={ 'is_crowd': False }),
                        ]
                    ),
                ])
@ -305,19 +398,20 @@ class CocoConverterTest(TestCase):
        class DstTestExtractor(Extractor):
            def __iter__(self):
                return iter([
-                    DatasetItem(id=0, image=np.zeros((5, 10, 3)),
+                    DatasetItem(id=1, image=np.zeros((6, 10, 3)),
                        annotations=[
-                            BboxObject(1, 0, 8, 4, label=3, id=4, group=4,
+                            BboxObject(0, 0, 9, 5, label=3, id=4, group=4,
                                attributes={ 'is_crowd': True }),
                            MaskObject(np.array([
                                    [0, 1, 1, 1, 0, 1, 1, 1, 1, 0],
                                    [0, 0, 1, 1, 0, 1, 1, 1, 0, 0],
                                    [0, 0, 0, 1, 0, 1, 1, 0, 0, 0],
                                    [0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
                                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
                                    # only internal fragment (without the border),
                                    # but not everywhere...
-                                    dtype=np.bool),
+                                    ),
                                attributes={ 'is_crowd': True },
                                label=3, id=4, group=4),
                        ]
@ -329,15 +423,69 @@ class CocoConverterTest(TestCase):
        with TestDir() as test_dir:
            self._test_save_and_load(SrcTestExtractor(),
-                CocoInstancesConverter(merge_polygons=True), test_dir,
+                CocoInstancesConverter(segmentation_mode='mask'), test_dir,
                target_dataset=DstTestExtractor())
    def test_can_convert_masks_to_polygons(self):
        label_categories = LabelCategories()
        for i in range(10):
            label_categories.add(str(i))
        class SrcTestExtractor(Extractor):
            def __iter__(self):
                items = [
                    DatasetItem(id=1, image=np.zeros((5, 10, 3)),
                        annotations=[
                            MaskObject(np.array([
                                        [0, 1, 1, 1, 0, 1, 1, 1, 1, 0],
                                        [0, 0, 1, 1, 0, 1, 1, 1, 0, 0],
                                        [0, 0, 0, 1, 0, 1, 1, 0, 0, 0],
                                        [0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
                                        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                                    ],
                                    ),
                                label=3, id=4, group=4),
                        ]
                    ),
                ]
                return iter(items)
            def categories(self):
                return { AnnotationType.label: label_categories }
        class DstTestExtractor(Extractor):
            def __iter__(self):
                return iter([
                    DatasetItem(id=1, image=np.zeros((5, 10, 3)),
                        annotations=[
                            BboxObject(1, 0, 7, 3, label=3, id=4, group=4,
                                attributes={ 'is_crowd': False }),
                            PolygonObject(
                                [3.0, 2.5, 1.0, 0.0, 3.5, 0.0, 3.0, 2.5],
                                label=3, id=4, group=4,
                                attributes={ 'is_crowd': False }),
                            PolygonObject(
                                [5.0, 3.5, 4.5, 0.0, 8.0, 0.0, 5.0, 3.5],
                                label=3, id=4, group=4,
                                attributes={ 'is_crowd': False }),
                        ]
                    ),
                ])
            def categories(self):
                return { AnnotationType.label: label_categories }
        with TestDir() as test_dir:
            self._test_save_and_load(SrcTestExtractor(),
                CocoInstancesConverter(segmentation_mode='polygons'), test_dir,
                target_dataset=DstTestExtractor())
    def test_can_save_and_load_images(self):
        class TestExtractor(Extractor):
            def __iter__(self):
                return iter([
                    DatasetItem(id=0, subset='train'),
                    DatasetItem(id=1, subset='train'),
                    DatasetItem(id=2, subset='train'),
                    DatasetItem(id=2, subset='val'),
                    DatasetItem(id=3, subset='val'),
@ -354,19 +502,19 @@ class CocoConverterTest(TestCase):
        class TestExtractor(Extractor):
            def __iter__(self):
                return iter([
-                    DatasetItem(id=0, subset='train',
+                    DatasetItem(id=1, subset='train',
                        annotations=[
-                            LabelObject(4, id=1),
+                            LabelObject(4, id=1, group=1),
-                            LabelObject(9, id=2),
+                            LabelObject(9, id=2, group=2),
                        ]),
-                    DatasetItem(id=1, subset='train',
+                    DatasetItem(id=2, subset='train',
                        annotations=[
-                            LabelObject(4, id=4),
+                            LabelObject(4, id=4, group=4),
                        ]),
-                    DatasetItem(id=2, subset='val',
+                    DatasetItem(id=3, subset='val',
                        annotations=[
-                            LabelObject(2, id=1),
+                            LabelObject(2, id=1, group=1),
                        ]),
                ])
@ -383,62 +531,114 @@ class CocoConverterTest(TestCase):
                CocoLabelsConverter(), test_dir)
    def test_can_save_and_load_keypoints(self):
        label_categories = LabelCategories()
        points_categories = PointsCategories()
        for i in range(10):
            label_categories.add(str(i))
            points_categories.add(i, [])
        categories = {
            AnnotationType.label: label_categories,
            AnnotationType.points: points_categories,
        }
        class TestExtractor(Extractor):
            def __iter__(self):
                return iter([
-                    DatasetItem(id=0, subset='train',
+                    DatasetItem(id=1, subset='train', image=np.zeros((5, 5, 3)),
                        annotations=[
-                            PointsObject([1, 2, 0, 2, 4, 1], [0, 1, 2],
+                            # Full instance annotations: polygon + keypoints
                            PointsObject([0, 0, 0, 2, 4, 1], [0, 1, 2],
                                label=3, group=1, id=1),
                            PolygonObject([0, 0, 4, 0, 4, 4],
                                label=3, group=1, id=1),
-                            BboxObject(1, 2, 3, 4, label=3, group=1),
+
-                            PointsObject([5, 6, 0, 7], group=2, id=2),
+                            # Full instance annotations: bbox + keypoints
-                            BboxObject(1, 2, 3, 4, group=2),
+                            PointsObject([1, 2, 3, 4, 2, 3], group=2, id=2),
                            BboxObject(1, 2, 2, 2, group=2, id=2),
                        ]),
-                    DatasetItem(id=1, subset='train',
+                    DatasetItem(id=2, subset='train',
                        annotations=[
-                            PointsObject([1, 2, 0, 2, 4, 1], label=5,
+                            # Solitary keypoints
-                                group=3, id=3),
+                            PointsObject([1, 2, 0, 2, 4, 1], label=5, id=3),
                            BboxObject(1, 2, 3, 4, label=5, group=3),
                        ]),
-                    DatasetItem(id=2, subset='val',
+                    DatasetItem(id=3, subset='val',
                        annotations=[
-                            PointsObject([0, 2, 0, 2, 4, 1], label=2,
+                            # Solitary keypoints with no label
-                                group=3, id=3),
+                            PointsObject([0, 0, 1, 2, 3, 4], [0, 1, 2], id=3),
                            BboxObject(0, 2, 4, 4, label=2, group=3),
                        ]),
                ])
            def categories(self):
-                label_categories = LabelCategories()
+                return categories
                points_categories = PointsCategories()
                for i in range(10):
                    label_categories.add(str(i))
                    points_categories.add(i, [])
-                return {
+        class DstTestExtractor(TestExtractor):
-                    AnnotationType.label: label_categories,
+            def __iter__(self):
-                    AnnotationType.points: points_categories,
+                return iter([
-                }
+                    DatasetItem(id=1, subset='train', image=np.zeros((5, 5, 3)),
                        annotations=[
                            PointsObject([0, 0, 0, 2, 4, 1], [0, 1, 2],
                                label=3, group=1, id=1,
                                attributes={'is_crowd': False}),
                            PolygonObject([0, 0, 4, 0, 4, 4],
                                label=3, group=1, id=1,
                                attributes={'is_crowd': False}),
                            BboxObject(0, 0, 4, 4,
                                label=3, group=1, id=1,
                                attributes={'is_crowd': False}),
                            PointsObject([1, 2, 3, 4, 2, 3],
                                group=2, id=2,
                                attributes={'is_crowd': False}),
                            PolygonObject([1, 2, 3, 2, 3, 4, 1, 4],
                                group=2, id=2,
                                attributes={'is_crowd': False}),
                            BboxObject(1, 2, 2, 2,
                                group=2, id=2,
                                attributes={'is_crowd': False}),
                        ]),
                    DatasetItem(id=2, subset='train',
                        annotations=[
                            PointsObject([1, 2, 0, 2, 4, 1],
                                label=5, group=3, id=3,
                                attributes={'is_crowd': False}),
                            PolygonObject([0, 1, 4, 1, 4, 2, 0, 2],
                                label=5, group=3, id=3,
                                attributes={'is_crowd': False}),
                            BboxObject(0, 1, 4, 1,
                                label=5, group=3, id=3,
                                attributes={'is_crowd': False}),
                        ]),
                    DatasetItem(id=3, subset='val',
                        annotations=[
                            PointsObject([0, 0, 1, 2, 3, 4], [0, 1, 2],
                                group=3, id=3,
                                attributes={'is_crowd': False}),
                            PolygonObject([1, 2, 3, 2, 3, 4, 1, 4],
                                group=3, id=3,
                                attributes={'is_crowd': False}),
                            BboxObject(1, 2, 2, 2,
                                group=3, id=3,
                                attributes={'is_crowd': False}),
                        ]),
                ])
        with TestDir() as test_dir:
            self._test_save_and_load(TestExtractor(),
-                CocoPersonKeypointsConverter(), test_dir)
+                CocoPersonKeypointsConverter(), test_dir,
                target_dataset=DstTestExtractor())
    def test_can_save_dataset_with_no_subsets(self):
        class TestExtractor(Extractor):
            def __iter__(self):
                return iter([
                    DatasetItem(id=1, annotations=[
-                        LabelObject(2, id=1),
+                        LabelObject(2, id=1, group=1),
                    ]),
-                    DatasetItem(id=2, image=np.zeros((5, 5, 3)), annotations=[
+                    DatasetItem(id=2, annotations=[
-                        LabelObject(3, id=3),
+                        LabelObject(3, id=2, group=2),
                        BboxObject(0, 0, 5, 5, label=3, id=4, group=4,
                            attributes={ 'is_crowd': False }),
                        PolygonObject([0, 0, 4, 0, 4, 4], label=3, id=4, group=4,
                            attributes={ 'is_crowd': False }),
                    ]),
                ])
--- a/datumaro/tests/test_masks.py
+++ b/datumaro/tests/test_masks.py
@ -0,0 +1,69 @@
 import numpy as np
 from unittest import TestCase
 import datumaro.util.mask_tools as mask_tools
 class PolygonConversionsTest(TestCase):
    def test_mask_can_be_converted_to_polygon(self):
        mask = np.array([
            [0, 1, 1, 1, 0, 1, 1, 1, 1, 0],
            [0, 0, 1, 1, 0, 1, 0, 1, 0, 0],
            [0, 0, 0, 1, 0, 1, 1, 0, 0, 0],
            [0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
            [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        ])
        expected = [
            [1, 0, 3, 0, 3, 2, 1, 0],
            [5, 0, 8, 0, 5, 3],
        ]
        computed = mask_tools.mask_to_polygons(mask)
        self.assertEqual(len(expected), len(computed))
    def test_can_crop_covered_segments(self):
        image_size = [7, 7]
        initial = [
            [1, 1, 6, 1, 6, 6, 1, 6], # rectangle
            mask_tools.mask_to_rle(np.array([
                [0, 0, 0, 0, 0, 0, 0],
                [0, 0, 1, 0, 1, 1, 0],
                [0, 1, 1, 0, 1, 1, 0],
                [0, 0, 0, 0, 0, 1, 0],
                [0, 1, 1, 0, 0, 1, 0],
                [0, 1, 1, 1, 1, 1, 0],
                [0, 0, 0, 0, 0, 0, 0],
            ])),
            [1, 1, 6, 6, 1, 6], # lower-left triangle
        ]
        expected = [
            np.array([
                [0, 0, 0, 0, 0, 0, 0],
                [0, 0, 0, 1, 0, 0, 0],
                [0, 0, 0, 1, 0, 0, 0],
                [0, 0, 0, 0, 1, 0, 0],
                [0, 0, 0, 0, 0, 0, 0],
                [0, 0, 0, 0, 0, 0, 0],
                [0, 0, 0, 0, 0, 0, 0],
            ]), # half-covered
            np.array([
                [0, 0, 0, 0, 0, 0, 0],
                [0, 0, 1, 0, 1, 1, 0],
                [0, 0, 0, 0, 1, 1, 0],
                [0, 0, 0, 0, 0, 1, 0],
                [0, 0, 0, 0, 0, 1, 0],
                [0, 0, 0, 0, 0, 0, 0],
                [0, 0, 0, 0, 0, 0, 0],
            ]), # half-covered
            mask_tools.rles_to_mask([initial[2]], *image_size), # unchanged
        ]
        computed = mask_tools.crop_covered_segments(initial, *image_size,
            ratio_tolerance=0, return_masks=True)
        self.assertEqual(len(initial), len(computed))
        for i, (e_mask, c_mask) in enumerate(zip(expected, computed)):
            self.assertTrue(np.array_equal(e_mask, c_mask),
                '#%s: %s\n%s\n' % (i, e_mask, c_mask))