diff --git a/utils/README.md b/utils/README.md index 275a32db..68735adf 100644 --- a/utils/README.md +++ b/utils/README.md @@ -4,6 +4,7 @@ ## Description This folder contains some useful utilities for Computer Vision Annotation Tool (CVAT). To read about a certain utility please choose a link: -* [Convert CVAT XML to PASCAL VOC](voc/converter.md) -* [Convert CVAT XML to MS COCO](coco/converter.md) -* [Convert CVAT XML to PNG mask](mask/converter.md) +- [Convert CVAT XML to PASCAL VOC](voc/converter.md) +- [Convert CVAT XML to MS COCO](coco/converter.md) +- [Convert CVAT XML to PNG mask](mask/converter.md) +- [Convert CVAT XML to TFRECORDS](tfrecords/converter.md) diff --git a/utils/tfrecords/__init__.py b/utils/tfrecords/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/utils/tfrecords/converter.md b/utils/tfrecords/converter.md new file mode 100644 index 00000000..96d56ffc --- /dev/null +++ b/utils/tfrecords/converter.md @@ -0,0 +1,77 @@ +# Utility for converting CVAT XML annotation file to TFRECORDS format + +## Description + +Given a CVAT XML and a directory with the image dataset, this script reads the CVAT +XML and writes the annotations in tfrecords format into a given directory in addition +to the label map required for the tensorflow object detection API. + +This implementation supports **annotated images only**. Make sure to dump the +**XML annotations and NOT interpolations** from CVAT. + + +## Installation + +The conversion script depends on the tensorflow object detection API, +for installation steps. + +### 1. Install necessary packages (including tensorflow). + +```bash +sudo apt-get update +sudo apt-get install -y --no-install-recommends python3-pip python3-dev +``` + +``` bash +pip3 install -r requirements.txt +``` + +### 2. Install the tensorflow object detection API + If it's already installed you can check your `$PYTHONPATH`and move on to the usage section. + Here's a quick (unofficial) guide on how to do that. + For more details follow the official guide + [INSTALL TENSORFLOW OBJECT DETECTION API](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/installation.md). + +```bash +# clone the models repository +git clone https://github.com/tensorflow/models.git +``` +```bash +# install some dependencies +pip3 install --user Cython +pip3 install --user contextlib2 +pip3 install --user pillow +pip3 install --user lxml +pip3 install --user jupyter +pip3 install --user matplotlib +``` +```bash +# clone and compile the cocoapi +git clone https://github.com/cocodataset/cocoapi.git +cd cocoapi/PythonAPI +make +cp -r pycocotools /models/research/ +``` +```bash +# Protobuf Compilation +cd /models/research/ +protoc object_detection/protos/*.proto --python_out=. +``` +```bash +# setup the PYTHONPATH +export PYTHONPATH=$PYTHONPATH:`pwd`:`pwd`/slim + ``` + +## Usage + +Run the script. + +```bash +$python3 converter.py --cvat-xml --image-dir \ + --output-dir --attribute +``` + +Leave `--attribute` argument empty if you want the to consider CVAT labels as tfrecords labels, +otherwise you can specify a used attribute name like `--attribute `. + +Please run `python converter.py --help` for more details. diff --git a/utils/tfrecords/converter.py b/utils/tfrecords/converter.py new file mode 100644 index 00000000..7633dcba --- /dev/null +++ b/utils/tfrecords/converter.py @@ -0,0 +1,223 @@ +#!/usr/bin/env python +# +# SPDX-License-Identifier: MIT +# coding: utf-8 +# -*- coding: utf-8 -*- +""" +Given a CVAT XML and a directory with the image dataset, this script reads the +CVAT XML and writes the annotations in tfrecords into a given +directory. + +This implementation supports annotated images only. +""" +from __future__ import unicode_literals +import xml.etree.ElementTree as ET +import tensorflow as tf +from object_detection.utils import dataset_util +from collections import Counter +import codecs +import hashlib +from pathlib import Path +import argparse +import os +import string + +# we need it to filter out non-ASCII characters otherwise +# trainning will crash +printable = set(string.printable) + +def parse_args(): + """Parse arguments of command line""" + parser = argparse.ArgumentParser( + description='Convert CVAT XML annotations to tfrecords format' + ) + + parser.add_argument( + '--cvat-xml', metavar='FILE', required=True, + help='input file with CVAT annotation in xml format' + ) + + parser.add_argument( + '--image-dir', metavar='DIRECTORY', required=True, + help='directory which contains original images' + ) + + parser.add_argument( + '--output-dir', metavar='DIRECTORY', required=True, + help='directory for output annotations in tfrecords format' + ) + + parser.add_argument( + '--train-percentage', metavar='PERCENTAGE', required=False, default=90, type=int, + help='the percentage of training data to total data (default: 90)' + ) + + parser.add_argument( + '--min-train', metavar='NUM', required=False, default=10, type=int, + help='The minimum number of images above which the label is considered (default: 10)' + ) + + parser.add_argument( + '--attribute', metavar='NAME', required=False, default="", + type=str, + help='The attribute name based on which the object can identified' + ) + + return parser.parse_args() + +def process_cvat_xml(args): + """Transforms a single XML in CVAT format to tfrecords. + """ + + train_percentage = int(args.train_percentage) + assert (train_percentage<=100 and train_percentage>=0) + + cvat_xml = ET.parse(args.cvat_xml).getroot() + + output_dir = Path(args.output_dir) + if not output_dir.exists(): + print("Creating the output directory because it doesn't exist") + output_dir.mkdir() + + cvat_name, output_dir, min_train = \ + args.attribute, output_dir.absolute(), args.min_train + + # Open the tfrecord files for writing + writer_train = tf.python_io.TFRecordWriter( + os.path.join(output_dir.absolute(), 'train.tfrecord')) + writer_eval = tf.python_io.TFRecordWriter( + os.path.join(output_dir.absolute(), 'eval.tfrecord')) + + # extract the object names + object_names = [] + num_imgs = 0 + for img in cvat_xml.findall('image'): + num_imgs += 1 + for box in img: + if cvat_name == "" : + obj_name = ''.join(filter(lambda x: x in printable, + box.attrib['label'])) + object_names.append(obj_name) + else : + for attribute in box : + if attribute.attrib['name'] == cvat_name : + obj_name = ''.join(filter(lambda x: x in printable, + attribute.text.lower())) + object_names.append(obj_name) + + labels, values = zip(*Counter(object_names).items()) + + # Create the label map file + saved_dict = dict() + reverse_dict = dict() + with codecs.open(os.path.join(output_dir,'label_map.pbtxt'), + 'w', encoding='utf8') as f: + counter = 1 + for iii, label in enumerate(labels): + if values[iii] < min_train : + continue + saved_dict[label] = counter + reverse_dict[counter] = label + f.write(u'item {\n') + f.write(u'\tid: {}\n'.format(counter)) + f.write(u"\tname: '{}'\n".format(label)) + f.write(u'}\n\n') + counter+=1 + + num_iter = num_imgs + eval_num = num_iter * (100 - train_percentage) // 100 + train_num = num_iter - eval_num + + + for counter,example in enumerate(cvat_xml.findall('image')): + tf_example = create_tf_example(example, args.attribute, saved_dict, args.image_dir) + if tf_example is None: + continue + if(counter < train_num): + writer_train.write(tf_example.SerializeToString()) + else : + writer_eval.write(tf_example.SerializeToString()) + + writer_train.close() + writer_eval.close() + + + return saved_dict, num_imgs + + +# Defining the main conversion function +def create_tf_example(example, cvat_name, saved_dict, img_dir): + # Process one image data per run + height = int(example.attrib['height']) # Image height + width = int(example.attrib['width']) # Image width + filename = os.path.join(img_dir, example.attrib['name']) + _, ext = os.path.splitext(example.attrib['name']) + + filename = filename.encode('utf8') + with tf.gfile.GFile(filename,'rb') as fid: + encoded_jpg = fid.read() + + key = hashlib.sha256(encoded_jpg).hexdigest() + + if ext.lower() in ['.jpg','.jpeg'] : + image_format = 'jpeg'.encode('utf8') + elif ext.lower() == '.png' : + image_format = 'png'.encode('utf8') + else: + print('File Format not supported, Skipping') + return None + + xmins = [] # List of normalized left x coordinates in bounding box (1 per box) + xmaxs = [] # List of normalized right x coordinates in bounding box + # (1 per box) + ymins = [] # List of normalized top y coordinates in bounding box (1 per box) + ymaxs = [] # List of normalized bottom y coordinates in bounding box + # (1 per box) + classes_text = [] # List of string class name of bounding box (1 per box) + classes = [] # List of integer class id of bounding box (1 per box) + + # Loop oer the boxes and fill the above fields + for box in example: + box_name = '' + if cvat_name == "" : + box_name = box.attrib['label'] + else : + for attr in box: + if attr.attrib['name'] == cvat_name: + box_name = attr.text.lower() + + # filter out non-ASCII characters + box_name = ''.join(filter(lambda x: x in printable, box_name)) + + if box_name in saved_dict.keys(): + xmins.append(float(box.attrib['xtl']) / width) + xmaxs.append(float(box.attrib['xbr']) / width) + ymins.append(float(box.attrib['ytl']) / height) + ymaxs.append(float(box.attrib['ybr']) / height) + classes_text.append(box_name.encode('utf8')) + classes.append(saved_dict[box_name]) + + tf_example = tf.train.Example(features=tf.train.Features(feature={ + 'image/height': dataset_util.int64_feature(height), + 'image/width': dataset_util.int64_feature(width), + 'image/filename': dataset_util.bytes_feature(filename), + 'image/source_id': dataset_util.bytes_feature(filename), + 'image/key/sha256': dataset_util.bytes_feature(key.encode('utf8')), + 'image/encoded': dataset_util.bytes_feature(encoded_jpg), + 'image/format': dataset_util.bytes_feature(image_format), + 'image/object/bbox/xmin': dataset_util.float_list_feature(xmins), + 'image/object/bbox/xmax': dataset_util.float_list_feature(xmaxs), + 'image/object/bbox/ymin': dataset_util.float_list_feature(ymins), + 'image/object/bbox/ymax': dataset_util.float_list_feature(ymaxs), + 'image/object/class/text': dataset_util.bytes_list_feature(classes_text), + 'image/object/class/label': dataset_util.int64_list_feature(classes), + })) + return tf_example + +def main(): + args = parse_args() + process_cvat_xml(args) + +if __name__== '__main__' : + main() + diff --git a/utils/tfrecords/requirements.txt b/utils/tfrecords/requirements.txt new file mode 100644 index 00000000..616c0401 --- /dev/null +++ b/utils/tfrecords/requirements.txt @@ -0,0 +1,3 @@ +argparse==1.1 +tensorflow==1.13.1 +pathlib==1.0.1