From 935d380d36cb1074eddc3b0018c856a3997804de Mon Sep 17 00:00:00 2001
From: Andrey Zhavoronkov <41117609+azhavoro@users.noreply.github.com>
Date: Thu, 15 Aug 2019 18:02:35 +0300
Subject: [PATCH] tfrecord format support (#644)

* added tfrecord loader/dumper
* add comment
* remove unused import
* used the latest version of tensorflow(1.12.3) which supports cuda 9.0
updated cudnn library
install tensorflow by default and replace it by tensorflow-gpu in case
of cuda support is enabled
* Updated changelog
---
 CHANGELOG.md                        |   2 +-
 Dockerfile                          |  24 ++--
 README.md                           |   1 +
 components/cuda/install.sh          |   5 +-
 components/tf_annotation/install.sh |   7 +-
 cvat/apps/annotation/settings.py    |   1 +
 cvat/apps/annotation/tfrecord.py    | 182 ++++++++++++++++++++++++++++
 cvat/requirements/base.txt          |   1 +
 8 files changed, 201 insertions(+), 22 deletions(-)
 create mode 100644 cvat/apps/annotation/tfrecord.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9f578b59..10e6a6ca 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -18,7 +18,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Ability to create a custom extractors for unsupported media types
 - Added in PDF extractor
 - Added in a command line model manager tester
-- Ability to dump/load annotations in several formats from UI (CVAT, Pascal VOC, YOLO, MS COCO, png mask)
+- Ability to dump/load annotations in several formats from UI (CVAT, Pascal VOC, YOLO, MS COCO, png mask, TFRecord)
 
 ### Changed
 - Outside and keyframe buttons in the side panel for all interpolation shapes (they were only for boxes before)
diff --git a/Dockerfile b/Dockerfile
index 8837fdd0..6b5d416b 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -58,15 +58,12 @@ COPY components /tmp/components
 # OpenVINO toolkit support
 ARG OPENVINO_TOOLKIT
 ENV OPENVINO_TOOLKIT=${OPENVINO_TOOLKIT}
+ENV REID_MODEL_DIR=${HOME}/reid
 RUN if [ "$OPENVINO_TOOLKIT" = "yes" ]; then \
-        /tmp/components/openvino/install.sh; \
-    fi
-
-# CUDA support
-ARG CUDA_SUPPORT
-ENV CUDA_SUPPORT=${CUDA_SUPPORT}
-RUN if [ "$CUDA_SUPPORT" = "yes" ]; then \
-        /tmp/components/cuda/install.sh; \
+        /tmp/components/openvino/install.sh && \
+        mkdir ${REID_MODEL_DIR} && \
+        wget https://download.01.org/openvinotoolkit/2018_R5/open_model_zoo/person-reidentification-retail-0079/FP32/person-reidentification-retail-0079.xml -O reid/reid.xml && \
+        wget https://download.01.org/openvinotoolkit/2018_R5/open_model_zoo/person-reidentification-retail-0079/FP32/person-reidentification-retail-0079.bin -O reid/reid.bin; \
     fi
 
 # Tensorflow annotation support
@@ -120,12 +117,11 @@ RUN apt-get update && \
         echo export "GIT_SSH_COMMAND=\"ssh -o StrictHostKeyChecking=no -o ConnectTimeout=30 -o ProxyCommand='nc -X 5 -x ${socks_proxy} %h %p'\"" >> ${HOME}/.bashrc; \
     fi
 
-# Download model for re-identification app
-ENV REID_MODEL_DIR=${HOME}/reid
-RUN if [ "$OPENVINO_TOOLKIT" = "yes" ]; then \
-        mkdir ${HOME}/reid && \
-        wget https://download.01.org/openvinotoolkit/2018_R5/open_model_zoo/person-reidentification-retail-0079/FP32/person-reidentification-retail-0079.xml -O reid/reid.xml && \
-        wget https://download.01.org/openvinotoolkit/2018_R5/open_model_zoo/person-reidentification-retail-0079/FP32/person-reidentification-retail-0079.bin -O reid/reid.bin; \
+# CUDA support
+ARG CUDA_SUPPORT
+ENV CUDA_SUPPORT=${CUDA_SUPPORT}
+RUN if [ "$CUDA_SUPPORT" = "yes" ]; then \
+        /tmp/components/cuda/install.sh; \
     fi
 
 # TODO: CHANGE URL
diff --git a/README.md b/README.md
index bf5d6a6b..2bca82a5 100644
--- a/README.md
+++ b/README.md
@@ -39,6 +39,7 @@ Format selection is possible after clicking on the Upload annotation / Dump anno
 | [YOLO](https://pjreddie.com/darknet/yolo/)                                         | X      | X      |
 | [MS COCO Object Detection](http://cocodataset.org/#format-data)                    | X      | X      |
 | PNG mask                                                                           | X      |        |
+| [TFrecord](https://www.tensorflow.org/tutorials/load_data/tf_records)              | X      | X      |
 
 ## Links
 - [Intel AI blog: New Computer Vision Tool Accelerates Annotation of Digital Images and Video](https://www.intel.ai/introducing-cvat)
diff --git a/components/cuda/install.sh b/components/cuda/install.sh
index dd689f77..2cda99fc 100755
--- a/components/cuda/install.sh
+++ b/components/cuda/install.sh
@@ -16,7 +16,7 @@ echo "deb http://developer.download.nvidia.com/compute/machine-learning/repos/ub
 
 CUDA_VERSION=9.0.176
 NCCL_VERSION=2.1.15
-CUDNN_VERSION=7.0.5.15
+CUDNN_VERSION=7.6.2.24
 CUDA_PKG_VERSION="9-0=${CUDA_VERSION}-1"
 echo 'export PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:${PATH}' >> ${HOME}/.bashrc
 echo 'export LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64:${LD_LIBRARY_PATH}' >> ${HOME}/.bashrc
@@ -32,3 +32,6 @@ apt-get update && apt-get install -y --no-install-recommends --allow-unauthentic
     ln -s cuda-9.0 /usr/local/cuda && \
 rm -rf /var/lib/apt/lists/* \
     /etc/apt/sources.list.d/nvidia-ml.list /etc/apt/sources.list.d/cuda.list
+
+pip3 uninstall -y tensorflow
+pip3 install --no-cache-dir tensorflow-gpu==1.12.3
diff --git a/components/tf_annotation/install.sh b/components/tf_annotation/install.sh
index bdd1c6e2..fc5ed6b6 100755
--- a/components/tf_annotation/install.sh
+++ b/components/tf_annotation/install.sh
@@ -12,9 +12,4 @@ tar -xzf model.tar.gz && rm model.tar.gz && \
 mv faster_rcnn_inception_resnet_v2_atrous_coco_2018_01_28 ${HOME}/rcnn && cd ${HOME} && \
 mv rcnn/frozen_inference_graph.pb rcnn/inference_graph.pb
 
-if [[ "$CUDA_SUPPORT" = "yes" ]]
-then
-    pip3 install --no-cache-dir tensorflow-gpu==1.7.0
-else
-    pip3 install --no-cache-dir tensorflow==1.7.0
-fi
+# tensorflow is installed globally
diff --git a/cvat/apps/annotation/settings.py b/cvat/apps/annotation/settings.py
index f9b28d4d..0ac2a38c 100644
--- a/cvat/apps/annotation/settings.py
+++ b/cvat/apps/annotation/settings.py
@@ -11,4 +11,5 @@ BUILTIN_FORMATS = (
     os.path.join(path_prefix, 'yolo.py'),
     os.path.join(path_prefix, 'coco.py'),
     os.path.join(path_prefix, 'mask.py'),
+    os.path.join(path_prefix, 'tfrecord.py'),
 )
diff --git a/cvat/apps/annotation/tfrecord.py b/cvat/apps/annotation/tfrecord.py
new file mode 100644
index 00000000..a911625e
--- /dev/null
+++ b/cvat/apps/annotation/tfrecord.py
@@ -0,0 +1,182 @@
+# Copyright (C) 2019 Intel Corporation
+#
+# SPDX-License-Identifier: MIT
+
+format_spec = {
+    "name": "TFRecord",
+    "dumpers": [
+        {
+            "display_name": "{name} {format} {version}",
+            "format": "ZIP",
+            "version": "1.0",
+            "handler": "dump"
+        },
+    ],
+    "loaders": [
+        {
+            "display_name": "{name} {format} {version}",
+            "format": "ZIP",
+            "version": "1.0",
+            "handler": "load"
+        },
+    ],
+}
+
+def dump(file_object, annotations):
+    import tensorflow as tf
+    import os
+    import string
+    from zipfile import ZipFile
+    import codecs
+    from tempfile import TemporaryDirectory
+    from collections import OrderedDict
+
+    # we need it to filter out non-ASCII characters otherwise
+    # trainning will crash
+    printable = set(string.printable)
+
+    def int64_feature(value):
+        return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
+
+    def int64_list_feature(value):
+        return tf.train.Feature(int64_list=tf.train.Int64List(value=value))
+
+    def bytes_feature(value):
+        return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
+
+    def bytes_list_feature(value):
+        return tf.train.Feature(bytes_list=tf.train.BytesList(value=value))
+
+    def float_list_feature(value):
+        return tf.train.Feature(float_list=tf.train.FloatList(value=value))
+
+    # Defining the main conversion function
+    def create_tf_example(img_id, img_size, image_name, boxes, label_ids):
+        # Process one image data per run
+        height = img_size[0]
+        width = img_size[1]
+
+        xmins = [] # List of normalized left x coordinates in bounding box (1 per box)
+        xmaxs = [] # List of normalized right x coordinates in bounding box
+                    # (1 per box)
+        ymins = [] # List of normalized top y coordinates in bounding box (1 per box)
+        ymaxs = [] # List of normalized bottom y coordinates in bounding box
+                    # (1 per box)
+        classes_text = [] # List of string class name of bounding box (1 per box)
+        classes = [] # List of integer class id of bounding box (1 per box)
+
+        # Loop oer the boxes and fill the above fields
+        for box in boxes:
+            # filter out non-ASCII characters
+            box_name = ''.join(filter(lambda x: x in printable, box.label))
+
+            xmins.append(box.points[0] / width)
+            xmaxs.append(box.points[2] / width)
+            ymins.append(box.points[1] / height)
+            ymaxs.append(box.points[3] / height)
+            classes_text.append(box_name.encode('utf8'))
+            classes.append(label_ids[box.label])
+
+        tf_example = tf.train.Example(features=tf.train.Features(feature={
+            'image/height': int64_feature(height),
+            'image/width': int64_feature(width),
+            'image/filename': bytes_feature(image_name.encode('utf8')),
+            'image/source_id': int64_feature(img_id),
+            'image/object/bbox/xmin': float_list_feature(xmins),
+            'image/object/bbox/xmax': float_list_feature(xmaxs),
+            'image/object/bbox/ymin': float_list_feature(ymins),
+            'image/object/bbox/ymax': float_list_feature(ymaxs),
+            'image/object/class/text': bytes_list_feature(classes_text),
+            'image/object/class/label': int64_list_feature(classes),
+        }))
+        return tf_example
+
+    # Create the label map file
+    label_ids = OrderedDict((label[1]["name"], idx) for idx, label in enumerate(annotations.meta["task"]["labels"]))
+    with TemporaryDirectory() as out_dir:
+        labelmap_file = 'label_map.pbtxt'
+        with codecs.open(os.path.join(out_dir, labelmap_file), 'w', encoding='utf8') as f:
+            for label, idx in label_ids.items():
+                f.write(u'item {\n')
+                f.write(u'\tid: {}\n'.format(idx))
+                f.write(u"\tname: '{}'\n".format(label))
+                f.write(u'}\n\n')
+
+        annotation_file = '{}.tfrecord'.format(annotations.meta['task']['name'])
+        with tf.io.TFRecordWriter(os.path.join(out_dir, annotation_file)) as writer:
+            for frame_annotation in annotations.group_by_frame():
+                boxes = [shape for shape in frame_annotation.labeled_shapes if shape.type == 'rectangle']
+                if not boxes:
+                    continue
+                tf_example = create_tf_example(
+                    img_id=frame_annotation.frame,
+                    img_size=(frame_annotation.height, frame_annotation.width),
+                    image_name=frame_annotation.name,
+                    boxes=boxes,
+                    label_ids=label_ids,
+                )
+                writer.write(tf_example.SerializeToString())
+
+        with ZipFile(file_object, 'w') as output_zip:
+            output_zip.write(filename=os.path.join(out_dir, labelmap_file), arcname=labelmap_file)
+            output_zip.write(filename=os.path.join(out_dir, annotation_file), arcname=annotation_file)
+
+def load(file_object, annotations):
+    from pyunpack import Archive
+    from tempfile import TemporaryDirectory
+    import os
+    import tensorflow as tf
+    from glob import glob
+    import numpy as np
+
+    tf.enable_eager_execution()
+
+    def parse_tfrecord_file(filenames):
+        def clamp(value, _min, _max):
+            return max(min(_max, value), _min)
+
+        dataset = tf.data.TFRecordDataset(filenames)
+        image_feature_description = {
+            'image/filename': tf.io.FixedLenFeature([], tf.string),
+            'image/source_id': tf.io.FixedLenFeature([], tf.int64),
+            'image/height': tf.io.FixedLenFeature([], tf.int64),
+            'image/width': tf.io.FixedLenFeature([], tf.int64),
+            # Object boxes and classes.
+            'image/object/bbox/xmin': tf.io.VarLenFeature(tf.float32),
+            'image/object/bbox/xmax': tf.io.VarLenFeature(tf.float32),
+            'image/object/bbox/ymin': tf.io.VarLenFeature(tf.float32),
+            'image/object/bbox/ymax': tf.io.VarLenFeature(tf.float32),
+            'image/object/class/label': tf.io.VarLenFeature(tf.int64),
+            'image/object/class/text': tf.io.VarLenFeature(tf.string),
+        }
+
+        for record in dataset:
+            parsed_record = tf.io.parse_single_example(record, image_feature_description)
+            frame_number = tf.cast(parsed_record['image/source_id'], tf.int64).numpy().item()
+            frame_height = tf.cast(parsed_record['image/height'], tf.int64).numpy().item()
+            frame_width = tf.cast(parsed_record['image/width'], tf.int64).numpy().item()
+            xmins = tf.sparse.to_dense(parsed_record['image/object/bbox/xmin']).numpy()
+            ymins = tf.sparse.to_dense(parsed_record['image/object/bbox/ymin']).numpy()
+            xmaxs = tf.sparse.to_dense(parsed_record['image/object/bbox/xmax']).numpy()
+            ymaxs = tf.sparse.to_dense(parsed_record['image/object/bbox/ymax']).numpy()
+            labels = tf.sparse.to_dense(parsed_record['image/object/class/text'], default_value='').numpy()
+            for shape in np.dstack((labels, xmins, ymins, xmaxs, ymaxs))[0]:
+                annotations.add_shape(annotations.LabeledShape(
+                    type='rectangle',
+                    frame=frame_number,
+                    label=shape[0].decode("utf-8"),
+                    points=[
+                        clamp(shape[1] * frame_width, 0, frame_width),
+                        clamp(shape[2] * frame_height, 0, frame_height),
+                        clamp(shape[3] * frame_width, 0, frame_width),
+                        clamp(shape[4] * frame_height, 0, frame_height),
+                    ],
+                    occluded=False,
+                    attributes=[],
+                ))
+
+    archive_file = getattr(file_object, 'name')
+    with TemporaryDirectory() as tmp_dir:
+        Archive(archive_file).extractall(tmp_dir)
+        filenames = glob(os.path.join(tmp_dir, '*.tfrecord'))
+        parse_tfrecord_file(filenames)
diff --git a/cvat/requirements/base.txt b/cvat/requirements/base.txt
index 04a201ee..a3f22e82 100644
--- a/cvat/requirements/base.txt
+++ b/cvat/requirements/base.txt
@@ -38,3 +38,4 @@ pascal_voc_writer==0.1.4
 cython==0.29.13
 matplotlib==3.0.3
 scikit-image>=0.14.0
+tensorflow==1.12.3