cvat/utils/yolo/converter.py

#!/usr/bin/env python
#
# SPDX-License-Identifier: MIT
"""
Given a CVAT XML, this script reads the CVAT XML and writes the
annotations in YOLO format into a given directory.

This implementation supports both interpolation tracks from video and
annotated images.
"""

import os
import argparse
import glog as log
from lxml import etree
import requests


def parse_args():
    """Parse arguments of command line"""
    parser = argparse.ArgumentParser(
        description='Convert CVAT XML annotations to YOLO format'
    )

    parser.add_argument(
        '--cvat-xml', metavar='FILE', required=True,
        help='input file with CVAT annotation in xml format'
    )

    parser.add_argument(
        '--image-dir', metavar='DIRECTORY', required=False,
        help='directory which contains original images'
    )

    parser.add_argument(
        '--output-dir', metavar='DIRECTORY', required=True,
        help='directory for output annotations in YOLO format'
    )

    parser.add_argument(
        '--username', metavar='USERNAME', required=False,
        help='Username from CVAT Login page, required to download images'
    )

    parser.add_argument(
        '--password', metavar='PASSWORD', required=False,
        help='Password from CVAT Login page, required to download images'
    )

    parser.add_argument(
        '--labels', metavar='ILABELS', required=False,
        help='Labels (separated by comma) to extract. Example: car,truck,motorcycle'
    )

    return parser.parse_args()


def process_cvat_xml(xml_file, image_dir, output_dir,username,password,ilabels):
    """
    Transforms a single XML in CVAT format to YOLO TXT files and download images when not in IMAGE_DIR

    :param xml_file: CVAT format XML
    :param image_dir: image directory of the dataset
    :param output_dir: directory of annotations with YOLO format
    :param username: Username used to login CVAT. Required to download images
    :param password: Password used to login CVAT. Required to download images
    :param ilabels: Comma separated ordered labels
    :return:
    """
    KNOWN_TAGS = {'box', 'image', 'attribute'}

    if (image_dir is None):
        image_dir=os.path.join(output_dir,"data/obj")
        os.makedirs(image_dir, exist_ok=True)

    os.makedirs(output_dir, exist_ok=True)
    cvat_xml = etree.parse(xml_file)
    basename = os.path.splitext( os.path.basename( xml_file ) )[0]
    current_labels = {}
    traintxt = ""

    if (ilabels is not None):
        vlabels=ilabels.split(',')
        current_labels = {label: idx for idx, label in enumerate(vlabels)}
    else:
        current_labels = {label.text: idx for idx, label in enumerate(cvat_xml.findall('meta/task/labels/label/name'))}

    tracks= cvat_xml.findall( './/track' )

    if (tracks is not None) and (len(tracks) > 0):
        frames = {}

        for track in tracks:
            trackid = int(track.get("id"))
            label = track.get("label")
            boxes = track.findall( './box' )
            for box in boxes:
                frameid  = int(box.get('frame'))
                outside  = int(box.get('outside'))
                #occluded = int(box.get('occluded'))  #currently unused
                #keyframe = int(box.get('keyframe'))  #currently unused
                xtl      = float(box.get('xtl'))
                ytl      = float(box.get('ytl'))
                xbr      = float(box.get('xbr'))
                ybr      = float(box.get('ybr'))

                frame = frames.get( frameid, {} )

                if outside == 0:
                    frame[ trackid ] = { 'xtl': xtl, 'ytl': ytl, 'xbr': xbr, 'ybr': ybr, 'label': label }

                frames[ frameid ] = frame

        width = int(cvat_xml.find('.//original_size/width').text)
        height  = int(cvat_xml.find('.//original_size/height').text)

        taskid = int(cvat_xml.find('.//task/id').text)

        urlsegment = cvat_xml.find(".//segments/segment/url").text
        urlbase = urlsegment.split("?")[0]

        httpclient = requests.session()
        httpclient.get(urlbase)

        csrftoken = "none"
        sessionid = "none"

        # Spit out a list of each object for each frame
        for frameid in sorted(frames.keys()):
            image_name = "%s_%08d.jpg" % (basename, frameid)
            image_path = os.path.join(image_dir, image_name)
            if not os.path.exists(image_path):
                if username is None:
                    log.warn('{} image cannot be found. Is `{}` image directory correct?\n'.format(image_path, image_dir))
                else:
                    log.info('{} image cannot be found. Downloading from task ID {}\n'.format(image_path, taskid))

                    if sessionid == "none":
                        if "csrftoken" in httpclient.cookies:
                            csrftoken = httpclient.cookies["csrftoken"]
                        elif "csrf" in httpclient.cookies:
                            csrftoken = httpclient.cookies["csrf"]

                        login_data = dict(username=username, password=password,
                                        csrfmiddlewaretoken=csrftoken, next='/dashboard')

                        urllogin = urlbase+"/auth/login"
                        httpclient.post(urllogin, data=login_data,
                                        headers=dict(Referer=urllogin))

                        if ("sessionid" in httpclient.cookies):
                            sessionid = httpclient.cookies["sessionid"]

                    url = urlbase+"/api/v1/tasks/"+str(taskid)+"/frames/"+ str(frameid)

                    req = httpclient.get(url, headers=dict(
                        csrftoken=csrftoken, sessionid=sessionid))

                    with open(image_path, 'wb') as fo:
                        fo.write(req.content)
                        print('Url saved as %s\n' % image_path)


            frame = frames[frameid]

            _yoloAnnotationContent=""

            objids = sorted(frame.keys())

            for objid in objids:

                box = frame[objid]

                label = box.get('label')
                xmin = float(box.get('xtl'))
                ymin = float(box.get('ytl'))
                xmax = float(box.get('xbr'))
                ymax = float(box.get('ybr'))

                if not label in current_labels:
                    raise Exception('Unexpected label name {}'.format(label))

                labelid=current_labels[label]
                yolo_x= (xmin + ((xmax-xmin)/2))/width
                yolo_y= (ymin + ((ymax-ymin)/2))/height
                yolo_w = (xmax - xmin) / width
                yolo_h = (ymax - ymin) / height

                if len(_yoloAnnotationContent) != 0:
                        _yoloAnnotationContent += "\n"

                _yoloAnnotationContent+=str(labelid)+" "+"{:.6f}".format(yolo_x) +" "+"{:.6f}".format(yolo_y) +" "+"{:.6f}".format(yolo_w) +" "+"{:.6f}".format(yolo_h)
            anno_name = os.path.basename(os.path.splitext(image_name)[0] + '.txt')
            anno_path = os.path.join(image_dir, anno_name)

            _yoloFile = open(anno_path, "w", newline="\n")
            _yoloFile.write(_yoloAnnotationContent)
            _yoloFile.close()

            if len(traintxt)!=0:
                traintxt+="\n"

            traintxt+=image_path

    else:
        for img_tag in cvat_xml.findall('image'):
            image_name = img_tag.get('name')
            width = int(img_tag.get('width'))
            height = int(img_tag.get('height'))
            image_path = os.path.join(image_dir, image_name)
            if not os.path.exists(image_path):
                log.warn('{} image cannot be found. Is `{}` image directory correct?'.
                    format(image_path, image_dir))

            unknown_tags = {x.tag for x in img_tag.iter()}.difference(KNOWN_TAGS)
            if unknown_tags:
                log.warn('Ignoring tags for image {}: {}'.format(image_path, unknown_tags))

            _yoloAnnotationContent = ""

            for box in img_tag.findall('box'):
                label = box.get('label')
                xmin = float(box.get('xtl'))
                ymin = float(box.get('ytl'))
                xmax = float(box.get('xbr'))
                ymax = float(box.get('ybr'))

                if not label in current_labels:
                    raise Exception('Unexpected label name {}'.format(label))

                labelid = current_labels[label]
                yolo_x = (xmin + ((xmax-xmin)/2))/width
                yolo_y = (ymin + ((ymax-ymin)/2))/height
                yolo_w = (xmax - xmin) / width
                yolo_h = (ymax - ymin) / height

                if len(_yoloAnnotationContent) != 0:
                        _yoloAnnotationContent += "\n"

                _yoloAnnotationContent += str(labelid)+" "+"{:.6f}".format(yolo_x) + " "+"{:.6f}".format(
                    yolo_y) + " "+"{:.6f}".format(yolo_w) + " "+"{:.6f}".format(yolo_h)

            anno_name = os.path.basename(os.path.splitext(image_name)[0] + '.txt')
            anno_path = os.path.join(image_dir, anno_name)

            _yoloFile = open(anno_path, "w", newline="\n")
            _yoloFile.write(_yoloAnnotationContent)
            _yoloFile.close()

    traintxt_file=open(output_dir+"/train.txt","w",newline="\n")
    traintxt_file.write(traintxt)
    traintxt_file.close()


def main():
    args = parse_args()
    process_cvat_xml(args.cvat_xml, args.image_dir, args.output_dir, args.username,args.password,args.labels)


if __name__ == "__main__":
    main()