Fix upload big xml files for tasks (#199)

* Fix #123 (Uploading large XML Files) * Integrated SILK for profiling. * Don't try to call save_job without objects.
7 years ago · 1a040a0608
parent 076899afa3
commit 1a040a0608
13 changed files with 228 additions and 103 deletions
--- a/.gitignore
+++ b/.gitignore
@ -7,6 +7,7 @@
 /keys
 /logs
 /components/openvino/*.tgz
+/profiles

 # Ignore temporary files
 docker-compose.override.yml
--- a/cvat/apps/dashboard/static/dashboard/js/dashboard.js
+++ b/cvat/apps/dashboard/static/dashboard/js/dashboard.js
@ -538,29 +538,62 @@ function uploadAnnotationRequest() {
                    }
                    catch(error) {
                        overlay.remove();
-                        showMessage("Parsing errors was occured. " + error);
+                        showMessage("Parsing errors was occurred. " + error);
                        return;
                    }

-                    const exportData = createExportContainer();
-                    exportData.create = parsed;
-
                    let asyncSave = function() {
                        $.ajax({
-                            url: '/save/annotation/task/' + window.cvat.dashboard.taskID,
-                            type: 'POST',
-                            data: JSON.stringify(exportData),
-                            contentType: 'application/json',
+                            url: '/delete/annotation/task/' + window.cvat.dashboard.taskID,
+                            type: 'DELETE',
                            success: function() {
-                                let message = 'Annotation successfully uploaded';
-                                showMessage(message);
+                                asyncSaveChunk(0);
                            },
                            error: function(response) {
-                                let message = 'Annotation uploading errors was occured. ' + response.responseText;
+                                let message = 'Previous annotations cannot be deleted: ' +
+                                    response.responseText;
                                showMessage(message);
+                                overlay.remove();
                            },
-                            complete: () => overlay.remove()
                        });
+                    }
+
+                    let asyncSaveChunk = function(start) {
+                        const CHUNK_SIZE = 100000;
+                        let end = start + CHUNK_SIZE;
+                        let chunk = {};
+                        let next = false;
+                        for (let prop in parsed) {
+                            if (parsed.hasOwnProperty(prop)) {
+                                chunk[prop] = parsed[prop].slice(start, end);
+                                next |= chunk[prop].length > 0;
+                            }
+                        }
+
+                        if (next) {
+                            let exportData = createExportContainer();
+                            exportData.create = chunk;
+
+                            $.ajax({
+                                url: '/save/annotation/task/' + window.cvat.dashboard.taskID,
+                                type: 'POST',
+                                data: JSON.stringify(exportData),
+                                contentType: 'application/json',
+                                success: function() {
+                                    asyncSaveChunk(end);
+                                },
+                                error: function(response) {
+                                    let message = 'Annotations uploading errors were occurred: ' +
+                                        response.responseText;
+                                    showMessage(message);
+                                    overlay.remove();
+                                },
+                            });
+                        } else {
+                            let message = 'Annotations were uploaded successfully';
+                            showMessage(message);
+                            overlay.remove();
+                        }
                    };

                    overlay.setMessage('Annotation is being saved..');
--- a/cvat/apps/engine/annotation.py
+++ b/cvat/apps/engine/annotation.py
@ -19,6 +19,7 @@ import django_rq
 from django.conf import settings
 from django.db import transaction

+from cvat.apps.profiler import silk_profile
 from . import models
 from .task import get_frame_path, get_image_meta_cache
 from .log import slogger
@ -70,16 +71,17 @@ def get(jid):

    return annotation.to_client()

+@silk_profile(name="Save job")
@transaction.atomic
-def save_job(jid, data, delete_old_data=False):
+def save_job(jid, data):
    """
    Save new annotations for the job.
    """
-    db_job = models.Job.objects.select_for_update().get(id=jid)
+    slogger.job[jid].info("Enter save_job API: jid = {}".format(jid))
+    db_job = models.Job.objects.select_related('segment__task') \
+        .select_for_update().get(id=jid)

    annotation = _AnnotationForJob(db_job)
-    if delete_old_data:
-        annotation.delete_all_shapes_from_db()
    annotation.validate_data_from_client(data)

    annotation.delete_from_db(data['delete'])
@ -88,12 +90,33 @@ def save_job(jid, data, delete_old_data=False):

    db_job.segment.task.updated_date = timezone.now()
    db_job.segment.task.save()
+    slogger.job[jid].info("Leave save_job API: jid = {}".format(jid))
+
+@silk_profile(name="Clear job")
+@transaction.atomic
+def clear_job(jid):
+    """
+    Clear annotations for the job.
+    """
+    slogger.job[jid].info("Enter clear_job API: jid = {}".format(jid))
+    db_job = models.Job.objects.select_related('segment__task') \
+        .select_for_update().get(id=jid)
+
+    annotation = _AnnotationForJob(db_job)
+    annotation.delete_all_shapes_from_db()
+    annotation.delete_all_paths_from_db()
+
+    db_job.segment.task.updated_date = timezone.now()
+    db_job.segment.task.save()
+    slogger.job[jid].info("Leave clear_job API: jid = {}".format(jid))

 # pylint: disable=unused-argument
+@silk_profile(name="Save task")
 def save_task(tid, data):
    """
    Save new annotations for the task.
    """
+    slogger.task[tid].info("Enter save_task API: tid = {}".format(tid))
    db_task = models.Task.objects.get(id=tid)
    db_segments = list(db_task.segment_set.prefetch_related('job_set').all())

@ -117,7 +140,35 @@ def save_task(tid, data):
            }

    for jid, _data in splitted_data.items():
-        save_job(jid, _data, True)
+        # if an item inside _data isn't empty need to call save_job
+        isNonEmpty = False
+        for action in ['create', 'update', 'delete']:
+            for objects in _data[action].values():
+                if objects:
+                    isNonEmpty = True
+                    break
+
+        if isNonEmpty:
+            save_job(jid, _data)
+
+    slogger.task[tid].info("Leave save_task API: tid = {}".format(tid))
+
+
+# pylint: disable=unused-argument
+@silk_profile(name="Clear task")
+def clear_task(tid):
+    """
+    Clear annotations for the task.
+    """
+    slogger.task[tid].info("Enter clear_task API: tid = {}".format(tid))
+    db_task = models.Task.objects.get(id=tid)
+    db_segments = list(db_task.segment_set.prefetch_related('job_set').all())
+
+    for db_segment in db_segments:
+        for db_job in list(db_segment.job_set.all()):
+            clear_job(db_job.id)
+
+    slogger.task[tid].info("Leave clear_task API: tid = {}".format(tid))

 # pylint: disable=unused-argument
 def rq_handler(job, exc_type, exc_value, traceback):
@ -345,6 +396,14 @@ class _Annotation:
        self.points = []
        self.points_paths = []

+    def has_data(self):
+        non_empty = False
+        for attr in ['boxes', 'box_paths', 'polygons', 'polygon_paths',
+            'polylines', 'polyline_paths', 'points', 'points_paths']:
+            non_empty |= bool(getattr(self, attr))
+
+        return non_empty
+
    def get_max_client_id(self):
        max_client_id = -1

@ -473,6 +532,18 @@ class _Annotation:
    def to_points_paths(self):
        return self._to_poly_paths('points') + self.points_paths

+def bulk_create(db_model, objects, flt_param = {}):
+    if objects:
+        if flt_param:
+            if 'postgresql' in settings.DATABASES["default"]["ENGINE"]:
+                return db_model.objects.bulk_create(objects)
+            else:
+                ids = list(db_model.objects.filter(**flt_param).values_list('id', flat=True))
+                db_model.objects.bulk_create(objects)
+
+                return list(db_model.objects.exclude(id__in=ids).filter(**flt_param))
+        else:
+            return db_model.objects.bulk_create(objects)

 class _AnnotationForJob(_Annotation):
    def __init__(self, db_job):
@ -487,26 +558,19 @@ class _AnnotationForJob(_Annotation):
        self.db_attributes = {db_attr.id:db_attr
            for db_attr in models.AttributeSpec.objects.filter(
                label__task__id=db_job.segment.task.id)}
-        self.saved_db_ids = {}
-        self.saved_client_ids = set()

-    def _collect_saved_ids(self):
-        self.saved_db_ids = {}
-        self.saved_client_ids = set()
-        def append_ids(shape_type, shape_ids):
-            for db_id, client_id in shape_ids:
-                self.saved_db_ids[shape_type].append(db_id)
-                self.saved_client_ids.add(client_id)
+    def _get_client_ids_from_db(self):
+        client_ids = set()

-        for shape_type in ['polygons', 'polylines', 'points', 'boxes', 'paths']:
-            self.saved_db_ids[shape_type] = []
-
-        saved_path_ids = list(self.db_job.objectpath_set.values_list('id', 'client_id'))
-        append_ids('paths', saved_path_ids)
+        ids = list(self.db_job.objectpath_set.values_list('client_id', flat=True))
+        client_ids.update(ids)

        for shape_type in ['polygons', 'polylines', 'points', 'boxes']:
-            saved_shapes_ids = list(self._get_shape_class(shape_type).objects.filter(job_id=self.db_job.id).values_list('id', 'client_id'))
-            append_ids(shape_type, saved_shapes_ids)
+            ids = list(self._get_shape_class(shape_type).objects.filter(
+                job_id=self.db_job.id).values_list('client_id', flat=True))
+            client_ids.update(ids)
+
+        return client_ids

    def _merge_table_rows(self, rows, keys_for_merge, field_id):
        """dot.notation access to dictionary attributes"""
@ -1033,6 +1097,8 @@ class _AnnotationForJob(_Annotation):

                getattr(self, poly_path_type).append(poly_path)

+        return self.has_data()
+
    def _get_shape_class(self, shape_type):
        if shape_type == 'polygons':
            return models.LabeledPolygon
@ -1075,8 +1141,6 @@ class _AnnotationForJob(_Annotation):
            db_path_attrvals = []
            db_shapes = []
            db_shape_attrvals = []
-            # Need to be sure saved_db_ids is actual.
-            self._collect_saved_ids()

            shapes = getattr(self, shape_type)
            for path in shapes:
@ -1137,38 +1201,18 @@ class _AnnotationForJob(_Annotation):
                    db_shapes.append(db_shape)
                db_paths.append(db_path)

-            db_paths = models.ObjectPath.objects.bulk_create(db_paths)
-
-            if db_paths and db_paths[0].id == None:
-                # Try to get primary keys. Probably the code will work for sqlite
-                # but it definetely doesn't work for Postgres. Need to say that
-                # for Postgres bulk_create will return objects with ids even ids
-                # are auto incremented. Thus we will not be inside the 'if'.
-                if shape_type == 'polygon_paths':
-                    db_paths = list(self.db_job.objectpath_set.exclude(id__in=self.saved_db_ids['paths']))
-                elif shape_type == 'polyline_paths':
-                    db_paths = list(self.db_job.objectpath_set.exclude(id__in=self.saved_db_ids['paths']))
-                elif shape_type == 'box_paths':
-                    db_paths = list(self.db_job.objectpath_set.exclude(id__in=self.saved_db_ids['paths']))
-                elif shape_type == 'points_paths':
-                    db_paths = list(self.db_job.objectpath_set.exclude(id__in=self.saved_db_ids['paths']))
+            db_paths = bulk_create(models.ObjectPath, db_paths,
+                {"job_id": self.db_job.id})

            for db_attrval in db_path_attrvals:
                db_attrval.track_id = db_paths[db_attrval.track_id].id
-            models.ObjectPathAttributeVal.objects.bulk_create(db_path_attrvals)
+            bulk_create(models.ObjectPathAttributeVal, db_path_attrvals)

            for db_shape in db_shapes:
                db_shape.track_id = db_paths[db_shape.track_id].id

-            db_shapes_ids = list(self._get_shape_class(shape_type).objects.filter(track__job_id=self.db_job.id).values_list('id', flat=True))
-            db_shapes = self._get_shape_class(shape_type).objects.bulk_create(db_shapes)
-
-            if db_shapes and db_shapes[0].id == None:
-                # Try to get primary keys. Probably the code will work for sqlite
-                # but it definetely doesn't work for Postgres. Need to say that
-                # for Postgres bulk_create will return objects with ids even ids
-                # are auto incremented. Thus we will not be inside the 'if'.
-                db_shapes = list(self._get_shape_class(shape_type).objects.exclude(id__in=db_shapes_ids).filter(track__job_id=self.db_job.id))
+            db_shapes = bulk_create(self._get_shape_class(shape_type), db_shapes,
+                {"track__job_id": self.db_job.id})

            for db_attrval in db_shape_attrvals:
                if shape_type == 'polygon_paths':
@ -1180,7 +1224,7 @@ class _AnnotationForJob(_Annotation):
                elif shape_type == 'points_paths':
                    db_attrval.points_id = db_shapes[db_attrval.points_id].id

-            self._get_shape_attr_class(shape_type).objects.bulk_create(db_shape_attrvals)
+            bulk_create(self._get_shape_attr_class(shape_type), db_shape_attrvals)

    def _get_shape_set(self, shape_type):
        if shape_type == 'polygons':
@ -1193,8 +1237,6 @@ class _AnnotationForJob(_Annotation):
            return self.db_job.labeledpoints_set

    def _save_shapes_to_db(self):
-        # Need to be sure saved_db_ids is actual.
-        self._collect_saved_ids()
        for shape_type in ['polygons', 'polylines', 'points', 'boxes']:
            db_shapes = []
            db_attrvals = []
@ -1234,14 +1276,8 @@ class _AnnotationForJob(_Annotation):

                db_shapes.append(db_shape)

-            db_shapes = self._get_shape_class(shape_type).objects.bulk_create(db_shapes)
-
-            if db_shapes and db_shapes[0].id == None:
-                # Try to get primary keys. Probably the code will work for sqlite
-                # but it definetely doesn't work for Postgres. Need to say that
-                # for Postgres bulk_create will return objects with ids even ids
-                # are auto incremented. Thus we will not be inside the 'if'.
-                db_shapes = list(self._get_shape_set(shape_type).exclude(id__in=self.saved_db_ids[shape_type]))
+            db_shapes = bulk_create(self._get_shape_class(shape_type), db_shapes,
+                {"job_id": self.db_job.id})

            for db_attrval in db_attrvals:
                if shape_type == 'polygons':
@ -1253,7 +1289,7 @@ class _AnnotationForJob(_Annotation):
                else:
                    db_attrval.points_id = db_shapes[db_attrval.points_id].id

-            self._get_shape_attr_class(shape_type).objects.bulk_create(db_attrvals)
+            bulk_create(self._get_shape_attr_class(shape_type), db_attrvals)

    def _update_shapes_in_db(self):
        client_ids_to_delete = {}
@ -1278,38 +1314,35 @@ class _AnnotationForJob(_Annotation):
                raise Exception('Number of deleted object doesn\'t match with requested number')

    def _delete_paths_from_db(self, data):
+        client_ids_to_delete = []
        for shape_type in ['polygon_paths', 'polyline_paths', 'points_paths', 'box_paths']:
-            client_ids_to_delete = data[shape_type]
-            deleted = self.db_job.objectpath_set.filter(client_id__in=client_ids_to_delete).delete()
-            class_name = 'engine.ObjectPath'
-            if not (deleted[0] == 0 and len(client_ids_to_delete) == 0) and \
-               (class_name in deleted[1] and deleted[1][class_name] != len(client_ids_to_delete)):
-               raise Exception('Number of deleted object doesn\'t match with requested number')
+            client_ids_to_delete.extend(data[shape_type])
+        deleted = self.db_job.objectpath_set.filter(client_id__in=client_ids_to_delete).delete()
+        class_name = 'engine.ObjectPath'
+        if not (deleted[0] == 0 and len(client_ids_to_delete) == 0) and \
+            (class_name in deleted[1] and deleted[1][class_name] != len(client_ids_to_delete)):
+            raise Exception('Number of deleted object doesn\'t match with requested number')

-    def _delete_all_shapes_from_db(self):
+    def delete_all_shapes_from_db(self):
        for shape_type in ['polygons', 'polylines', 'points', 'boxes']:
            self._get_shape_set(shape_type).all().delete()

-    def _delete_all_paths_from_db(self):
+    def delete_all_paths_from_db(self):
        self.db_job.objectpath_set.all().delete()

-    def delete_all_shapes_from_db(self):
-        self._delete_all_shapes_from_db()
-        self._delete_all_paths_from_db()
-
    def delete_from_db(self, data):
        self._delete_shapes_from_db(data)
        self._delete_paths_from_db(data)

    def update_in_db(self, data):
-        self.init_from_client(data)
-        self._update_shapes_in_db()
-        self._update_paths_in_db()
+        if self.init_from_client(data):
+            self._update_shapes_in_db()
+            self._update_paths_in_db()

    def save_to_db(self, data):
-        self.init_from_client(data)
-        self._save_shapes_to_db()
-        self._save_paths_to_db()
+        if self.init_from_client(data):
+            self._save_shapes_to_db()
+            self._save_paths_to_db()

    def to_client(self):
        data = {
@ -1396,7 +1429,7 @@ class _AnnotationForJob(_Annotation):
        return data

    def validate_data_from_client(self, data):
-        self._collect_saved_ids()
+        db_client_ids = self._get_client_ids_from_db()
        client_ids = {
            'create': set(),
            'update': set(),
@ -1428,15 +1461,15 @@ class _AnnotationForJob(_Annotation):
        if tmp_res:
            raise Exception('More than one action for shape(s) with id={}'.format(tmp_res))

-        tmp_res = (self.saved_client_ids - client_ids['delete']) & client_ids['create']
+        tmp_res = (db_client_ids - client_ids['delete']) & client_ids['create']
        if tmp_res:
            raise Exception('Trying to create new shape(s) with existing client id {}'.format(tmp_res))

-        tmp_res = client_ids['delete'] - self.saved_client_ids
+        tmp_res = client_ids['delete'] - db_client_ids
        if tmp_res:
            raise Exception('Trying to delete shape(s) with nonexistent client id {}'.format(tmp_res))

-        tmp_res = client_ids['update'] - (self.saved_client_ids - client_ids['delete'])
+        tmp_res = client_ids['update'] - (db_client_ids - client_ids['delete'])
        if tmp_res:
            raise Exception('Trying to update shape(s) with nonexistent client id {}'.format(tmp_res))

--- a/cvat/apps/engine/urls.py
+++ b/cvat/apps/engine/urls.py
@ -20,6 +20,7 @@ urlpatterns = [
    path('download/annotation/task/<int:tid>', views.download_annotation),
    path('save/annotation/job/<int:jid>', views.save_annotation_for_job),
    path('save/annotation/task/<int:tid>', views.save_annotation_for_task),
+    path('delete/annotation/task/<int:tid>', views.delete_annotation_for_task),
    path('get/annotation/job/<int:jid>', views.get_annotation),
    path('get/username', views.get_username),
    path('save/exception/<int:jid>', views.catch_client_exception),
--- a/cvat/apps/engine/views.py
+++ b/cvat/apps/engine/views.py
@ -281,6 +281,20 @@ def save_annotation_for_task(request, tid):

    return HttpResponse()

+@login_required
+@permission_required(perm=['engine.task.change'],
+    fn=objectgetter(models.Task, 'tid'), raise_exception=True)
+def delete_annotation_for_task(request, tid):
+    try:
+        slogger.task[tid].info("delete annotation request")
+        annotation.clear_task(tid)
+    except Exception as e:
+        slogger.task[tid].error("cannot delete annotation", exc_info=True)
+        return HttpResponseBadRequest(str(e))
+
+    return HttpResponse()
+
+
@login_required
@permission_required(perm=['engine.job.change'],
    fn=objectgetter(models.Job, 'jid'), raise_exception=True)
--- a/cvat/apps/profiler.py
+++ b/cvat/apps/profiler.py
@ -0,0 +1,13 @@
+from django.apps import apps
+
+if apps.is_installed('silk'):
+    from silk.profiling.profiler import silk_profile
+else:
+    from functools import wraps
+    def silk_profile(name=None):
+        def profile(f):
+            @wraps(f)
+            def wrapped(*args, **kwargs):
+                return f(*args, **kwargs)
+            return wrapped
+        return profile
--- a/cvat/requirements/development.txt
+++ b/cvat/requirements/development.txt
@ -12,3 +12,4 @@ wrapt==1.10.11
 django-extensions==2.0.6
 Werkzeug==0.14.1
 snakeviz==0.4.2
+django-silk==3.0.1
--- a/cvat/settings/init.py
+++ b/cvat/settings/init.py
@ -2,4 +2,3 @@
 # Copyright (C) 2018 Intel Corporation
 #
 # SPDX-License-Identifier: MIT
-
--- a/cvat/settings/base.py
+++ b/cvat/settings/base.py
@ -1,4 +1,3 @@
-
 # Copyright (C) 2018 Intel Corporation
 #
 # SPDX-License-Identifier: MIT
@ -55,7 +54,7 @@ INSTALLED_APPS = [
    'sendfile',
    'dj_pagination',
    'revproxy',
-    'rules'
+    'rules',
 ]

 if 'yes' == os.environ.get('TF_ANNOTATION', 'no'):
@ -174,6 +173,10 @@ CACHEOPS = {
    # Automatically cache any Task.objects.get() calls for 15 minutes
    # This also includes .first() and .last() calls.
    'engine.task': {'ops': 'get', 'timeout': 60*15},
+
+    # Automatically cache any Job.objects.get() calls for 15 minutes
+    # This also includes .first() and .last() calls.
+    'engine.job': {'ops': 'get', 'timeout': 60*15},
 }

 CACHEOPS_DEGRADE_ON_FAILURE = True
--- a/cvat/settings/development.py
+++ b/cvat/settings/development.py
@ -1,4 +1,3 @@
-
 # Copyright (C) 2018 Intel Corporation
 #
 # SPDX-License-Identifier: MIT
@ -8,13 +7,36 @@ from .base import *
 # SECURITY WARNING: don't run with debug turned on in production!
 DEBUG = True

-# Quick-start development settings - unsuitable for production
-# See https://docs.djangoproject.com/en/2.0/howto/deployment/checklist/
-
 INSTALLED_APPS += [
    'django_extensions',
+    'silk'
 ]

+MIDDLEWARE += [
+    'silk.middleware.SilkyMiddleware',
+]
+
+# Django profiler
+# https://github.com/jazzband/django-silk
+SILKY_PYTHON_PROFILER = True
+SILKY_PYTHON_PROFILER_BINARY = True
+SILKY_PYTHON_PROFILER_RESULT_PATH = os.path.join(BASE_DIR, 'profiles/')
+os.makedirs(SILKY_PYTHON_PROFILER_RESULT_PATH, exist_ok=True)
+SILKY_AUTHENTICATION = True
+SILKY_AUTHORISATION = True
+SILKY_MAX_REQUEST_BODY_SIZE = 1024
+SILKY_MAX_RESPONSE_BODY_SIZE = 1024
+SILKY_IGNORE_PATHS = ['/admin', '/documentation', '/django-rq', '/auth']
+SILKY_MAX_RECORDED_REQUESTS = 10**4
+def SILKY_INTERCEPT_FUNC(request):
+    # Ignore all requests which try to get a frame (too many of them)
+    if request.method == 'GET' and '/frame/' in request.path:
+        return False
+
+    return True
+
+SILKY_INTERCEPT_FUNC = SILKY_INTERCEPT_FUNC
+
 # Django-sendfile:
 # https://github.com/johnsensible/django-sendfile
 SENDFILE_BACKEND = 'sendfile.backends.development'
--- a/cvat/settings/production.py
+++ b/cvat/settings/production.py
@ -1,4 +1,3 @@
-
 # Copyright (C) 2018 Intel Corporation
 #
 # SPDX-License-Identifier: MIT
--- a/cvat/settings/staging.py
+++ b/cvat/settings/staging.py
@ -5,4 +5,7 @@

 from .production import *

+# SECURITY WARNING: don't run with debug turned on in production!
 DEBUG = True
+
+
--- a/cvat/urls.py
+++ b/cvat/urls.py
@ -31,7 +31,7 @@ urlpatterns = [
    path('dashboard/', include('cvat.apps.dashboard.urls')),
    path('django-rq/', include('django_rq.urls')),
    path('auth/', include('cvat.apps.authentication.urls')),
-    path('documentation/', include('cvat.apps.documentation.urls'))
+    path('documentation/', include('cvat.apps.documentation.urls')),
 ]

 if apps.is_installed('cvat.apps.tf_annotation'):
@ -39,3 +39,6 @@ if apps.is_installed('cvat.apps.tf_annotation'):

 if apps.is_installed('cvat.apps.log_viewer'):
    urlpatterns.append(path('analytics/', include('cvat.apps.log_viewer.urls')))
+
+if apps.is_installed('silk'):
+    urlpatterns.append(path('profiler/', include('silk.urls')))