Manifest fixes (#3146)

* Add logger, reverse func * Fix image filtering * Fix upload video manifest Co-authored-by: Nikita Manovich <nikita.manovich@intel.com>
5 years ago · e7cca0ec64
parent 73b85a9465
commit e7cca0ec64
3 changed files with 97 additions and 10 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -27,6 +27,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Changing a label on canvas does not work when 'Show object details' enabled (<https://github.com/openvinotoolkit/cvat/pull/3084>)
 - Make sure frame unzip web worker correctly terminates after unzipping all images in a requested chunk (<https://github.com/openvinotoolkit/cvat/pull/3096>)
 - Reset password link was unavailable before login (<https://github.com/openvinotoolkit/cvat/pull/3140>)
 - Manifest: migration (<https://github.com/openvinotoolkit/cvat/pull/3146>)
 ### Security
--- a/cvat/apps/engine/migrations/0038_manifest.py
+++ b/cvat/apps/engine/migrations/0038_manifest.py
@ -1,7 +1,10 @@
 # Generated by Django 3.1.1 on 2021-02-20 08:36
 import glob
 import itertools
 import logging
 import os
 import sys
 from re import search
 from django.conf import settings
@ -9,40 +12,109 @@ from django.db import migrations
 from cvat.apps.engine.models import (DimensionType, StorageChoice,
                                     StorageMethodChoice)
 from cvat.apps.engine.media_extractors import get_mime
 from utils.dataset_manifest import ImageManifestManager, VideoManifestManager
-def migrate_data(apps, shema_editor):
+def get_logger():
    migration = os.path.basename(__file__).split(".")[0]
    logger = logging.getLogger(name=migration)
    logger.setLevel(logging.INFO)
    file_handler = logging.FileHandler(os.path.join(settings.MIGRATIONS_LOGS_ROOT, f"{migration}.log"))
    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    file_handler.setFormatter(formatter)
    logger.addHandler(file_handler)
    logger.addHandler(logging.StreamHandler(sys.stdout))
    logger.addHandler(logging.StreamHandler(sys.stderr))
    return logger
 def _get_query_set(apps):
    Data = apps.get_model("engine", "Data")
    query_set = Data.objects.filter(storage_method=StorageMethodChoice.CACHE)
    return query_set
 def migrate2meta(apps, shema_editor):
    logger = get_logger()
    query_set = _get_query_set(apps)
    for db_data in query_set:
        try:
            upload_dir = '{}/{}/raw'.format(settings.MEDIA_DATA_ROOT, db_data.id)
            logger.info('Migrate data({}), folder - {}'.format(db_data.id, upload_dir))
            meta_path = os.path.join(upload_dir, "meta_info.txt")
            if os.path.exists(os.path.join(upload_dir, 'manifest.jsonl')):
                os.remove(os.path.join(upload_dir, 'manifest.jsonl'))
                logger.info('A manifest file has been deleted')
            if os.path.exists(os.path.join(upload_dir, 'index.json')):
                os.remove(os.path.join(upload_dir, 'index.json'))
                logger.info('A manifest index file has been deleted')
            data_dir = upload_dir if db_data.storage == StorageChoice.LOCAL else settings.SHARE_ROOT
            if hasattr(db_data, 'video'):
                if os.path.exists(meta_path):
                    logger.info('A meta_info.txt already exists')
                    continue
                media_file = os.path.join(data_dir, db_data.video.path)
                logger.info('Preparing of the video meta has begun')
                meta = VideoManifestManager(manifest_path=upload_dir) \
                    .prepare_meta(media_file=media_file, force=True)
                with open(meta_path, "w") as meta_file:
                    for idx, pts, _ in meta:
                        meta_file.write(f"{idx} {pts}\n")
            else:
                name_format = "dummy_{}.txt"
                sources = [db_image.path for db_image in db_data.images.all().order_by('frame')]
                counter = itertools.count()
                logger.info('Preparing of the dummy chunks has begun')
                for idx, img_paths in itertools.groupby(sources, lambda x: next(counter) // db_data.chunk_size):
                    if os.path.exists(os.path.join(upload_dir, name_format.format(idx))):
                        logger.info(name_format.format(idx) + " already exists")
                        continue
                    with open(os.path.join(upload_dir, name_format.format(idx)), "w") as dummy_chunk:
                        dummy_chunk.writelines([f"{img_path}\n" for img_path in img_paths])
            logger.info('Succesfull migration for the data({})'.format(db_data.id))
        except Exception as ex:
            logger.error(str(ex))
 def migrate2manifest(apps, shema_editor):
    logger = get_logger()
    logger.info('The data migration has been started for creating manifest`s files')
    query_set = _get_query_set(apps)
    logger.info('Need to update {} data objects'.format(len(query_set)))
    for db_data in query_set:
        try:
            upload_dir = '{}/{}/raw'.format(settings.MEDIA_DATA_ROOT, db_data.id)
            logger.info('Migrate data({}), folder - {}'.format(db_data.id, upload_dir))
            if os.path.exists(os.path.join(upload_dir, 'meta_info.txt')):
-                    os.remove(os.path.join(upload_dir, 'meta_info.txt'))
+                os.remove(os.path.join(upload_dir, 'meta_info.txt'))
                logger.info('{}/meta_info.txt has been deleted'.format(upload_dir))
            else:
                for path in glob.glob(f'{upload_dir}/dummy_*.txt'):
                    os.remove(path)
                    logger.info(f"{path} has been deleted")
            # it's necessary for case with long data migration
            if os.path.exists(os.path.join(upload_dir, 'manifest.jsonl')):
                logger.info('Manifest file already exists')
                continue
            data_dir = upload_dir if db_data.storage == StorageChoice.LOCAL else settings.SHARE_ROOT
            if hasattr(db_data, 'video'):
                media_file = os.path.join(data_dir, db_data.video.path)
                manifest = VideoManifestManager(manifest_path=upload_dir)
-                meta_info = manifest.prepare_meta(media_file=media_file)
+                logger.info('Preparing of the video meta information has begun')
                meta_info = manifest.prepare_meta(media_file=media_file, force=True)
                logger.info('Manifest creating has begun')
                manifest.create(meta_info)
                logger.info('Index creating has begun')
                manifest.init_index()
            else:
                manifest = ImageManifestManager(manifest_path=upload_dir)
                sources = []
                if db_data.storage == StorageChoice.LOCAL:
                    for (root, _, files) in os.walk(data_dir):
-                        sources.extend([os.path.join(root, f) for f in files])
+                        sources.extend([os.path.join(root, f) for f in files if get_mime(f) == 'image'])
                    sources.sort()
                # using share, this means that we can not explicitly restore the entire data structure
                else:
                    sources = [os.path.join(data_dir, db_image.path) for db_image in db_data.images.all().order_by('frame')]
                if any(list(filter(lambda x: x.dimension==DimensionType.DIM_3D, db_data.tasks.all()))):
                    logger.info('Preparing of images 3d meta information has begun')
                    content = []
                    for source in sources:
                        name, ext = os.path.splitext(os.path.relpath(source, upload_dir))
@ -51,6 +123,7 @@ def migrate_data(apps, shema_editor):
                            'extension': ext
                        })
                else:
                    logger.info('Preparing of 2d images meta information has begun')
                    meta_info = manifest.prepare_meta(sources=sources, data_dir=data_dir)
                    content = meta_info.content
@ -58,6 +131,7 @@ def migrate_data(apps, shema_editor):
                    def _get_frame_step(str_):
                        match = search("step\s*=\s*([1-9]\d*)", str_)
                        return int(match.group(1)) if match else 1
                    logger.info('Data is located on the share, metadata update has been started')
                    step = _get_frame_step(db_data.frame_filter)
                    start = db_data.start_frame
                    stop = db_data.stop_frame + 1
@ -67,10 +141,13 @@ def migrate_data(apps, shema_editor):
                        item = content.pop(0) if i in images_range else dict()
                        result_content.append(item)
                    content = result_content
                logger.info('Manifest creating has begun')
                manifest.create(content)
                logger.info('Index creating has begun')
                manifest.init_index()
            logger.info('Succesfull migration for the data({})'.format(db_data.id))
        except Exception as ex:
-            print(str(ex))
+            logger.error(str(ex))
 class Migration(migrations.Migration):
@ -79,5 +156,8 @@ class Migration(migrations.Migration):
    ]
    operations = [
-        migrations.RunPython(migrate_data)
+        migrations.RunPython(
            code=migrate2manifest,
            reverse_code=migrate2meta
        )
    ]
--- a/utils/dataset_manifest/core.py
+++ b/utils/dataset_manifest/core.py
@ -325,7 +325,7 @@ class _ManifestManager(ABC):
        return self._index
 class VideoManifestManager(_ManifestManager):
-    def __init__(self, manifest_path, *args, **kwargs):
+    def __init__(self, manifest_path):
        super().__init__(manifest_path)
        setattr(self._manifest, 'TYPE', 'video')
        self.BASE_INFORMATION['properties'] = 3
@ -381,9 +381,15 @@ class ManifestValidator:
            assert self._manifest.TYPE != json.loads(manifest_file.readline())['type']
 class VideoManifestValidator(VideoManifestManager):
-    def __init__(self, **kwargs):
+    def __init__(self, source_path, manifest_path):
-        self.source_path = kwargs.pop('source_path')
+        self.source_path = source_path
-        super().__init__(self, **kwargs)
+        super().__init__(manifest_path)
    @staticmethod
    def _get_video_stream(container):
        video_stream = next(stream for stream in container.streams if stream.type == 'video')
        video_stream.thread_type = 'AUTO'
        return video_stream
    def validate_key_frame(self, container, video_stream, key_frame):
        for packet in container.demux(video_stream):