From cde33acf5da2c29b5c70dec7df688af60734e1a7 Mon Sep 17 00:00:00 2001
From: Maria Khrustaleva <maya17grd@gmail.com>
Date: Thu, 16 Dec 2021 13:57:31 +0300
Subject: [PATCH] Fix issue: Original pdf file is deleted (#3967)

---
 CHANGELOG.md                            |  1 +
 cvat/apps/engine/media_extractors.py    | 25 ++++++++++--------
 cvat/apps/engine/task.py                |  9 +++----
 cvat/apps/engine/tests/test_rest_api.py | 34 ++++++++++++++++++++++---
 4 files changed, 51 insertions(+), 18 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index afb997fe..1846a53b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -40,6 +40,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Fixed task creating with large files via webpage (<https://github.com/openvinotoolkit/cvat/pull/3692>)
 - Added information to export CVAT_HOST when performing local installation for accessing over network (<https://github.com/openvinotoolkit/cvat/pull/4014>)
 - Fixed possible color collisions in the generated colormap (<https://github.com/openvinotoolkit/cvat/pull/4007>)
+- Original pdf file is deleted when using share(<https://github.com/openvinotoolkit/cvat/pull/3967>)
 
 ### Security
 - TDB
diff --git a/cvat/apps/engine/media_extractors.py b/cvat/apps/engine/media_extractors.py
index 740da1ca..a7693c67 100644
--- a/cvat/apps/engine/media_extractors.py
+++ b/cvat/apps/engine/media_extractors.py
@@ -217,14 +217,16 @@ class ArchiveReader(DirectoryReader):
                 start=0,
                 stop=None,
                 dimension=DimensionType.DIM_2D,
-                sorting_method=SortingMethod.LEXICOGRAPHICAL):
+                sorting_method=SortingMethod.LEXICOGRAPHICAL,
+                extract_dir=None):
+
         self._archive_source = source_path[0]
-        extract_dir = source_path[1] if len(source_path) > 1 else os.path.dirname(source_path[0])
-        Archive(self._archive_source).extractall(extract_dir)
-        if extract_dir == os.path.dirname(source_path[0]):
+        tmp_dir = extract_dir if extract_dir else os.path.dirname(source_path[0])
+        Archive(self._archive_source).extractall(tmp_dir)
+        if not extract_dir:
             os.remove(self._archive_source)
         super().__init__(
-            source_path=[extract_dir],
+            source_path=[tmp_dir],
             step=step,
             start=start,
             stop=stop,
@@ -239,7 +241,8 @@ class PdfReader(ImageListReader):
                 start=0,
                 stop=None,
                 dimension=DimensionType.DIM_2D,
-                sorting_method=SortingMethod.LEXICOGRAPHICAL):
+                sorting_method=SortingMethod.LEXICOGRAPHICAL,
+                extract_dir=None):
         if not source_path:
             raise Exception('No PDF found')
 
@@ -252,7 +255,7 @@ class PdfReader(ImageListReader):
                 yield '{}{:09d}.jpeg'.format(_basename, page_num)
 
         from pdf2image import convert_from_path
-        self._tmp_dir = os.path.dirname(source_path[0])
+        self._tmp_dir = extract_dir if extract_dir else os.path.dirname(source_path[0])
         os.makedirs(self._tmp_dir, exist_ok=True)
 
         # Avoid OOM: https://github.com/openvinotoolkit/cvat/issues/940
@@ -260,7 +263,8 @@ class PdfReader(ImageListReader):
             last_page=stop, paths_only=True,
             output_folder=self._tmp_dir, fmt="jpeg", output_file=_make_name())
 
-        os.remove(source_path[0])
+        if not extract_dir:
+            os.remove(source_path[0])
 
         super().__init__(
             source_path=paths,
@@ -278,9 +282,10 @@ class ZipReader(ImageListReader):
                 start=0,
                 stop=None,
                 dimension=DimensionType.DIM_2D,
-                sorting_method=SortingMethod.LEXICOGRAPHICAL):
+                sorting_method=SortingMethod.LEXICOGRAPHICAL,
+                extract_dir=None):
         self._zip_source = zipfile.ZipFile(source_path[0], mode='r')
-        self.extract_dir = source_path[1] if len(source_path) > 1 else None
+        self.extract_dir = extract_dir
         file_list = [f for f in self._zip_source.namelist() if files_to_ignore(f) and get_mime(f) == 'image']
         super().__init__(file_list,
                         step=step,
diff --git a/cvat/apps/engine/task.py b/cvat/apps/engine/task.py
index 57de30ba..b364c338 100644
--- a/cvat/apps/engine/task.py
+++ b/cvat/apps/engine/task.py
@@ -325,17 +325,16 @@ def _create_thread(tid, data, isImport=False):
                     data['sorting_method'] in {models.SortingMethod.RANDOM, models.SortingMethod.PREDEFINED}:
                 raise Exception("It isn't supported to import the task that was created without cache but with random/predefined sorting")
 
-            if media_type in {'archive', 'zip'} and db_data.storage == models.StorageChoice.SHARE:
-                source_paths.append(db_data.get_upload_dirname())
-                upload_dir = db_data.get_upload_dirname()
-                db_data.storage = models.StorageChoice.LOCAL
-
             details = {
                 'source_path': source_paths,
                 'step': db_data.get_frame_step(),
                 'start': db_data.start_frame,
                 'stop': data['stop_frame'],
             }
+            if media_type in {'archive', 'zip', 'pdf'} and db_data.storage == models.StorageChoice.SHARE:
+                details['extract_dir'] = db_data.get_upload_dirname()
+                upload_dir = db_data.get_upload_dirname()
+                db_data.storage = models.StorageChoice.LOCAL
             if media_type != 'video':
                 details['sorting_method'] = data['sorting_method']
             extractor = MEDIA_TYPES[media_type]['extractor'](**details)
diff --git a/cvat/apps/engine/tests/test_rest_api.py b/cvat/apps/engine/tests/test_rest_api.py
index f5499c62..20b2ee06 100644
--- a/cvat/apps/engine/tests/test_rest_api.py
+++ b/cvat/apps/engine/tests/test_rest_api.py
@@ -2767,6 +2767,13 @@ class TaskDataAPITestCase(APITestCase):
         shutil.rmtree(root_path)
         cls._image_sizes[filename] = image_sizes
 
+        file_name = 'test_1.pdf'
+        path = os.path.join(settings.SHARE_ROOT, file_name)
+        img_sizes, data = generate_pdf_file(file_name, page_count=5)
+        with open(path, "wb") as pdf_file:
+            pdf_file.write(data.read())
+        cls._image_sizes[file_name] = img_sizes
+
         generate_manifest_file(data_type='video', manifest_path=os.path.join(settings.SHARE_ROOT, 'videos', 'manifest.jsonl'),
             sources=[os.path.join(settings.SHARE_ROOT, 'videos', 'test_video_1.mp4')])
 
@@ -2804,6 +2811,9 @@ class TaskDataAPITestCase(APITestCase):
         path = os.path.join(settings.SHARE_ROOT, "manifest.jsonl")
         os.remove(path)
 
+        path = os.path.join(settings.SHARE_ROOT, "test_1.pdf")
+        os.remove(path)
+
     def _run_api_v1_tasks_id_data_post(self, tid, user, data):
         with ForceLogin(user, self.client):
             response = self.client.post('/api/v1/tasks/{}/data'.format(tid),
@@ -2886,10 +2896,12 @@ class TaskDataAPITestCase(APITestCase):
             db_data = Task.objects.get(pk=task_id).data
             self.assertEqual(expected_storage_method, db_data.storage_method)
             self.assertEqual(expected_uploaded_data_location, db_data.storage)
-            # check if used share without copying inside and files doesn`t exist in ../raw/
+            # check if used share without copying inside and files doesn`t exist in ../raw/ and exist in share
             if expected_uploaded_data_location is StorageChoice.SHARE:
-                self.assertEqual(False,
-                    os.path.exists(os.path.join(db_data.get_upload_dirname(), next(iter(data.values())))))
+                raw_file_path = os.path.join(db_data.get_upload_dirname(), next(iter(data.values())))
+                share_file_path = os.path.join(settings.SHARE_ROOT, next(iter(data.values())))
+                self.assertEqual(False, os.path.exists(raw_file_path))
+                self.assertEqual(True, os.path.exists(share_file_path))
 
         # check preview
         response = self._get_preview(task_id, user)
@@ -2956,6 +2968,10 @@ class TaskDataAPITestCase(APITestCase):
                 for f in source_files:
                     if zipfile.is_zipfile(f):
                         source_images.extend(self._extract_zip_chunk(f, dimension=dimension))
+                    elif isinstance(f, str) and f.endswith('.pdf'):
+                        with open(f, 'rb') as pdf_file:
+                            source_images.extend(convert_from_bytes(pdf_file.read(),
+                                fmt='png'))
                     elif isinstance(f, io.BytesIO) and \
                             str(getattr(f, 'name', None)).endswith('.pdf'):
                         source_images.extend(convert_from_bytes(f.getvalue(),
@@ -3475,6 +3491,18 @@ class TaskDataAPITestCase(APITestCase):
         self._test_api_v1_tasks_id_data_spec(user, task_spec, task_data, self.ChunkType.IMAGESET, self.ChunkType.IMAGESET,
             image_sizes, StorageMethodChoice.CACHE, StorageChoice.SHARE)
 
+        task_spec.update([('name', 'task pdf in the shared folder #30')])
+        task_data = {
+            "server_files[0]": "test_1.pdf",
+            "image_quality": 70,
+            "copy_data": False,
+            "use_cache": True,
+        }
+        image_sizes = self._image_sizes[task_data["server_files[0]"]]
+
+        self._test_api_v1_tasks_id_data_spec(user, task_spec, task_data, self.ChunkType.IMAGESET, self.ChunkType.IMAGESET,
+            image_sizes, StorageMethodChoice.CACHE, StorageChoice.LOCAL)
+
     def test_api_v1_tasks_id_data_admin(self):
         self._test_api_v1_tasks_id_data(self.admin)