From cde33acf5da2c29b5c70dec7df688af60734e1a7 Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Thu, 16 Dec 2021 13:57:31 +0300 Subject: [PATCH] Fix issue: Original pdf file is deleted (#3967) --- CHANGELOG.md | 1 + cvat/apps/engine/media_extractors.py | 25 ++++++++++-------- cvat/apps/engine/task.py | 9 +++---- cvat/apps/engine/tests/test_rest_api.py | 34 ++++++++++++++++++++++--- 4 files changed, 51 insertions(+), 18 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index afb997fe..1846a53b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -40,6 +40,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Fixed task creating with large files via webpage () - Added information to export CVAT_HOST when performing local installation for accessing over network () - Fixed possible color collisions in the generated colormap () +- Original pdf file is deleted when using share() ### Security - TDB diff --git a/cvat/apps/engine/media_extractors.py b/cvat/apps/engine/media_extractors.py index 740da1ca..a7693c67 100644 --- a/cvat/apps/engine/media_extractors.py +++ b/cvat/apps/engine/media_extractors.py @@ -217,14 +217,16 @@ class ArchiveReader(DirectoryReader): start=0, stop=None, dimension=DimensionType.DIM_2D, - sorting_method=SortingMethod.LEXICOGRAPHICAL): + sorting_method=SortingMethod.LEXICOGRAPHICAL, + extract_dir=None): + self._archive_source = source_path[0] - extract_dir = source_path[1] if len(source_path) > 1 else os.path.dirname(source_path[0]) - Archive(self._archive_source).extractall(extract_dir) - if extract_dir == os.path.dirname(source_path[0]): + tmp_dir = extract_dir if extract_dir else os.path.dirname(source_path[0]) + Archive(self._archive_source).extractall(tmp_dir) + if not extract_dir: os.remove(self._archive_source) super().__init__( - source_path=[extract_dir], + source_path=[tmp_dir], step=step, start=start, stop=stop, @@ -239,7 +241,8 @@ class PdfReader(ImageListReader): start=0, stop=None, dimension=DimensionType.DIM_2D, - sorting_method=SortingMethod.LEXICOGRAPHICAL): + sorting_method=SortingMethod.LEXICOGRAPHICAL, + extract_dir=None): if not source_path: raise Exception('No PDF found') @@ -252,7 +255,7 @@ class PdfReader(ImageListReader): yield '{}{:09d}.jpeg'.format(_basename, page_num) from pdf2image import convert_from_path - self._tmp_dir = os.path.dirname(source_path[0]) + self._tmp_dir = extract_dir if extract_dir else os.path.dirname(source_path[0]) os.makedirs(self._tmp_dir, exist_ok=True) # Avoid OOM: https://github.com/openvinotoolkit/cvat/issues/940 @@ -260,7 +263,8 @@ class PdfReader(ImageListReader): last_page=stop, paths_only=True, output_folder=self._tmp_dir, fmt="jpeg", output_file=_make_name()) - os.remove(source_path[0]) + if not extract_dir: + os.remove(source_path[0]) super().__init__( source_path=paths, @@ -278,9 +282,10 @@ class ZipReader(ImageListReader): start=0, stop=None, dimension=DimensionType.DIM_2D, - sorting_method=SortingMethod.LEXICOGRAPHICAL): + sorting_method=SortingMethod.LEXICOGRAPHICAL, + extract_dir=None): self._zip_source = zipfile.ZipFile(source_path[0], mode='r') - self.extract_dir = source_path[1] if len(source_path) > 1 else None + self.extract_dir = extract_dir file_list = [f for f in self._zip_source.namelist() if files_to_ignore(f) and get_mime(f) == 'image'] super().__init__(file_list, step=step, diff --git a/cvat/apps/engine/task.py b/cvat/apps/engine/task.py index 57de30ba..b364c338 100644 --- a/cvat/apps/engine/task.py +++ b/cvat/apps/engine/task.py @@ -325,17 +325,16 @@ def _create_thread(tid, data, isImport=False): data['sorting_method'] in {models.SortingMethod.RANDOM, models.SortingMethod.PREDEFINED}: raise Exception("It isn't supported to import the task that was created without cache but with random/predefined sorting") - if media_type in {'archive', 'zip'} and db_data.storage == models.StorageChoice.SHARE: - source_paths.append(db_data.get_upload_dirname()) - upload_dir = db_data.get_upload_dirname() - db_data.storage = models.StorageChoice.LOCAL - details = { 'source_path': source_paths, 'step': db_data.get_frame_step(), 'start': db_data.start_frame, 'stop': data['stop_frame'], } + if media_type in {'archive', 'zip', 'pdf'} and db_data.storage == models.StorageChoice.SHARE: + details['extract_dir'] = db_data.get_upload_dirname() + upload_dir = db_data.get_upload_dirname() + db_data.storage = models.StorageChoice.LOCAL if media_type != 'video': details['sorting_method'] = data['sorting_method'] extractor = MEDIA_TYPES[media_type]['extractor'](**details) diff --git a/cvat/apps/engine/tests/test_rest_api.py b/cvat/apps/engine/tests/test_rest_api.py index f5499c62..20b2ee06 100644 --- a/cvat/apps/engine/tests/test_rest_api.py +++ b/cvat/apps/engine/tests/test_rest_api.py @@ -2767,6 +2767,13 @@ class TaskDataAPITestCase(APITestCase): shutil.rmtree(root_path) cls._image_sizes[filename] = image_sizes + file_name = 'test_1.pdf' + path = os.path.join(settings.SHARE_ROOT, file_name) + img_sizes, data = generate_pdf_file(file_name, page_count=5) + with open(path, "wb") as pdf_file: + pdf_file.write(data.read()) + cls._image_sizes[file_name] = img_sizes + generate_manifest_file(data_type='video', manifest_path=os.path.join(settings.SHARE_ROOT, 'videos', 'manifest.jsonl'), sources=[os.path.join(settings.SHARE_ROOT, 'videos', 'test_video_1.mp4')]) @@ -2804,6 +2811,9 @@ class TaskDataAPITestCase(APITestCase): path = os.path.join(settings.SHARE_ROOT, "manifest.jsonl") os.remove(path) + path = os.path.join(settings.SHARE_ROOT, "test_1.pdf") + os.remove(path) + def _run_api_v1_tasks_id_data_post(self, tid, user, data): with ForceLogin(user, self.client): response = self.client.post('/api/v1/tasks/{}/data'.format(tid), @@ -2886,10 +2896,12 @@ class TaskDataAPITestCase(APITestCase): db_data = Task.objects.get(pk=task_id).data self.assertEqual(expected_storage_method, db_data.storage_method) self.assertEqual(expected_uploaded_data_location, db_data.storage) - # check if used share without copying inside and files doesn`t exist in ../raw/ + # check if used share without copying inside and files doesn`t exist in ../raw/ and exist in share if expected_uploaded_data_location is StorageChoice.SHARE: - self.assertEqual(False, - os.path.exists(os.path.join(db_data.get_upload_dirname(), next(iter(data.values()))))) + raw_file_path = os.path.join(db_data.get_upload_dirname(), next(iter(data.values()))) + share_file_path = os.path.join(settings.SHARE_ROOT, next(iter(data.values()))) + self.assertEqual(False, os.path.exists(raw_file_path)) + self.assertEqual(True, os.path.exists(share_file_path)) # check preview response = self._get_preview(task_id, user) @@ -2956,6 +2968,10 @@ class TaskDataAPITestCase(APITestCase): for f in source_files: if zipfile.is_zipfile(f): source_images.extend(self._extract_zip_chunk(f, dimension=dimension)) + elif isinstance(f, str) and f.endswith('.pdf'): + with open(f, 'rb') as pdf_file: + source_images.extend(convert_from_bytes(pdf_file.read(), + fmt='png')) elif isinstance(f, io.BytesIO) and \ str(getattr(f, 'name', None)).endswith('.pdf'): source_images.extend(convert_from_bytes(f.getvalue(), @@ -3475,6 +3491,18 @@ class TaskDataAPITestCase(APITestCase): self._test_api_v1_tasks_id_data_spec(user, task_spec, task_data, self.ChunkType.IMAGESET, self.ChunkType.IMAGESET, image_sizes, StorageMethodChoice.CACHE, StorageChoice.SHARE) + task_spec.update([('name', 'task pdf in the shared folder #30')]) + task_data = { + "server_files[0]": "test_1.pdf", + "image_quality": 70, + "copy_data": False, + "use_cache": True, + } + image_sizes = self._image_sizes[task_data["server_files[0]"]] + + self._test_api_v1_tasks_id_data_spec(user, task_spec, task_data, self.ChunkType.IMAGESET, self.ChunkType.IMAGESET, + image_sizes, StorageMethodChoice.CACHE, StorageChoice.LOCAL) + def test_api_v1_tasks_id_data_admin(self): self._test_api_v1_tasks_id_data(self.admin)