Support of Google Cloud Storage for cloud storage (#3561)

* Support Google Cloud Storage for cloud provider

Implement GoogleCloudStorage and add KEY_FILE_PATH to
CredentialsTypeChoice, and key_file_path in Credentials

* Handle cloud storage key contains path seperator

* Update changelog

* Add the migration file for engine_cloud_provider table
main
Jason Wu 5 years ago committed by GitHub
parent 5446816708
commit 6a29b344c7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -50,6 +50,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Explicit "Done" button when drawing any polyshapes (<https://github.com/openvinotoolkit/cvat/pull/3417>) - Explicit "Done" button when drawing any polyshapes (<https://github.com/openvinotoolkit/cvat/pull/3417>)
- Histogram equalization with OpenCV javascript (<https://github.com/openvinotoolkit/cvat/pull/3447>) - Histogram equalization with OpenCV javascript (<https://github.com/openvinotoolkit/cvat/pull/3447>)
- Client-side polyshapes approximation when using semi-automatic interactors & scissors (<https://github.com/openvinotoolkit/cvat/pull/3450>) - Client-side polyshapes approximation when using semi-automatic interactors & scissors (<https://github.com/openvinotoolkit/cvat/pull/3450>)
- Support of Google Cloud Storage for cloud storage (<https://github.com/openvinotoolkit/cvat/pull/3561>)
### Changed ### Changed

@ -87,7 +87,7 @@ class CacheInteraction:
name = f"{item['name']}{item['extension']}" name = f"{item['name']}{item['extension']}"
if name not in cloud_storage_instance: if name not in cloud_storage_instance:
raise Exception('{} file was not found on a {} storage'.format(name, cloud_storage_instance.name)) raise Exception('{} file was not found on a {} storage'.format(name, cloud_storage_instance.name))
with NamedTemporaryFile(mode='w+b', prefix='cvat', suffix=name, delete=False) as temp_file: with NamedTemporaryFile(mode='w+b', prefix='cvat', suffix=name.replace(os.path.sep, '#'), delete=False) as temp_file:
source_path = temp_file.name source_path = temp_file.name
buf = cloud_storage_instance.download_fileobj(name) buf = cloud_storage_instance.download_fileobj(name)
temp_file.write(buf.getvalue()) temp_file.write(buf.getvalue())

@ -1,6 +1,8 @@
#from dataclasses import dataclass #from dataclasses import dataclass
from abc import ABC, abstractmethod, abstractproperty from abc import ABC, abstractmethod, abstractproperty
from io import BytesIO from io import BytesIO
import os
import os.path
import boto3 import boto3
from boto3.s3.transfer import TransferConfig from boto3.s3.transfer import TransferConfig
@ -11,6 +13,8 @@ from azure.storage.blob import BlobServiceClient
from azure.core.exceptions import ResourceExistsError from azure.core.exceptions import ResourceExistsError
from azure.storage.blob import PublicAccess from azure.storage.blob import PublicAccess
from google.cloud import storage
from cvat.apps.engine.log import slogger from cvat.apps.engine.log import slogger
from cvat.apps.engine.models import CredentialsTypeChoice, CloudProviderChoice from cvat.apps.engine.models import CredentialsTypeChoice, CloudProviderChoice
@ -42,6 +46,7 @@ class _CloudStorage(ABC):
def download_file(self, key, path): def download_file(self, key, path):
file_obj = self.download_fileobj(key) file_obj = self.download_fileobj(key)
if isinstance(file_obj, BytesIO): if isinstance(file_obj, BytesIO):
os.makedirs(os.path.dirname(path), exist_ok=True)
with open(path, 'wb') as f: with open(path, 'wb') as f:
f.write(file_obj.getvalue()) f.write(file_obj.getvalue())
else: else:
@ -77,6 +82,14 @@ def get_cloud_storage_instance(cloud_provider, resource, credentials, specific_a
account_name=credentials.account_name, account_name=credentials.account_name,
sas_token=credentials.session_token sas_token=credentials.session_token
) )
elif cloud_provider == CloudProviderChoice.GOOGLE_CLOUD_STORAGE:
instance = GoogleCloudStorage(
bucket_name=resource,
service_account_json=credentials.key_file_path,
prefix=specific_attributes.get('prefix'),
location=specific_attributes.get('location'),
project=specific_attributes.get('project')
)
else: else:
raise NotImplementedError() raise NotImplementedError()
return instance return instance
@ -256,14 +269,89 @@ class AzureBlobContainer(_CloudStorage):
class GOOGLE_DRIVE(_CloudStorage): class GOOGLE_DRIVE(_CloudStorage):
pass pass
class GoogleCloudStorage(_CloudStorage):
def __init__(self, bucket_name, prefix=None, service_account_json=None, project=None, location=None):
super().__init__()
if service_account_json:
self._storage_client = storage.Client.from_service_account_json(service_account_json)
else:
self._storage_client = storage.Client()
bucket = self._storage_client.lookup_bucket(bucket_name)
if bucket is None:
bucket = self._storage_client.bucket(bucket_name, user_project=project)
self._bucket = bucket
self._bucket_location = location
self._prefix = prefix
@property
def bucket(self):
return self._bucket
@property
def name(self):
return self._bucket.name
def exists(self):
return self._storage_client.lookup_bucket(self.name) is not None
def initialize_content(self):
self._files = [
{
'name': blob.name
}
for blob in self._storage_client.list_blobs(
self.bucket, prefix=self._prefix
)
]
def download_fileobj(self, key):
buf = BytesIO()
blob = self.bucket.blob(key)
self._storage_client.download_blob_to_file(blob, buf)
buf.seek(0)
return buf
def is_object_exist(self, key):
return self.bucket.blob(key).exists()
def upload_file(self, file_obj, file_name):
self.bucket.blob(file_name).upload_from_file(file_obj)
def create(self):
try:
self._bucket = self._storage_client.create_bucket(
self.bucket,
location=self._bucket_location
)
slogger.glob.info(
'Bucket {} has been created at {} region for {}'.format(
self.name,
self.bucket.location,
self.bucket.user_project,
))
except Exception as ex:
msg = str(ex)
slogger.glob.info(msg)
raise Exception(msg)
def get_file_last_modified(self, key):
blob = self.bucket.blob(key)
blob.reload()
return blob.updated
class Credentials: class Credentials:
__slots__ = ('key', 'secret_key', 'session_token', 'account_name', 'credentials_type') __slots__ = ('key', 'secret_key', 'session_token', 'account_name', 'key_file_path', 'credentials_type')
def __init__(self, **credentials): def __init__(self, **credentials):
self.key = credentials.get('key', '') self.key = credentials.get('key', '')
self.secret_key = credentials.get('secret_key', '') self.secret_key = credentials.get('secret_key', '')
self.session_token = credentials.get('session_token', '') self.session_token = credentials.get('session_token', '')
self.account_name = credentials.get('account_name', '') self.account_name = credentials.get('account_name', '')
self.key_file_path = credentials.get('key_file_path', '')
self.credentials_type = credentials.get('credentials_type', None) self.credentials_type = credentials.get('credentials_type', None)
def convert_to_db(self): def convert_to_db(self):
@ -271,6 +359,7 @@ class Credentials:
CredentialsTypeChoice.TEMP_KEY_SECRET_KEY_TOKEN_SET : \ CredentialsTypeChoice.TEMP_KEY_SECRET_KEY_TOKEN_SET : \
" ".join([self.key, self.secret_key, self.session_token]), " ".join([self.key, self.secret_key, self.session_token]),
CredentialsTypeChoice.ACCOUNT_NAME_TOKEN_PAIR : " ".join([self.account_name, self.session_token]), CredentialsTypeChoice.ACCOUNT_NAME_TOKEN_PAIR : " ".join([self.account_name, self.session_token]),
CredentialsTypeChoice.KEY_FILE_PATH: self.key_file_path,
CredentialsTypeChoice.ANONYMOUS_ACCESS: "", CredentialsTypeChoice.ANONYMOUS_ACCESS: "",
} }
return converted_credentials[self.credentials_type] return converted_credentials[self.credentials_type]
@ -281,6 +370,8 @@ class Credentials:
self.key, self.secret_key, self.session_token = credentials.get('value').split() self.key, self.secret_key, self.session_token = credentials.get('value').split()
elif self.credentials_type == CredentialsTypeChoice.ACCOUNT_NAME_TOKEN_PAIR: elif self.credentials_type == CredentialsTypeChoice.ACCOUNT_NAME_TOKEN_PAIR:
self.account_name, self.session_token = credentials.get('value').split() self.account_name, self.session_token = credentials.get('value').split()
elif self.credentials_type == CredentialsTypeChoice.KEY_FILE_PATH:
self.key_file_path = credentials.get('value')
else: else:
self.account_name, self.session_token, self.key, self.secret_key = ('', '', '', '') self.account_name, self.session_token, self.key, self.secret_key = ('', '', '', '')
self.credentials_type = None self.credentials_type = None
@ -291,6 +382,7 @@ class Credentials:
self.secret_key = credentials.get('secret_key', self.secret_key) self.secret_key = credentials.get('secret_key', self.secret_key)
self.session_token = credentials.get('session_token', self.session_token) self.session_token = credentials.get('session_token', self.session_token)
self.account_name = credentials.get('account_name', self.account_name) self.account_name = credentials.get('account_name', self.account_name)
self.key_file_path = credentials.get('key_file_path', self.key_file_path)
def values(self): def values(self):
return [self.key, self.secret_key, self.session_token, self.account_name] return [self.key, self.secret_key, self.session_token, self.account_name, self.key_file_path]

@ -0,0 +1,23 @@
# Generated by Django 3.1.13 on 2021-08-27 02:58
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('engine', '0040_cloud_storage'),
]
operations = [
migrations.AlterField(
model_name='cloudstorage',
name='credentials_type',
field=models.CharField(choices=[('TEMP_KEY_SECRET_KEY_TOKEN_SET', 'TEMP_KEY_SECRET_KEY_TOKEN_SET'), ('ACCOUNT_NAME_TOKEN_PAIR', 'ACCOUNT_NAME_TOKEN_PAIR'), ('KEY_FILE_PATH', 'KEY_FILE_PATH'), ('ANONYMOUS_ACCESS', 'ANONYMOUS_ACCESS')], max_length=29),
),
migrations.AlterField(
model_name='cloudstorage',
name='provider_type',
field=models.CharField(choices=[('AWS_S3_BUCKET', 'AWS_S3'), ('AZURE_CONTAINER', 'AZURE_CONTAINER'), ('GOOGLE_DRIVE', 'GOOGLE_DRIVE'), ('GOOGLE_CLOUD_STORAGE', 'GOOGLE_CLOUD_STORAGE')], max_length=20),
),
]

@ -542,6 +542,7 @@ class CloudProviderChoice(str, Enum):
AWS_S3 = 'AWS_S3_BUCKET' AWS_S3 = 'AWS_S3_BUCKET'
AZURE_CONTAINER = 'AZURE_CONTAINER' AZURE_CONTAINER = 'AZURE_CONTAINER'
GOOGLE_DRIVE = 'GOOGLE_DRIVE' GOOGLE_DRIVE = 'GOOGLE_DRIVE'
GOOGLE_CLOUD_STORAGE = 'GOOGLE_CLOUD_STORAGE'
@classmethod @classmethod
def choices(cls): def choices(cls):
@ -558,6 +559,7 @@ class CredentialsTypeChoice(str, Enum):
# ignore bandit issues because false positives # ignore bandit issues because false positives
TEMP_KEY_SECRET_KEY_TOKEN_SET = 'TEMP_KEY_SECRET_KEY_TOKEN_SET' # nosec TEMP_KEY_SECRET_KEY_TOKEN_SET = 'TEMP_KEY_SECRET_KEY_TOKEN_SET' # nosec
ACCOUNT_NAME_TOKEN_PAIR = 'ACCOUNT_NAME_TOKEN_PAIR' # nosec ACCOUNT_NAME_TOKEN_PAIR = 'ACCOUNT_NAME_TOKEN_PAIR' # nosec
KEY_FILE_PATH = 'KEY_FILE_PATH'
ANONYMOUS_ACCESS = 'ANONYMOUS_ACCESS' ANONYMOUS_ACCESS = 'ANONYMOUS_ACCESS'
@classmethod @classmethod

@ -792,6 +792,7 @@ class CloudStorageSerializer(serializers.ModelSerializer):
session_token = serializers.CharField(max_length=440, allow_blank=True, required=False) session_token = serializers.CharField(max_length=440, allow_blank=True, required=False)
key = serializers.CharField(max_length=20, allow_blank=True, required=False) key = serializers.CharField(max_length=20, allow_blank=True, required=False)
secret_key = serializers.CharField(max_length=40, allow_blank=True, required=False) secret_key = serializers.CharField(max_length=40, allow_blank=True, required=False)
key_file_path = serializers.CharField(max_length=64, allow_blank=True, required=False)
account_name = serializers.CharField(max_length=24, allow_blank=True, required=False) account_name = serializers.CharField(max_length=24, allow_blank=True, required=False)
class Meta: class Meta:
@ -799,7 +800,7 @@ class CloudStorageSerializer(serializers.ModelSerializer):
fields = ( fields = (
'provider_type', 'resource', 'display_name', 'owner', 'credentials_type', 'provider_type', 'resource', 'display_name', 'owner', 'credentials_type',
'created_date', 'updated_date', 'session_token', 'account_name', 'key', 'created_date', 'updated_date', 'session_token', 'account_name', 'key',
'secret_key', 'specific_attributes', 'description' 'secret_key', 'key_file_path', 'specific_attributes', 'description'
) )
read_only_fields = ('created_date', 'updated_date', 'owner') read_only_fields = ('created_date', 'updated_date', 'owner')
@ -816,6 +817,9 @@ class CloudStorageSerializer(serializers.ModelSerializer):
if attrs.get('provider_type') == models.CloudProviderChoice.AZURE_CONTAINER: if attrs.get('provider_type') == models.CloudProviderChoice.AZURE_CONTAINER:
if not attrs.get('account_name', ''): if not attrs.get('account_name', ''):
raise serializers.ValidationError('Account name for Azure container was not specified') raise serializers.ValidationError('Account name for Azure container was not specified')
if attrs.get('provider_type') == models.CloudProviderChoice.GOOGLE_CLOUD_STORAGE:
if not attrs.get('key_file_path', ''):
raise serializers.ValidationError('Key file path for Google cloud storage was not specified')
return attrs return attrs
def create(self, validated_data): def create(self, validated_data):
@ -826,6 +830,7 @@ class CloudStorageSerializer(serializers.ModelSerializer):
key=validated_data.pop('key', ''), key=validated_data.pop('key', ''),
secret_key=validated_data.pop('secret_key', ''), secret_key=validated_data.pop('secret_key', ''),
session_token=validated_data.pop('session_token', ''), session_token=validated_data.pop('session_token', ''),
key_file_path=validated_data.pop('key_file_path', ''),
credentials_type = validated_data.get('credentials_type') credentials_type = validated_data.get('credentials_type')
) )
if should_be_created: if should_be_created:
@ -859,7 +864,7 @@ class CloudStorageSerializer(serializers.ModelSerializer):
'type': instance.credentials_type, 'type': instance.credentials_type,
'value': instance.credentials, 'value': instance.credentials,
}) })
tmp = {k:v for k,v in validated_data.items() if k in {'key', 'secret_key', 'account_name', 'session_token', 'credentials_type'}} tmp = {k:v for k,v in validated_data.items() if k in {'key', 'secret_key', 'account_name', 'session_token', 'key_file_path', 'credentials_type'}}
credentials.mapping_with_new_values(tmp) credentials.mapping_with_new_values(tmp)
instance.credentials = credentials.convert_to_db() instance.credentials = credentials.convert_to_db()
instance.credentials_type = validated_data.get('credentials_type', instance.credentials_type) instance.credentials_type = validated_data.get('credentials_type', instance.credentials_type)

@ -1262,7 +1262,8 @@ class CloudStorageViewSet(auth.CloudStorageGetQuerySetMixin, viewsets.ModelViewS
session_token=serializer.validated_data.get('session_token', ''), session_token=serializer.validated_data.get('session_token', ''),
account_name=serializer.validated_data.get('account_name', ''), account_name=serializer.validated_data.get('account_name', ''),
key=serializer.validated_data.get('key', ''), key=serializer.validated_data.get('key', ''),
secret_key=serializer.validated_data.get('secret_key', '') secret_key=serializer.validated_data.get('secret_key', ''),
key_file_path=serializer.validated_data.get('key_file_path', '')
) )
details = { details = {
'resource': serializer.validated_data.get('resource'), 'resource': serializer.validated_data.get('resource'),

@ -47,6 +47,7 @@ diskcache==5.0.2
open3d==0.11.2 open3d==0.11.2
boto3==1.17.61 boto3==1.17.61
azure-storage-blob==12.8.1 azure-storage-blob==12.8.1
google-cloud-storage==1.42.0
# --no-binary=datumaro: workaround for pip to install # --no-binary=datumaro: workaround for pip to install
# opencv-headless instead of regular opencv, to actually run setup script # opencv-headless instead of regular opencv, to actually run setup script
# --no-binary=pycocotools: workaround for binary incompatibility on numpy 1.20 # --no-binary=pycocotools: workaround for binary incompatibility on numpy 1.20

Loading…
Cancel
Save