From 00cc60839e3aefda25c6c907df5d8e39d3bb6b6e Mon Sep 17 00:00:00 2001
From: David Douard <david.douard@sdfa3.org>
Date: Mon, 16 Dec 2019 12:56:14 +0100
Subject: [PATCH] Move load-deposit task creation code in the deposit-check
 private API endpoint

Also replace utils.origin_url_from() by a Deposit.origin_url property, and
ensure the archive file format of the uploaded file is supported.

The rename of invalid.tar.gz as invalid.gz is required for tests to pass
which should not be the case. It will be investigated in a later revision,
but for now we want tests to be green.

This requires to make config dict available in SWHPrivateAPIView so we can
have access to a scheduler from there.
---
 swh/deposit/api/common.py                     | 11 +-----
 swh/deposit/api/private/__init__.py           |  4 +-
 swh/deposit/api/private/deposit_check.py      | 19 ++++++++++
 swh/deposit/api/private/deposit_read.py       |  3 +-
 swh/deposit/client/__init__.py                |  5 ++-
 swh/deposit/models.py                         |  5 +++
 .../tests/api/test_deposit_private_check.py   |  2 +-
 swh/deposit/tests/test_utils.py               | 37 -------------------
 swh/deposit/utils.py                          | 25 -------------
 9 files changed, 32 insertions(+), 79 deletions(-)

diff --git a/swh/deposit/api/common.py b/swh/deposit/api/common.py
index a53b2d6e..dbff46e0 100644
--- a/swh/deposit/api/common.py
+++ b/swh/deposit/api/common.py
@@ -19,13 +19,12 @@ from rest_framework.views import APIView
 
 from swh.model import hashutil
 from swh.scheduler.utils import create_oneshot_task_dict
-from swh.deposit.utils import origin_url_from
 
 from ..config import (
     SWHDefaultConfig, EDIT_SE_IRI, EM_IRI, CONT_FILE_IRI,
     ARCHIVE_KEY, METADATA_KEY, RAW_METADATA_KEY, STATE_IRI,
     DEPOSIT_STATUS_DEPOSITED, DEPOSIT_STATUS_PARTIAL,
-    DEPOSIT_STATUS_VERIFIED, PRIVATE_CHECK_DEPOSIT,
+    PRIVATE_CHECK_DEPOSIT,
     DEPOSIT_STATUS_LOAD_SUCCESS, ARCHIVE_TYPE, METADATA_TYPE
 )
 from ..errors import (
@@ -184,14 +183,6 @@ class SWHBaseDeposit(SWHDefaultConfig, SWHAPIView, metaclass=ABCMeta):
                     'check-deposit', deposit_check_url=check_url)
                 check_task_id = scheduler.create_tasks([task])[0]['id']
                 deposit.check_task_id = check_task_id
-            elif (deposit.status == DEPOSIT_STATUS_VERIFIED and
-                  not deposit.load_task_id):
-
-                url = origin_url_from(deposit)
-                task = create_oneshot_task_dict(
-                    'load-deposit', url=url, deposit_id=deposit.id)
-                load_task_id = scheduler.create_task([task])[0]['id']
-                deposit.load_task_id = load_task_id
 
         deposit.save()
 
diff --git a/swh/deposit/api/private/__init__.py b/swh/deposit/api/private/__init__.py
index 4ed1154d..f0f86945 100644
--- a/swh/deposit/api/private/__init__.py
+++ b/swh/deposit/api/private/__init__.py
@@ -5,7 +5,7 @@
 
 from swh.deposit import utils
 
-from ...config import METADATA_TYPE
+from ...config import METADATA_TYPE, SWHDefaultConfig
 from ...models import DepositRequest, Deposit
 
 from rest_framework.permissions import AllowAny
@@ -56,7 +56,7 @@ class DepositReadMixin:
         return utils.merge(*metadata)
 
 
-class SWHPrivateAPIView(SWHAPIView):
+class SWHPrivateAPIView(SWHDefaultConfig, SWHAPIView):
     """Mixin intended as private api (so no authentication) based API view
        (for the private ones).
 
diff --git a/swh/deposit/api/private/deposit_check.py b/swh/deposit/api/private/deposit_check.py
index 8961d914..7f0387fa 100644
--- a/swh/deposit/api/private/deposit_check.py
+++ b/swh/deposit/api/private/deposit_check.py
@@ -8,8 +8,12 @@ import re
 import tarfile
 import zipfile
 
+from itertools import chain
+from shutil import get_unpack_formats
+
 from rest_framework import status
 
+from swh.scheduler.utils import create_oneshot_task_dict
 
 from . import DepositReadMixin, SWHPrivateAPIView
 from ..common import SWHGetDepositAPI
@@ -33,6 +37,11 @@ PATTERN_ARCHIVE_EXTENSION = re.compile(
     r'.*\.(%s)$' % '|'.join(ARCHIVE_EXTENSIONS))
 
 
+def known_archive_format(filename):
+    return any(filename.endswith(t) for t in
+               chain(*(x[1] for x in get_unpack_formats())))
+
+
 class SWHChecksDeposit(SWHPrivateAPIView, SWHGetDepositAPI, DepositReadMixin):
     """Dedicated class to read a deposit's raw archives content.
 
@@ -92,6 +101,10 @@ class SWHChecksDeposit(SWHPrivateAPIView, SWHGetDepositAPI, DepositReadMixin):
 
         """
         archive_path = archive_request.archive.path
+
+        if not known_archive_format(archive_path):
+            return False, MANDATORY_ARCHIVE_UNSUPPORTED
+
         try:
             if zipfile.is_zipfile(archive_path):
                 with zipfile.ZipFile(archive_path) as f:
@@ -203,6 +216,12 @@ class SWHChecksDeposit(SWHPrivateAPIView, SWHGetDepositAPI, DepositReadMixin):
             response = {
                 'status': deposit.status,
             }
+            if not deposit.load_task_id and self.config['checks']:
+                url = deposit.origin_url
+                task = create_oneshot_task_dict(
+                    'load-deposit', url=url, deposit_id=deposit.id)
+                load_task_id = self.scheduler.create_tasks([task])[0]['id']
+                deposit.load_task_id = load_task_id
 
         deposit.save()
 
diff --git a/swh/deposit/api/private/deposit_read.py b/swh/deposit/api/private/deposit_read.py
index 8b834b04..c9a575c5 100644
--- a/swh/deposit/api/private/deposit_read.py
+++ b/swh/deposit/api/private/deposit_read.py
@@ -15,7 +15,6 @@ from rest_framework import status
 from swh.core import tarball
 from swh.model import identifiers
 from swh.deposit.utils import normalize_date
-from swh.deposit import utils
 
 from . import DepositReadMixin, SWHPrivateAPIView
 from ...config import SWH_PERSON, ARCHIVE_TYPE
@@ -177,7 +176,7 @@ class SWHDepositReadMetadata(SWHPrivateAPIView, SWHGetDepositAPI,
         data = {
             'origin': {
                 'type': 'deposit',
-                'url': utils.origin_url_from(deposit),
+                'url': deposit.origin_url,
             }
         }
 
diff --git a/swh/deposit/client/__init__.py b/swh/deposit/client/__init__.py
index b4b7f7e2..35b34193 100644
--- a/swh/deposit/client/__init__.py
+++ b/swh/deposit/client/__init__.py
@@ -14,6 +14,7 @@ import xmltodict
 import logging
 
 from abc import ABCMeta, abstractmethod
+from urllib.parse import urljoin
 
 from swh.core.config import SWHConfig
 
@@ -82,7 +83,7 @@ class BaseApiDepositClient(SWHConfig):
             self.config = config
 
         self._client = _client
-        self.base_url = self.config['url']
+        self.base_url = self.config['url'].strip('/') + '/'
         auth = self.config['auth']
         if auth == {}:
             self.auth = None
@@ -109,7 +110,7 @@ class BaseApiDepositClient(SWHConfig):
         if self.auth:
             kwargs['auth'] = self.auth
 
-        full_url = '%s%s' % (self.base_url.rstrip('/'), url)
+        full_url = urljoin(self.base_url, url.lstrip('/'))
         return method_fn(full_url, *args, **kwargs)
 
 
diff --git a/swh/deposit/models.py b/swh/deposit/models.py
index e7c5440d..1a7a78ea 100644
--- a/swh/deposit/models.py
+++ b/swh/deposit/models.py
@@ -151,6 +151,11 @@ class Deposit(models.Model):
             d['status_detail'] = self.status_detail
         return str(d)
 
+    @property
+    def origin_url(self):
+        return '%s/%s' % (self.client.provider_url.rstrip('/'),
+                          self.external_id)
+
 
 def client_directory_path(instance, filename):
     """Callable to upload archive in MEDIA_ROOT/user_<id>/<filename>
diff --git a/swh/deposit/tests/api/test_deposit_private_check.py b/swh/deposit/tests/api/test_deposit_private_check.py
index 1c90113a..50f0b1d0 100644
--- a/swh/deposit/tests/api/test_deposit_private_check.py
+++ b/swh/deposit/tests/api/test_deposit_private_check.py
@@ -237,7 +237,7 @@ def create_deposit_archive_with_archive(
 
     # now we create an archive holding the first created archive
     invalid_archive = create_archive_with_archive(
-        root_path, 'invalid.tar.gz', archive)
+        root_path, 'invalid.tgz', archive)
 
     # we deposit it
     response = client.post(
diff --git a/swh/deposit/tests/test_utils.py b/swh/deposit/tests/test_utils.py
index f44710a6..a7486f7e 100644
--- a/swh/deposit/tests/test_utils.py
+++ b/swh/deposit/tests/test_utils.py
@@ -8,43 +8,6 @@ import pytest
 from unittest.mock import patch
 
 from swh.deposit import utils
-from swh.deposit.models import Deposit, DepositClient
-
-
-def test_origin_url_from():
-    """With correctly setup-ed deposit, all is fine
-
-    """
-    for provider_url, external_id in (
-            ('http://somewhere.org', 'uuid'),
-            ('http://overthejungle.org', 'diuu'),
-    ):
-        deposit = Deposit(
-            client=DepositClient(provider_url=provider_url),
-            external_id=external_id
-        )
-
-        actual_origin_url = utils.origin_url_from(deposit)
-
-        assert actual_origin_url == '%s/%s' % (
-            provider_url.rstrip('/'), external_id)
-
-
-def test_origin_url_from_ko():
-    """Badly configured deposit should raise
-
-    """
-    for provider_url, external_id in (
-            (None, 'uuid'),
-            ('http://overthejungle.org', None),
-    ):
-        deposit = Deposit(
-            client=DepositClient(provider_url=provider_url),
-            external_id=None
-        )
-
-        with pytest.raises(AssertionError):
-            utils.origin_url_from(deposit)
 
 
 def test_merge():
diff --git a/swh/deposit/utils.py b/swh/deposit/utils.py
index beb31ef6..86775ac3 100644
--- a/swh/deposit/utils.py
+++ b/swh/deposit/utils.py
@@ -10,31 +10,6 @@ from types import GeneratorType
 from swh.model.identifiers import normalize_timestamp
 
 
-def origin_url_from(deposit):
-    """Given a deposit instance, return the associated origin url.
-
-    This expects a deposit and the associated client to be correctly
-    configured.
-
-    Args:
-        deposit (Deposit): The deposit from which derives the origin url
-
-    Raises:
-        AssertionError if:
-        - the client's provider_url field is not configured.
-        - the deposit's external_id field is not configured.
-
-    Returns
-       The associated origin url
-
-    """
-    external_id = deposit.external_id
-    assert external_id is not None
-    base_url = deposit.client.provider_url
-    assert base_url is not None
-    return '%s/%s' % (base_url.rstrip('/'), external_id)
-
-
 def merge(*dicts):
     """Given an iterator of dicts, merge them losing no information.
 
-- 
GitLab