diff --git a/PKG-INFO b/PKG-INFO index 15e14c0468b58633340533ec41b0757b1728b197..b9fd7e41d64a7f7402b8a36b9865a61a35cbe405 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: swh.deposit -Version: 0.0.19 +Version: 0.0.20 Summary: Software Heritage Deposit Server Home-page: https://forge.softwareheritage.org/source/swh-deposit/ Author: Software Heritage developers diff --git a/debian/control b/debian/control index 307719da628c8c8948385212e2d4901a823f5d4f..856c25d808f975ef54fc0a716e7ac9675ff3fbea 100644 --- a/debian/control +++ b/debian/control @@ -25,7 +25,6 @@ Package: python3-swh.deposit Architecture: all Depends: python3-swh.core (>= 0.0.14~), python3-swh.loader.tar (>= 0.0.26~), - python3-swh.scheduler (>= 0.0.17~), ${misc:Depends}, ${python3:Depends} Description: Software Heritage Deposit Server diff --git a/requirements-swh.txt b/requirements-swh.txt index 36288b45213cb4ea3b5e20b38ef80b0e2523b230..faaa7a9f7117ab9e33c7564d4476ee3d30e8b283 100644 --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,3 +1,2 @@ swh.core >= 0.0.14 swh.loader.tar >= 0.0.27 -swh.scheduler >= 0.0.17 diff --git a/requirements.txt b/requirements.txt index 3c648d6ab2fcc14ac56a869defe7ec4f1e0eaa07..29cf6c92439a9611d4914346cfa83bac2ebd6f3c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,5 @@ vcversioner -retrying click Django djangorestframework diff --git a/setup.py b/setup.py index 93a6ae0a92203fc054db2f3ce24405b03035a8c3..53a0b67779a19b9427e1462120177857ad377e36 100644 --- a/setup.py +++ b/setup.py @@ -24,6 +24,7 @@ setup( install_requires=parse_requirements(), extras_require={ 'injection': ['swh.loader.core >= 0.0.19', + 'swh.scheduler >= 0.0.17', 'requests'], }, setup_requires=['vcversioner'], diff --git a/swh.deposit.egg-info/PKG-INFO b/swh.deposit.egg-info/PKG-INFO index 15e14c0468b58633340533ec41b0757b1728b197..b9fd7e41d64a7f7402b8a36b9865a61a35cbe405 100644 --- a/swh.deposit.egg-info/PKG-INFO +++ b/swh.deposit.egg-info/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: swh.deposit -Version: 0.0.19 +Version: 0.0.20 Summary: Software Heritage Deposit Server Home-page: https://forge.softwareheritage.org/source/swh-deposit/ Author: Software Heritage developers diff --git a/swh.deposit.egg-info/SOURCES.txt b/swh.deposit.egg-info/SOURCES.txt index 2b54c0f06b51e75a9e6afcdba818cc86f58bf50d..7d2a79ac362cefcfd37d39169ea690211d302a58 100644 --- a/swh.deposit.egg-info/SOURCES.txt +++ b/swh.deposit.egg-info/SOURCES.txt @@ -73,13 +73,12 @@ swh/deposit/fixtures/__init__.py swh/deposit/fixtures/deposit_data.yaml swh/deposit/injection/__init__.py swh/deposit/injection/loader.py +swh/deposit/injection/scheduler.py swh/deposit/injection/tasks.py swh/deposit/migrations/0001_initial.py swh/deposit/migrations/0002_depositrequest_archive.py swh/deposit/migrations/0003_temporaryarchive.py swh/deposit/migrations/__init__.py -swh/deposit/scheduler/__init__.py -swh/deposit/scheduler/cli.py swh/deposit/service/__init__.py swh/deposit/service/clean_temporary_directory.py swh/deposit/settings/__init__.py @@ -104,6 +103,7 @@ swh/deposit/tests/api/test_deposit_atom.py swh/deposit/tests/api/test_deposit_binary.py swh/deposit/tests/api/test_deposit_delete.py swh/deposit/tests/api/test_deposit_multipart.py +swh/deposit/tests/api/test_deposit_read.py swh/deposit/tests/api/test_deposit_status.py swh/deposit/tests/api/test_deposit_update.py swh/deposit/tests/api/test_deposit_update_status.py diff --git a/swh.deposit.egg-info/requires.txt b/swh.deposit.egg-info/requires.txt index e02d4f80d20a44e46dfcafed386dd56b9ba47576..248f72186719f86d57c4ca97513b867c4094ca5a 100644 --- a/swh.deposit.egg-info/requires.txt +++ b/swh.deposit.egg-info/requires.txt @@ -2,12 +2,11 @@ Django click djangorestframework djangorestframework-xml -retrying swh.core>=0.0.14 swh.loader.tar>=0.0.27 -swh.scheduler>=0.0.17 vcversioner [injection] requests swh.loader.core>=0.0.19 +swh.scheduler>=0.0.17 diff --git a/swh/deposit/api/deposit_update.py b/swh/deposit/api/deposit_update.py index bf612630683604ed8efcd296959291cac89a6b9f..d99563b2243c03aa51d996bfbf1fae66ef71a20b 100644 --- a/swh/deposit/api/deposit_update.py +++ b/swh/deposit/api/deposit_update.py @@ -4,12 +4,10 @@ # See top-level LICENSE file for more information from rest_framework import status -from rest_framework.parsers import JSONParser from .common import SWHPostDepositAPI, SWHPutDepositAPI, SWHDeleteDepositAPI from ..config import CONT_FILE_IRI, EDIT_SE_IRI, EM_IRI -from ..errors import make_error_response, make_error_dict, BAD_REQUEST -from ..models import Deposit, DEPOSIT_STATUS_DETAIL +from ..errors import make_error_response, BAD_REQUEST from ..parsers import SWHFileUploadParser, SWHAtomEntryParser from ..parsers import SWHMultiPartParser @@ -155,42 +153,3 @@ class SWHUpdateMetadataDeposit(SWHPostDepositAPI, SWHPutDepositAPI, #protocoloperations_deleteconteiner """ return self._delete_deposit(collection_name, deposit_id) - - -class SWHUpdateStatusDeposit(SWHPutDepositAPI): - """Deposit request class to update the deposit's status. - - HTTP verbs supported: PUT - - """ - parser_classes = (JSONParser, ) - - def restrict_access(self, req, deposit=None): - """Remove restriction modification to 'partial' deposit. - Update is possible regardless of the existing status. - - """ - return None - - def process_put(self, req, headers, collection_name, deposit_id): - """Update the deposit's status - - Returns: - 204 No content - - """ - status = req.data.get('status') - if not status: - msg = 'The status key is mandatory with possible values %s' % list( - DEPOSIT_STATUS_DETAIL.keys()) - return make_error_dict(BAD_REQUEST, msg) - - if status not in DEPOSIT_STATUS_DETAIL: - msg = 'Possible status in %s' % list(DEPOSIT_STATUS_DETAIL.keys()) - return make_error_dict(BAD_REQUEST, msg) - - deposit = Deposit.objects.get(pk=deposit_id) - deposit.status = status - deposit.save() - - return {} diff --git a/swh/deposit/api/private/deposit_read.py b/swh/deposit/api/private/deposit_read.py index a88f790f0830e9fa4a87e3d383991a897a70b5f7..348abb1d3dbdab4d2c4d41c4ab4de607ac6414d7 100644 --- a/swh/deposit/api/private/deposit_read.py +++ b/swh/deposit/api/private/deposit_read.py @@ -3,6 +3,7 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import json import os import shutil import tempfile @@ -10,9 +11,11 @@ import tempfile from rest_framework import status from swh.loader.tar import tarball +from swh.model import hashutil, identifiers from ..common import SWHGetDepositAPI, SWHPrivateAPIView from ...models import Deposit, DepositRequest, TemporaryArchive +from ...models import previous_revision_id def aggregate_tarballs(extraction_dir, archive_paths): @@ -142,3 +145,91 @@ class SWHDepositReadArchives(SWHGetDepositAPI, SWHPrivateAPIView): self.cleanup(directory_to_cleanup) return status.HTTP_200_OK, stream, 'application/octet-stream' + + +class SWHDepositReadMetadata(SWHGetDepositAPI, SWHPrivateAPIView): + """Class in charge of aggregating metadata on a deposit. + + """ + def _aggregate_metadata(self, deposit, metadata_requests): + """Retrieve and aggregates metadata information. + + """ + metadata = {} + for req in metadata_requests: + metadata.update(req.metadata) + + return metadata + + def aggregate(self, deposit, requests): + """Aggregate multiple data on deposit into one unified data dictionary. + + Args: + deposit (Deposit): Deposit concerned by the data aggregation. + requests ([DepositRequest]): List of associated requests which + need aggregation. + + Returns: + Dictionary of data representing the deposit to inject in swh. + + """ + data = {} + metadata_requests = [] + + # Retrieve tarballs/metadata information + metadata = self._aggregate_metadata(deposit, metadata_requests) + + # Read information metadata + data['origin'] = { + 'type': deposit.collection.name, + 'url': deposit.external_id, + } + + # revision + + fullname = deposit.client.get_full_name() + author_committer = { + 'name': deposit.client.last_name, + 'fullname': fullname, + 'email': deposit.client.email, + } + + revision_type = 'tar' + revision_msg = '%s: Deposit %s in collection %s' % ( + fullname, deposit.id, deposit.collection.name) + complete_date = identifiers.normalize_timestamp(deposit.complete_date) + + data['revision'] = { + 'synthetic': True, + 'date': complete_date, + 'committer_date': complete_date, + 'author': author_committer, + 'committer': author_committer, + 'type': revision_type, + 'message': revision_msg, + 'metadata': metadata, + } + + parent_revision = previous_revision_id(deposit.swh_id) + if parent_revision: + data['revision'] = { + 'parents': [hashutil.hash_to_bytes(parent_revision)] + } + + data['occurrence'] = { + 'branch': 'master' + } + + return data + + def process_get(self, req, collection_name, deposit_id): + deposit = Deposit.objects.get(pk=deposit_id) + requests = DepositRequest.objects.filter( + deposit=deposit, type=self.deposit_request_types['metadata']) + + data = self.aggregate(deposit, requests) + d = {} + if data: + d = json.dumps(data) + + return status.HTTP_200_OK, d, 'application/json' diff --git a/swh/deposit/api/urls.py b/swh/deposit/api/urls.py index bebc5a41f56b5aeca2c401472db5d8d0ae504562..69fd612babdde0fde5589ace169c80c9d8c0e25a 100644 --- a/swh/deposit/api/urls.py +++ b/swh/deposit/api/urls.py @@ -22,7 +22,7 @@ from django.conf.urls import url from ..config import EDIT_SE_IRI, EM_IRI, CONT_FILE_IRI from ..config import SD_IRI, COL_IRI, STATE_IRI, PRIVATE_GET_RAW_CONTENT -from ..config import PRIVATE_PUT_DEPOSIT +from ..config import PRIVATE_PUT_DEPOSIT, PRIVATE_GET_DEPOSIT_METADATA from .deposit import SWHDeposit from .deposit_status import SWHDepositStatus from .deposit_update import SWHUpdateMetadataDeposit @@ -30,6 +30,7 @@ from .deposit_update import SWHUpdateArchiveDeposit from .deposit_content import SWHDepositContent from .service_document import SWHServiceDocument from .private.deposit_read import SWHDepositReadArchives +from .private.deposit_read import SWHDepositReadMetadata from .private.deposit_update_status import SWHUpdateStatusDeposit @@ -77,7 +78,14 @@ urlpatterns = [ url(r'^(?P<collection_name>[^/]+)/(?P<deposit_id>[^/]+)/raw/$', SWHDepositReadArchives.as_view(), name=PRIVATE_GET_RAW_CONTENT), + # Update deposit's status + # -> PUT url(r'^(?P<collection_name>[^/]+)/(?P<deposit_id>[^/]+)/update/$', SWHUpdateStatusDeposit.as_view(), name=PRIVATE_PUT_DEPOSIT), + # Retrieve metadata information on a specific deposit + # -> GET + url(r'^(?P<collection_name>[^/]+)/(?P<deposit_id>[^/]+)/meta/$', + SWHDepositReadMetadata.as_view(), + name=PRIVATE_GET_DEPOSIT_METADATA), ] diff --git a/swh/deposit/config.py b/swh/deposit/config.py index c9f38e6ef2449f0b33a05d1745df591f21f42f82..1408d2671d5d49128d572f39e2b8f2f97c34f359 100644 --- a/swh/deposit/config.py +++ b/swh/deposit/config.py @@ -17,6 +17,7 @@ COL_IRI = 'upload' STATE_IRI = 'status' PRIVATE_GET_RAW_CONTENT = 'private-download' PRIVATE_PUT_DEPOSIT = 'private-update' +PRIVATE_GET_DEPOSIT_METADATA = 'private-read' ARCHIVE_KEY = 'archive' METADATA_KEY = 'metadata' diff --git a/swh/deposit/injection/loader.py b/swh/deposit/injection/loader.py index f4cc5cde9aa0a51ba04671be31634f50e28d92ad..6043c7fc6c2d7c5df04c0bef0c8c8340765ad64b 100644 --- a/swh/deposit/injection/loader.py +++ b/swh/deposit/injection/loader.py @@ -3,22 +3,22 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import datetime import os import requests import tempfile - from swh.model import hashutil from swh.loader.tar import loader from swh.loader.core.loader import SWHLoader -def retrieve_archive_to(archive_url, archive_path): +def retrieve_archive_to(archive_update_url, archive_path, log=None): """Retrieve the archive from the deposit to a local directory. Args: - archive_url (str): The full deposit archive(s)'s raw content + archive_update_url (str): The full deposit archive(s)'s raw content to retrieve locally archive_path (str): the local archive's path where to store @@ -29,30 +29,61 @@ def retrieve_archive_to(archive_url, archive_path): Or None if any problem arose. """ - r = requests.get(archive_url, stream=True) + r = requests.get(archive_update_url, stream=True) if r.ok: with open(archive_path, 'wb') as f: for chunk in r.iter_content(): f.write(chunk) return archive_path - return None + + msg = 'Problem when retrieving deposit archive at %s' % ( + archive_update_url, ) + if log: + log.error(msg) + + raise ValueError(msg) + + +def retrieve_metadata(metadata_url, log=None): + """Retrieve the metadata information on a given deposit. + + Args: + + metadata_url (str): The full deposit metadata url to retrieve + locally + + Returns: + The dictionary of metadata for that deposit or None if any + problem arose. + + """ + r = requests.get(metadata_url) + if r.ok: + data = r.json() + + return data + + msg = 'Problem when retrieving metadata at %s' % metadata_url + if log: + log.error(msg) + + raise ValueError(msg) -def update_deposit_status(archive_url, status, revision_id=None): +def update_deposit_status(update_status_url, status, revision_id=None): """Update the deposit's status. Args: - archive_url (str): the full deposit's archive + update_status_url (str): the full deposit's archive status (str): The status to update the deposit with revision_id (str/None): the revision's identifier to update to """ - update_url = archive_url.replace('/raw/', '/update/') payload = {'status': status} if revision_id: payload['revision_id'] = revision_id - requests.put(update_url, json=payload) + requests.put(update_status_url, json=payload) class DepositLoader(loader.TarLoader): @@ -71,36 +102,36 @@ class DepositLoader(loader.TarLoader): - update the deposit's status accordingly """ - def load(self, *, deposit_archive_url, origin, visit_date, revision): - occurrence = {'branch': 'master'} - SWHLoader.load(self, - deposit_archive_url=deposit_archive_url, - origin=origin, - visit_date=visit_date, - revision=revision, - occurrences=[occurrence]) - - def prepare(self, *, deposit_archive_url, origin, visit_date, revision, - occurrences): + def load(self, *, archive_url, deposit_meta_url, deposit_update_url): + SWHLoader.load( + self, + archive_url=archive_url, + deposit_meta_url=deposit_meta_url, + deposit_update_url=deposit_update_url) + + def prepare(self, *, archive_url, deposit_meta_url, deposit_update_url): """Prepare the injection by first retrieving the deposit's raw archive content. """ - self.archive_url = deposit_archive_url + self.deposit_update_url = deposit_update_url temporary_directory = tempfile.TemporaryDirectory() self.temporary_directory = temporary_directory archive_path = os.path.join(temporary_directory.name, 'archive.zip') - archive = retrieve_archive_to(deposit_archive_url, archive_path) + archive = retrieve_archive_to(archive_url, archive_path, log=self.log) - if not archive: - raise ValueError('Failure to retrieve archive') + metadata = retrieve_metadata(deposit_meta_url, log=self.log) + origin = metadata['origin'] + visit_date = datetime.datetime.now(tz=datetime.timezone.utc) + revision = metadata['revision'] + occurrence = metadata['occurrence'] - update_deposit_status(self.archive_url, 'injecting') + update_deposit_status(deposit_update_url, 'injecting') super().prepare(tar_path=archive, origin=origin, visit_date=visit_date, revision=revision, - occurrences=occurrences) + occurrences=[occurrence]) def post_load(self, success=True): """Updating the deposit's status according to its loading status. @@ -112,7 +143,8 @@ class DepositLoader(loader.TarLoader): """ try: if not success: - update_deposit_status(self.archive_url, status='failure') + update_deposit_status(self.deposit_update_url, + status='failure') return # first retrieve the new revision occs = list(self.storage.occurrence_get(self.origin_id)) @@ -121,7 +153,7 @@ class DepositLoader(loader.TarLoader): revision_id = hashutil.hash_to_hex(occ['target']) # then update the deposit's status to success with its # revision-id - update_deposit_status(self.archive_url, + update_deposit_status(self.deposit_update_url, status='success', revision_id=revision_id) except: diff --git a/swh/deposit/injection/scheduler.py b/swh/deposit/injection/scheduler.py new file mode 100644 index 0000000000000000000000000000000000000000..e1b5e788c24ec7e52254069007326204c08481ae --- /dev/null +++ b/swh/deposit/injection/scheduler.py @@ -0,0 +1,204 @@ +# Copyright (C) 2017 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +"""Module in charge of sending deposit injection as celery task or +scheduled one-shot tasks. + +""" + +import click +import logging + +from abc import ABCMeta, abstractmethod +from celery import group + +from swh.core import utils +from swh.core.config import SWHConfig +from swh.deposit.config import setup_django_for + + +class SWHScheduling(SWHConfig, metaclass=ABCMeta): + """Base swh scheduling class to aggregate the schedule deposit + injection. + + """ + CONFIG_BASE_FILENAME = 'deposit/server' + + DEFAULT_CONFIG = { + 'dry_run': ('bool', False), + } + + def __init__(self): + super().__init__() + self.config = self.parse_config_file( + additional_configs=[self.ADDITIONAL_CONFIG]) + self.log = logging.getLogger('swh.deposit.scheduling') + + @abstractmethod + def schedule(self, deposits): + """Schedule the new deposit injection. + + Args: + data (dict): Deposit aggregated data + + Returns: + None + + """ + pass + + +class SWHCeleryScheduling(SWHScheduling): + """Deposit injection as Celery task scheduling. + + """ + ADDITIONAL_CONFIG = { + 'task_name': ('str', 'swh.deposit.tasks.LoadDepositArchive'), + } + + def __init__(self, config=None): + super().__init__() + from swh.scheduler import utils + self.task_name = self.config['task_name'] + self.task = utils.get_task(self.task_name) + if config: + self.config.update(**config) + self.dry_run = self.config['dry_run'] + + def _convert(self, deposits): + """Convert tuple to celery task signature. + + """ + task = self.task + for archive_url, deposit_meta_url, deposit_update_url in deposits: + yield task.s(archive_url=archive_url, + deposit_meta_url=deposit_meta_url, + deposit_update_url=deposit_update_url) + + def schedule(self, deposits): + """Schedule the new deposit injection directly through celery. + + Args: + depositdata (dict): Deposit aggregated information. + + Returns: + None + + """ + if self.dry_run: + return + + return group(self._convert(deposits)).delay() + + +class SWHScheduling(SWHScheduling): + """Deposit injection through SWH's task scheduling interface. + + """ + ADDITIONAL_CONFIG = {} + + def __init__(self, config=None): + super().__init__() + from swh.scheduler.backend import SchedulerBackend + if config: + self.config.update(**config) + self.dry_run = self.config['dry_run'] + self.scheduler = SchedulerBackend(**self.config) + + def _convert(self, deposits): + """Convert tuple to one-shot scheduling tasks. + + """ + import datetime + for archive_url, deposit_meta_url, deposit_update_url in deposits: + yield { + 'policy': 'oneshot', + 'type': 'swh-deposit-archive-ingestion', + 'next_run': datetime.datetime.now(tz=datetime.timezone.utc), + 'arguments': { + 'args': [], + 'kwargs': { + 'archive_url': archive_url, + 'deposit_meta_url': deposit_meta_url, + 'deposit_update_url': deposit_update_url, + }, + } + } + + def schedule(self, deposits): + """Schedule the new deposit injection through swh.scheduler's api. + + Args: + deposits (dict): Deposit aggregated information. + + """ + if self.dry_run: + return + + self.scheduler.create_tasks(self._convert(deposits)) + + +def get_deposit_ready(): + """Retrieve deposit ready to be task executed. + + """ + from swh.deposit.models import Deposit + yield from Deposit.objects.filter(status='ready') + + +def prepare_task_arguments(server): + """Convert deposit to argument for task to be executed. + + """ + from swh.deposit.config import PRIVATE_GET_RAW_CONTENT + from swh.deposit.config import PRIVATE_GET_DEPOSIT_METADATA + from swh.deposit.config import PRIVATE_PUT_DEPOSIT + from django.core.urlresolvers import reverse + + for deposit in get_deposit_ready(): + args = [deposit.collection.name, deposit.id] + archive_url = '%s%s' % (server, reverse( + PRIVATE_GET_RAW_CONTENT, args=args)) + deposit_meta_url = '%s%s' % (server, reverse( + PRIVATE_GET_DEPOSIT_METADATA, args=args)) + deposit_update_url = '%s%s' % (server, reverse( + PRIVATE_PUT_DEPOSIT, args=args)) + + yield archive_url, deposit_meta_url, deposit_update_url + + +@click.command( + help='Schedule one-shot deposit injections') +@click.option('--platform', default='development', + help='development or production platform') +@click.option('--scheduling-method', default='celery', + help='Scheduling method') +@click.option('--server', default='http://127.0.0.1:5006', + help='Deposit server') +@click.option('--batch-size', default=1000, type=click.INT, + help='Task batch size') +@click.option('--dry-run/--no-dry-run', is_flag=True, default=False, + help='Dry run') +def main(platform, scheduling_method, server, batch_size, dry_run): + setup_django_for(platform) + + override_config = {} + if dry_run: + override_config['dry_run'] = dry_run + + if scheduling_method == 'celery': + scheduling = SWHCeleryScheduling(override_config) + elif scheduling_method == 'swh-scheduler': + scheduling = SWHScheduling(override_config) + else: + raise ValueError( + 'Only `celery` or `swh-scheduler` values are accepted') + + for deposits in utils.grouper(prepare_task_arguments(server), batch_size): + scheduling.schedule(deposits) + + +if __name__ == '__main__': + main() diff --git a/swh/deposit/injection/tasks.py b/swh/deposit/injection/tasks.py index 4989c4df95929732d97d24ae91b82f7e1bd09487..bb9038982320a601993e012af01700b7c1bfd13f 100644 --- a/swh/deposit/injection/tasks.py +++ b/swh/deposit/injection/tasks.py @@ -20,8 +20,7 @@ class LoadDepositArchive(Task): """ task_queue = 'swh_deposit_archive' - def run_task(self, *, deposit_archive_url, origin, visit_date, - revision): + def run_task(self, *, archive_url, deposit_meta_url, deposit_update_url): """Import a deposit tarball into swh. Args: see :func:`DepositLoader.load`. @@ -29,7 +28,6 @@ class LoadDepositArchive(Task): """ loader = DepositLoader() loader.log = self.log - loader.load(deposit_archive_url=deposit_archive_url, - origin=origin, - visit_date=visit_date, - revision=revision) + loader.load(archive_url=archive_url, + deposit_meta_url=deposit_meta_url, + deposit_update_url=deposit_update_url) diff --git a/swh/deposit/models.py b/swh/deposit/models.py index ad316777ac3c9dc124c3c935290ba3aaddcd50ce..a454f649bd8fb273c40389920e03c9d0b2f8e193 100644 --- a/swh/deposit/models.py +++ b/swh/deposit/models.py @@ -89,6 +89,22 @@ def format_swh_id(collection_name, revision_id): return 'swh-%s-%s' % (collection_name, revision_id) +def previous_revision_id(swh_id): + """Compute the parent's revision id (if any) from the swh_id. + + Args: + swh_id (id): SWH Identifier from a previous deposit. + + Returns: + None if no parent revision is detected. + The revision id's hash if any. + + """ + if swh_id: + return swh_id.split('-')[2] + return None + + class Deposit(models.Model): """Deposit reception table diff --git a/swh/deposit/scheduler/__init__.py b/swh/deposit/scheduler/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/swh/deposit/scheduler/cli.py b/swh/deposit/scheduler/cli.py deleted file mode 100644 index a2dfee8bf86e70e4bfd2f30d96e557d453bd6849..0000000000000000000000000000000000000000 --- a/swh/deposit/scheduler/cli.py +++ /dev/null @@ -1,285 +0,0 @@ -# Copyright (C) 2017 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -"""Module in charge of scheduling deposit injection one-shot task to swh. - -""" - -import click -import logging - -from abc import ABCMeta, abstractmethod - -from swh.core.config import SWHConfig -from swh.deposit.config import setup_django_for -from swh.model import hashutil, identifiers - - -def previous_revision_id(swh_id): - """Compute the parent's revision id (if any) from the swh_id. - - Args: - swh_id (id): SWH Identifier from a previous deposit. - - Returns: - None if no parent revision is detected. - The revision id's hash if any. - - """ - if swh_id: - return swh_id.split('-')[2] - return None - - -class SWHScheduling(SWHConfig, metaclass=ABCMeta): - """Base swh scheduling class to aggregate the schedule deposit - injection. - - """ - CONFIG_BASE_FILENAME = 'deposit/server' - - DEFAULT_CONFIG = { - 'dry_run': ('bool', False), - } - - def __init__(self): - super().__init__() - self.config = self.parse_config_file( - additional_configs=[self.ADDITIONAL_CONFIG]) - self.log = logging.getLogger('swh.deposit.scheduling') - - def _aggregate_metadata(self, deposit, metadata_requests): - """Retrieve and aggregates metadata information. - - """ - metadata = {} - for req in metadata_requests: - metadata.update(req.metadata) - - return metadata - - def aggregate(self, deposit, deposit_archive_url, requests): - """Aggregate multiple data on deposit into one unified data dictionary. - - Args: - deposit (Deposit): Deposit concerned by the data aggregation. - deposit_archive_url (str): Url to retrieve a tarball from - the deposit instance - requests ([DepositRequest]): List of associated requests which - need aggregation. - - Returns: - Dictionary of data representing the deposit to inject in swh. - - """ - data = {} - metadata_requests = [] - - # Retrieve tarballs/metadata information - metadata = self._aggregate_metadata(deposit, metadata_requests) - - data['deposit_archive_url'] = deposit_archive_url - - # Read information metadata - data['origin'] = { - 'type': deposit.collection.name, - 'url': deposit.external_id, - } - - # revision - - fullname = deposit.client.get_full_name() - author_committer = { - 'name': deposit.client.last_name, - 'fullname': fullname, - 'email': deposit.client.email, - } - - revision_type = 'tar' - revision_msg = '%s: Deposit %s in collection %s' % ( - fullname, deposit.id, deposit.collection.name) - complete_date = identifiers.normalize_timestamp(deposit.complete_date) - - data['revision'] = { - 'synthetic': True, - 'date': complete_date, - 'committer_date': complete_date, - 'author': author_committer, - 'committer': author_committer, - 'type': revision_type, - 'message': revision_msg, - 'metadata': metadata, - } - - parent_revision = previous_revision_id(deposit.swh_id) - if parent_revision: - data['revision'] = { - 'parents': [hashutil.hash_to_bytes(parent_revision)] - } - - return data - - @abstractmethod - def schedule(self, deposit, data): - """Schedule the new deposit injection. - - Args: - deposit (Deposit): Deposit concerned by the data aggregation. - data (dict): Deposit aggregated data - - Returns: - None - - """ - pass - - -class SWHCeleryScheduling(SWHScheduling): - """Deposit injection as Celery task scheduling. - - """ - ADDITIONAL_CONFIG = { - 'task_name': ('str', 'swh.deposit.tasks.LoadDepositArchive'), - } - - def __init__(self, config=None): - super().__init__() - from swh.scheduler import utils - self.task_name = self.config['task_name'] - self.task = utils.get_task(self.task_name) - if config: - self.config.update(**config) - self.dry_run = self.config['dry_run'] - - def schedule(self, deposit_data): - """Schedule the new deposit injection directly through celery. - - Args: - deposit_data (dict): Deposit aggregated information. - - Returns: - None - - """ - deposit_archive_url = deposit_data['deposit_archive_url'] - origin = deposit_data['origin'] - visit_date = None # default to Now - revision = deposit_data['revision'] - - self.log.debug('args: %s %s %s %s' % ( - deposit_archive_url, origin, visit_date, revision)) - - if self.dry_run: - return - - return self.task.delay( - deposit_archive_url=deposit_archive_url, - origin=origin, - visit_date=visit_date, - revision=revision) - - -class SWHScheduling(SWHScheduling): - """Deposit injection through SWH's task scheduling interface. - - """ - ADDITIONAL_CONFIG = {} - - def __init__(self, config=None): - super().__init__() - from swh.scheduler.backend import SchedulerBackend - if config: - self.config.update(**config) - self.dry_run = self.config['dry_run'] - self.scheduler = SchedulerBackend(**self.config) - - def schedule(self, deposit_data): - """Schedule the new deposit injection through swh.scheduler's api. - - Args: - deposit_data (dict): Deposit aggregated information. - - """ - deposit_archive_url = deposit_data['deposit_archive_url'] - origin = deposit_data['origin'] - visit_date = None # default to Now - revision = deposit_data['revision'] - - self.log.debug('args: %s %s %s %s' % ( - deposit_archive_url, origin, visit_date, revision)) - - if self.dry_run: - return - - import datetime - task = { - 'policy': 'oneshot', - 'type': 'swh-deposit-archive-ingestion', - 'next_run': datetime.datetime.now(tz=datetime.timezone.utc), - 'arguments': { - 'args': [], - 'kwargs': { - 'deposit_archive_url': deposit_archive_url, - 'origin': origin, - 'visit_date': visit_date, - 'revision': revision - }, - } - } - self.scheduler.create_tasks([task]) - - -@click.command( - help='Schedule one-shot deposit injections') -@click.option('--platform', default='development', - help='development or production platform') -@click.option('--scheduling-method', default='celery', - help='Scheduling method') -@click.option('--server', default='http://127.0.0.1:5006', - help='Deposit server') -@click.option('--dry-run/--no-dry-run', is_flag=True, default=False, - help='Dry run') -def main(platform, scheduling_method, server, dry_run): - setup_django_for(platform) - - from swh.deposit.models import Deposit, DepositRequest, DepositRequestType - - override_config = {} - if dry_run: - override_config['dry_run'] = dry_run - - if scheduling_method == 'celery': - scheduling = SWHCeleryScheduling(override_config) - elif scheduling_method == 'swh-scheduler': - scheduling = SWHScheduling(override_config) - else: - raise ValueError( - 'Only `celery` or `swh-scheduler` values are accepted') - - from swh.deposit.config import PRIVATE_GET_RAW_CONTENT - from django.core.urlresolvers import reverse - - _request_types = DepositRequestType.objects.all() - deposit_request_types = { - type.name: type for type in _request_types - } - - deposits = Deposit.objects.filter(status='ready') - for deposit in deposits: - deposit_archive_url = '%s%s' % (server, reverse( - PRIVATE_GET_RAW_CONTENT, - args=[deposit.collection.name, deposit.id])) - - requests = DepositRequest.objects.filter( - deposit=deposit, type=deposit_request_types['metadata']) - - deposit_data = scheduling.aggregate( - deposit, deposit_archive_url, requests) - - scheduling.schedule(deposit_data) - - -if __name__ == '__main__': - main() diff --git a/swh/deposit/tests/api/test_deposit_read.py b/swh/deposit/tests/api/test_deposit_read.py new file mode 100644 index 0000000000000000000000000000000000000000..c203fda448385d671ecd5e5a502caa1d02f642e7 --- /dev/null +++ b/swh/deposit/tests/api/test_deposit_read.py @@ -0,0 +1,72 @@ +# Copyright (C) 2017 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import json + +from django.core.urlresolvers import reverse +from rest_framework import status +from rest_framework.test import APITestCase + +from swh.deposit.config import PRIVATE_GET_DEPOSIT_METADATA + +from ..common import BasicTestCase, WithAuthTestCase, CommonCreationRoutine + + +class DepositReadMetadataTest(APITestCase, WithAuthTestCase, BasicTestCase, + CommonCreationRoutine): + """Deposit access to read metadata information on deposit. + + """ + def test_access_to_an_existing_deposit_returns_metadata(self): + deposit_id = self.create_deposit_partial() + + url = reverse(PRIVATE_GET_DEPOSIT_METADATA, + args=[self.collection.name, deposit_id]) + + response = self.client.get(url) + + self.assertEqual(response.status_code, + status.HTTP_200_OK) + self.assertEquals(response._headers['content-type'][1], + 'application/json') + data = json.loads(response.content.decode('utf-8')) + + expected_meta = { + 'origin': { + 'url': 'some-external-id', + 'type': 'hal' + }, + 'revision': { + 'synthetic': True, + 'committer_date': None, + 'message': ': Deposit %s in collection hal' % deposit_id, + 'author': { + 'fullname': '', 'email': '', 'name': '' + }, + 'committer': { + 'fullname': '', 'email': '', 'name': '' + }, + 'date': None, + 'metadata': {}, + 'type': 'tar' + }, + 'occurrence': { + 'branch': 'master' + } + } + + self.assertEquals(data, expected_meta) + + def test_access_to_unexisting_collection_returns_404_response(self): + """Read unknown deposit should return a 404 response + + """ + unknown_id = '999' + url = reverse(PRIVATE_GET_DEPOSIT_METADATA, + args=[self.collection.name, unknown_id]) + + response = self.client.get(url) + self.assertEqual(response.status_code, + status.HTTP_404_NOT_FOUND) diff --git a/version.txt b/version.txt index 0305ff4f3f011cd4a65e4c92e4f58a0d341cb006..b5da016c58b10cf3460f82c68bbdd5fb5e40603d 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -v0.0.19-0-g662215d \ No newline at end of file +v0.0.20-0-g0df83b9 \ No newline at end of file