Skip to content
Snippets Groups Projects
Commit 89f5ccc7 authored by Boris Baldassari's avatar Boris Baldassari
Browse files

loader: add new maven-jar loader

The maven loader loads jar and zip files as Maven artefacts into the software heritage archive.

Note:
Supersedes D6158 and addresses the review done in that diff.

Related to T1724
parent 5d22455c
No related branches found
No related tags found
1 merge request!418Implement maven jar source files loader
Showing
with 1111 additions and 0 deletions
......@@ -22,4 +22,5 @@ def swh_scheduler_celery_includes(swh_scheduler_celery_includes):
"swh.loader.package.npm.tasks",
"swh.loader.package.pypi.tasks",
"swh.loader.package.nixguix.tasks",
"swh.loader.package.maven.tasks",
]
......@@ -56,6 +56,15 @@ Here is an overview of the fields (+ internal version name + branch name) used b
- original author
- ``<codemeta: dateCreated>`` from SWORD XML
- revisions had parents
* - maven-loader
- passed as arg
- HEAD
- ``release_name(version)``
- "Synthetic release for archive at {p_info.url}\n"
- true
- ""
- passed as arg
- Only one artefact per url (jar/zip src)
* - nixguix
- URL
- URL
......
......@@ -63,6 +63,7 @@ setup(
loader.npm=swh.loader.package.npm:register
loader.opam=swh.loader.package.opam:register
loader.pypi=swh.loader.package.pypi:register
loader.maven=swh.loader.package.maven:register
""",
classifiers=[
"Programming Language :: Python :: 3",
......
# Copyright (C) 2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from typing import Any, Mapping
def register() -> Mapping[str, Any]:
"""Register the current worker module's definition"""
from .loader import MavenLoader
return {
"task_modules": [f"{__name__}.tasks"],
"loader": MavenLoader,
}
# Copyright (C) 2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from datetime import datetime, timezone
import hashlib
import json
import logging
from os import path
import string
from typing import (
Any,
Dict,
Iterator,
List,
Mapping,
Optional,
OrderedDict,
Sequence,
Tuple,
)
from urllib.parse import urlparse
import attr
import iso8601
import requests
from swh.loader.package.loader import (
BasePackageInfo,
PackageLoader,
PartialExtID,
RawExtrinsicMetadataCore,
)
from swh.loader.package.utils import EMPTY_AUTHOR, release_name
from swh.model.model import (
MetadataAuthority,
MetadataAuthorityType,
ObjectType,
RawExtrinsicMetadata,
Release,
Sha1Git,
TimestampWithTimezone,
)
from swh.storage.interface import StorageInterface
logger = logging.getLogger(__name__)
@attr.s
class MavenPackageInfo(BasePackageInfo):
time = attr.ib(type=datetime)
"""Timestamp of the last update of jar file on the server."""
gid = attr.ib(type=str)
"""Group ID of the maven artifact"""
aid = attr.ib(type=str)
"""Artifact ID of the maven artifact"""
version = attr.ib(type=str)
"""Version of the maven artifact"""
# default format for maven artifacts
MANIFEST_FORMAT = string.Template("$gid $aid $version $url $time")
def extid(self, manifest_format: Optional[string.Template] = None) -> PartialExtID:
"""Returns a unique intrinsic identifier of this package info
``manifest_format`` allows overriding the class' default MANIFEST_FORMAT"""
manifest_format = manifest_format or self.MANIFEST_FORMAT
manifest = manifest_format.substitute(
{
"gid": self.gid,
"aid": self.aid,
"version": self.version,
"url": self.url,
"time": str(self.time),
}
)
return ("maven-jar", hashlib.sha256(manifest.encode()).digest())
@classmethod
def from_metadata(cls, a_metadata: Dict[str, Any]) -> "MavenPackageInfo":
url = a_metadata["url"]
filename = a_metadata.get("filename")
time = iso8601.parse_date(a_metadata["time"])
time = time.astimezone(tz=timezone.utc)
gid = a_metadata["gid"]
aid = a_metadata["aid"]
version = a_metadata["version"]
return cls(
url=url,
filename=filename or path.split(url)[-1],
time=time,
gid=gid,
aid=aid,
version=version,
directory_extrinsic_metadata=[
RawExtrinsicMetadataCore(
format="maven-json", metadata=json.dumps(a_metadata).encode(),
),
],
)
class MavenLoader(PackageLoader[MavenPackageInfo]):
"""Load source code jar origin's artifact files into swh archive
"""
visit_type = "maven"
def __init__(
self,
storage: StorageInterface,
url: str,
artifacts: Sequence[Dict[str, Any]],
extid_manifest_format: Optional[str] = None,
max_content_size: Optional[int] = None,
):
f"""Loader constructor.
For now, this is the lister's task output.
There is one, and only one, artefact (jar or zip) per version, as guaranteed by
the Maven coordinates system.
Args:
url: Origin url
artifacts: List of single artifact information with keys:
- **time**: the time of the last update of jar file on the server
as an iso8601 date string
- **url**: the artifact url to retrieve filename
- **filename**: optionally, the file's name
- **gid**: artifact's groupId
- **aid**: artifact's artifactId
- **version**: artifact's version
extid_manifest_format: template string used to format a manifest,
which is hashed to get the extid of a package.
Defaults to {MavenPackageInfo.MANIFEST_FORMAT!r}
"""
super().__init__(storage=storage, url=url, max_content_size=max_content_size)
self.artifacts = artifacts # assume order is enforced in the lister
self.version_artifact: OrderedDict[str, Dict[str, Any]]
self.version_artifact = OrderedDict(
{str(jar["version"]): jar for jar in artifacts if jar["version"]}
)
def get_versions(self) -> Sequence[str]:
return list(self.version_artifact.keys())
def get_default_version(self) -> str:
# Default version is the last item
return self.artifacts[-1]["version"]
def get_metadata_authority(self):
p_url = urlparse(self.url)
return MetadataAuthority(
type=MetadataAuthorityType.FORGE,
url=f"{p_url.scheme}://{p_url.netloc}/",
metadata={},
)
def build_extrinsic_directory_metadata(
self, p_info: MavenPackageInfo, release_id: Sha1Git, directory_id: Sha1Git,
) -> List[RawExtrinsicMetadata]:
if not p_info.directory_extrinsic_metadata:
# If this package loader doesn't write metadata, no need to require
# an implementation for get_metadata_authority.
return []
# Get artifacts
dir_ext_metadata = p_info.directory_extrinsic_metadata[0]
a_metadata = json.loads(dir_ext_metadata.metadata)
aid = a_metadata["aid"]
version = a_metadata["version"]
# Rebuild POM URL.
pom_url = path.dirname(p_info.url)
pom_url = f"{pom_url}/{aid}-{version}.pom"
r = requests.get(pom_url, allow_redirects=True)
if r.status_code == 200:
metadata_pom = r.content
else:
metadata_pom = b""
return super().build_extrinsic_directory_metadata(
attr.evolve(
p_info,
directory_extrinsic_metadata=[
RawExtrinsicMetadataCore(
format="maven-pom", metadata=metadata_pom,
),
dir_ext_metadata,
],
),
release_id=release_id,
directory_id=directory_id,
)
def get_package_info(self, version: str) -> Iterator[Tuple[str, MavenPackageInfo]]:
a_metadata = self.version_artifact[version]
yield release_name(a_metadata["version"]), MavenPackageInfo.from_metadata(
a_metadata
)
def build_release(
self, p_info: MavenPackageInfo, uncompressed_path: str, directory: Sha1Git
) -> Optional[Release]:
msg = f"Synthetic release for archive at {p_info.url}\n".encode("utf-8")
# time is an iso8601 date
normalized_time = TimestampWithTimezone.from_datetime(p_info.time)
return Release(
name=p_info.version.encode(),
message=msg,
date=normalized_time,
author=EMPTY_AUTHOR,
target=directory,
target_type=ObjectType.DIRECTORY,
synthetic=True,
)
def extra_branches(self) -> Dict[bytes, Mapping[str, Any]]:
last_snapshot = self.last_snapshot()
return last_snapshot.to_dict()["branches"] if last_snapshot else {}
# Copyright (C) 2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from celery import shared_task
from swh.loader.package.maven.loader import MavenLoader
@shared_task(name=__name__ + ".LoadMaven")
def load_jar_file(*, url=None, artifacts=None):
"""Load jar's artifacts."""
loader = MavenLoader.from_configfile(url=url, artifacts=artifacts)
return loader.load()
File added
<?xml version="1.0" encoding="UTF-8"?>
<project xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd" xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<modelVersion>4.0.0</modelVersion>
<groupId>al.aldi</groupId>
<artifactId>sprova4j</artifactId>
<version>0.1.0</version>
<name>sprova4j</name>
<description>Java client for Sprova Test Management</description>
<url>https://github.com/aldialimucaj/sprova4j</url>
<inceptionYear>2018</inceptionYear>
<licenses>
<license>
<name>The Apache Software License, Version 2.0</name>
<url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
<distribution>repo</distribution>
</license>
</licenses>
<developers>
<developer>
<id>aldi</id>
<name>Aldi Alimucaj</name>
<email>aldi.alimucaj@gmail.com</email>
</developer>
</developers>
<scm>
<connection>scm:git:git://github.com/aldialimucaj/sprova4j.git</connection>
<developerConnection>scm:git:git://github.com/aldialimucaj/sprova4j.git</developerConnection>
<url>https://github.com/aldialimucaj/sprova4j</url>
</scm>
<dependencies>
<dependency>
<groupId>ch.qos.logback</groupId>
<artifactId>logback-classic</artifactId>
<version>1.2.3</version>
<scope>runtime</scope>
</dependency>
<dependency>
<groupId>com.google.code.gson</groupId>
<artifactId>gson</artifactId>
<version>2.8.3</version>
<scope>runtime</scope>
</dependency>
<dependency>
<groupId>com.squareup.okhttp3</groupId>
<artifactId>okhttp</artifactId>
<version>3.10.0</version>
<scope>runtime</scope>
</dependency>
<dependency>
<groupId>com.squareup.okio</groupId>
<artifactId>okio</artifactId>
<version>1.0.0</version>
<scope>runtime</scope>
</dependency>
<dependency>
<groupId>org.glassfish</groupId>
<artifactId>javax.json</artifactId>
<version>1.1.2</version>
<scope>runtime</scope>
</dependency>
<dependency>
<groupId>javax.json</groupId>
<artifactId>javax.json-api</artifactId>
<version>1.1.2</version>
<scope>runtime</scope>
</dependency>
<dependency>
<groupId>javax.validation</groupId>
<artifactId>validation-api</artifactId>
<version>2.0.1.Final</version>
<scope>runtime</scope>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>com.squareup.okhttp3</groupId>
<artifactId>mockwebserver</artifactId>
<version>3.10.0</version>
<scope>test</scope>
</dependency>
</dependencies>
</project>
File added
<?xml version="1.0" encoding="UTF-8"?>
<project xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd" xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<modelVersion>4.0.0</modelVersion>
<groupId>al.aldi</groupId>
<artifactId>sprova4j</artifactId>
<version>0.1.1</version>
<name>sprova4j</name>
<description>Java client for Sprova Test Management</description>
<url>https://github.com/aldialimucaj/sprova4j</url>
<inceptionYear>2018</inceptionYear>
<licenses>
<license>
<name>The Apache Software License, Version 2.0</name>
<url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
<distribution>repo</distribution>
</license>
</licenses>
<developers>
<developer>
<id>aldi</id>
<name>Aldi Alimucaj</name>
<email>aldi.alimucaj@gmail.com</email>
</developer>
</developers>
<scm>
<connection>https://github.com/aldialimucaj/sprova4j.git</connection>
<developerConnection>https://github.com/aldialimucaj/sprova4j.git</developerConnection>
<url>https://github.com/aldialimucaj/sprova4j</url>
</scm>
<dependencies>
<dependency>
<groupId>ch.qos.logback</groupId>
<artifactId>logback-classic</artifactId>
<version>1.2.3</version>
<scope>runtime</scope>
</dependency>
<dependency>
<groupId>com.google.code.gson</groupId>
<artifactId>gson</artifactId>
<version>2.8.5</version>
<scope>runtime</scope>
</dependency>
<dependency>
<groupId>com.squareup.okhttp3</groupId>
<artifactId>okhttp</artifactId>
<version>3.10.0</version>
<scope>runtime</scope>
</dependency>
<dependency>
<groupId>com.squareup.okio</groupId>
<artifactId>okio</artifactId>
<version>1.14.1</version>
<scope>runtime</scope>
</dependency>
<dependency>
<groupId>org.glassfish</groupId>
<artifactId>javax.json</artifactId>
<version>1.1.2</version>
<scope>runtime</scope>
</dependency>
<dependency>
<groupId>javax.json</groupId>
<artifactId>javax.json-api</artifactId>
<version>1.1.2</version>
<scope>runtime</scope>
</dependency>
<dependency>
<groupId>javax.validation</groupId>
<artifactId>validation-api</artifactId>
<version>2.0.1.Final</version>
<scope>runtime</scope>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>com.squareup.okhttp3</groupId>
<artifactId>mockwebserver</artifactId>
<version>3.10.0</version>
<scope>test</scope>
</dependency>
</dependencies>
</project>
This diff is collapsed.
# Copyright (C) 2019-2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
MVN_ARTIFACTS = [
{
"time": 1626109619335,
"url": "https://repo1.maven.org/maven2/al/aldi/sprova4j/0.1.0/"
+ "sprova4j-0.1.0.jar",
"gid": "al.aldi",
"aid": "sprova4j",
"filename": "sprova4j-0.1.0.jar",
"version": "0.1.0",
},
]
def test_tasks_jar_loader(
mocker, swh_scheduler_celery_app, swh_scheduler_celery_worker, swh_config
):
mock_load = mocker.patch("swh.loader.package.maven.loader.MavenLoader.load")
mock_load.return_value = {"status": "eventful"}
res = swh_scheduler_celery_app.send_task(
"swh.loader.package.maven.tasks.LoadMaven",
kwargs=dict(url=MVN_ARTIFACTS[0]["url"], artifacts=MVN_ARTIFACTS,),
)
assert res
res.wait()
assert res.successful()
assert mock_load.called
assert res.result == {"status": "eventful"}
def test_tasks_jar_loader_snapshot_append(
mocker, swh_scheduler_celery_app, swh_scheduler_celery_worker, swh_config
):
mock_load = mocker.patch("swh.loader.package.maven.loader.MavenLoader.load")
mock_load.return_value = {"status": "eventful"}
res = swh_scheduler_celery_app.send_task(
"swh.loader.package.maven.tasks.LoadMaven",
kwargs=dict(url=MVN_ARTIFACTS[0]["url"], artifacts=[]),
)
assert res
res.wait()
assert res.successful()
assert mock_load.called
assert res.result == {"status": "eventful"}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment