From 986c672ed1acf34c3f7c0e4f2d6e959b8d012278 Mon Sep 17 00:00:00 2001
From: Valentin Lorentz <vlorentz@softwareheritage.org>
Date: Wed, 1 Jun 2022 17:42:22 +0200
Subject: [PATCH] Add support for indexing from head releases

Needed since package loaders now create release objects instead
of revision objects.
---
 swh/indexer/metadata.py                   | 76 ++++++++++++++++++-----
 swh/indexer/origin_head.py                |  2 +-
 swh/indexer/tests/test_origin_head.py     |  7 ++-
 swh/indexer/tests/test_origin_metadata.py | 42 ++++++++++++-
 swh/indexer/tests/utils.py                | 62 +++++++++++++++++-
 5 files changed, 168 insertions(+), 21 deletions(-)

diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py
index 5163c4a1..ac0920b8 100644
--- a/swh/indexer/metadata.py
+++ b/swh/indexer/metadata.py
@@ -32,10 +32,13 @@ from swh.indexer.storage.model import (
     OriginIntrinsicMetadataRow,
 )
 from swh.model import hashutil
-from swh.model.model import Directory, Origin, Sha1Git
-from swh.model.swhids import ObjectType
+from swh.model.model import Directory
+from swh.model.model import ObjectType as ModelObjectType
+from swh.model.model import Origin, Sha1Git
+from swh.model.swhids import CoreSWHID, ObjectType
 
 REVISION_GET_BATCH_SIZE = 10
+RELEASE_GET_BATCH_SIZE = 10
 ORIGIN_GET_BATCH_SIZE = 10
 
 
@@ -329,7 +332,8 @@ class OriginMetadataIndexer(
         self, origins: List[Origin], check_origin_known: bool = True, **kwargs
     ) -> List[Tuple[OriginIntrinsicMetadataRow, DirectoryIntrinsicMetadataRow]]:
         head_rev_ids = []
-        origins_with_head = []
+        head_rel_ids = []
+        origin_heads: Dict[Origin, CoreSWHID] = {}
 
         # Filter out origins not in the storage
         if check_origin_known:
@@ -348,25 +352,63 @@ class OriginMetadataIndexer(
                 continue
             head_swhid = get_head_swhid(self.storage, origin.url)
             if head_swhid:
-                # TODO: add support for releases
-                assert head_swhid.object_type == ObjectType.REVISION, head_swhid
-                origins_with_head.append(origin)
-                head_rev_ids.append(head_swhid.object_id)
-
-        head_revs = list(
-            call_with_batches(
-                self.storage.revision_get, head_rev_ids, REVISION_GET_BATCH_SIZE
+                origin_heads[origin] = head_swhid
+                if head_swhid.object_type == ObjectType.REVISION:
+                    head_rev_ids.append(head_swhid.object_id)
+                elif head_swhid.object_type == ObjectType.RELEASE:
+                    head_rel_ids.append(head_swhid.object_id)
+                else:
+                    assert False, head_swhid
+
+        head_revs = dict(
+            zip(
+                head_rev_ids,
+                call_with_batches(
+                    self.storage.revision_get, head_rev_ids, REVISION_GET_BATCH_SIZE
+                ),
+            )
+        )
+        head_rels = dict(
+            zip(
+                head_rel_ids,
+                call_with_batches(
+                    self.storage.release_get, head_rel_ids, RELEASE_GET_BATCH_SIZE
+                ),
             )
         )
-        assert len(head_revs) == len(head_rev_ids)
 
         results = []
-        for (origin, rev) in zip(origins_with_head, head_revs):
-            if not rev:
-                self.log.warning("Missing head revision of origin %r", origin.url)
-                continue
+        for (origin, head_swhid) in origin_heads.items():
+            if head_swhid.object_type == ObjectType.REVISION:
+                rev = head_revs[head_swhid.object_id]
+                if not rev:
+                    self.log.warning(
+                        "Missing head object %s of origin %r", head_swhid, origin.url
+                    )
+                    continue
+                directory_id = rev.directory
+            elif head_swhid.object_type == ObjectType.RELEASE:
+                rel = head_rels[head_swhid.object_id]
+                if not rel:
+                    self.log.warning(
+                        "Missing head object %s of origin %r", head_swhid, origin.url
+                    )
+                    continue
+                if rel.target_type != ModelObjectType.DIRECTORY:
+                    # TODO
+                    self.log.warning(
+                        "Head release %s of %r has unexpected target type %s",
+                        head_swhid,
+                        origin.url,
+                        rel.target_type,
+                    )
+                    continue
+                assert rel.target, rel
+                directory_id = rel.target
+            else:
+                assert False, head_swhid
 
-            for dir_metadata in self.directory_metadata_indexer.index(rev.directory):
+            for dir_metadata in self.directory_metadata_indexer.index(directory_id):
                 # There is at most one dir_metadata
                 orig_metadata = OriginIntrinsicMetadataRow(
                     from_directory=dir_metadata.id,
diff --git a/swh/indexer/origin_head.py b/swh/indexer/origin_head.py
index 6e79e1e9..2d9ff6da 100644
--- a/swh/indexer/origin_head.py
+++ b/swh/indexer/origin_head.py
@@ -113,7 +113,7 @@ def _try_resolve_target(
         elif branch.target_type == TargetType.DIRECTORY:
             return None  # TODO
         elif branch.target_type == TargetType.RELEASE:
-            return None  # TODO
+            return CoreSWHID(object_type=ObjectType.RELEASE, object_id=branch.target)
         else:
             assert False, branch
     except KeyError:
diff --git a/swh/indexer/tests/test_origin_head.py b/swh/indexer/tests/test_origin_head.py
index 21f8637e..999084bb 100644
--- a/swh/indexer/tests/test_origin_head.py
+++ b/swh/indexer/tests/test_origin_head.py
@@ -139,11 +139,16 @@ def test_deposit_missing_snapshot(storage):
 
 
 def test_pypi(storage):
-    origin_url = "https://pypi.org/project/limnoria/"
+    origin_url = "https://old-pypi.example.org/project/limnoria/"
     assert get_head_swhid(storage, origin_url) == CoreSWHID.from_string(
         "swh:1:rev:83b9b6c705b125d0fe6dd86b41109dc5fa32f874"
     )
 
+    origin_url = "https://pypi.org/project/limnoria/"
+    assert get_head_swhid(storage, origin_url) == CoreSWHID.from_string(
+        "swh:1:rel:83b9b6c705b125d0fe6dd86b41109dc5fa32f874"
+    )
+
 
 def test_svn(storage):
     origin_url = "http://0-512-md.googlecode.com/svn/"
diff --git a/swh/indexer/tests/test_origin_metadata.py b/swh/indexer/tests/test_origin_metadata.py
index 529680f0..f5179c77 100644
--- a/swh/indexer/tests/test_origin_metadata.py
+++ b/swh/indexer/tests/test_origin_metadata.py
@@ -29,7 +29,47 @@ def swh_indexer_config(swh_indexer_config):
     return cfg
 
 
-def test_origin_metadata_indexer(
+def test_origin_metadata_indexer_release(
+    swh_indexer_config,
+    idx_storage: IndexerStorageInterface,
+    storage: StorageInterface,
+    obj_storage,
+) -> None:
+    indexer = OriginMetadataIndexer(config=swh_indexer_config)
+    origin = "https://npm.example.org/yarn-parser"
+    indexer.run([origin])
+
+    tool = swh_indexer_config["tools"]
+
+    dir_id = DIRECTORY2.id
+    dir_metadata = DirectoryIntrinsicMetadataRow(
+        id=dir_id,
+        tool=tool,
+        metadata=YARN_PARSER_METADATA,
+        mappings=["npm"],
+    )
+    origin_metadata = OriginIntrinsicMetadataRow(
+        id=origin,
+        tool=tool,
+        from_directory=dir_id,
+        metadata=YARN_PARSER_METADATA,
+        mappings=["npm"],
+    )
+
+    dir_results = list(idx_storage.directory_intrinsic_metadata_get([dir_id]))
+    for dir_result in dir_results:
+        assert dir_result.tool
+        del dir_result.tool["id"]
+    assert dir_results == [dir_metadata]
+
+    orig_results = list(idx_storage.origin_intrinsic_metadata_get([origin]))
+    for orig_result in orig_results:
+        assert orig_result.tool
+        del orig_result.tool["id"]
+    assert orig_results == [origin_metadata]
+
+
+def test_origin_metadata_indexer_revision(
     swh_indexer_config,
     idx_storage: IndexerStorageInterface,
     storage: StorageInterface,
diff --git a/swh/indexer/tests/utils.py b/swh/indexer/tests/utils.py
index 45f48a2d..5171bae2 100644
--- a/swh/indexer/tests/utils.py
+++ b/swh/indexer/tests/utils.py
@@ -19,10 +19,12 @@ from swh.model.model import (
     Content,
     Directory,
     DirectoryEntry,
+    ObjectType,
     Origin,
     OriginVisit,
     OriginVisitStatus,
     Person,
+    Release,
     Revision,
     RevisionType,
     Snapshot,
@@ -46,10 +48,15 @@ ORIGIN_VISITS = [
         "type": "deposit",
         "origin": "https://forge.softwareheritage.org/source/jesuisgpl/",
     },
-    {"type": "pypi", "origin": "https://pypi.org/project/limnoria/"},
+    {
+        "type": "pypi",
+        "origin": "https://old-pypi.example.org/project/limnoria/",
+    },  # with rev head
+    {"type": "pypi", "origin": "https://pypi.org/project/limnoria/"},  # with rel head
     {"type": "svn", "origin": "http://0-512-md.googlecode.com/svn/"},
     {"type": "git", "origin": "https://github.com/librariesio/yarn-parser"},
     {"type": "git", "origin": "https://github.com/librariesio/yarn-parser.git"},
+    {"type": "git", "origin": "https://npm.example.org/yarn-parser"},
 ]
 
 ORIGINS = [Origin(url=visit["origin"]) for visit in ORIGIN_VISITS]
@@ -120,7 +127,26 @@ REVISION = Revision(
 
 REVISIONS = [REVISION]
 
+RELEASE = Release(
+    name=b"v0.0.0",
+    message=None,
+    author=Person(
+        name=b"Andrew Nesbitt",
+        fullname=b"Andrew Nesbitt <andrewnez@gmail.com>",
+        email=b"andrewnez@gmail.com",
+    ),
+    synthetic=False,
+    date=TimestampWithTimezone.from_datetime(
+        datetime.datetime(2017, 2, 20, 16, 14, 16, tzinfo=_utc_plus_2)
+    ),
+    target_type=ObjectType.DIRECTORY,
+    target=DIRECTORY2.id,
+)
+
+RELEASES = [RELEASE]
+
 SNAPSHOTS = [
+    # https://github.com/SoftwareHeritage/swh-storage
     Snapshot(
         id=hash_to_bytes("a50fde72265343b7d28cecf6db20d98a81d21965"),
         branches={
@@ -141,6 +167,7 @@ SNAPSHOTS = [
             ),
         },
     ),
+    # rsync://ftp.gnu.org/gnu/3dldf
     Snapshot(
         id=hash_to_bytes("2c67f69a416bca4e1f3fcd848c588fab88ad0642"),
         branches={
@@ -166,6 +193,7 @@ SNAPSHOTS = [
             ),
         },
     ),
+    # https://forge.softwareheritage.org/source/jesuisgpl/",
     Snapshot(
         id=hash_to_bytes("68c0d26104d47e278dd6be07ed61fafb561d0d20"),
         branches={
@@ -175,6 +203,7 @@ SNAPSHOTS = [
             )
         },
     ),
+    # https://old-pypi.example.org/project/limnoria/
     Snapshot(
         id=hash_to_bytes("f255245269e15fc99d284affd79f766668de0b67"),
         branches={
@@ -191,6 +220,23 @@ SNAPSHOTS = [
             ),
         },
     ),
+    # https://pypi.org/project/limnoria/
+    Snapshot(
+        branches={
+            b"HEAD": SnapshotBranch(
+                target=b"releases/2018.09.09", target_type=TargetType.ALIAS
+            ),
+            b"releases/2018.09.01": SnapshotBranch(
+                target=b"<\xee1(\xe8\x8d_\xc1\xc9\xa6rT\xf1\x1d\xbb\xdfF\xfdw\xcf",
+                target_type=TargetType.RELEASE,
+            ),
+            b"releases/2018.09.09": SnapshotBranch(
+                target=b"\x83\xb9\xb6\xc7\x05\xb1%\xd0\xfem\xd8kA\x10\x9d\xc5\xfa2\xf8t",  # noqa
+                target_type=TargetType.RELEASE,
+            ),
+        },
+    ),
+    # http://0-512-md.googlecode.com/svn/
     Snapshot(
         id=hash_to_bytes("a1a28c0ab387a8f9e0618cb705eab81fc448f473"),
         branches={
@@ -200,6 +246,7 @@ SNAPSHOTS = [
             )
         },
     ),
+    # https://github.com/librariesio/yarn-parser
     Snapshot(
         id=hash_to_bytes("bb4fd3a836930ce629d912864319637040ff3040"),
         branches={
@@ -209,6 +256,7 @@ SNAPSHOTS = [
             )
         },
     ),
+    # https://github.com/librariesio/yarn-parser.git
     Snapshot(
         id=hash_to_bytes("bb4fd3a836930ce629d912864319637040ff3040"),
         branches={
@@ -218,8 +266,19 @@ SNAPSHOTS = [
             )
         },
     ),
+    # https://npm.example.org/yarn-parser
+    Snapshot(
+        branches={
+            b"HEAD": SnapshotBranch(
+                target=RELEASE.id,
+                target_type=TargetType.RELEASE,
+            )
+        },
+    ),
 ]
 
+assert len(SNAPSHOTS) == len(ORIGIN_VISITS)
+
 
 SHA1_TO_LICENSES = {
     "01c9379dfc33803963d07c1ccc748d3fe4c96bb5": ["GPL"],
@@ -562,6 +621,7 @@ def fill_storage(storage):
     storage.origin_add(ORIGINS)
     storage.directory_add([DIRECTORY, DIRECTORY2])
     storage.revision_add(REVISIONS)
+    storage.release_add(RELEASES)
     storage.snapshot_add(SNAPSHOTS)
 
     for visit, snapshot in zip(ORIGIN_VISITS, SNAPSHOTS):
-- 
GitLab