From 31b4429ced3d66d880fd8a48aa624115d8ac91b4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rapha=C3=ABl=20Gom=C3=A8s?= <rgomes@octobus.net>
Date: Mon, 14 Feb 2022 14:52:55 +0100
Subject: [PATCH] sourceforge: fix support for listing bzr origins

Bazaar support was removed a long time ago and predates a lot of the new
mechanisms in place in the API. Unfortunately, it looks like a lot of
the URLs are offline now, but there are still a few projects that can be
listed, this is pretty low-effort.
---
 swh/lister/sourceforge/lister.py              | 27 ++++++++--
 .../sourceforge/tests/data/bzr-repo.json      | 53 +++++++++++++++++++
 .../sourceforge/tests/data/subsitemap-1.xml   |  5 ++
 swh/lister/sourceforge/tests/test_lister.py   | 25 ++++++---
 4 files changed, 100 insertions(+), 10 deletions(-)
 create mode 100644 swh/lister/sourceforge/tests/data/bzr-repo.json

diff --git a/swh/lister/sourceforge/lister.py b/swh/lister/sourceforge/lister.py
index 6a519c47..71ee6158 100644
--- a/swh/lister/sourceforge/lister.py
+++ b/swh/lister/sourceforge/lister.py
@@ -84,6 +84,9 @@ PROJECT_API_URL_FORMAT = "https://sourceforge.net/rest/{namespace}/{project}"
 
 # Predictable URL for cloning (in the broad sense) a VCS registered for the project.
 #
+# Warning: does not apply to bzr repos, and Mercurial are http only, see use of this
+# constant below.
+#
 # `vcs`: VCS type, one of `VCS_NAMES`
 # `namespace`: Project namespace. Very often `p`, but can be something else like
 #              `adobe`.
@@ -170,13 +173,24 @@ class SourceForgeLister(Lister[SourceForgeListerState, SourceForgeListerPage]):
         url_match = re.compile(
             r".*\.code\.sf\.net/(?P<namespace>[^/]+)/(?P<project>.+)/.*"
         )
+        bzr_url_match = re.compile(
+            r"http://(?P<project>[^/]+).bzr.sourceforge.net/bzrroot/([^/]+)"
+        )
+
         for origin in stream:
             url = origin.url
             match = url_match.match(url)
-            assert match is not None
-            matches = match.groupdict()
-            namespace = matches["namespace"]
-            project = matches["project"]
+            if match is None:
+                # Should be a bzr special endpoint
+                match = bzr_url_match.match(url)
+                assert match is not None
+                matches = match.groupdict()
+                project = matches["project"]
+                namespace = "p"  # no special namespacing for bzr projects
+            else:
+                matches = match.groupdict()
+                namespace = matches["namespace"]
+                project = matches["project"]
             # "Last modified" dates are the same across all VCS (tools, even)
             # within a project or subproject. An assertion here would be overkill.
             last_modified = origin.last_update
@@ -356,6 +370,11 @@ class SourceForgeLister(Lister[SourceForgeListerState, SourceForgeListerPage]):
                 # SourceForge does not yet support anonymous HTTPS cloning for Mercurial
                 # See https://sourceforge.net/p/forge/feature-requests/727/
                 url = url.replace("https://", "http://")
+            if tool_name == VcsNames.BAZAAR.value:
+                # SourceForge has removed support for bzr and only keeps legacy projects
+                # around at a separate (also not https) URL. Bzr projects are very rare
+                # and a lot of them are 404 now.
+                url = f"http://{project}.bzr.sourceforge.net/bzrroot/{project}"
             entry = SourceForgeListerEntry(
                 vcs=VcsNames(tool_name), url=url, last_modified=last_modified
             )
diff --git a/swh/lister/sourceforge/tests/data/bzr-repo.json b/swh/lister/sourceforge/tests/data/bzr-repo.json
new file mode 100644
index 00000000..380e8e6d
--- /dev/null
+++ b/swh/lister/sourceforge/tests/data/bzr-repo.json
@@ -0,0 +1,53 @@
+{
+    "shortname": "bzr-repo",
+    "name": "Bazaar repo",
+    "_id": "4bf3fc291be1ce2f10000052",
+    "url": "https://sourceforge.net/p/bzr-repo/",
+    "private": false,
+    "short_description": "This is an example bzr project",
+    "creation_date": "2009-10-10",
+    "summary": "",
+    "external_homepage": "",
+    "video_url": "",
+    "socialnetworks": [],
+    "status": "active",
+    "moved_to_url": "",
+    "preferred_support_tool": "",
+    "preferred_support_url": "",
+    "developers": [
+      {
+        "username": "Alphare",
+        "name": "Raphaël Gomès",
+        "url": "https://sourceforge.net/u/alphare/"
+      }
+    ],
+    "tools": [
+      {
+        "name": "bzr",
+        "mount_point": "bzr",
+        "url": "/p/bzr-repo/bazaar/",
+        "icons": {
+          "24": "images/code_24.png",
+          "32": "images/code_32.png",
+          "48": "images/code_48.png"
+        },
+        "installable": true,
+        "tool_label": "Bazaar",
+        "mount_label": "Bazaar"
+      }
+    ],
+    "labels": [],
+    "categories": {
+      "audience": [],
+      "developmentstatus": [],
+      "environment": [],
+      "language": [],
+      "license": [],
+      "translation": [],
+      "os": [],
+      "database": [],
+      "topic": []
+    },
+    "icon_url": null,
+    "screenshots": []
+  }
diff --git a/swh/lister/sourceforge/tests/data/subsitemap-1.xml b/swh/lister/sourceforge/tests/data/subsitemap-1.xml
index fcb468bb..290800b3 100644
--- a/swh/lister/sourceforge/tests/data/subsitemap-1.xml
+++ b/swh/lister/sourceforge/tests/data/subsitemap-1.xml
@@ -40,4 +40,9 @@
     <lastmod>2019-05-02</lastmod>
     <changefreq>daily</changefreq>
 </url>
+<url>
+    <loc>https://sourceforge.net/p/bzr-repo/</loc>
+    <lastmod>2021-01-27</lastmod>
+    <changefreq>daily</changefreq>
+</url>
 </urlset>
diff --git a/swh/lister/sourceforge/tests/test_lister.py b/swh/lister/sourceforge/tests/test_lister.py
index 55b4669d..9bb9a7cf 100644
--- a/swh/lister/sourceforge/tests/test_lister.py
+++ b/swh/lister/sourceforge/tests/test_lister.py
@@ -29,6 +29,7 @@ TEST_PROJECTS = {
     "adobexmp": "adobe",
     "backapps": "p",
     "backapps/website": "p",
+    "bzr-repo": "p",
     "mojunk": "p",
     "mramm": "p",
     "os3dmodels": "p",
@@ -79,6 +80,7 @@ def _check_listed_origins(lister, swh_scheduler):
         "https://git.code.sf.net/p/mojunk/git2": ("git", "2017-12-31"),
         "https://svn.code.sf.net/p/mojunk/svn": ("svn", "2017-12-31"),
         "http://hg.code.sf.net/p/random-mercurial/hg": ("hg", "2019-05-02"),
+        "http://bzr-repo.bzr.sourceforge.net/bzrroot/bzr-repo": ("bzr", "2021-01-27"),
     }
 
 
@@ -119,9 +121,10 @@ def test_sourceforge_lister_full(swh_scheduler, requests_mock, datadir):
     # - mojunk (3 repos),
     # - backapps/website (1 repo),
     # - random-mercurial (1 repo).
+    # - bzr-repo (1 repo).
     # adobe and backapps itself have no repos.
-    assert stats.pages == 5
-    assert stats.origins == 10
+    assert stats.pages == 6
+    assert stats.origins == 11
     expected_state = {
         "subsitemap_last_modified": {
             "https://sourceforge.net/allura_sitemap/sitemap-0.xml": "2021-03-18",
@@ -239,6 +242,12 @@ def test_sourceforge_lister_incremental(swh_scheduler, requests_mock, datadir, m
             url="http://hg.code.sf.net/p/random-mercurial/hg",
             last_update=iso8601.parse_date("2019-05-02"),
         ),
+        ListedOrigin(
+            lister_id=lister.lister_obj.id,
+            visit_type="bzr",
+            url="http://bzr-repo.bzr.sourceforge.net/bzrroot/bzr-repo",
+            last_update=iso8601.parse_date("2021-01-27"),
+        ),
     ]
     swh_scheduler.record_listed_origins(faked_listed_origins)
 
@@ -319,9 +328,10 @@ def test_sourceforge_lister_retry(swh_scheduler, requests_mock, mocker, datadir)
     # - mojunk (3 repos),
     # - backapps/website (1 repo),
     # - random-mercurial (1 repo).
+    # - bzr-repo (1 repo).
     # adobe and backapps itself have no repos.
-    assert stats.pages == 5
-    assert stats.origins == 10
+    assert stats.pages == 6
+    assert stats.origins == 11
 
     scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
     assert {o.url: o.visit_type for o in scheduler_origins} == {
@@ -335,6 +345,7 @@ def test_sourceforge_lister_retry(swh_scheduler, requests_mock, mocker, datadir)
         "https://git.code.sf.net/p/mojunk/git2": "git",
         "https://svn.code.sf.net/p/mojunk/svn": "svn",
         "http://hg.code.sf.net/p/random-mercurial/hg": "hg",
+        "http://bzr-repo.bzr.sourceforge.net/bzrroot/bzr-repo": "bzr",
     }
 
     # Test `time.sleep` is called with exponential retries
@@ -402,10 +413,11 @@ def test_sourceforge_lister_project_error(
     # - mojunk (3 repos),
     # - backapps/website (1 repo),
     # - random-mercurial (1 repo).
+    # - bzr-repo (1 repo).
     # adobe and backapps itself have no repos.
     # Did *not* list mramm
-    assert stats.pages == 4
-    assert stats.origins == 7
+    assert stats.pages == 5
+    assert stats.origins == 8
 
     scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
     res = {o.url: (o.visit_type, str(o.last_update.date())) for o in scheduler_origins}
@@ -418,4 +430,5 @@ def test_sourceforge_lister_project_error(
         "https://git.code.sf.net/p/mojunk/git2": ("git", "2017-12-31"),
         "https://svn.code.sf.net/p/mojunk/svn": ("svn", "2017-12-31"),
         "http://hg.code.sf.net/p/random-mercurial/hg": ("hg", "2019-05-02"),
+        "http://bzr-repo.bzr.sourceforge.net/bzrroot/bzr-repo": ("bzr", "2021-01-27"),
     }
-- 
GitLab