From 6a7479553e6be3a84790001c150bba39be44864d Mon Sep 17 00:00:00 2001
From: Antoine Lambert <anlambert@softwareheritage.org>
Date: Tue, 15 Feb 2022 22:16:45 +0100
Subject: [PATCH] sourceforge: Fix origin URLs for CVS projects

CVS projects are different from other VCS ones, they use the rsync
protocol, a list of modules needs to be fetched from an info page
and multiple origin URLs can be produced for a same project.

Related to T3789
---
 swh/lister/sourceforge/lister.py              |  30 +++
 swh/lister/sourceforge/tests/data/aaron.html  |  23 ++
 swh/lister/sourceforge/tests/data/aaron.json  | 236 ++++++++++++++++++
 .../sourceforge/tests/data/subsitemap-0.xml   |  15 ++
 swh/lister/sourceforge/tests/test_lister.py   |  56 +++--
 5 files changed, 340 insertions(+), 20 deletions(-)
 create mode 100644 swh/lister/sourceforge/tests/data/aaron.html
 create mode 100644 swh/lister/sourceforge/tests/data/aaron.json

diff --git a/swh/lister/sourceforge/lister.py b/swh/lister/sourceforge/lister.py
index 71ee6158..c0153c57 100644
--- a/swh/lister/sourceforge/lister.py
+++ b/swh/lister/sourceforge/lister.py
@@ -10,6 +10,7 @@ import re
 from typing import Any, Dict, Iterator, List, Optional, Set, Tuple
 from xml.etree import ElementTree
 
+from bs4 import BeautifulSoup
 import iso8601
 import requests
 from tenacity.before_sleep import before_sleep_log
@@ -360,6 +361,35 @@ class SourceForgeLister(Lister[SourceForgeListerState, SourceForgeListerPage]):
             tool_name = tool["name"]
             if tool_name not in VCS_NAMES:
                 continue
+            if tool_name == VcsNames.CVS.value:
+                # CVS projects are different from other VCS ones, they use the rsync
+                # protocol, a list of modules needs to be fetched from an info page
+                # and multiple origin URLs can be produced for a same project.
+                cvs_info_url = f"http://{project}.cvs.sourceforge.net"
+                try:
+                    response = self.page_request(cvs_info_url, params={})
+                except requests.HTTPError:
+                    logger.warning(
+                        "CVS info page could not be fetched, skipping project '%s'",
+                        project,
+                    )
+                    continue
+                else:
+                    bs = BeautifulSoup(response.text, features="html.parser")
+                    cvs_base_url = "rsync://a.cvs.sourceforge.net/cvsroot"
+                    for text in [b.text for b in bs.find_all("b")]:
+                        match = re.search(fr".*/cvsroot/{project} co -P (.+)", text)
+                        if match is not None:
+                            module = match.group(1)
+                            url = f"{cvs_base_url}/{project}/{module}"
+                            hits.append(
+                                SourceForgeListerEntry(
+                                    vcs=VcsNames(tool_name),
+                                    url=url,
+                                    last_modified=last_modified,
+                                )
+                            )
+                    continue
             url = CLONE_URL_FORMAT.format(
                 vcs=tool_name,
                 namespace=namespace,
diff --git a/swh/lister/sourceforge/tests/data/aaron.html b/swh/lister/sourceforge/tests/data/aaron.html
new file mode 100644
index 00000000..5b1c2266
--- /dev/null
+++ b/swh/lister/sourceforge/tests/data/aaron.html
@@ -0,0 +1,23 @@
+<html><head>
+  <meta name="generator" content="cvs-info" />
+  <meta name="description" content="The world's largest development and download repository of Open Source code and applications" />
+  <meta name="keywords" content="Open Source, Development, Developers, Projects, Downloads, OSTG, VA Software, SF.net, SourceForge" />
+
+  <title>CVS Info for project aaron</title>
+
+  <link rel="shortcut icon" href="https://sourceforge.net/favicon.ico" />
+
+</head>
+<body>
+
+<p> The aaron project's CVS data is in read-only mode, so the project may have switched over to another source-code-management system. To check, visit the <a href="https://sourceforge.net/projects/aaron">Project Summary Page for aaron</a> and see if the menubar lists a newer code repository, such as SVN or Git.
+
+<p>The CVS data can be accessed as follows.
+You can run a per-module CVS checkout via pserver protocol:
+<li><b>cvs -z3 -d:pserver:anonymous@a.cvs.sourceforge.net:/cvsroot/aaron co -P aaron</b></li>
+<li><b>cvs -z3 -d:pserver:anonymous@a.cvs.sourceforge.net:/cvsroot/aaron co -P www</b></li>
+<p>You can view a list of files or copy all the CVS repository data via rsync (the 1st command lists the files, the 2nd copies):
+<li><b>rsync -a a.cvs.sourceforge.net::cvsroot/aaron/</b></li>
+<li><b>rsync -ai a.cvs.sourceforge.net::cvsroot/aaron/ /my/local/dest/dir/</b></li>
+
+<p>If you are a project admin for aaron, you can request that this page redirect to another repo on your project by submitting a <a href="https://sourceforge.net/support">support request</a>.
diff --git a/swh/lister/sourceforge/tests/data/aaron.json b/swh/lister/sourceforge/tests/data/aaron.json
new file mode 100644
index 00000000..8eea8e9a
--- /dev/null
+++ b/swh/lister/sourceforge/tests/data/aaron.json
@@ -0,0 +1,236 @@
+{
+  "shortname": "aaron",
+  "name": "Aaron: the app, service, and net monitor",
+  "_id": "5139010d5fcbc97960fd66bb",
+  "url": "https://sourceforge.net/p/aaron/",
+  "private": false,
+  "short_description": "Aaron is an application, service, and network availability monitoring and alert daemon.  Notification of unavailable services, networks, etc., levels is sent to the appropriate roles.  Aaron is highly customizable enterprise class monitoring software.",
+  "creation_date": "2001-06-24",
+  "summary": "",
+  "external_homepage": "http://aaron.sourceforge.net",
+  "video_url": "",
+  "socialnetworks": [],
+  "status": "active",
+  "moved_to_url": "",
+  "preferred_support_tool": "",
+  "preferred_support_url": "",
+  "developers": [
+    {
+      "username": "kapelmeister",
+      "name": "Steve Nickels",
+      "url": "https://sourceforge.net/u/kapelmeister/"
+    },
+    {
+      "username": "thetitan",
+      "name": "Sean Chittenden",
+      "url": "https://sourceforge.net/u/thetitan/"
+    },
+    {
+      "username": "stwalker",
+      "name": "Scott Walker",
+      "url": "https://sourceforge.net/u/stwalker/"
+    }
+  ],
+  "tools": [
+    {
+      "name": "support",
+      "mount_point": "support",
+      "url": "/p/aaron/support/",
+      "icons": {
+        "24": "images/sftheme/24x24/blog_24.png",
+        "32": "images/sftheme/32x32/blog_32.png",
+        "48": "images/sftheme/48x48/blog_48.png"
+      },
+      "installable": false,
+      "tool_label": "Support",
+      "mount_label": "Support"
+    },
+    {
+      "name": "mailman",
+      "mount_point": "mailman",
+      "url": "/p/aaron/mailman/",
+      "icons": {
+        "24": "images/forums_24.png",
+        "32": "images/forums_32.png",
+        "48": "images/forums_48.png"
+      },
+      "installable": false,
+      "tool_label": "Mailing Lists",
+      "mount_label": "Mailing Lists"
+    },
+    {
+      "name": "reviews",
+      "mount_point": "reviews",
+      "url": "/p/aaron/reviews/",
+      "icons": {
+        "24": "images/sftheme/24x24/blog_24.png",
+        "32": "images/sftheme/32x32/blog_32.png",
+        "48": "images/sftheme/48x48/blog_48.png"
+      },
+      "installable": false,
+      "tool_label": "Reviews",
+      "mount_label": "Reviews"
+    },
+    {
+      "name": "wiki",
+      "mount_point": "wiki",
+      "url": "/p/aaron/wiki/",
+      "icons": {
+        "24": "images/wiki_24.png",
+        "32": "images/wiki_32.png",
+        "48": "images/wiki_48.png"
+      },
+      "installable": true,
+      "tool_label": "Wiki",
+      "mount_label": "Wiki"
+    },
+    {
+      "name": "summary",
+      "mount_point": "summary",
+      "url": "/p/aaron/summary/",
+      "icons": {
+        "24": "images/sftheme/24x24/blog_24.png",
+        "32": "images/sftheme/32x32/blog_32.png",
+        "48": "images/sftheme/48x48/blog_48.png"
+      },
+      "installable": false,
+      "tool_label": "Summary",
+      "mount_label": "Summary",
+      "sourceforge_group_id": 29993
+    },
+    {
+      "name": "files-sf",
+      "mount_point": "files",
+      "url": "/p/aaron/files/",
+      "icons": {
+        "24": "images/downloads_24.png",
+        "32": "images/downloads_32.png",
+        "48": "images/downloads_48.png"
+      },
+      "installable": false,
+      "tool_label": "Files",
+      "mount_label": "Files"
+    },
+    {
+      "name": "cvs",
+      "mount_point": "code",
+      "url": "/p/aaron/code/",
+      "icons": {
+        "24": "images/code_24.png",
+        "32": "images/code_32.png",
+        "48": "images/code_48.png"
+      },
+      "installable": false,
+      "tool_label": "CVS",
+      "mount_label": "Code"
+    },
+    {
+      "name": "activity",
+      "mount_point": "activity",
+      "url": "/p/aaron/activity/",
+      "icons": {
+        "24": "images/admin_24.png",
+        "32": "images/admin_32.png",
+        "48": "images/admin_48.png"
+      },
+      "installable": false,
+      "tool_label": "Tool",
+      "mount_label": "Activity"
+    },
+    {
+      "name": "discussion",
+      "mount_point": "discussion",
+      "url": "/p/aaron/discussion/",
+      "icons": {
+        "24": "images/forums_24.png",
+        "32": "images/forums_32.png",
+        "48": "images/forums_48.png"
+      },
+      "installable": true,
+      "tool_label": "Discussion",
+      "mount_label": "Discussion"
+    }
+  ],
+  "labels": [],
+  "categories": {
+    "audience": [
+      {
+        "id": 4,
+        "shortname": "sysadmins",
+        "fullname": "System Administrators",
+        "fullpath": "Intended Audience :: by End-User Class :: System Administrators"
+      }
+    ],
+    "developmentstatus": [
+      {
+        "id": 8,
+        "shortname": "prealpha",
+        "fullname": "2 - Pre-Alpha",
+        "fullpath": "Development Status :: 2 - Pre-Alpha"
+      },
+      {
+        "id": 7,
+        "shortname": "planning",
+        "fullname": "1 - Planning",
+        "fullpath": "Development Status :: 1 - Planning"
+      }
+    ],
+    "environment": [
+      {
+        "id": 238,
+        "shortname": "daemon",
+        "fullname": "Non-interactive (Daemon)",
+        "fullpath": "User Interface :: Non-interactive (Daemon)"
+      }
+    ],
+    "language": [
+      {
+        "id": 164,
+        "shortname": "c",
+        "fullname": "C",
+        "fullpath": "Programming Language :: C"
+      },
+      {
+        "id": 293,
+        "shortname": "ruby",
+        "fullname": "Ruby",
+        "fullpath": "Programming Language :: Ruby"
+      }
+    ],
+    "license": [
+      {
+        "id": 296,
+        "shortname": "apache",
+        "fullname": "Apache Software License",
+        "fullpath": "License :: OSI-Approved Open Source :: Apache Software License"
+      }
+    ],
+    "translation": [
+      {
+        "id": 275,
+        "shortname": "english",
+        "fullname": "English",
+        "fullpath": "Translations :: English"
+      }
+    ],
+    "os": [
+      {
+        "id": 235,
+        "shortname": "independent",
+        "fullname": "OS Independent (Written in an interpreted language)",
+        "fullpath": "Operating System :: Grouping and Descriptive Categories :: OS Independent (Written in an interpreted language)"
+      }
+    ],
+    "database": [],
+    "topic": [
+      {
+        "id": 152,
+        "shortname": "monitoring",
+        "fullname": "Monitoring",
+        "fullpath": "Topic :: System :: Networking :: Monitoring"
+      }
+    ]
+  },
+  "icon_url": null,
+  "screenshots": []
+}
\ No newline at end of file
diff --git a/swh/lister/sourceforge/tests/data/subsitemap-0.xml b/swh/lister/sourceforge/tests/data/subsitemap-0.xml
index 5f2cba85..451554a2 100644
--- a/swh/lister/sourceforge/tests/data/subsitemap-0.xml
+++ b/swh/lister/sourceforge/tests/data/subsitemap-0.xml
@@ -1,5 +1,20 @@
 <?xml version="1.0" encoding="utf-8"?>
 <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+    <url>
+        <loc>https://sourceforge.net/projects/aaron/files/</loc>
+        <lastmod>2013-03-07</lastmod>
+        <changefreq>daily</changefreq>
+    </url>
+    <url>
+        <loc>https://sourceforge.net/p/aaron/home/</loc>
+        <lastmod>2013-03-07</lastmod>
+        <changefreq>daily</changefreq>
+    </url>
+    <url>
+        <loc>https://sourceforge.net/p/aaron/tickets/</loc>
+        <lastmod>2013-03-07</lastmod>
+        <changefreq>daily</changefreq>
+    </url>
     <url>
         <loc>https://sourceforge.net/projects/os3dmodels/files/</loc>
         <lastmod>2017-03-31</lastmod>
diff --git a/swh/lister/sourceforge/tests/test_lister.py b/swh/lister/sourceforge/tests/test_lister.py
index 9bb9a7cf..3dfa595b 100644
--- a/swh/lister/sourceforge/tests/test_lister.py
+++ b/swh/lister/sourceforge/tests/test_lister.py
@@ -26,6 +26,7 @@ from swh.lister.utils import WAIT_EXP_BASE
 from swh.scheduler.model import ListedOrigin
 
 TEST_PROJECTS = {
+    "aaron": "p",
     "adobexmp": "adobe",
     "backapps": "p",
     "backapps/website": "p",
@@ -62,6 +63,10 @@ def get_project_json(datadir, request, context):
     return json.loads(Path(datadir, f"{project}.json").read_text())
 
 
+def get_cvs_info_page(datadir):
+    return Path(datadir, "aaron.html").read_text()
+
+
 def _check_request_headers(request):
     return request.headers.get("User-Agent") == USER_AGENT
 
@@ -81,6 +86,8 @@ def _check_listed_origins(lister, swh_scheduler):
         "https://svn.code.sf.net/p/mojunk/svn": ("svn", "2017-12-31"),
         "http://hg.code.sf.net/p/random-mercurial/hg": ("hg", "2019-05-02"),
         "http://bzr-repo.bzr.sourceforge.net/bzrroot/bzr-repo": ("bzr", "2021-01-27"),
+        "rsync://a.cvs.sourceforge.net/cvsroot/aaron/aaron": ("cvs", "2013-03-07"),
+        "rsync://a.cvs.sourceforge.net/cvsroot/aaron/www": ("cvs", "2013-03-07"),
     }
 
 
@@ -114,6 +121,11 @@ def test_sourceforge_lister_full(swh_scheduler, requests_mock, datadir):
         json=functools.partial(get_project_json, datadir),
         additional_matcher=_check_request_headers,
     )
+    requests_mock.get(
+        re.compile("http://aaron.cvs.sourceforge.net/"),
+        text=get_cvs_info_page(datadir),
+        additional_matcher=_check_request_headers,
+    )
 
     stats = lister.run()
     # - os3dmodels (2 repos),
@@ -123,8 +135,8 @@ def test_sourceforge_lister_full(swh_scheduler, requests_mock, datadir):
     # - random-mercurial (1 repo).
     # - bzr-repo (1 repo).
     # adobe and backapps itself have no repos.
-    assert stats.pages == 6
-    assert stats.origins == 11
+    assert stats.pages == 7
+    assert stats.origins == 13
     expected_state = {
         "subsitemap_last_modified": {
             "https://sourceforge.net/allura_sitemap/sitemap-0.xml": "2021-03-18",
@@ -178,6 +190,12 @@ def test_sourceforge_lister_incremental(swh_scheduler, requests_mock, datadir, m
         additional_matcher=_check_request_headers,
     )
 
+    requests_mock.get(
+        re.compile("http://aaron.cvs.sourceforge.net/"),
+        text=get_cvs_info_page(datadir),
+        additional_matcher=_check_request_headers,
+    )
+
     faked_listed_origins = [
         # mramm: changed
         ListedOrigin(
@@ -272,8 +290,8 @@ def test_sourceforge_lister_incremental(swh_scheduler, requests_mock, datadir, m
 
     stats = lister.run()
     # - mramm (3 repos),  # changed
-    assert stats.pages == 1
-    assert stats.origins == 3
+    assert stats.pages == 2
+    assert stats.origins == 5
     expected_state = {
         "subsitemap_last_modified": {
             "https://sourceforge.net/allura_sitemap/sitemap-0.xml": "2021-03-18",
@@ -322,6 +340,12 @@ def test_sourceforge_lister_retry(swh_scheduler, requests_mock, mocker, datadir)
         additional_matcher=_check_request_headers,
     )
 
+    requests_mock.get(
+        re.compile("http://aaron.cvs.sourceforge.net/"),
+        text=get_cvs_info_page(datadir),
+        additional_matcher=_check_request_headers,
+    )
+
     stats = lister.run()
     # - os3dmodels (2 repos),
     # - mramm (3 repos),
@@ -330,23 +354,10 @@ def test_sourceforge_lister_retry(swh_scheduler, requests_mock, mocker, datadir)
     # - random-mercurial (1 repo).
     # - bzr-repo (1 repo).
     # adobe and backapps itself have no repos.
-    assert stats.pages == 6
-    assert stats.origins == 11
+    assert stats.pages == 7
+    assert stats.origins == 13
 
-    scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
-    assert {o.url: o.visit_type for o in scheduler_origins} == {
-        "https://svn.code.sf.net/p/backapps/website/code": "svn",
-        "https://git.code.sf.net/p/os3dmodels/git": "git",
-        "https://svn.code.sf.net/p/os3dmodels/svn": "svn",
-        "https://git.code.sf.net/p/mramm/files": "git",
-        "https://git.code.sf.net/p/mramm/git": "git",
-        "https://svn.code.sf.net/p/mramm/svn": "svn",
-        "https://git.code.sf.net/p/mojunk/git": "git",
-        "https://git.code.sf.net/p/mojunk/git2": "git",
-        "https://svn.code.sf.net/p/mojunk/svn": "svn",
-        "http://hg.code.sf.net/p/random-mercurial/hg": "hg",
-        "http://bzr-repo.bzr.sourceforge.net/bzrroot/bzr-repo": "bzr",
-    }
+    _check_listed_origins(lister, swh_scheduler)
 
     # Test `time.sleep` is called with exponential retries
     assert_sleep_calls(mocker, mocked_sleep, [1, WAIT_EXP_BASE, 1, 1])
@@ -408,6 +419,11 @@ def test_sourceforge_lister_project_error(
         re.compile("https://sourceforge.net/rest/p/mramm"), status_code=status_code
     )
 
+    # Make request to CVS info page fail
+    requests_mock.get(
+        re.compile("http://aaron.cvs.sourceforge.net/"), status_code=status_code
+    )
+
     stats = lister.run()
     # - os3dmodels (2 repos),
     # - mojunk (3 repos),
-- 
GitLab