Skip to content
Snippets Groups Projects
Commit 6a747955 authored by Antoine Lambert's avatar Antoine Lambert
Browse files

sourceforge: Fix origin URLs for CVS projects

CVS projects are different from other VCS ones, they use the rsync
protocol, a list of modules needs to be fetched from an info page
and multiple origin URLs can be produced for a same project.

Related to T3789
parent 4265e5dd
No related branches found
Tags v2.7.0
1 merge request!264sourceforge: Fix origin URLs for CVS projects
...@@ -10,6 +10,7 @@ import re ...@@ -10,6 +10,7 @@ import re
from typing import Any, Dict, Iterator, List, Optional, Set, Tuple from typing import Any, Dict, Iterator, List, Optional, Set, Tuple
from xml.etree import ElementTree from xml.etree import ElementTree
from bs4 import BeautifulSoup
import iso8601 import iso8601
import requests import requests
from tenacity.before_sleep import before_sleep_log from tenacity.before_sleep import before_sleep_log
...@@ -360,6 +361,35 @@ class SourceForgeLister(Lister[SourceForgeListerState, SourceForgeListerPage]): ...@@ -360,6 +361,35 @@ class SourceForgeLister(Lister[SourceForgeListerState, SourceForgeListerPage]):
tool_name = tool["name"] tool_name = tool["name"]
if tool_name not in VCS_NAMES: if tool_name not in VCS_NAMES:
continue continue
if tool_name == VcsNames.CVS.value:
# CVS projects are different from other VCS ones, they use the rsync
# protocol, a list of modules needs to be fetched from an info page
# and multiple origin URLs can be produced for a same project.
cvs_info_url = f"http://{project}.cvs.sourceforge.net"
try:
response = self.page_request(cvs_info_url, params={})
except requests.HTTPError:
logger.warning(
"CVS info page could not be fetched, skipping project '%s'",
project,
)
continue
else:
bs = BeautifulSoup(response.text, features="html.parser")
cvs_base_url = "rsync://a.cvs.sourceforge.net/cvsroot"
for text in [b.text for b in bs.find_all("b")]:
match = re.search(fr".*/cvsroot/{project} co -P (.+)", text)
if match is not None:
module = match.group(1)
url = f"{cvs_base_url}/{project}/{module}"
hits.append(
SourceForgeListerEntry(
vcs=VcsNames(tool_name),
url=url,
last_modified=last_modified,
)
)
continue
url = CLONE_URL_FORMAT.format( url = CLONE_URL_FORMAT.format(
vcs=tool_name, vcs=tool_name,
namespace=namespace, namespace=namespace,
......
<html><head>
<meta name="generator" content="cvs-info" />
<meta name="description" content="The world's largest development and download repository of Open Source code and applications" />
<meta name="keywords" content="Open Source, Development, Developers, Projects, Downloads, OSTG, VA Software, SF.net, SourceForge" />
<title>CVS Info for project aaron</title>
<link rel="shortcut icon" href="https://sourceforge.net/favicon.ico" />
</head>
<body>
<p> The aaron project's CVS data is in read-only mode, so the project may have switched over to another source-code-management system. To check, visit the <a href="https://sourceforge.net/projects/aaron">Project Summary Page for aaron</a> and see if the menubar lists a newer code repository, such as SVN or Git.
<p>The CVS data can be accessed as follows.
You can run a per-module CVS checkout via pserver protocol:
<li><b>cvs -z3 -d:pserver:anonymous@a.cvs.sourceforge.net:/cvsroot/aaron co -P aaron</b></li>
<li><b>cvs -z3 -d:pserver:anonymous@a.cvs.sourceforge.net:/cvsroot/aaron co -P www</b></li>
<p>You can view a list of files or copy all the CVS repository data via rsync (the 1st command lists the files, the 2nd copies):
<li><b>rsync -a a.cvs.sourceforge.net::cvsroot/aaron/</b></li>
<li><b>rsync -ai a.cvs.sourceforge.net::cvsroot/aaron/ /my/local/dest/dir/</b></li>
<p>If you are a project admin for aaron, you can request that this page redirect to another repo on your project by submitting a <a href="https://sourceforge.net/support">support request</a>.
{
"shortname": "aaron",
"name": "Aaron: the app, service, and net monitor",
"_id": "5139010d5fcbc97960fd66bb",
"url": "https://sourceforge.net/p/aaron/",
"private": false,
"short_description": "Aaron is an application, service, and network availability monitoring and alert daemon. Notification of unavailable services, networks, etc., levels is sent to the appropriate roles. Aaron is highly customizable enterprise class monitoring software.",
"creation_date": "2001-06-24",
"summary": "",
"external_homepage": "http://aaron.sourceforge.net",
"video_url": "",
"socialnetworks": [],
"status": "active",
"moved_to_url": "",
"preferred_support_tool": "",
"preferred_support_url": "",
"developers": [
{
"username": "kapelmeister",
"name": "Steve Nickels",
"url": "https://sourceforge.net/u/kapelmeister/"
},
{
"username": "thetitan",
"name": "Sean Chittenden",
"url": "https://sourceforge.net/u/thetitan/"
},
{
"username": "stwalker",
"name": "Scott Walker",
"url": "https://sourceforge.net/u/stwalker/"
}
],
"tools": [
{
"name": "support",
"mount_point": "support",
"url": "/p/aaron/support/",
"icons": {
"24": "images/sftheme/24x24/blog_24.png",
"32": "images/sftheme/32x32/blog_32.png",
"48": "images/sftheme/48x48/blog_48.png"
},
"installable": false,
"tool_label": "Support",
"mount_label": "Support"
},
{
"name": "mailman",
"mount_point": "mailman",
"url": "/p/aaron/mailman/",
"icons": {
"24": "images/forums_24.png",
"32": "images/forums_32.png",
"48": "images/forums_48.png"
},
"installable": false,
"tool_label": "Mailing Lists",
"mount_label": "Mailing Lists"
},
{
"name": "reviews",
"mount_point": "reviews",
"url": "/p/aaron/reviews/",
"icons": {
"24": "images/sftheme/24x24/blog_24.png",
"32": "images/sftheme/32x32/blog_32.png",
"48": "images/sftheme/48x48/blog_48.png"
},
"installable": false,
"tool_label": "Reviews",
"mount_label": "Reviews"
},
{
"name": "wiki",
"mount_point": "wiki",
"url": "/p/aaron/wiki/",
"icons": {
"24": "images/wiki_24.png",
"32": "images/wiki_32.png",
"48": "images/wiki_48.png"
},
"installable": true,
"tool_label": "Wiki",
"mount_label": "Wiki"
},
{
"name": "summary",
"mount_point": "summary",
"url": "/p/aaron/summary/",
"icons": {
"24": "images/sftheme/24x24/blog_24.png",
"32": "images/sftheme/32x32/blog_32.png",
"48": "images/sftheme/48x48/blog_48.png"
},
"installable": false,
"tool_label": "Summary",
"mount_label": "Summary",
"sourceforge_group_id": 29993
},
{
"name": "files-sf",
"mount_point": "files",
"url": "/p/aaron/files/",
"icons": {
"24": "images/downloads_24.png",
"32": "images/downloads_32.png",
"48": "images/downloads_48.png"
},
"installable": false,
"tool_label": "Files",
"mount_label": "Files"
},
{
"name": "cvs",
"mount_point": "code",
"url": "/p/aaron/code/",
"icons": {
"24": "images/code_24.png",
"32": "images/code_32.png",
"48": "images/code_48.png"
},
"installable": false,
"tool_label": "CVS",
"mount_label": "Code"
},
{
"name": "activity",
"mount_point": "activity",
"url": "/p/aaron/activity/",
"icons": {
"24": "images/admin_24.png",
"32": "images/admin_32.png",
"48": "images/admin_48.png"
},
"installable": false,
"tool_label": "Tool",
"mount_label": "Activity"
},
{
"name": "discussion",
"mount_point": "discussion",
"url": "/p/aaron/discussion/",
"icons": {
"24": "images/forums_24.png",
"32": "images/forums_32.png",
"48": "images/forums_48.png"
},
"installable": true,
"tool_label": "Discussion",
"mount_label": "Discussion"
}
],
"labels": [],
"categories": {
"audience": [
{
"id": 4,
"shortname": "sysadmins",
"fullname": "System Administrators",
"fullpath": "Intended Audience :: by End-User Class :: System Administrators"
}
],
"developmentstatus": [
{
"id": 8,
"shortname": "prealpha",
"fullname": "2 - Pre-Alpha",
"fullpath": "Development Status :: 2 - Pre-Alpha"
},
{
"id": 7,
"shortname": "planning",
"fullname": "1 - Planning",
"fullpath": "Development Status :: 1 - Planning"
}
],
"environment": [
{
"id": 238,
"shortname": "daemon",
"fullname": "Non-interactive (Daemon)",
"fullpath": "User Interface :: Non-interactive (Daemon)"
}
],
"language": [
{
"id": 164,
"shortname": "c",
"fullname": "C",
"fullpath": "Programming Language :: C"
},
{
"id": 293,
"shortname": "ruby",
"fullname": "Ruby",
"fullpath": "Programming Language :: Ruby"
}
],
"license": [
{
"id": 296,
"shortname": "apache",
"fullname": "Apache Software License",
"fullpath": "License :: OSI-Approved Open Source :: Apache Software License"
}
],
"translation": [
{
"id": 275,
"shortname": "english",
"fullname": "English",
"fullpath": "Translations :: English"
}
],
"os": [
{
"id": 235,
"shortname": "independent",
"fullname": "OS Independent (Written in an interpreted language)",
"fullpath": "Operating System :: Grouping and Descriptive Categories :: OS Independent (Written in an interpreted language)"
}
],
"database": [],
"topic": [
{
"id": 152,
"shortname": "monitoring",
"fullname": "Monitoring",
"fullpath": "Topic :: System :: Networking :: Monitoring"
}
]
},
"icon_url": null,
"screenshots": []
}
\ No newline at end of file
<?xml version="1.0" encoding="utf-8"?> <?xml version="1.0" encoding="utf-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url>
<loc>https://sourceforge.net/projects/aaron/files/</loc>
<lastmod>2013-03-07</lastmod>
<changefreq>daily</changefreq>
</url>
<url>
<loc>https://sourceforge.net/p/aaron/home/</loc>
<lastmod>2013-03-07</lastmod>
<changefreq>daily</changefreq>
</url>
<url>
<loc>https://sourceforge.net/p/aaron/tickets/</loc>
<lastmod>2013-03-07</lastmod>
<changefreq>daily</changefreq>
</url>
<url> <url>
<loc>https://sourceforge.net/projects/os3dmodels/files/</loc> <loc>https://sourceforge.net/projects/os3dmodels/files/</loc>
<lastmod>2017-03-31</lastmod> <lastmod>2017-03-31</lastmod>
......
...@@ -26,6 +26,7 @@ from swh.lister.utils import WAIT_EXP_BASE ...@@ -26,6 +26,7 @@ from swh.lister.utils import WAIT_EXP_BASE
from swh.scheduler.model import ListedOrigin from swh.scheduler.model import ListedOrigin
TEST_PROJECTS = { TEST_PROJECTS = {
"aaron": "p",
"adobexmp": "adobe", "adobexmp": "adobe",
"backapps": "p", "backapps": "p",
"backapps/website": "p", "backapps/website": "p",
...@@ -62,6 +63,10 @@ def get_project_json(datadir, request, context): ...@@ -62,6 +63,10 @@ def get_project_json(datadir, request, context):
return json.loads(Path(datadir, f"{project}.json").read_text()) return json.loads(Path(datadir, f"{project}.json").read_text())
def get_cvs_info_page(datadir):
return Path(datadir, "aaron.html").read_text()
def _check_request_headers(request): def _check_request_headers(request):
return request.headers.get("User-Agent") == USER_AGENT return request.headers.get("User-Agent") == USER_AGENT
...@@ -81,6 +86,8 @@ def _check_listed_origins(lister, swh_scheduler): ...@@ -81,6 +86,8 @@ def _check_listed_origins(lister, swh_scheduler):
"https://svn.code.sf.net/p/mojunk/svn": ("svn", "2017-12-31"), "https://svn.code.sf.net/p/mojunk/svn": ("svn", "2017-12-31"),
"http://hg.code.sf.net/p/random-mercurial/hg": ("hg", "2019-05-02"), "http://hg.code.sf.net/p/random-mercurial/hg": ("hg", "2019-05-02"),
"http://bzr-repo.bzr.sourceforge.net/bzrroot/bzr-repo": ("bzr", "2021-01-27"), "http://bzr-repo.bzr.sourceforge.net/bzrroot/bzr-repo": ("bzr", "2021-01-27"),
"rsync://a.cvs.sourceforge.net/cvsroot/aaron/aaron": ("cvs", "2013-03-07"),
"rsync://a.cvs.sourceforge.net/cvsroot/aaron/www": ("cvs", "2013-03-07"),
} }
...@@ -114,6 +121,11 @@ def test_sourceforge_lister_full(swh_scheduler, requests_mock, datadir): ...@@ -114,6 +121,11 @@ def test_sourceforge_lister_full(swh_scheduler, requests_mock, datadir):
json=functools.partial(get_project_json, datadir), json=functools.partial(get_project_json, datadir),
additional_matcher=_check_request_headers, additional_matcher=_check_request_headers,
) )
requests_mock.get(
re.compile("http://aaron.cvs.sourceforge.net/"),
text=get_cvs_info_page(datadir),
additional_matcher=_check_request_headers,
)
stats = lister.run() stats = lister.run()
# - os3dmodels (2 repos), # - os3dmodels (2 repos),
...@@ -123,8 +135,8 @@ def test_sourceforge_lister_full(swh_scheduler, requests_mock, datadir): ...@@ -123,8 +135,8 @@ def test_sourceforge_lister_full(swh_scheduler, requests_mock, datadir):
# - random-mercurial (1 repo). # - random-mercurial (1 repo).
# - bzr-repo (1 repo). # - bzr-repo (1 repo).
# adobe and backapps itself have no repos. # adobe and backapps itself have no repos.
assert stats.pages == 6 assert stats.pages == 7
assert stats.origins == 11 assert stats.origins == 13
expected_state = { expected_state = {
"subsitemap_last_modified": { "subsitemap_last_modified": {
"https://sourceforge.net/allura_sitemap/sitemap-0.xml": "2021-03-18", "https://sourceforge.net/allura_sitemap/sitemap-0.xml": "2021-03-18",
...@@ -178,6 +190,12 @@ def test_sourceforge_lister_incremental(swh_scheduler, requests_mock, datadir, m ...@@ -178,6 +190,12 @@ def test_sourceforge_lister_incremental(swh_scheduler, requests_mock, datadir, m
additional_matcher=_check_request_headers, additional_matcher=_check_request_headers,
) )
requests_mock.get(
re.compile("http://aaron.cvs.sourceforge.net/"),
text=get_cvs_info_page(datadir),
additional_matcher=_check_request_headers,
)
faked_listed_origins = [ faked_listed_origins = [
# mramm: changed # mramm: changed
ListedOrigin( ListedOrigin(
...@@ -272,8 +290,8 @@ def test_sourceforge_lister_incremental(swh_scheduler, requests_mock, datadir, m ...@@ -272,8 +290,8 @@ def test_sourceforge_lister_incremental(swh_scheduler, requests_mock, datadir, m
stats = lister.run() stats = lister.run()
# - mramm (3 repos), # changed # - mramm (3 repos), # changed
assert stats.pages == 1 assert stats.pages == 2
assert stats.origins == 3 assert stats.origins == 5
expected_state = { expected_state = {
"subsitemap_last_modified": { "subsitemap_last_modified": {
"https://sourceforge.net/allura_sitemap/sitemap-0.xml": "2021-03-18", "https://sourceforge.net/allura_sitemap/sitemap-0.xml": "2021-03-18",
...@@ -322,6 +340,12 @@ def test_sourceforge_lister_retry(swh_scheduler, requests_mock, mocker, datadir) ...@@ -322,6 +340,12 @@ def test_sourceforge_lister_retry(swh_scheduler, requests_mock, mocker, datadir)
additional_matcher=_check_request_headers, additional_matcher=_check_request_headers,
) )
requests_mock.get(
re.compile("http://aaron.cvs.sourceforge.net/"),
text=get_cvs_info_page(datadir),
additional_matcher=_check_request_headers,
)
stats = lister.run() stats = lister.run()
# - os3dmodels (2 repos), # - os3dmodels (2 repos),
# - mramm (3 repos), # - mramm (3 repos),
...@@ -330,23 +354,10 @@ def test_sourceforge_lister_retry(swh_scheduler, requests_mock, mocker, datadir) ...@@ -330,23 +354,10 @@ def test_sourceforge_lister_retry(swh_scheduler, requests_mock, mocker, datadir)
# - random-mercurial (1 repo). # - random-mercurial (1 repo).
# - bzr-repo (1 repo). # - bzr-repo (1 repo).
# adobe and backapps itself have no repos. # adobe and backapps itself have no repos.
assert stats.pages == 6 assert stats.pages == 7
assert stats.origins == 11 assert stats.origins == 13
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results _check_listed_origins(lister, swh_scheduler)
assert {o.url: o.visit_type for o in scheduler_origins} == {
"https://svn.code.sf.net/p/backapps/website/code": "svn",
"https://git.code.sf.net/p/os3dmodels/git": "git",
"https://svn.code.sf.net/p/os3dmodels/svn": "svn",
"https://git.code.sf.net/p/mramm/files": "git",
"https://git.code.sf.net/p/mramm/git": "git",
"https://svn.code.sf.net/p/mramm/svn": "svn",
"https://git.code.sf.net/p/mojunk/git": "git",
"https://git.code.sf.net/p/mojunk/git2": "git",
"https://svn.code.sf.net/p/mojunk/svn": "svn",
"http://hg.code.sf.net/p/random-mercurial/hg": "hg",
"http://bzr-repo.bzr.sourceforge.net/bzrroot/bzr-repo": "bzr",
}
# Test `time.sleep` is called with exponential retries # Test `time.sleep` is called with exponential retries
assert_sleep_calls(mocker, mocked_sleep, [1, WAIT_EXP_BASE, 1, 1]) assert_sleep_calls(mocker, mocked_sleep, [1, WAIT_EXP_BASE, 1, 1])
...@@ -408,6 +419,11 @@ def test_sourceforge_lister_project_error( ...@@ -408,6 +419,11 @@ def test_sourceforge_lister_project_error(
re.compile("https://sourceforge.net/rest/p/mramm"), status_code=status_code re.compile("https://sourceforge.net/rest/p/mramm"), status_code=status_code
) )
# Make request to CVS info page fail
requests_mock.get(
re.compile("http://aaron.cvs.sourceforge.net/"), status_code=status_code
)
stats = lister.run() stats = lister.run()
# - os3dmodels (2 repos), # - os3dmodels (2 repos),
# - mojunk (3 repos), # - mojunk (3 repos),
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment