From 37484125b53a9551b7ce3706160d263d9beb9d33 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Gom=C3=A8s?= <rgomes@octobus.net> Date: Wed, 17 Mar 2021 17:39:41 +0100 Subject: [PATCH] WIP Add a limited (and untested) sourceforge lister --- swh/lister/sourceforge/__init__.py | 19 +++ swh/lister/sourceforge/lister.py | 211 +++++++++++++++++++++++++++++ 2 files changed, 230 insertions(+) create mode 100644 swh/lister/sourceforge/__init__.py create mode 100644 swh/lister/sourceforge/lister.py diff --git a/swh/lister/sourceforge/__init__.py b/swh/lister/sourceforge/__init__.py new file mode 100644 index 00000000..fb94037a --- /dev/null +++ b/swh/lister/sourceforge/__init__.py @@ -0,0 +1,19 @@ +# Copyright (C) 2021 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +def register(): + from .lister import SourceForgeLister + + return { + "lister": SourceForgeLister, + "task_modules": ["%s.tasks" % __name__], + "task_types": { + "list-sourceforge-full": { + "default_interval": "7 days", + "min_interval": "7 days", + "max_interval": "7 days", + }, + }, + } diff --git a/swh/lister/sourceforge/lister.py b/swh/lister/sourceforge/lister.py new file mode 100644 index 00000000..9e12bd4f --- /dev/null +++ b/swh/lister/sourceforge/lister.py @@ -0,0 +1,211 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from dataclasses import dataclass +from enum import Enum +import logging +import re +from typing import Iterator, List, Set +from xml.etree import ElementTree + +import requests +from tenacity.before_sleep import before_sleep_log + +from swh.lister.utils import throttling_retry +from swh.scheduler.interface import SchedulerInterface +from swh.scheduler.model import ListedOrigin + +from .. import USER_AGENT +from ..pattern import CredentialsType, StatelessLister + +logger = logging.getLogger(__name__) + + +class VcsNames(Enum): + """Used to filter SourceForge tool names for valid VCS types""" + + CVS = "cvs" + GIT = "git" + SUBVERSION = "svn" + MERCURIAL = "hg" + BAZAAR = "bzr" + + +VCS_NAMES = set(VcsNames.__members__.values()) + + +@dataclass +class SourceForgeListerEntry: + vcs: VcsNames + url: str + + +SourceForgeListerPage = List[SourceForgeListerEntry] + +MAIN_SITEMAP_URL = "https://sourceforge.net/allura_sitemap/sitemap.xml" + +# `namespace`: Project namespace. Very often `p`, but can be something else like +# `adobe` +# `project`: Project name, e.g. `seedai` +PROJECT_REST_URL_FORMAT = "https://sourceforge.net/rest/{namespace}/{project}" + +# `vcs`: VCS type, one of `VCS_NAMES` +# `namespace`: Project namespace. Very often `p`, but can be something else like +# `adobe` +# `project`: Project name, e.g. `seedai` +# `mount_point`: url path used by the repo. For example, the Code::Blocks project +# has `git` (https://git.code.sf.net/p/codeblocks/git) +CLONE_URL_FORMAT = "{vcs}.code.sf.net/{namespace}/{project}/{mount_point}" + +PROJ_URL_RE = re.compile( + r"^https://sourceforge.net/(?P<namespace>[^/]+)/(?P<project>[^/]+)" +) + + +class SourceForgeLister(StatelessLister[SourceForgeListerPage]): + """List origins from the "SourceForge" forge. + + """ + + # Part of the lister API, that identifies this lister + LISTER_NAME = "sourceforge" + + def __init__( + self, + # Required + scheduler: SchedulerInterface, + # Instance URL, required for multi-instances listers (e.g gitlab, ...) + url: str, + # Instance name (free form) required for multi-instance listers, + # or computed from `url` + instance: str, + # Required whether lister supports authentication or not + credentials: CredentialsType = None, + ): + super().__init__( + scheduler=scheduler, credentials=credentials, url=url, instance=instance, + ) + + self.session = requests.Session() + # Declare the USER_AGENT is more sysadm-friendly for the forge we list + self.session.headers.update( + {"Accept": "application/json", "User-Agent": USER_AGENT} + ) + + @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) + def page_request(self, url, params) -> requests.Response: + # Log listed URL to ease debugging + logger.debug("Fetching URL %s with params %s", url, params) + response = self.session.get(url, params=params) + + if response.status_code != 200: + # Log response content to ease debugging + logger.warning( + "Unexpected HTTP status code %s on %s: %s", + response.status_code, + response.url, + response.content, + ) + # The lister must fail on blocking errors + response.raise_for_status() + + return response + + def get_pages(self) -> Iterator[SourceForgeListerPage]: + """ + SourceForge has a main XML sitemap that lists its sharded sitemaps for all + projects. + Each XML sub-sitemap lists project pages, which are not unique per project: a + project can have a wiki, a home, a git, an svn, etc. + For each unique project page, we query a REST endpoint that lists (among + other things) the tools associated with said project, some of which are + the VCS used. + Lastly we use the information of which VCS are used to build the predictable + clone URL for any given VCS. + """ + sitemap_contents = self.page_request(MAIN_SITEMAP_URL, {}).text + tree = ElementTree.fromstring(sitemap_contents) + + # FIXME build an iterator to hide ugly XML manipulation + for subsitemap in tree.iter("{*}sitemap"): + # TODO use when adding incremental support + # last_modified = sub_sitemap.find("{*}lastmod") + location = subsitemap.find("{*}loc") + assert location is not None + sub_url = location.text + subsitemap_contents = self.page_request(sub_url, {}).text + subtree = ElementTree.fromstring(subsitemap_contents) + + yield from self._get_pages_from_subsitemap(subtree) + + def get_origins_from_page( + self, page: SourceForgeListerPage + ) -> Iterator[ListedOrigin]: + assert self.lister_obj.id is not None + + for hit in page: + yield ListedOrigin( + lister_id=self.lister_obj.id, + visit_type=hit.vcs.value, + url=hit.url, + last_update=None, + ) + + def _get_pages_from_subsitemap(self, subtree: ElementTree.Element): + projects: Set[str] = set() + for project_block in subtree.iter("{*}url"): + # TODO use when adding incremental support + # last_modified = project_block.find("{*}lastmod") + location = project_block.find("{*}loc") + assert location is not None + project_url = location.text + assert project_url is not None + + match = PROJ_URL_RE.match(project_url) + if match: + namespace = match.groupdict()["namespace"] + if namespace == "projects": + # These have a `p`-namespaced counterpart, use that instead + continue + + project = match.groupdict()["project"] + prev_len = len(projects) + projects.add(project) + + if prev_len == len(project): + # Already seen + continue + + yield self._get_pages_for_project(namespace, project) + else: + # Should always match, let's log it + msg = "Project URL '%s' does not match expected pattern" + logger.warning(msg, project_url) + + def _get_pages_for_project(self, namespace, project) -> SourceForgeListerPage: + endpoint = PROJECT_REST_URL_FORMAT.format(namespace=namespace, project=project) + res = self.page_request(endpoint, {}).json() + + tools = res.get("tools") + if tools is None: + # This probably never happens + logger.warning("Project '%s' does not have any tools", endpoint) + return [] + + hits = [] + for tool in tools: + tool_name = tool["name"] + if tool_name not in VCS_NAMES: + continue + url = CLONE_URL_FORMAT.format( + vcs=tool_name, + namespace=namespace, + project=project, + mount_point=tool["mount_point"], + ) + entry = SourceForgeListerEntry(vcs=VcsNames[tool_name], url=url) + hits.append(entry) + + return hits -- GitLab