Use beautifulsoup4 CSS selectors to simplify code and type checking

As the types-beautifulsoup4 package gets installed in the swh virtualenv as it is a swh-scanner test dependency, some mypy errors were reported related to beautifulsoup4 typing. As the returned type for the find method of bs4 is the following union: Tag | NavigableString | None, isinstance calls must be used to ensure proper typing which is not great. So prefer to use the select_one method instead where a simple None check must be done to ensure typing is correct as it is returning Optional[Tag]. In a similar manner, replace use of find_all method by select method. It also has the advantage to simplify the code.

Use beautifulsoup4 CSS selectors to simplify code and type checking
As the types-beautifulsoup4 package gets installed in the swh virtualenv as it is a swh-scanner test dependency, some mypy errors were reported related to beautifulsoup4 typing. As the returned type for the find method of bs4 is the following union: Tag | NavigableString | None, isinstance calls must be used to ensure proper typing which is not great. So prefer to use the select_one method instead where a simple None check must be done to ensure typing is correct as it is returning Optional[Tag]. In a similar manner, replace use of find_all method by select method. It also has the advantage to simplify the code.
41407e0e · Antoine Lambert · e6a35c55 · 41407e0e · 41407e0e · 41407e0e
Commit 41407e0e authored 11 months ago by Antoine Lambert
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -3,6 +3,7 @@ pytest >= 8.1
 pytest-mock
 requests_mock
 swh-scheduler[testing]
+types-beautifulsoup4
 types-click
 types-pyyaml
 types-requests

--- a/swh/lister/arch/lister.py
+++ b/swh/lister/arch/lister.py
-# Copyright (C) 2022-2023  The Software Heritage developers
+# Copyright (C) 2022-2024  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
@@ -128,10 +128,10 @@ class ArchLister(StatelessLister[ArchListerPage]):
        )
        response = self.http_request(url)
        soup = BeautifulSoup(response.text, "html.parser")
-        links = soup.find_all("a", href=True)
+        links = soup.select("a[href]")

        # drop the first line (used to go to up directory)
-        if links[0].attrs["href"] == "../":
+        if links and links[0].attrs["href"] == "../":
            links.pop(0)

        versions = []
@@ -156,26 +156,28 @@ class ArchLister(StatelessLister[ArchListerPage]):
                    arch = m.group("arch")
                    version = m.group("version")

-                # Extract last_modified and an approximate file size
+                # Extract last_modified date
+                last_modified = None
                raw_text = link.next_sibling
-                raw_text_rex = re.compile(
-                    r"^(?P<last_modified>\d+-\w+-\d+ \d\d:\d\d)\s+.*$"
-                )
-                s = raw_text_rex.search(raw_text.strip())
-                if s is None:
-                    logger.error(
-                        "Can not find a match for 'last_modified' in '%(raw_text)s'",
-                        dict(raw_text=raw_text),
+                if raw_text:
+                    raw_text_rex = re.compile(
+                        r"^(?P<last_modified>\d+-\w+-\d+ \d\d:\d\d)\s+.*$"
                    )
-                else:
-                    values = s.groups()
-                    assert values and len(values) == 1
-                    last_modified_str = values[0]
-
-                # format as expected
-                last_modified = datetime.datetime.strptime(
-                    last_modified_str, "%d-%b-%Y %H:%M"
-                ).isoformat()
+                    s = raw_text_rex.search(raw_text.text.strip())
+                    if s is None:
+                        logger.error(
+                            "Can not find a match for 'last_modified' in '%(raw_text)s'",
+                            dict(raw_text=raw_text),
+                        )
+                    else:
+                        values = s.groups()
+                        assert values and len(values) == 1
+                        last_modified_str = values[0]
+
+                    # format as expected
+                    last_modified = datetime.datetime.strptime(
+                        last_modified_str, "%d-%b-%Y %H:%M"
+                    ).isoformat()

                # link url is relative, format a canonical one
                url = self.ARCH_PACKAGE_DOWNLOAD_URL_PATTERN.format(

--- a/swh/lister/bioconductor/lister.py
+++ b/swh/lister/bioconductor/lister.py
-# Copyright (C) 2023  The Software Heritage developers
+# Copyright (C) 2023-2024  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
@@ -146,10 +146,11 @@ class BioconductorLister(Lister[BioconductorListerState, BioconductorListerPage]
            f"{self.BIOCONDUCTOR_HOMEPAGE}/about/release-announcements"
        ).text
        bs = BeautifulSoup(html, "html.parser")
+
        return [
-            tr.find_all("td")[0].text
-            for tr in reversed(bs.find("table").find("tbody").find_all("tr"))
-            if tr.find_all("td")[2].find("a")
+            tr.select("td")[0].text
+            for tr in reversed(bs.select("table tbody tr"))
+            if tr.select("td")[2].select("a")
        ]

    def parse_packages(self, text: str) -> Dict[str, Any]:

--- a/swh/lister/cgit/lister.py
+++ b/swh/lister/cgit/lister.py
-# Copyright (C) 2019-2023 The Software Heritage developers
+# Copyright (C) 2019-2024 The Software Heritage developers
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information

 from datetime import datetime, timezone
 import logging
-import re
 from typing import Any, Dict, Iterator, List, Optional
 from urllib.parse import urljoin, urlparse

@@ -95,14 +94,16 @@ class CGitLister(StatelessLister[Repositories]):

            page_results = []

-            for tr in bs_idx.find("div", {"class": "content"}).find_all(
-                "tr", {"class": ""}
-            ):
-                repository_link = tr.find("a")["href"]
+            for tr in bs_idx.select("div.content tr:not([class])"):
+                repository_link = tr.select_one("a")
+
+                if repository_link is None:
+                    continue
+
                repo_url = None
                git_url = None

-                base_url = urljoin(self.url, repository_link).strip("/")
+                base_url = urljoin(self.url, repository_link.attrs["href"]).strip("/")
                if self.base_git_url:  # mapping provided
                    # computing git url
                    git_url = base_url.replace(self.url, self.base_git_url)
@@ -111,7 +112,7 @@ class CGitLister(StatelessLister[Repositories]):
                    # the git url (cf. self.get_origins_from_page)
                    repo_url = base_url

-                span = tr.find("span", {"class": re.compile("age-")})
+                span = tr.select_one('span[class^="age-"]')
                last_updated_date = span.get("title") if span else None

                page_results.append(
@@ -125,12 +126,14 @@ class CGitLister(StatelessLister[Repositories]):
            yield page_results

            try:
-                pager = bs_idx.find("ul", {"class": "pager"})
-
-                current_page = pager.find("a", {"class": "current"})
-                if current_page:
-                    next_page = current_page.parent.next_sibling.a["href"]
-                    next_page = urljoin(self.url, next_page)
+                next_page_li = bs_idx.select_one(
+                    "ul.pager li:has(> a.current) + li:has(> a)"
+                )
+                next_page = (
+                    urljoin(self.url, next_page_li.select("a")[0].attrs["href"])
+                    if next_page_li
+                    else None
+                )
            except (AttributeError, KeyError):
                # no pager, or no next page
                next_page = None
@@ -169,27 +172,26 @@ class CGitLister(StatelessLister[Repositories]):
            return None

        # check if we are on the summary tab, if not, go to this tab
-        tab = bs.find("table", {"class": "tabs"})
-        if tab:
-            summary_a = tab.find("a", string="summary")
-            if summary_a:
-                summary_url = urljoin(repository_url, summary_a["href"]).strip("/")
-
-                if summary_url != repository_url:
-                    logger.debug(
-                        "%s : Active tab is not the summary, trying to load the summary page",
-                        repository_url,
-                    )
-                    return self._get_origin_from_repository_url(summary_url)
-            else:
-                logger.debug("No summary tab found on %s", repository_url)
+        summary_a = bs.select_one('table.tabs a:-soup-contains("summary")')
+        if summary_a:
+            summary_path = summary_a.attrs["href"]
+            summary_url = urljoin(repository_url, summary_path).strip("/")
+
+            if summary_url != repository_url:
+                logger.debug(
+                    "%s : Active tab is not the summary, trying to load the summary page",
+                    repository_url,
+                )
+                return self._get_origin_from_repository_url(summary_url)
+        else:
+            logger.debug("No summary tab found on %s", repository_url)

        # origin urls are listed on the repository page
        # TODO check if forcing https is better or not ?
        # <link rel='vcs-git' href='git://...' title='...'/>
        # <link rel='vcs-git' href='http://...' title='...'/>
        # <link rel='vcs-git' href='https://...' title='...'/>
-        urls = [x["href"] for x in bs.find_all("a", {"rel": "vcs-git"})]
+        urls = [x.attrs["href"] for x in bs.select('a[rel="vcs-git"]')]

        if not urls:
            logger.debug("No git urls found on %s", repository_url)

--- a/swh/lister/gitweb/lister.py
+++ b/swh/lister/gitweb/lister.py
-# Copyright (C) 2023 The Software Heritage developers
+# Copyright (C) 2023-2024 The Software Heritage developers
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information

 from datetime import datetime, timezone
 import logging
-import re
 from typing import Any, Dict, Iterator, List, Optional
 from urllib.parse import parse_qs, urljoin, urlparse

@@ -80,14 +79,12 @@ class GitwebLister(StatelessLister[Repositories]):

        page_results = []

-        for tr in bs_idx.find("table", {"class": re.compile("project_list")}).find_all(
-            "tr"
-        ):
-            link = tr.find("a")
+        for tr in bs_idx.select("table.project_list tr"):
+            link = tr.select_one("a")
            if not link:
                continue

-            repo_url = urljoin(self.url, link["href"]).strip("/")
+            repo_url = urljoin(self.url, link.attrs["href"]).strip("/")

            # Skip this description page which is listed but won't yield any origins to list
            if repo_url.endswith("?o=descr"):
@@ -95,7 +92,7 @@ class GitwebLister(StatelessLister[Repositories]):

            # This retrieves the date interval in natural language (e.g. '9 years ago')
            # to actual python datetime interval so we can derive last update
-            span = tr.find("td", {"class": re.compile("age.*")})
+            span = tr.select_one('td[class^="age"]')
            page_results.append(
                {"url": repo_url, "last_update_interval": span.text if span else None}
            )
@@ -134,8 +131,8 @@ class GitwebLister(StatelessLister[Repositories]):
            return None

        urls = []
-        for row in bs.find_all("tr", {"class": "metadata_url"}):
-            url = row.contents[-1].string.strip()
+        for row in bs.select("tr.metadata_url"):
+            url = row.select("td")[-1].text.strip()
            for scheme in ("http", "https", "git"):
                # remove any string prefix before origin
                pos = url.find(f"{scheme}://")

--- a/swh/lister/maven/lister.py
+++ b/swh/lister/maven/lister.py
-# Copyright (C) 2021-2022 The Software Heritage developers
+# Copyright (C) 2021-2024 The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
@@ -226,32 +226,22 @@ class MavenLister(Lister[MavenListerState, RepoPage]):
        logger.info("Found %s poms.", len(out_pom))

        # Now fetch pom files and scan them for scm info.
-
-        logger.info("Fetching poms..")
+        logger.info("Fetching poms ...")
        for pom_url in out_pom:
            try:
                response = self.http_request(pom_url)
                parsed_pom = BeautifulSoup(response.content, "xml")
-                project = parsed_pom.find("project")
-                if project is None:
-                    continue
-                scm = project.find("scm")
-                if scm is not None:
-                    connection = scm.find("connection")
-                    if connection is not None:
-                        artifact_metadata_d = {
-                            "type": "scm",
-                            "doc": out_pom[pom_url],
-                            "url": connection.text,
-                        }
-                        logger.debug(
-                            "* Yielding pom %s: %s", pom_url, artifact_metadata_d
-                        )
-                        yield artifact_metadata_d
-                    else:
-                        logger.debug("No scm.connection in pom %s", pom_url)
+                connection = parsed_pom.select_one("project scm connection")
+                if connection is not None:
+                    artifact_metadata_d = {
+                        "type": "scm",
+                        "doc": out_pom[pom_url],
+                        "url": connection.text,
+                    }
+                    logger.debug("* Yielding pom %s: %s", pom_url, artifact_metadata_d)
+                    yield artifact_metadata_d
                else:
-                    logger.debug("No scm in pom %s", pom_url)
+                    logger.debug("No project.scm.connection in pom %s", pom_url)
            except requests.HTTPError:
                logger.warning(
                    "POM info page could not be fetched, skipping project '%s'",

--- a/swh/lister/nuget/lister.py
+++ b/swh/lister/nuget/lister.py
@@ -146,7 +146,7 @@ class NugetLister(Lister[NugetListerState, NugetListerPage]):
                )
                continue
            xml = BeautifulSoup(res_metadata.content, "xml")
-            repo = xml.find("repository")
+            repo = xml.select_one("repository")
            if repo and "url" in repo.attrs and "type" in repo.attrs:
                vcs_url = repo.attrs["url"]
                vcs_type = repo.attrs["type"]

--- a/swh/lister/rubygems/lister.py
+++ b/swh/lister/rubygems/lister.py
-# Copyright (C) 2022-2023  The Software Heritage developers
+# Copyright (C) 2022-2024  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
@@ -82,8 +82,8 @@ class RubyGemsLister(StatelessLister[RubyGemsListerPage]):
    def get_latest_dump_file(self) -> str:
        response = self.http_request(self.RUBY_GEMS_POSTGRES_DUMP_LIST_URL)
        xml = BeautifulSoup(response.content, "xml")
-        contents = xml.find_all("Contents")
-        return contents[-1].find("Key").text
+        contents = xml.select("Contents")
+        return contents[-1].select("Key")[0].text

    def create_rubygems_db(
        self, postgresql: Postgresql

--- a/swh/lister/sourceforge/lister.py
+++ b/swh/lister/sourceforge/lister.py
-# Copyright (C) 2021-2023  The Software Heritage developers
+# Copyright (C) 2021-2024  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
@@ -367,7 +367,7 @@ class SourceForgeLister(Lister[SourceForgeListerState, SourceForgeListerPage]):
                else:
                    bs = BeautifulSoup(response.text, features="html.parser")
                    cvs_base_url = "rsync://a.cvs.sourceforge.net/cvsroot"
-                    for text in [b.text for b in bs.find_all("b")]:
+                    for text in [b.text for b in bs.select("b")]:
                        match = re.search(rf".*/cvsroot/{project} co -P (.+)", text)
                        if match is not None:
                            module = match.group(1)

--- a/swh/lister/stagit/lister.py
+++ b/swh/lister/stagit/lister.py
-# Copyright (C) 2023 The Software Heritage developers
+# Copyright (C) 2023-2024 The Software Heritage developers
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information

 from datetime import datetime, timezone
 import logging
-import re
 from typing import Any, Dict, Iterator, List, Optional
 from urllib.parse import urlparse

@@ -73,15 +72,19 @@ class StagitLister(StatelessLister[Repositories]):

        page_results = []

-        for tr in bs_idx.find("table", {"id": re.compile("index")}).find_all("tr"):
-            link = tr.find("a")
+        index_table = bs_idx.select_one("table#index")
+        if index_table is None:
+            return
+
+        for tr in index_table.select("tr"):
+            link = tr.select_one("a")
            if not link:
                continue

-            repo_description_url = self.url + "/" + link["href"]
+            repo_description_url = self.url + "/" + link.attrs["href"]

            # This retrieves the date in format "%Y-%m-%d %H:%M"
-            tds = tr.find_all("td")
+            tds = tr.select("td")
            last_update = tds[-1].text if tds and tds[-1] else None

            page_results.append(
@@ -122,10 +125,14 @@ class StagitLister(StatelessLister[Repositories]):
            return None

        urls = [
-            td.find("a")["href"]
-            for row in bs.find_all("tr", {"class": "url"})
-            for td in row.find_all("td")
-            if td.text.startswith("git clone")
+            a.attrs["href"]
+            for a in [
+                td.select_one("a")
+                for row in bs.select("tr.url")
+                for td in row.select("td")
+                if td.text.startswith("git clone")
+            ]
+            if a is not None
        ]

        if not urls: