From 41407e0eff50969435daf4e08573dd578543c1b7 Mon Sep 17 00:00:00 2001
From: Antoine Lambert <anlambert@softwareheritage.org>
Date: Mon, 15 Apr 2024 16:58:46 +0200
Subject: [PATCH] Use beautifulsoup4 CSS selectors to simplify code and type
 checking

As the types-beautifulsoup4 package gets installed in the swh virtualenv
as it is a swh-scanner test dependency, some mypy errors were reported
related to beautifulsoup4 typing.

As the returned type for the find method of bs4 is the following union:
Tag | NavigableString | None, isinstance calls must be used to ensure
proper typing which is not great.

So prefer to use the select_one method instead where a simple None check
must be done to ensure typing is correct as it is returning Optional[Tag].
In a similar manner, replace use of find_all method by select method.

It also has the advantage to simplify the code.
---
 requirements-test.txt             |  1 +
 swh/lister/arch/lister.py         | 44 ++++++++++++-----------
 swh/lister/bioconductor/lister.py |  9 ++---
 swh/lister/cgit/lister.py         | 60 ++++++++++++++++---------------
 swh/lister/gitweb/lister.py       | 17 ++++-----
 swh/lister/maven/lister.py        | 34 +++++++-----------
 swh/lister/nuget/lister.py        |  2 +-
 swh/lister/rubygems/lister.py     |  6 ++--
 swh/lister/sourceforge/lister.py  |  4 +--
 swh/lister/stagit/lister.py       | 27 ++++++++------
 10 files changed, 102 insertions(+), 102 deletions(-)

diff --git a/requirements-test.txt b/requirements-test.txt
index 442fd85e..5bb91bd9 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -3,6 +3,7 @@ pytest >= 8.1
 pytest-mock
 requests_mock
 swh-scheduler[testing]
+types-beautifulsoup4
 types-click
 types-pyyaml
 types-requests
diff --git a/swh/lister/arch/lister.py b/swh/lister/arch/lister.py
index 66b9ce86..6537cc44 100644
--- a/swh/lister/arch/lister.py
+++ b/swh/lister/arch/lister.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2022-2023  The Software Heritage developers
+# Copyright (C) 2022-2024  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
@@ -128,10 +128,10 @@ class ArchLister(StatelessLister[ArchListerPage]):
         )
         response = self.http_request(url)
         soup = BeautifulSoup(response.text, "html.parser")
-        links = soup.find_all("a", href=True)
+        links = soup.select("a[href]")
 
         # drop the first line (used to go to up directory)
-        if links[0].attrs["href"] == "../":
+        if links and links[0].attrs["href"] == "../":
             links.pop(0)
 
         versions = []
@@ -156,26 +156,28 @@ class ArchLister(StatelessLister[ArchListerPage]):
                     arch = m.group("arch")
                     version = m.group("version")
 
-                # Extract last_modified and an approximate file size
+                # Extract last_modified date
+                last_modified = None
                 raw_text = link.next_sibling
-                raw_text_rex = re.compile(
-                    r"^(?P<last_modified>\d+-\w+-\d+ \d\d:\d\d)\s+.*$"
-                )
-                s = raw_text_rex.search(raw_text.strip())
-                if s is None:
-                    logger.error(
-                        "Can not find a match for 'last_modified' in '%(raw_text)s'",
-                        dict(raw_text=raw_text),
+                if raw_text:
+                    raw_text_rex = re.compile(
+                        r"^(?P<last_modified>\d+-\w+-\d+ \d\d:\d\d)\s+.*$"
                     )
-                else:
-                    values = s.groups()
-                    assert values and len(values) == 1
-                    last_modified_str = values[0]
-
-                # format as expected
-                last_modified = datetime.datetime.strptime(
-                    last_modified_str, "%d-%b-%Y %H:%M"
-                ).isoformat()
+                    s = raw_text_rex.search(raw_text.text.strip())
+                    if s is None:
+                        logger.error(
+                            "Can not find a match for 'last_modified' in '%(raw_text)s'",
+                            dict(raw_text=raw_text),
+                        )
+                    else:
+                        values = s.groups()
+                        assert values and len(values) == 1
+                        last_modified_str = values[0]
+
+                    # format as expected
+                    last_modified = datetime.datetime.strptime(
+                        last_modified_str, "%d-%b-%Y %H:%M"
+                    ).isoformat()
 
                 # link url is relative, format a canonical one
                 url = self.ARCH_PACKAGE_DOWNLOAD_URL_PATTERN.format(
diff --git a/swh/lister/bioconductor/lister.py b/swh/lister/bioconductor/lister.py
index e050f76c..3c505569 100644
--- a/swh/lister/bioconductor/lister.py
+++ b/swh/lister/bioconductor/lister.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2023  The Software Heritage developers
+# Copyright (C) 2023-2024  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
@@ -146,10 +146,11 @@ class BioconductorLister(Lister[BioconductorListerState, BioconductorListerPage]
             f"{self.BIOCONDUCTOR_HOMEPAGE}/about/release-announcements"
         ).text
         bs = BeautifulSoup(html, "html.parser")
+
         return [
-            tr.find_all("td")[0].text
-            for tr in reversed(bs.find("table").find("tbody").find_all("tr"))
-            if tr.find_all("td")[2].find("a")
+            tr.select("td")[0].text
+            for tr in reversed(bs.select("table tbody tr"))
+            if tr.select("td")[2].select("a")
         ]
 
     def parse_packages(self, text: str) -> Dict[str, Any]:
diff --git a/swh/lister/cgit/lister.py b/swh/lister/cgit/lister.py
index af3d7036..c04b38fa 100644
--- a/swh/lister/cgit/lister.py
+++ b/swh/lister/cgit/lister.py
@@ -1,10 +1,9 @@
-# Copyright (C) 2019-2023 The Software Heritage developers
+# Copyright (C) 2019-2024 The Software Heritage developers
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 from datetime import datetime, timezone
 import logging
-import re
 from typing import Any, Dict, Iterator, List, Optional
 from urllib.parse import urljoin, urlparse
 
@@ -95,14 +94,16 @@ class CGitLister(StatelessLister[Repositories]):
 
             page_results = []
 
-            for tr in bs_idx.find("div", {"class": "content"}).find_all(
-                "tr", {"class": ""}
-            ):
-                repository_link = tr.find("a")["href"]
+            for tr in bs_idx.select("div.content tr:not([class])"):
+                repository_link = tr.select_one("a")
+
+                if repository_link is None:
+                    continue
+
                 repo_url = None
                 git_url = None
 
-                base_url = urljoin(self.url, repository_link).strip("/")
+                base_url = urljoin(self.url, repository_link.attrs["href"]).strip("/")
                 if self.base_git_url:  # mapping provided
                     # computing git url
                     git_url = base_url.replace(self.url, self.base_git_url)
@@ -111,7 +112,7 @@ class CGitLister(StatelessLister[Repositories]):
                     # the git url (cf. self.get_origins_from_page)
                     repo_url = base_url
 
-                span = tr.find("span", {"class": re.compile("age-")})
+                span = tr.select_one('span[class^="age-"]')
                 last_updated_date = span.get("title") if span else None
 
                 page_results.append(
@@ -125,12 +126,14 @@ class CGitLister(StatelessLister[Repositories]):
             yield page_results
 
             try:
-                pager = bs_idx.find("ul", {"class": "pager"})
-
-                current_page = pager.find("a", {"class": "current"})
-                if current_page:
-                    next_page = current_page.parent.next_sibling.a["href"]
-                    next_page = urljoin(self.url, next_page)
+                next_page_li = bs_idx.select_one(
+                    "ul.pager li:has(> a.current) + li:has(> a)"
+                )
+                next_page = (
+                    urljoin(self.url, next_page_li.select("a")[0].attrs["href"])
+                    if next_page_li
+                    else None
+                )
             except (AttributeError, KeyError):
                 # no pager, or no next page
                 next_page = None
@@ -169,27 +172,26 @@ class CGitLister(StatelessLister[Repositories]):
             return None
 
         # check if we are on the summary tab, if not, go to this tab
-        tab = bs.find("table", {"class": "tabs"})
-        if tab:
-            summary_a = tab.find("a", string="summary")
-            if summary_a:
-                summary_url = urljoin(repository_url, summary_a["href"]).strip("/")
-
-                if summary_url != repository_url:
-                    logger.debug(
-                        "%s : Active tab is not the summary, trying to load the summary page",
-                        repository_url,
-                    )
-                    return self._get_origin_from_repository_url(summary_url)
-            else:
-                logger.debug("No summary tab found on %s", repository_url)
+        summary_a = bs.select_one('table.tabs a:-soup-contains("summary")')
+        if summary_a:
+            summary_path = summary_a.attrs["href"]
+            summary_url = urljoin(repository_url, summary_path).strip("/")
+
+            if summary_url != repository_url:
+                logger.debug(
+                    "%s : Active tab is not the summary, trying to load the summary page",
+                    repository_url,
+                )
+                return self._get_origin_from_repository_url(summary_url)
+        else:
+            logger.debug("No summary tab found on %s", repository_url)
 
         # origin urls are listed on the repository page
         # TODO check if forcing https is better or not ?
         # <link rel='vcs-git' href='git://...' title='...'/>
         # <link rel='vcs-git' href='http://...' title='...'/>
         # <link rel='vcs-git' href='https://...' title='...'/>
-        urls = [x["href"] for x in bs.find_all("a", {"rel": "vcs-git"})]
+        urls = [x.attrs["href"] for x in bs.select('a[rel="vcs-git"]')]
 
         if not urls:
             logger.debug("No git urls found on %s", repository_url)
diff --git a/swh/lister/gitweb/lister.py b/swh/lister/gitweb/lister.py
index 26521141..7ac71b08 100644
--- a/swh/lister/gitweb/lister.py
+++ b/swh/lister/gitweb/lister.py
@@ -1,10 +1,9 @@
-# Copyright (C) 2023 The Software Heritage developers
+# Copyright (C) 2023-2024 The Software Heritage developers
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 from datetime import datetime, timezone
 import logging
-import re
 from typing import Any, Dict, Iterator, List, Optional
 from urllib.parse import parse_qs, urljoin, urlparse
 
@@ -80,14 +79,12 @@ class GitwebLister(StatelessLister[Repositories]):
 
         page_results = []
 
-        for tr in bs_idx.find("table", {"class": re.compile("project_list")}).find_all(
-            "tr"
-        ):
-            link = tr.find("a")
+        for tr in bs_idx.select("table.project_list tr"):
+            link = tr.select_one("a")
             if not link:
                 continue
 
-            repo_url = urljoin(self.url, link["href"]).strip("/")
+            repo_url = urljoin(self.url, link.attrs["href"]).strip("/")
 
             # Skip this description page which is listed but won't yield any origins to list
             if repo_url.endswith("?o=descr"):
@@ -95,7 +92,7 @@ class GitwebLister(StatelessLister[Repositories]):
 
             # This retrieves the date interval in natural language (e.g. '9 years ago')
             # to actual python datetime interval so we can derive last update
-            span = tr.find("td", {"class": re.compile("age.*")})
+            span = tr.select_one('td[class^="age"]')
             page_results.append(
                 {"url": repo_url, "last_update_interval": span.text if span else None}
             )
@@ -134,8 +131,8 @@ class GitwebLister(StatelessLister[Repositories]):
             return None
 
         urls = []
-        for row in bs.find_all("tr", {"class": "metadata_url"}):
-            url = row.contents[-1].string.strip()
+        for row in bs.select("tr.metadata_url"):
+            url = row.select("td")[-1].text.strip()
             for scheme in ("http", "https", "git"):
                 # remove any string prefix before origin
                 pos = url.find(f"{scheme}://")
diff --git a/swh/lister/maven/lister.py b/swh/lister/maven/lister.py
index 56fafe16..0441552c 100644
--- a/swh/lister/maven/lister.py
+++ b/swh/lister/maven/lister.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2021-2022 The Software Heritage developers
+# Copyright (C) 2021-2024 The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
@@ -226,32 +226,22 @@ class MavenLister(Lister[MavenListerState, RepoPage]):
         logger.info("Found %s poms.", len(out_pom))
 
         # Now fetch pom files and scan them for scm info.
-
-        logger.info("Fetching poms..")
+        logger.info("Fetching poms ...")
         for pom_url in out_pom:
             try:
                 response = self.http_request(pom_url)
                 parsed_pom = BeautifulSoup(response.content, "xml")
-                project = parsed_pom.find("project")
-                if project is None:
-                    continue
-                scm = project.find("scm")
-                if scm is not None:
-                    connection = scm.find("connection")
-                    if connection is not None:
-                        artifact_metadata_d = {
-                            "type": "scm",
-                            "doc": out_pom[pom_url],
-                            "url": connection.text,
-                        }
-                        logger.debug(
-                            "* Yielding pom %s: %s", pom_url, artifact_metadata_d
-                        )
-                        yield artifact_metadata_d
-                    else:
-                        logger.debug("No scm.connection in pom %s", pom_url)
+                connection = parsed_pom.select_one("project scm connection")
+                if connection is not None:
+                    artifact_metadata_d = {
+                        "type": "scm",
+                        "doc": out_pom[pom_url],
+                        "url": connection.text,
+                    }
+                    logger.debug("* Yielding pom %s: %s", pom_url, artifact_metadata_d)
+                    yield artifact_metadata_d
                 else:
-                    logger.debug("No scm in pom %s", pom_url)
+                    logger.debug("No project.scm.connection in pom %s", pom_url)
             except requests.HTTPError:
                 logger.warning(
                     "POM info page could not be fetched, skipping project '%s'",
diff --git a/swh/lister/nuget/lister.py b/swh/lister/nuget/lister.py
index 7604e078..0c4997ed 100644
--- a/swh/lister/nuget/lister.py
+++ b/swh/lister/nuget/lister.py
@@ -146,7 +146,7 @@ class NugetLister(Lister[NugetListerState, NugetListerPage]):
                 )
                 continue
             xml = BeautifulSoup(res_metadata.content, "xml")
-            repo = xml.find("repository")
+            repo = xml.select_one("repository")
             if repo and "url" in repo.attrs and "type" in repo.attrs:
                 vcs_url = repo.attrs["url"]
                 vcs_type = repo.attrs["type"]
diff --git a/swh/lister/rubygems/lister.py b/swh/lister/rubygems/lister.py
index 4e59b901..400a992c 100644
--- a/swh/lister/rubygems/lister.py
+++ b/swh/lister/rubygems/lister.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2022-2023  The Software Heritage developers
+# Copyright (C) 2022-2024  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
@@ -82,8 +82,8 @@ class RubyGemsLister(StatelessLister[RubyGemsListerPage]):
     def get_latest_dump_file(self) -> str:
         response = self.http_request(self.RUBY_GEMS_POSTGRES_DUMP_LIST_URL)
         xml = BeautifulSoup(response.content, "xml")
-        contents = xml.find_all("Contents")
-        return contents[-1].find("Key").text
+        contents = xml.select("Contents")
+        return contents[-1].select("Key")[0].text
 
     def create_rubygems_db(
         self, postgresql: Postgresql
diff --git a/swh/lister/sourceforge/lister.py b/swh/lister/sourceforge/lister.py
index 518a7ece..3d19894c 100644
--- a/swh/lister/sourceforge/lister.py
+++ b/swh/lister/sourceforge/lister.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2021-2023  The Software Heritage developers
+# Copyright (C) 2021-2024  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
@@ -367,7 +367,7 @@ class SourceForgeLister(Lister[SourceForgeListerState, SourceForgeListerPage]):
                 else:
                     bs = BeautifulSoup(response.text, features="html.parser")
                     cvs_base_url = "rsync://a.cvs.sourceforge.net/cvsroot"
-                    for text in [b.text for b in bs.find_all("b")]:
+                    for text in [b.text for b in bs.select("b")]:
                         match = re.search(rf".*/cvsroot/{project} co -P (.+)", text)
                         if match is not None:
                             module = match.group(1)
diff --git a/swh/lister/stagit/lister.py b/swh/lister/stagit/lister.py
index 7a2187f7..bc82499a 100644
--- a/swh/lister/stagit/lister.py
+++ b/swh/lister/stagit/lister.py
@@ -1,10 +1,9 @@
-# Copyright (C) 2023 The Software Heritage developers
+# Copyright (C) 2023-2024 The Software Heritage developers
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 from datetime import datetime, timezone
 import logging
-import re
 from typing import Any, Dict, Iterator, List, Optional
 from urllib.parse import urlparse
 
@@ -73,15 +72,19 @@ class StagitLister(StatelessLister[Repositories]):
 
         page_results = []
 
-        for tr in bs_idx.find("table", {"id": re.compile("index")}).find_all("tr"):
-            link = tr.find("a")
+        index_table = bs_idx.select_one("table#index")
+        if index_table is None:
+            return
+
+        for tr in index_table.select("tr"):
+            link = tr.select_one("a")
             if not link:
                 continue
 
-            repo_description_url = self.url + "/" + link["href"]
+            repo_description_url = self.url + "/" + link.attrs["href"]
 
             # This retrieves the date in format "%Y-%m-%d %H:%M"
-            tds = tr.find_all("td")
+            tds = tr.select("td")
             last_update = tds[-1].text if tds and tds[-1] else None
 
             page_results.append(
@@ -122,10 +125,14 @@ class StagitLister(StatelessLister[Repositories]):
             return None
 
         urls = [
-            td.find("a")["href"]
-            for row in bs.find_all("tr", {"class": "url"})
-            for td in row.find_all("td")
-            if td.text.startswith("git clone")
+            a.attrs["href"]
+            for a in [
+                td.select_one("a")
+                for row in bs.select("tr.url")
+                for td in row.select("td")
+                if td.text.startswith("git clone")
+            ]
+            if a is not None
         ]
 
         if not urls:
-- 
GitLab