diff --git a/requirements-test.txt b/requirements-test.txt index 442fd85e1d501e60c6e65b89e4561998bfb76633..5bb91bd9005f3a92676a89ea941ed9bde30ffac3 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -3,6 +3,7 @@ pytest >= 8.1 pytest-mock requests_mock swh-scheduler[testing] +types-beautifulsoup4 types-click types-pyyaml types-requests diff --git a/swh/lister/arch/lister.py b/swh/lister/arch/lister.py index 66b9ce86a1595e90c6218639f9db95486ed4be67..6537cc44f396d011b2599a9703a4e74af1d629ae 100644 --- a/swh/lister/arch/lister.py +++ b/swh/lister/arch/lister.py @@ -1,4 +1,4 @@ -# Copyright (C) 2022-2023 The Software Heritage developers +# Copyright (C) 2022-2024 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -128,10 +128,10 @@ class ArchLister(StatelessLister[ArchListerPage]): ) response = self.http_request(url) soup = BeautifulSoup(response.text, "html.parser") - links = soup.find_all("a", href=True) + links = soup.select("a[href]") # drop the first line (used to go to up directory) - if links[0].attrs["href"] == "../": + if links and links[0].attrs["href"] == "../": links.pop(0) versions = [] @@ -156,26 +156,28 @@ class ArchLister(StatelessLister[ArchListerPage]): arch = m.group("arch") version = m.group("version") - # Extract last_modified and an approximate file size + # Extract last_modified date + last_modified = None raw_text = link.next_sibling - raw_text_rex = re.compile( - r"^(?P<last_modified>\d+-\w+-\d+ \d\d:\d\d)\s+.*$" - ) - s = raw_text_rex.search(raw_text.strip()) - if s is None: - logger.error( - "Can not find a match for 'last_modified' in '%(raw_text)s'", - dict(raw_text=raw_text), + if raw_text: + raw_text_rex = re.compile( + r"^(?P<last_modified>\d+-\w+-\d+ \d\d:\d\d)\s+.*$" ) - else: - values = s.groups() - assert values and len(values) == 1 - last_modified_str = values[0] - - # format as expected - last_modified = datetime.datetime.strptime( - last_modified_str, "%d-%b-%Y %H:%M" - ).isoformat() + s = raw_text_rex.search(raw_text.text.strip()) + if s is None: + logger.error( + "Can not find a match for 'last_modified' in '%(raw_text)s'", + dict(raw_text=raw_text), + ) + else: + values = s.groups() + assert values and len(values) == 1 + last_modified_str = values[0] + + # format as expected + last_modified = datetime.datetime.strptime( + last_modified_str, "%d-%b-%Y %H:%M" + ).isoformat() # link url is relative, format a canonical one url = self.ARCH_PACKAGE_DOWNLOAD_URL_PATTERN.format( diff --git a/swh/lister/bioconductor/lister.py b/swh/lister/bioconductor/lister.py index e050f76c87b549df7af548ec670a212b5fa46b30..3c505569b2aeb8ab3254431f7ae0350a5aaa8394 100644 --- a/swh/lister/bioconductor/lister.py +++ b/swh/lister/bioconductor/lister.py @@ -1,4 +1,4 @@ -# Copyright (C) 2023 The Software Heritage developers +# Copyright (C) 2023-2024 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -146,10 +146,11 @@ class BioconductorLister(Lister[BioconductorListerState, BioconductorListerPage] f"{self.BIOCONDUCTOR_HOMEPAGE}/about/release-announcements" ).text bs = BeautifulSoup(html, "html.parser") + return [ - tr.find_all("td")[0].text - for tr in reversed(bs.find("table").find("tbody").find_all("tr")) - if tr.find_all("td")[2].find("a") + tr.select("td")[0].text + for tr in reversed(bs.select("table tbody tr")) + if tr.select("td")[2].select("a") ] def parse_packages(self, text: str) -> Dict[str, Any]: diff --git a/swh/lister/cgit/lister.py b/swh/lister/cgit/lister.py index af3d70362c58fe20d6aab1ca3eec4d0110504ac7..c04b38fa43944bcda40d76172d2182bf063ad4a0 100644 --- a/swh/lister/cgit/lister.py +++ b/swh/lister/cgit/lister.py @@ -1,10 +1,9 @@ -# Copyright (C) 2019-2023 The Software Heritage developers +# Copyright (C) 2019-2024 The Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from datetime import datetime, timezone import logging -import re from typing import Any, Dict, Iterator, List, Optional from urllib.parse import urljoin, urlparse @@ -95,14 +94,16 @@ class CGitLister(StatelessLister[Repositories]): page_results = [] - for tr in bs_idx.find("div", {"class": "content"}).find_all( - "tr", {"class": ""} - ): - repository_link = tr.find("a")["href"] + for tr in bs_idx.select("div.content tr:not([class])"): + repository_link = tr.select_one("a") + + if repository_link is None: + continue + repo_url = None git_url = None - base_url = urljoin(self.url, repository_link).strip("/") + base_url = urljoin(self.url, repository_link.attrs["href"]).strip("/") if self.base_git_url: # mapping provided # computing git url git_url = base_url.replace(self.url, self.base_git_url) @@ -111,7 +112,7 @@ class CGitLister(StatelessLister[Repositories]): # the git url (cf. self.get_origins_from_page) repo_url = base_url - span = tr.find("span", {"class": re.compile("age-")}) + span = tr.select_one('span[class^="age-"]') last_updated_date = span.get("title") if span else None page_results.append( @@ -125,12 +126,14 @@ class CGitLister(StatelessLister[Repositories]): yield page_results try: - pager = bs_idx.find("ul", {"class": "pager"}) - - current_page = pager.find("a", {"class": "current"}) - if current_page: - next_page = current_page.parent.next_sibling.a["href"] - next_page = urljoin(self.url, next_page) + next_page_li = bs_idx.select_one( + "ul.pager li:has(> a.current) + li:has(> a)" + ) + next_page = ( + urljoin(self.url, next_page_li.select("a")[0].attrs["href"]) + if next_page_li + else None + ) except (AttributeError, KeyError): # no pager, or no next page next_page = None @@ -169,27 +172,26 @@ class CGitLister(StatelessLister[Repositories]): return None # check if we are on the summary tab, if not, go to this tab - tab = bs.find("table", {"class": "tabs"}) - if tab: - summary_a = tab.find("a", string="summary") - if summary_a: - summary_url = urljoin(repository_url, summary_a["href"]).strip("/") - - if summary_url != repository_url: - logger.debug( - "%s : Active tab is not the summary, trying to load the summary page", - repository_url, - ) - return self._get_origin_from_repository_url(summary_url) - else: - logger.debug("No summary tab found on %s", repository_url) + summary_a = bs.select_one('table.tabs a:-soup-contains("summary")') + if summary_a: + summary_path = summary_a.attrs["href"] + summary_url = urljoin(repository_url, summary_path).strip("/") + + if summary_url != repository_url: + logger.debug( + "%s : Active tab is not the summary, trying to load the summary page", + repository_url, + ) + return self._get_origin_from_repository_url(summary_url) + else: + logger.debug("No summary tab found on %s", repository_url) # origin urls are listed on the repository page # TODO check if forcing https is better or not ? # <link rel='vcs-git' href='git://...' title='...'/> # <link rel='vcs-git' href='http://...' title='...'/> # <link rel='vcs-git' href='https://...' title='...'/> - urls = [x["href"] for x in bs.find_all("a", {"rel": "vcs-git"})] + urls = [x.attrs["href"] for x in bs.select('a[rel="vcs-git"]')] if not urls: logger.debug("No git urls found on %s", repository_url) diff --git a/swh/lister/gitweb/lister.py b/swh/lister/gitweb/lister.py index 26521141fb21b9554db6b9aab0e66f0ccdf7586a..7ac71b08e36e361ea5768dc36aae7acb6a76724a 100644 --- a/swh/lister/gitweb/lister.py +++ b/swh/lister/gitweb/lister.py @@ -1,10 +1,9 @@ -# Copyright (C) 2023 The Software Heritage developers +# Copyright (C) 2023-2024 The Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from datetime import datetime, timezone import logging -import re from typing import Any, Dict, Iterator, List, Optional from urllib.parse import parse_qs, urljoin, urlparse @@ -80,14 +79,12 @@ class GitwebLister(StatelessLister[Repositories]): page_results = [] - for tr in bs_idx.find("table", {"class": re.compile("project_list")}).find_all( - "tr" - ): - link = tr.find("a") + for tr in bs_idx.select("table.project_list tr"): + link = tr.select_one("a") if not link: continue - repo_url = urljoin(self.url, link["href"]).strip("/") + repo_url = urljoin(self.url, link.attrs["href"]).strip("/") # Skip this description page which is listed but won't yield any origins to list if repo_url.endswith("?o=descr"): @@ -95,7 +92,7 @@ class GitwebLister(StatelessLister[Repositories]): # This retrieves the date interval in natural language (e.g. '9 years ago') # to actual python datetime interval so we can derive last update - span = tr.find("td", {"class": re.compile("age.*")}) + span = tr.select_one('td[class^="age"]') page_results.append( {"url": repo_url, "last_update_interval": span.text if span else None} ) @@ -134,8 +131,8 @@ class GitwebLister(StatelessLister[Repositories]): return None urls = [] - for row in bs.find_all("tr", {"class": "metadata_url"}): - url = row.contents[-1].string.strip() + for row in bs.select("tr.metadata_url"): + url = row.select("td")[-1].text.strip() for scheme in ("http", "https", "git"): # remove any string prefix before origin pos = url.find(f"{scheme}://") diff --git a/swh/lister/maven/lister.py b/swh/lister/maven/lister.py index 56fafe16f2f5c8a172ac7aea4d65d5d36188385e..0441552cdb999612c3c7c658efd0df8c8fe1adc8 100644 --- a/swh/lister/maven/lister.py +++ b/swh/lister/maven/lister.py @@ -1,4 +1,4 @@ -# Copyright (C) 2021-2022 The Software Heritage developers +# Copyright (C) 2021-2024 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -226,32 +226,22 @@ class MavenLister(Lister[MavenListerState, RepoPage]): logger.info("Found %s poms.", len(out_pom)) # Now fetch pom files and scan them for scm info. - - logger.info("Fetching poms..") + logger.info("Fetching poms ...") for pom_url in out_pom: try: response = self.http_request(pom_url) parsed_pom = BeautifulSoup(response.content, "xml") - project = parsed_pom.find("project") - if project is None: - continue - scm = project.find("scm") - if scm is not None: - connection = scm.find("connection") - if connection is not None: - artifact_metadata_d = { - "type": "scm", - "doc": out_pom[pom_url], - "url": connection.text, - } - logger.debug( - "* Yielding pom %s: %s", pom_url, artifact_metadata_d - ) - yield artifact_metadata_d - else: - logger.debug("No scm.connection in pom %s", pom_url) + connection = parsed_pom.select_one("project scm connection") + if connection is not None: + artifact_metadata_d = { + "type": "scm", + "doc": out_pom[pom_url], + "url": connection.text, + } + logger.debug("* Yielding pom %s: %s", pom_url, artifact_metadata_d) + yield artifact_metadata_d else: - logger.debug("No scm in pom %s", pom_url) + logger.debug("No project.scm.connection in pom %s", pom_url) except requests.HTTPError: logger.warning( "POM info page could not be fetched, skipping project '%s'", diff --git a/swh/lister/nuget/lister.py b/swh/lister/nuget/lister.py index 7604e078a6ed459c60ac449a73dd000e10a8cc9a..0c4997edc4eee522d2da77082f166876b973b61f 100644 --- a/swh/lister/nuget/lister.py +++ b/swh/lister/nuget/lister.py @@ -146,7 +146,7 @@ class NugetLister(Lister[NugetListerState, NugetListerPage]): ) continue xml = BeautifulSoup(res_metadata.content, "xml") - repo = xml.find("repository") + repo = xml.select_one("repository") if repo and "url" in repo.attrs and "type" in repo.attrs: vcs_url = repo.attrs["url"] vcs_type = repo.attrs["type"] diff --git a/swh/lister/rubygems/lister.py b/swh/lister/rubygems/lister.py index 4e59b901cf8457600445407eaaa2e9cbc926dbca..400a992c6e39393ccdf407a117ce11d47e833ba7 100644 --- a/swh/lister/rubygems/lister.py +++ b/swh/lister/rubygems/lister.py @@ -1,4 +1,4 @@ -# Copyright (C) 2022-2023 The Software Heritage developers +# Copyright (C) 2022-2024 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -82,8 +82,8 @@ class RubyGemsLister(StatelessLister[RubyGemsListerPage]): def get_latest_dump_file(self) -> str: response = self.http_request(self.RUBY_GEMS_POSTGRES_DUMP_LIST_URL) xml = BeautifulSoup(response.content, "xml") - contents = xml.find_all("Contents") - return contents[-1].find("Key").text + contents = xml.select("Contents") + return contents[-1].select("Key")[0].text def create_rubygems_db( self, postgresql: Postgresql diff --git a/swh/lister/sourceforge/lister.py b/swh/lister/sourceforge/lister.py index 518a7ece9b5d9f2696d29d4aaf11093db76be718..3d19894c7074a30213c48fd1ed267d4a6ed7c828 100644 --- a/swh/lister/sourceforge/lister.py +++ b/swh/lister/sourceforge/lister.py @@ -1,4 +1,4 @@ -# Copyright (C) 2021-2023 The Software Heritage developers +# Copyright (C) 2021-2024 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -367,7 +367,7 @@ class SourceForgeLister(Lister[SourceForgeListerState, SourceForgeListerPage]): else: bs = BeautifulSoup(response.text, features="html.parser") cvs_base_url = "rsync://a.cvs.sourceforge.net/cvsroot" - for text in [b.text for b in bs.find_all("b")]: + for text in [b.text for b in bs.select("b")]: match = re.search(rf".*/cvsroot/{project} co -P (.+)", text) if match is not None: module = match.group(1) diff --git a/swh/lister/stagit/lister.py b/swh/lister/stagit/lister.py index 7a2187f7d0229ac0c84e4e336dd123f8a964d875..bc82499afb77ba9a08c382c551acb371b273c510 100644 --- a/swh/lister/stagit/lister.py +++ b/swh/lister/stagit/lister.py @@ -1,10 +1,9 @@ -# Copyright (C) 2023 The Software Heritage developers +# Copyright (C) 2023-2024 The Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from datetime import datetime, timezone import logging -import re from typing import Any, Dict, Iterator, List, Optional from urllib.parse import urlparse @@ -73,15 +72,19 @@ class StagitLister(StatelessLister[Repositories]): page_results = [] - for tr in bs_idx.find("table", {"id": re.compile("index")}).find_all("tr"): - link = tr.find("a") + index_table = bs_idx.select_one("table#index") + if index_table is None: + return + + for tr in index_table.select("tr"): + link = tr.select_one("a") if not link: continue - repo_description_url = self.url + "/" + link["href"] + repo_description_url = self.url + "/" + link.attrs["href"] # This retrieves the date in format "%Y-%m-%d %H:%M" - tds = tr.find_all("td") + tds = tr.select("td") last_update = tds[-1].text if tds and tds[-1] else None page_results.append( @@ -122,10 +125,14 @@ class StagitLister(StatelessLister[Repositories]): return None urls = [ - td.find("a")["href"] - for row in bs.find_all("tr", {"class": "url"}) - for td in row.find_all("td") - if td.text.startswith("git clone") + a.attrs["href"] + for a in [ + td.select_one("a") + for row in bs.select("tr.url") + for td in row.select("td") + if td.text.startswith("git clone") + ] + if a is not None ] if not urls: