Skip to content
Snippets Groups Projects
Commit 41407e0e authored by Antoine Lambert's avatar Antoine Lambert
Browse files

Use beautifulsoup4 CSS selectors to simplify code and type checking

As the types-beautifulsoup4 package gets installed in the swh virtualenv
as it is a swh-scanner test dependency, some mypy errors were reported
related to beautifulsoup4 typing.

As the returned type for the find method of bs4 is the following union:
Tag | NavigableString | None, isinstance calls must be used to ensure
proper typing which is not great.

So prefer to use the select_one method instead where a simple None check
must be done to ensure typing is correct as it is returning Optional[Tag].
In a similar manner, replace use of find_all method by select method.

It also has the advantage to simplify the code.
parent e6a35c55
No related branches found
No related tags found
1 merge request!524Use beautifulsoup4 CSS selectors to simplify code and type checking
Pipeline #8352 passed
......@@ -3,6 +3,7 @@ pytest >= 8.1
pytest-mock
requests_mock
swh-scheduler[testing]
types-beautifulsoup4
types-click
types-pyyaml
types-requests
......
# Copyright (C) 2022-2023 The Software Heritage developers
# Copyright (C) 2022-2024 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
......@@ -128,10 +128,10 @@ class ArchLister(StatelessLister[ArchListerPage]):
)
response = self.http_request(url)
soup = BeautifulSoup(response.text, "html.parser")
links = soup.find_all("a", href=True)
links = soup.select("a[href]")
# drop the first line (used to go to up directory)
if links[0].attrs["href"] == "../":
if links and links[0].attrs["href"] == "../":
links.pop(0)
versions = []
......@@ -156,26 +156,28 @@ class ArchLister(StatelessLister[ArchListerPage]):
arch = m.group("arch")
version = m.group("version")
# Extract last_modified and an approximate file size
# Extract last_modified date
last_modified = None
raw_text = link.next_sibling
raw_text_rex = re.compile(
r"^(?P<last_modified>\d+-\w+-\d+ \d\d:\d\d)\s+.*$"
)
s = raw_text_rex.search(raw_text.strip())
if s is None:
logger.error(
"Can not find a match for 'last_modified' in '%(raw_text)s'",
dict(raw_text=raw_text),
if raw_text:
raw_text_rex = re.compile(
r"^(?P<last_modified>\d+-\w+-\d+ \d\d:\d\d)\s+.*$"
)
else:
values = s.groups()
assert values and len(values) == 1
last_modified_str = values[0]
# format as expected
last_modified = datetime.datetime.strptime(
last_modified_str, "%d-%b-%Y %H:%M"
).isoformat()
s = raw_text_rex.search(raw_text.text.strip())
if s is None:
logger.error(
"Can not find a match for 'last_modified' in '%(raw_text)s'",
dict(raw_text=raw_text),
)
else:
values = s.groups()
assert values and len(values) == 1
last_modified_str = values[0]
# format as expected
last_modified = datetime.datetime.strptime(
last_modified_str, "%d-%b-%Y %H:%M"
).isoformat()
# link url is relative, format a canonical one
url = self.ARCH_PACKAGE_DOWNLOAD_URL_PATTERN.format(
......
# Copyright (C) 2023 The Software Heritage developers
# Copyright (C) 2023-2024 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
......@@ -146,10 +146,11 @@ class BioconductorLister(Lister[BioconductorListerState, BioconductorListerPage]
f"{self.BIOCONDUCTOR_HOMEPAGE}/about/release-announcements"
).text
bs = BeautifulSoup(html, "html.parser")
return [
tr.find_all("td")[0].text
for tr in reversed(bs.find("table").find("tbody").find_all("tr"))
if tr.find_all("td")[2].find("a")
tr.select("td")[0].text
for tr in reversed(bs.select("table tbody tr"))
if tr.select("td")[2].select("a")
]
def parse_packages(self, text: str) -> Dict[str, Any]:
......
# Copyright (C) 2019-2023 The Software Heritage developers
# Copyright (C) 2019-2024 The Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from datetime import datetime, timezone
import logging
import re
from typing import Any, Dict, Iterator, List, Optional
from urllib.parse import urljoin, urlparse
......@@ -95,14 +94,16 @@ class CGitLister(StatelessLister[Repositories]):
page_results = []
for tr in bs_idx.find("div", {"class": "content"}).find_all(
"tr", {"class": ""}
):
repository_link = tr.find("a")["href"]
for tr in bs_idx.select("div.content tr:not([class])"):
repository_link = tr.select_one("a")
if repository_link is None:
continue
repo_url = None
git_url = None
base_url = urljoin(self.url, repository_link).strip("/")
base_url = urljoin(self.url, repository_link.attrs["href"]).strip("/")
if self.base_git_url: # mapping provided
# computing git url
git_url = base_url.replace(self.url, self.base_git_url)
......@@ -111,7 +112,7 @@ class CGitLister(StatelessLister[Repositories]):
# the git url (cf. self.get_origins_from_page)
repo_url = base_url
span = tr.find("span", {"class": re.compile("age-")})
span = tr.select_one('span[class^="age-"]')
last_updated_date = span.get("title") if span else None
page_results.append(
......@@ -125,12 +126,14 @@ class CGitLister(StatelessLister[Repositories]):
yield page_results
try:
pager = bs_idx.find("ul", {"class": "pager"})
current_page = pager.find("a", {"class": "current"})
if current_page:
next_page = current_page.parent.next_sibling.a["href"]
next_page = urljoin(self.url, next_page)
next_page_li = bs_idx.select_one(
"ul.pager li:has(> a.current) + li:has(> a)"
)
next_page = (
urljoin(self.url, next_page_li.select("a")[0].attrs["href"])
if next_page_li
else None
)
except (AttributeError, KeyError):
# no pager, or no next page
next_page = None
......@@ -169,27 +172,26 @@ class CGitLister(StatelessLister[Repositories]):
return None
# check if we are on the summary tab, if not, go to this tab
tab = bs.find("table", {"class": "tabs"})
if tab:
summary_a = tab.find("a", string="summary")
if summary_a:
summary_url = urljoin(repository_url, summary_a["href"]).strip("/")
if summary_url != repository_url:
logger.debug(
"%s : Active tab is not the summary, trying to load the summary page",
repository_url,
)
return self._get_origin_from_repository_url(summary_url)
else:
logger.debug("No summary tab found on %s", repository_url)
summary_a = bs.select_one('table.tabs a:-soup-contains("summary")')
if summary_a:
summary_path = summary_a.attrs["href"]
summary_url = urljoin(repository_url, summary_path).strip("/")
if summary_url != repository_url:
logger.debug(
"%s : Active tab is not the summary, trying to load the summary page",
repository_url,
)
return self._get_origin_from_repository_url(summary_url)
else:
logger.debug("No summary tab found on %s", repository_url)
# origin urls are listed on the repository page
# TODO check if forcing https is better or not ?
# <link rel='vcs-git' href='git://...' title='...'/>
# <link rel='vcs-git' href='http://...' title='...'/>
# <link rel='vcs-git' href='https://...' title='...'/>
urls = [x["href"] for x in bs.find_all("a", {"rel": "vcs-git"})]
urls = [x.attrs["href"] for x in bs.select('a[rel="vcs-git"]')]
if not urls:
logger.debug("No git urls found on %s", repository_url)
......
# Copyright (C) 2023 The Software Heritage developers
# Copyright (C) 2023-2024 The Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from datetime import datetime, timezone
import logging
import re
from typing import Any, Dict, Iterator, List, Optional
from urllib.parse import parse_qs, urljoin, urlparse
......@@ -80,14 +79,12 @@ class GitwebLister(StatelessLister[Repositories]):
page_results = []
for tr in bs_idx.find("table", {"class": re.compile("project_list")}).find_all(
"tr"
):
link = tr.find("a")
for tr in bs_idx.select("table.project_list tr"):
link = tr.select_one("a")
if not link:
continue
repo_url = urljoin(self.url, link["href"]).strip("/")
repo_url = urljoin(self.url, link.attrs["href"]).strip("/")
# Skip this description page which is listed but won't yield any origins to list
if repo_url.endswith("?o=descr"):
......@@ -95,7 +92,7 @@ class GitwebLister(StatelessLister[Repositories]):
# This retrieves the date interval in natural language (e.g. '9 years ago')
# to actual python datetime interval so we can derive last update
span = tr.find("td", {"class": re.compile("age.*")})
span = tr.select_one('td[class^="age"]')
page_results.append(
{"url": repo_url, "last_update_interval": span.text if span else None}
)
......@@ -134,8 +131,8 @@ class GitwebLister(StatelessLister[Repositories]):
return None
urls = []
for row in bs.find_all("tr", {"class": "metadata_url"}):
url = row.contents[-1].string.strip()
for row in bs.select("tr.metadata_url"):
url = row.select("td")[-1].text.strip()
for scheme in ("http", "https", "git"):
# remove any string prefix before origin
pos = url.find(f"{scheme}://")
......
# Copyright (C) 2021-2022 The Software Heritage developers
# Copyright (C) 2021-2024 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
......@@ -226,32 +226,22 @@ class MavenLister(Lister[MavenListerState, RepoPage]):
logger.info("Found %s poms.", len(out_pom))
# Now fetch pom files and scan them for scm info.
logger.info("Fetching poms..")
logger.info("Fetching poms ...")
for pom_url in out_pom:
try:
response = self.http_request(pom_url)
parsed_pom = BeautifulSoup(response.content, "xml")
project = parsed_pom.find("project")
if project is None:
continue
scm = project.find("scm")
if scm is not None:
connection = scm.find("connection")
if connection is not None:
artifact_metadata_d = {
"type": "scm",
"doc": out_pom[pom_url],
"url": connection.text,
}
logger.debug(
"* Yielding pom %s: %s", pom_url, artifact_metadata_d
)
yield artifact_metadata_d
else:
logger.debug("No scm.connection in pom %s", pom_url)
connection = parsed_pom.select_one("project scm connection")
if connection is not None:
artifact_metadata_d = {
"type": "scm",
"doc": out_pom[pom_url],
"url": connection.text,
}
logger.debug("* Yielding pom %s: %s", pom_url, artifact_metadata_d)
yield artifact_metadata_d
else:
logger.debug("No scm in pom %s", pom_url)
logger.debug("No project.scm.connection in pom %s", pom_url)
except requests.HTTPError:
logger.warning(
"POM info page could not be fetched, skipping project '%s'",
......
......@@ -146,7 +146,7 @@ class NugetLister(Lister[NugetListerState, NugetListerPage]):
)
continue
xml = BeautifulSoup(res_metadata.content, "xml")
repo = xml.find("repository")
repo = xml.select_one("repository")
if repo and "url" in repo.attrs and "type" in repo.attrs:
vcs_url = repo.attrs["url"]
vcs_type = repo.attrs["type"]
......
# Copyright (C) 2022-2023 The Software Heritage developers
# Copyright (C) 2022-2024 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
......@@ -82,8 +82,8 @@ class RubyGemsLister(StatelessLister[RubyGemsListerPage]):
def get_latest_dump_file(self) -> str:
response = self.http_request(self.RUBY_GEMS_POSTGRES_DUMP_LIST_URL)
xml = BeautifulSoup(response.content, "xml")
contents = xml.find_all("Contents")
return contents[-1].find("Key").text
contents = xml.select("Contents")
return contents[-1].select("Key")[0].text
def create_rubygems_db(
self, postgresql: Postgresql
......
# Copyright (C) 2021-2023 The Software Heritage developers
# Copyright (C) 2021-2024 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
......@@ -367,7 +367,7 @@ class SourceForgeLister(Lister[SourceForgeListerState, SourceForgeListerPage]):
else:
bs = BeautifulSoup(response.text, features="html.parser")
cvs_base_url = "rsync://a.cvs.sourceforge.net/cvsroot"
for text in [b.text for b in bs.find_all("b")]:
for text in [b.text for b in bs.select("b")]:
match = re.search(rf".*/cvsroot/{project} co -P (.+)", text)
if match is not None:
module = match.group(1)
......
# Copyright (C) 2023 The Software Heritage developers
# Copyright (C) 2023-2024 The Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from datetime import datetime, timezone
import logging
import re
from typing import Any, Dict, Iterator, List, Optional
from urllib.parse import urlparse
......@@ -73,15 +72,19 @@ class StagitLister(StatelessLister[Repositories]):
page_results = []
for tr in bs_idx.find("table", {"id": re.compile("index")}).find_all("tr"):
link = tr.find("a")
index_table = bs_idx.select_one("table#index")
if index_table is None:
return
for tr in index_table.select("tr"):
link = tr.select_one("a")
if not link:
continue
repo_description_url = self.url + "/" + link["href"]
repo_description_url = self.url + "/" + link.attrs["href"]
# This retrieves the date in format "%Y-%m-%d %H:%M"
tds = tr.find_all("td")
tds = tr.select("td")
last_update = tds[-1].text if tds and tds[-1] else None
page_results.append(
......@@ -122,10 +125,14 @@ class StagitLister(StatelessLister[Repositories]):
return None
urls = [
td.find("a")["href"]
for row in bs.find_all("tr", {"class": "url"})
for td in row.find_all("td")
if td.text.startswith("git clone")
a.attrs["href"]
for a in [
td.select_one("a")
for row in bs.select("tr.url")
for td in row.select("td")
if td.text.startswith("git clone")
]
if a is not None
]
if not urls:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment