Skip to content
Snippets Groups Projects
Verified Commit 00585f72 authored by Antoine R. Dumont's avatar Antoine R. Dumont
Browse files

gitweb: Parse the last update interval as a last update

Refs. swh/devel/swh-lister#1800
parent 11d59a03
No related branches found
No related tags found
No related merge requests found
......@@ -43,6 +43,9 @@ ignore_missing_imports = True
[mypy-dulwich.*]
ignore_missing_imports = True
[mypy-dateparser.*]
ignore_missing_imports = True
[mypy-testing.postgresql.*]
ignore_missing_imports = True
......
......@@ -6,6 +6,7 @@ beautifulsoup4
launchpadlib
tenacity >= 6.2
lxml
dateparser
dulwich
testing.postgresql
psycopg2
......
......@@ -8,9 +8,11 @@ from typing import Any, Dict, Iterator, List, Optional
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
from dateparser import parse
from requests.exceptions import HTTPError
from swh.lister.pattern import CredentialsType, StatelessLister
from swh.lister.utils import now
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
......@@ -60,6 +62,7 @@ class GitwebLister(StatelessLister[Repositories]):
)
self.session.headers.update({"Accept": "application/html"})
self.listing_date = now()
def _get_and_parse(self, url: str) -> BeautifulSoup:
"""Get the given url and parse the retrieved HTML using BeautifulSoup"""
......@@ -87,13 +90,12 @@ class GitwebLister(StatelessLister[Repositories]):
if repo_url.endswith("?o=descr"):
continue
# FIXME: Add parsing step from date interval like '9 years ago' to
# This retrieves the date interval in natural language (e.g. '9 years ago') to
# actual python datetime interval so we can derive last update
# span = tr.find("td", {"class": re.compile("age.*")})
# last_updated_date = span.get("title") if span else None
# last_updated_date = None
page_results.append({"url": repo_url})
span = tr.find("td", {"class": re.compile("age.*")})
page_results.append(
{"url": repo_url, "last_update_interval": span.text if span else None}
)
yield page_results
......@@ -108,10 +110,14 @@ class GitwebLister(StatelessLister[Repositories]):
if origin_url is None:
continue
last_update_interval = repo.get("last_update_interval")
last_update = parse(last_update_interval) if last_update_interval else None
yield ListedOrigin(
lister_id=self.lister_obj.id,
url=origin_url,
visit_type="git",
last_update=last_update,
)
def _get_origin_from_repository_url(self, repository_url: str) -> Optional[str]:
......
......@@ -70,8 +70,7 @@ def test_lister_gitweb_run(requests_mock_datadir, swh_scheduler):
for listed_origin in scheduler_origins:
assert listed_origin.visit_type == "git"
assert listed_origin.url.startswith(url)
# Not parsed
assert listed_origin.last_update is None
assert listed_origin.last_update is not None
# test user agent content
for request in requests_mock_datadir.request_history:
......
......@@ -2,6 +2,7 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from datetime import datetime, timezone
from typing import Iterator, Optional, Tuple
import urllib.parse
......@@ -75,3 +76,7 @@ def is_valid_origin_url(url: Optional[str]) -> bool:
return False
return True
def now() -> datetime:
return datetime.now(tz=timezone.utc)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment