diff --git a/mypy.ini b/mypy.ini index 7f9436b9e15362bb1dd2d547b07226cac61618cb..76468c2ad0ed728c250ea209d637c76eeb3856ee 100644 --- a/mypy.ini +++ b/mypy.ini @@ -43,6 +43,9 @@ ignore_missing_imports = True [mypy-dulwich.*] ignore_missing_imports = True +[mypy-dateparser.*] +ignore_missing_imports = True + [mypy-testing.postgresql.*] ignore_missing_imports = True diff --git a/requirements.txt b/requirements.txt index 2614f0a5f054dae719751ac80acfcd8772efb585..0e5880631c67c989be670429645599ad3e30f4f4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,6 +6,7 @@ beautifulsoup4 launchpadlib tenacity >= 6.2 lxml +dateparser dulwich testing.postgresql psycopg2 diff --git a/swh/lister/gitweb/lister.py b/swh/lister/gitweb/lister.py index 57810a9e95bd63bbee63615cf3081490592f5e86..3c902e4f7d019771476f6b71cc3bf60df6cbd807 100644 --- a/swh/lister/gitweb/lister.py +++ b/swh/lister/gitweb/lister.py @@ -8,9 +8,11 @@ from typing import Any, Dict, Iterator, List, Optional from urllib.parse import urljoin, urlparse from bs4 import BeautifulSoup +from dateparser import parse from requests.exceptions import HTTPError from swh.lister.pattern import CredentialsType, StatelessLister +from swh.lister.utils import now from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin @@ -60,6 +62,7 @@ class GitwebLister(StatelessLister[Repositories]): ) self.session.headers.update({"Accept": "application/html"}) + self.listing_date = now() def _get_and_parse(self, url: str) -> BeautifulSoup: """Get the given url and parse the retrieved HTML using BeautifulSoup""" @@ -87,13 +90,12 @@ class GitwebLister(StatelessLister[Repositories]): if repo_url.endswith("?o=descr"): continue - # FIXME: Add parsing step from date interval like '9 years ago' to + # This retrieves the date interval in natural language (e.g. '9 years ago') to # actual python datetime interval so we can derive last update - # span = tr.find("td", {"class": re.compile("age.*")}) - # last_updated_date = span.get("title") if span else None - # last_updated_date = None - - page_results.append({"url": repo_url}) + span = tr.find("td", {"class": re.compile("age.*")}) + page_results.append( + {"url": repo_url, "last_update_interval": span.text if span else None} + ) yield page_results @@ -108,10 +110,14 @@ class GitwebLister(StatelessLister[Repositories]): if origin_url is None: continue + last_update_interval = repo.get("last_update_interval") + last_update = parse(last_update_interval) if last_update_interval else None + yield ListedOrigin( lister_id=self.lister_obj.id, url=origin_url, visit_type="git", + last_update=last_update, ) def _get_origin_from_repository_url(self, repository_url: str) -> Optional[str]: diff --git a/swh/lister/gitweb/tests/test_lister.py b/swh/lister/gitweb/tests/test_lister.py index 1f784b1c90a8895235fc06dd5dc59a0f494cd6c9..e52546d7e4614998986251bfbd791777e6b42d4d 100644 --- a/swh/lister/gitweb/tests/test_lister.py +++ b/swh/lister/gitweb/tests/test_lister.py @@ -70,8 +70,7 @@ def test_lister_gitweb_run(requests_mock_datadir, swh_scheduler): for listed_origin in scheduler_origins: assert listed_origin.visit_type == "git" assert listed_origin.url.startswith(url) - # Not parsed - assert listed_origin.last_update is None + assert listed_origin.last_update is not None # test user agent content for request in requests_mock_datadir.request_history: diff --git a/swh/lister/utils.py b/swh/lister/utils.py index 60cfc933831579ba38c83cf86894599b159d961c..5d7a6e75902acfca8aea71a5f6f11355387049f0 100644 --- a/swh/lister/utils.py +++ b/swh/lister/utils.py @@ -2,6 +2,7 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from datetime import datetime, timezone from typing import Iterator, Optional, Tuple import urllib.parse @@ -75,3 +76,7 @@ def is_valid_origin_url(url: Optional[str]) -> bool: return False return True + + +def now() -> datetime: + return datetime.now(tz=timezone.utc)