From ab684134e6ec6ba2b05b593d08f4559069e064e4 Mon Sep 17 00:00:00 2001 From: "Antoine R. Dumont (@ardumont)" <ardumont@softwareheritage.org> Date: Mon, 10 Jul 2023 10:04:10 +0200 Subject: [PATCH] gitweb: Parse the last update interval listed as last update Refs. swh/devel/swh-lister#1800 --- mypy.ini | 3 +++ requirements.txt | 1 + swh/lister/gitweb/lister.py | 22 ++++++++++++++++------ swh/lister/gitweb/tests/test_lister.py | 3 +-- swh/lister/utils.py | 5 +++++ 5 files changed, 26 insertions(+), 8 deletions(-) diff --git a/mypy.ini b/mypy.ini index 7f9436b9..76468c2a 100644 --- a/mypy.ini +++ b/mypy.ini @@ -43,6 +43,9 @@ ignore_missing_imports = True [mypy-dulwich.*] ignore_missing_imports = True +[mypy-dateparser.*] +ignore_missing_imports = True + [mypy-testing.postgresql.*] ignore_missing_imports = True diff --git a/requirements.txt b/requirements.txt index 2614f0a5..0e588063 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,6 +6,7 @@ beautifulsoup4 launchpadlib tenacity >= 6.2 lxml +dateparser dulwich testing.postgresql psycopg2 diff --git a/swh/lister/gitweb/lister.py b/swh/lister/gitweb/lister.py index 57810a9e..62f4f80f 100644 --- a/swh/lister/gitweb/lister.py +++ b/swh/lister/gitweb/lister.py @@ -2,15 +2,18 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from datetime import timezone import logging import re from typing import Any, Dict, Iterator, List, Optional from urllib.parse import urljoin, urlparse from bs4 import BeautifulSoup +from dateparser import parse from requests.exceptions import HTTPError from swh.lister.pattern import CredentialsType, StatelessLister +from swh.lister.utils import now from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin @@ -60,6 +63,7 @@ class GitwebLister(StatelessLister[Repositories]): ) self.session.headers.update({"Accept": "application/html"}) + self.listing_date = now() def _get_and_parse(self, url: str) -> BeautifulSoup: """Get the given url and parse the retrieved HTML using BeautifulSoup""" @@ -87,13 +91,12 @@ class GitwebLister(StatelessLister[Repositories]): if repo_url.endswith("?o=descr"): continue - # FIXME: Add parsing step from date interval like '9 years ago' to + # This retrieves the date interval in natural language (e.g. '9 years ago') to # actual python datetime interval so we can derive last update - # span = tr.find("td", {"class": re.compile("age.*")}) - # last_updated_date = span.get("title") if span else None - # last_updated_date = None - - page_results.append({"url": repo_url}) + span = tr.find("td", {"class": re.compile("age.*")}) + page_results.append( + {"url": repo_url, "last_update_interval": span.text if span else None} + ) yield page_results @@ -108,10 +111,17 @@ class GitwebLister(StatelessLister[Repositories]): if origin_url is None: continue + last_update_interval = repo.get("last_update_interval") + if last_update_interval is not None: + last_update = parse(last_update_interval).replace(tzinfo=timezone.utc) + else: + last_update = None + yield ListedOrigin( lister_id=self.lister_obj.id, url=origin_url, visit_type="git", + last_update=last_update, ) def _get_origin_from_repository_url(self, repository_url: str) -> Optional[str]: diff --git a/swh/lister/gitweb/tests/test_lister.py b/swh/lister/gitweb/tests/test_lister.py index 1f784b1c..e52546d7 100644 --- a/swh/lister/gitweb/tests/test_lister.py +++ b/swh/lister/gitweb/tests/test_lister.py @@ -70,8 +70,7 @@ def test_lister_gitweb_run(requests_mock_datadir, swh_scheduler): for listed_origin in scheduler_origins: assert listed_origin.visit_type == "git" assert listed_origin.url.startswith(url) - # Not parsed - assert listed_origin.last_update is None + assert listed_origin.last_update is not None # test user agent content for request in requests_mock_datadir.request_history: diff --git a/swh/lister/utils.py b/swh/lister/utils.py index 60cfc933..5d7a6e75 100644 --- a/swh/lister/utils.py +++ b/swh/lister/utils.py @@ -2,6 +2,7 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from datetime import datetime, timezone from typing import Iterator, Optional, Tuple import urllib.parse @@ -75,3 +76,7 @@ def is_valid_origin_url(url: Optional[str]) -> bool: return False return True + + +def now() -> datetime: + return datetime.now(tz=timezone.utc) -- GitLab