diff --git a/swh/lister/gitweb/lister.py b/swh/lister/gitweb/lister.py index 57810a9e95bd63bbee63615cf3081490592f5e86..a81449cc8d002ce64a5f86470248738b5ef4f0ff 100644 --- a/swh/lister/gitweb/lister.py +++ b/swh/lister/gitweb/lister.py @@ -2,6 +2,7 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import datetime import logging import re from typing import Any, Dict, Iterator, List, Optional @@ -11,6 +12,7 @@ from bs4 import BeautifulSoup from requests.exceptions import HTTPError from swh.lister.pattern import CredentialsType, StatelessLister +from swh.lister.utils import now from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin @@ -60,6 +62,7 @@ class GitwebLister(StatelessLister[Repositories]): ) self.session.headers.update({"Accept": "application/html"}) + self.listing_date = now() def _get_and_parse(self, url: str) -> BeautifulSoup: """Get the given url and parse the retrieved HTML using BeautifulSoup""" @@ -87,13 +90,12 @@ class GitwebLister(StatelessLister[Repositories]): if repo_url.endswith("?o=descr"): continue - # FIXME: Add parsing step from date interval like '9 years ago' to + # This retrieves the date interval in natural language (e.g. '9 years ago') to # actual python datetime interval so we can derive last update - # span = tr.find("td", {"class": re.compile("age.*")}) - # last_updated_date = span.get("title") if span else None - # last_updated_date = None - - page_results.append({"url": repo_url}) + span = tr.find("td", {"class": re.compile("age.*")}) + page_results.append( + {"url": repo_url, "last_update_interval": span.text if span else None} + ) yield page_results @@ -108,10 +110,22 @@ class GitwebLister(StatelessLister[Repositories]): if origin_url is None: continue + last_update_timedelta = repo["last_update_interval"] + if last_update_timedelta: + last_update = self.listing_date - parse_last_update_interval( + last_update_timedelta + ) + else: + last_update = None + + print("#################### last_update_timedelta", last_update_timedelta) + print("#################### last_update", last_update) + yield ListedOrigin( lister_id=self.lister_obj.id, url=origin_url, visit_type="git", + last_update=last_update, ) def _get_origin_from_repository_url(self, repository_url: str) -> Optional[str]: @@ -160,3 +174,33 @@ class GitwebLister(StatelessLister[Repositories]): # otherwise, choose the first one origin_url = urls[0] return origin_url + + +MAPPING_UNIT = { + "second": lambda n: datetime.timedelta(seconds=1 * n), + "minute": lambda n: datetime.timedelta(minutes=1 * n), + "hour": lambda n: datetime.timedelta(hours=1 * n), + "day": lambda n: datetime.timedelta(days=1 * n), + "week": lambda n: datetime.timedelta(weeks=1 * n), + "month": lambda n: datetime.timedelta(weeks=4 * n), + "year": lambda n: datetime.timedelta(weeks=52 * n), +} + + +def parse_last_update_interval(last_update_interval: str) -> datetime.timedelta: + """Parse natural language interval period (e.g. '9 month ago') into an approximate + timedelta datetime object. + + """ + number, period, ago = [ + s.strip() for s in last_update_interval.strip().split(" ") if s + ] + n = int(number) + assert ago == "ago" + period = period.rstrip("s") if period.endswith("s") else period + + assert period in MAPPING_UNIT.keys() + if n > 0: + interval = MAPPING_UNIT[period](n) + + return interval diff --git a/swh/lister/gitweb/tests/test_lister.py b/swh/lister/gitweb/tests/test_lister.py index 1f784b1c90a8895235fc06dd5dc59a0f494cd6c9..3eff33f37af1b3b13302a165aa047a952332337e 100644 --- a/swh/lister/gitweb/tests/test_lister.py +++ b/swh/lister/gitweb/tests/test_lister.py @@ -2,13 +2,14 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import datetime import os from typing import List import pytest from swh.lister import __version__ -from swh.lister.gitweb.lister import GitwebLister +from swh.lister.gitweb.lister import GitwebLister, parse_last_update_interval from swh.lister.pattern import ListerStats MAIN_INSTANCE = "git.distorted.org.uk" @@ -70,8 +71,7 @@ def test_lister_gitweb_run(requests_mock_datadir, swh_scheduler): for listed_origin in scheduler_origins: assert listed_origin.visit_type == "git" assert listed_origin.url.startswith(url) - # Not parsed - assert listed_origin.last_update is None + assert listed_origin.last_update is not None # test user agent content for request in requests_mock_datadir.request_history: @@ -118,3 +118,22 @@ def test_lister_gitweb_get_origin_from_repo_failing( # so they are filtered out, only the 7 we know are thus listed expected_nb_origins = 7 assert stats == ListerStats(pages=1, origins=expected_nb_origins) + + +@pytest.mark.parametrize( + "interval, expected_result", + [ + ("2 second ago", datetime.timedelta(seconds=2)), + (" 2 seconds ago", datetime.timedelta(seconds=2)), + (" 3 minute ago", datetime.timedelta(minutes=3)), + ("3 minutes ago", datetime.timedelta(minutes=3)), + (" 3 day ago ", datetime.timedelta(days=3)), + ("4 days ago", datetime.timedelta(days=4)), + (" 6 month ago ", datetime.timedelta(weeks=4 * 6)), + ("2 months ago", datetime.timedelta(weeks=4 * 2)), + ("2 year ago", datetime.timedelta(weeks=52 * 2)), + (" 3 years ago ", datetime.timedelta(weeks=52 * 3)), + ], +) +def test_parse_last_update_interval(interval, expected_result): + assert parse_last_update_interval(interval) == expected_result diff --git a/swh/lister/utils.py b/swh/lister/utils.py index 60cfc933831579ba38c83cf86894599b159d961c..5d7a6e75902acfca8aea71a5f6f11355387049f0 100644 --- a/swh/lister/utils.py +++ b/swh/lister/utils.py @@ -2,6 +2,7 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from datetime import datetime, timezone from typing import Iterator, Optional, Tuple import urllib.parse @@ -75,3 +76,7 @@ def is_valid_origin_url(url: Optional[str]) -> bool: return False return True + + +def now() -> datetime: + return datetime.now(tz=timezone.utc)