Skip to content
Snippets Groups Projects
Verified Commit 8b69cd12 authored by Antoine R. Dumont's avatar Antoine R. Dumont
Browse files

gitweb: Parse the last update interval as a last update

Refs. swh/devel/swh-lister#1800
parent 11d59a03
No related branches found
No related tags found
No related merge requests found
Pipeline #3496 passed
......@@ -2,6 +2,7 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import datetime
import logging
import re
from typing import Any, Dict, Iterator, List, Optional
......@@ -11,6 +12,7 @@ from bs4 import BeautifulSoup
from requests.exceptions import HTTPError
from swh.lister.pattern import CredentialsType, StatelessLister
from swh.lister.utils import now
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
......@@ -60,6 +62,7 @@ class GitwebLister(StatelessLister[Repositories]):
)
self.session.headers.update({"Accept": "application/html"})
self.listing_date = now()
def _get_and_parse(self, url: str) -> BeautifulSoup:
"""Get the given url and parse the retrieved HTML using BeautifulSoup"""
......@@ -87,13 +90,12 @@ class GitwebLister(StatelessLister[Repositories]):
if repo_url.endswith("?o=descr"):
continue
# FIXME: Add parsing step from date interval like '9 years ago' to
# This retrieves the date interval in natural language (e.g. '9 years ago') to
# actual python datetime interval so we can derive last update
# span = tr.find("td", {"class": re.compile("age.*")})
# last_updated_date = span.get("title") if span else None
# last_updated_date = None
page_results.append({"url": repo_url})
span = tr.find("td", {"class": re.compile("age.*")})
page_results.append(
{"url": repo_url, "last_update_interval": span.text if span else None}
)
yield page_results
......@@ -108,10 +110,22 @@ class GitwebLister(StatelessLister[Repositories]):
if origin_url is None:
continue
last_update_timedelta = repo["last_update_interval"]
if last_update_timedelta:
last_update = self.listing_date - parse_last_update_interval(
last_update_timedelta
)
else:
last_update = None
print("#################### last_update_timedelta", last_update_timedelta)
print("#################### last_update", last_update)
yield ListedOrigin(
lister_id=self.lister_obj.id,
url=origin_url,
visit_type="git",
last_update=last_update,
)
def _get_origin_from_repository_url(self, repository_url: str) -> Optional[str]:
......@@ -160,3 +174,33 @@ class GitwebLister(StatelessLister[Repositories]):
# otherwise, choose the first one
origin_url = urls[0]
return origin_url
MAPPING_UNIT = {
"second": lambda n: datetime.timedelta(seconds=1 * n),
"minute": lambda n: datetime.timedelta(minutes=1 * n),
"hour": lambda n: datetime.timedelta(hours=1 * n),
"day": lambda n: datetime.timedelta(days=1 * n),
"week": lambda n: datetime.timedelta(weeks=1 * n),
"month": lambda n: datetime.timedelta(weeks=4 * n),
"year": lambda n: datetime.timedelta(weeks=52 * n),
}
def parse_last_update_interval(last_update_interval: str) -> datetime.timedelta:
"""Parse natural language interval period (e.g. '9 month ago') into an approximate
timedelta datetime object.
"""
number, period, ago = [
s.strip() for s in last_update_interval.strip().split(" ") if s
]
n = int(number)
assert ago == "ago"
period = period.rstrip("s") if period.endswith("s") else period
assert period in MAPPING_UNIT.keys()
if n > 0:
interval = MAPPING_UNIT[period](n)
return interval
......@@ -2,13 +2,14 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import datetime
import os
from typing import List
import pytest
from swh.lister import __version__
from swh.lister.gitweb.lister import GitwebLister
from swh.lister.gitweb.lister import GitwebLister, parse_last_update_interval
from swh.lister.pattern import ListerStats
MAIN_INSTANCE = "git.distorted.org.uk"
......@@ -70,8 +71,7 @@ def test_lister_gitweb_run(requests_mock_datadir, swh_scheduler):
for listed_origin in scheduler_origins:
assert listed_origin.visit_type == "git"
assert listed_origin.url.startswith(url)
# Not parsed
assert listed_origin.last_update is None
assert listed_origin.last_update is not None
# test user agent content
for request in requests_mock_datadir.request_history:
......@@ -118,3 +118,22 @@ def test_lister_gitweb_get_origin_from_repo_failing(
# so they are filtered out, only the 7 we know are thus listed
expected_nb_origins = 7
assert stats == ListerStats(pages=1, origins=expected_nb_origins)
@pytest.mark.parametrize(
"interval, expected_result",
[
("2 second ago", datetime.timedelta(seconds=2)),
(" 2 seconds ago", datetime.timedelta(seconds=2)),
(" 3 minute ago", datetime.timedelta(minutes=3)),
("3 minutes ago", datetime.timedelta(minutes=3)),
(" 3 day ago ", datetime.timedelta(days=3)),
("4 days ago", datetime.timedelta(days=4)),
(" 6 month ago ", datetime.timedelta(weeks=4 * 6)),
("2 months ago", datetime.timedelta(weeks=4 * 2)),
("2 year ago", datetime.timedelta(weeks=52 * 2)),
(" 3 years ago ", datetime.timedelta(weeks=52 * 3)),
],
)
def test_parse_last_update_interval(interval, expected_result):
assert parse_last_update_interval(interval) == expected_result
......@@ -2,6 +2,7 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from datetime import datetime, timezone
from typing import Iterator, Optional, Tuple
import urllib.parse
......@@ -75,3 +76,7 @@ def is_valid_origin_url(url: Optional[str]) -> bool:
return False
return True
def now() -> datetime:
return datetime.now(tz=timezone.utc)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment