From 00585f72278274d04b6979536861ad901d7112d4 Mon Sep 17 00:00:00 2001
From: "Antoine R. Dumont (@ardumont)" <ardumont@softwareheritage.org>
Date: Mon, 10 Jul 2023 10:04:10 +0200
Subject: [PATCH] gitweb: Parse the last update interval as a last update

Refs. swh/devel/swh-lister#1800
---
 mypy.ini                               |  3 +++
 requirements.txt                       |  1 +
 swh/lister/gitweb/lister.py            | 18 ++++++++++++------
 swh/lister/gitweb/tests/test_lister.py |  3 +--
 swh/lister/utils.py                    |  5 +++++
 5 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/mypy.ini b/mypy.ini
index 7f9436b9..76468c2a 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -43,6 +43,9 @@ ignore_missing_imports = True
 [mypy-dulwich.*]
 ignore_missing_imports = True
 
+[mypy-dateparser.*]
+ignore_missing_imports = True
+
 [mypy-testing.postgresql.*]
 ignore_missing_imports = True
 
diff --git a/requirements.txt b/requirements.txt
index 2614f0a5..0e588063 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,6 +6,7 @@ beautifulsoup4
 launchpadlib
 tenacity >= 6.2
 lxml
+dateparser
 dulwich
 testing.postgresql
 psycopg2
diff --git a/swh/lister/gitweb/lister.py b/swh/lister/gitweb/lister.py
index 57810a9e..3c902e4f 100644
--- a/swh/lister/gitweb/lister.py
+++ b/swh/lister/gitweb/lister.py
@@ -8,9 +8,11 @@ from typing import Any, Dict, Iterator, List, Optional
 from urllib.parse import urljoin, urlparse
 
 from bs4 import BeautifulSoup
+from dateparser import parse
 from requests.exceptions import HTTPError
 
 from swh.lister.pattern import CredentialsType, StatelessLister
+from swh.lister.utils import now
 from swh.scheduler.interface import SchedulerInterface
 from swh.scheduler.model import ListedOrigin
 
@@ -60,6 +62,7 @@ class GitwebLister(StatelessLister[Repositories]):
         )
 
         self.session.headers.update({"Accept": "application/html"})
+        self.listing_date = now()
 
     def _get_and_parse(self, url: str) -> BeautifulSoup:
         """Get the given url and parse the retrieved HTML using BeautifulSoup"""
@@ -87,13 +90,12 @@ class GitwebLister(StatelessLister[Repositories]):
             if repo_url.endswith("?o=descr"):
                 continue
 
-            # FIXME: Add parsing step from date interval like '9 years ago' to
+            # This retrieves the date interval in natural language (e.g. '9 years ago') to
             # actual python datetime interval so we can derive last update
-            # span = tr.find("td", {"class": re.compile("age.*")})
-            # last_updated_date = span.get("title") if span else None
-            # last_updated_date = None
-
-            page_results.append({"url": repo_url})
+            span = tr.find("td", {"class": re.compile("age.*")})
+            page_results.append(
+                {"url": repo_url, "last_update_interval": span.text if span else None}
+            )
 
         yield page_results
 
@@ -108,10 +110,14 @@ class GitwebLister(StatelessLister[Repositories]):
             if origin_url is None:
                 continue
 
+            last_update_interval = repo.get("last_update_interval")
+            last_update = parse(last_update_interval) if last_update_interval else None
+
             yield ListedOrigin(
                 lister_id=self.lister_obj.id,
                 url=origin_url,
                 visit_type="git",
+                last_update=last_update,
             )
 
     def _get_origin_from_repository_url(self, repository_url: str) -> Optional[str]:
diff --git a/swh/lister/gitweb/tests/test_lister.py b/swh/lister/gitweb/tests/test_lister.py
index 1f784b1c..e52546d7 100644
--- a/swh/lister/gitweb/tests/test_lister.py
+++ b/swh/lister/gitweb/tests/test_lister.py
@@ -70,8 +70,7 @@ def test_lister_gitweb_run(requests_mock_datadir, swh_scheduler):
     for listed_origin in scheduler_origins:
         assert listed_origin.visit_type == "git"
         assert listed_origin.url.startswith(url)
-        # Not parsed
-        assert listed_origin.last_update is None
+        assert listed_origin.last_update is not None
 
     # test user agent content
     for request in requests_mock_datadir.request_history:
diff --git a/swh/lister/utils.py b/swh/lister/utils.py
index 60cfc933..5d7a6e75 100644
--- a/swh/lister/utils.py
+++ b/swh/lister/utils.py
@@ -2,6 +2,7 @@
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
+from datetime import datetime, timezone
 from typing import Iterator, Optional, Tuple
 import urllib.parse
 
@@ -75,3 +76,7 @@ def is_valid_origin_url(url: Optional[str]) -> bool:
         return False
 
     return True
+
+
+def now() -> datetime:
+    return datetime.now(tz=timezone.utc)
-- 
GitLab