Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
S
swh-lister
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Antoine R. Dumont
swh-lister
Commits
8b69cd12
Verified
Commit
8b69cd12
authored
1 year ago
by
Antoine R. Dumont
Browse files
Options
Downloads
Patches
Plain Diff
gitweb: Parse the last update interval as a last update
Refs.
swh/devel/swh-lister#1800
parent
11d59a03
No related branches found
No related tags found
No related merge requests found
Pipeline
#3496
passed
1 year ago
Stage: external
Changes
3
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
swh/lister/gitweb/lister.py
+50
-6
50 additions, 6 deletions
swh/lister/gitweb/lister.py
swh/lister/gitweb/tests/test_lister.py
+22
-3
22 additions, 3 deletions
swh/lister/gitweb/tests/test_lister.py
swh/lister/utils.py
+5
-0
5 additions, 0 deletions
swh/lister/utils.py
with
77 additions
and
9 deletions
swh/lister/gitweb/lister.py
+
50
−
6
View file @
8b69cd12
...
...
@@ -2,6 +2,7 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import
datetime
import
logging
import
re
from
typing
import
Any
,
Dict
,
Iterator
,
List
,
Optional
...
...
@@ -11,6 +12,7 @@ from bs4 import BeautifulSoup
from
requests.exceptions
import
HTTPError
from
swh.lister.pattern
import
CredentialsType
,
StatelessLister
from
swh.lister.utils
import
now
from
swh.scheduler.interface
import
SchedulerInterface
from
swh.scheduler.model
import
ListedOrigin
...
...
@@ -60,6 +62,7 @@ class GitwebLister(StatelessLister[Repositories]):
)
self
.
session
.
headers
.
update
({
"
Accept
"
:
"
application/html
"
})
self
.
listing_date
=
now
()
def
_get_and_parse
(
self
,
url
:
str
)
->
BeautifulSoup
:
"""
Get the given url and parse the retrieved HTML using BeautifulSoup
"""
...
...
@@ -87,13 +90,12 @@ class GitwebLister(StatelessLister[Repositories]):
if
repo_url
.
endswith
(
"
?o=descr
"
):
continue
#
FIXME: Add parsing step from date interval like
'9 years ago' to
#
This retrieves the date interval in natural language (e.g.
'9 years ago'
)
to
# actual python datetime interval so we can derive last update
# span = tr.find("td", {"class": re.compile("age.*")})
# last_updated_date = span.get("title") if span else None
# last_updated_date = None
page_results
.
append
({
"
url
"
:
repo_url
})
span
=
tr
.
find
(
"
td
"
,
{
"
class
"
:
re
.
compile
(
"
age.*
"
)})
page_results
.
append
(
{
"
url
"
:
repo_url
,
"
last_update_interval
"
:
span
.
text
if
span
else
None
}
)
yield
page_results
...
...
@@ -108,10 +110,22 @@ class GitwebLister(StatelessLister[Repositories]):
if
origin_url
is
None
:
continue
last_update_timedelta
=
repo
[
"
last_update_interval
"
]
if
last_update_timedelta
:
last_update
=
self
.
listing_date
-
parse_last_update_interval
(
last_update_timedelta
)
else
:
last_update
=
None
print
(
"
#################### last_update_timedelta
"
,
last_update_timedelta
)
print
(
"
#################### last_update
"
,
last_update
)
yield
ListedOrigin
(
lister_id
=
self
.
lister_obj
.
id
,
url
=
origin_url
,
visit_type
=
"
git
"
,
last_update
=
last_update
,
)
def
_get_origin_from_repository_url
(
self
,
repository_url
:
str
)
->
Optional
[
str
]:
...
...
@@ -160,3 +174,33 @@ class GitwebLister(StatelessLister[Repositories]):
# otherwise, choose the first one
origin_url
=
urls
[
0
]
return
origin_url
MAPPING_UNIT
=
{
"
second
"
:
lambda
n
:
datetime
.
timedelta
(
seconds
=
1
*
n
),
"
minute
"
:
lambda
n
:
datetime
.
timedelta
(
minutes
=
1
*
n
),
"
hour
"
:
lambda
n
:
datetime
.
timedelta
(
hours
=
1
*
n
),
"
day
"
:
lambda
n
:
datetime
.
timedelta
(
days
=
1
*
n
),
"
week
"
:
lambda
n
:
datetime
.
timedelta
(
weeks
=
1
*
n
),
"
month
"
:
lambda
n
:
datetime
.
timedelta
(
weeks
=
4
*
n
),
"
year
"
:
lambda
n
:
datetime
.
timedelta
(
weeks
=
52
*
n
),
}
def
parse_last_update_interval
(
last_update_interval
:
str
)
->
datetime
.
timedelta
:
"""
Parse natural language interval period (e.g.
'
9 month ago
'
) into an approximate
timedelta datetime object.
"""
number
,
period
,
ago
=
[
s
.
strip
()
for
s
in
last_update_interval
.
strip
().
split
(
"
"
)
if
s
]
n
=
int
(
number
)
assert
ago
==
"
ago
"
period
=
period
.
rstrip
(
"
s
"
)
if
period
.
endswith
(
"
s
"
)
else
period
assert
period
in
MAPPING_UNIT
.
keys
()
if
n
>
0
:
interval
=
MAPPING_UNIT
[
period
](
n
)
return
interval
This diff is collapsed.
Click to expand it.
swh/lister/gitweb/tests/test_lister.py
+
22
−
3
View file @
8b69cd12
...
...
@@ -2,13 +2,14 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import
datetime
import
os
from
typing
import
List
import
pytest
from
swh.lister
import
__version__
from
swh.lister.gitweb.lister
import
GitwebLister
from
swh.lister.gitweb.lister
import
GitwebLister
,
parse_last_update_interval
from
swh.lister.pattern
import
ListerStats
MAIN_INSTANCE
=
"
git.distorted.org.uk
"
...
...
@@ -70,8 +71,7 @@ def test_lister_gitweb_run(requests_mock_datadir, swh_scheduler):
for
listed_origin
in
scheduler_origins
:
assert
listed_origin
.
visit_type
==
"
git
"
assert
listed_origin
.
url
.
startswith
(
url
)
# Not parsed
assert
listed_origin
.
last_update
is
None
assert
listed_origin
.
last_update
is
not
None
# test user agent content
for
request
in
requests_mock_datadir
.
request_history
:
...
...
@@ -118,3 +118,22 @@ def test_lister_gitweb_get_origin_from_repo_failing(
# so they are filtered out, only the 7 we know are thus listed
expected_nb_origins
=
7
assert
stats
==
ListerStats
(
pages
=
1
,
origins
=
expected_nb_origins
)
@pytest.mark.parametrize
(
"
interval, expected_result
"
,
[
(
"
2 second ago
"
,
datetime
.
timedelta
(
seconds
=
2
)),
(
"
2 seconds ago
"
,
datetime
.
timedelta
(
seconds
=
2
)),
(
"
3 minute ago
"
,
datetime
.
timedelta
(
minutes
=
3
)),
(
"
3 minutes ago
"
,
datetime
.
timedelta
(
minutes
=
3
)),
(
"
3 day ago
"
,
datetime
.
timedelta
(
days
=
3
)),
(
"
4 days ago
"
,
datetime
.
timedelta
(
days
=
4
)),
(
"
6 month ago
"
,
datetime
.
timedelta
(
weeks
=
4
*
6
)),
(
"
2 months ago
"
,
datetime
.
timedelta
(
weeks
=
4
*
2
)),
(
"
2 year ago
"
,
datetime
.
timedelta
(
weeks
=
52
*
2
)),
(
"
3 years ago
"
,
datetime
.
timedelta
(
weeks
=
52
*
3
)),
],
)
def
test_parse_last_update_interval
(
interval
,
expected_result
):
assert
parse_last_update_interval
(
interval
)
==
expected_result
This diff is collapsed.
Click to expand it.
swh/lister/utils.py
+
5
−
0
View file @
8b69cd12
...
...
@@ -2,6 +2,7 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from
datetime
import
datetime
,
timezone
from
typing
import
Iterator
,
Optional
,
Tuple
import
urllib.parse
...
...
@@ -75,3 +76,7 @@ def is_valid_origin_url(url: Optional[str]) -> bool:
return
False
return
True
def
now
()
->
datetime
:
return
datetime
.
now
(
tz
=
timezone
.
utc
)
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment