Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
S
swh-lister
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Platform
Development
swh-lister
Commits
6618cf34
Commit
6618cf34
authored
10 months ago
by
Antoine Lambert
Browse files
Options
Downloads
Patches
Plain Diff
Move tarball validation functions from nixguix to utils
parent
c0dc8edb
No related branches found
Branches containing commit
No related tags found
Tags containing commit
1 merge request
!528
Add save-bulk lister to check origins prior their insertion in database
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
swh/lister/nixguix/lister.py
+8
-185
8 additions, 185 deletions
swh/lister/nixguix/lister.py
swh/lister/nixguix/tests/test_lister.py
+6
-4
6 additions, 4 deletions
swh/lister/nixguix/tests/test_lister.py
swh/lister/utils.py
+202
-4
202 additions, 4 deletions
swh/lister/utils.py
with
216 additions
and
193 deletions
swh/lister/nixguix/lister.py
+
8
−
185
View file @
6618cf34
# Copyright (C) 2020-202
3
The Software Heritage developers
# Copyright (C) 2020-202
4
The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
...
...
@@ -20,18 +20,21 @@ import binascii
from
dataclasses
import
dataclass
from
enum
import
Enum
import
logging
from
pathlib
import
Path
import
random
import
re
from
typing
import
Any
,
Dict
,
Iterator
,
List
,
Optional
,
Tuple
,
Union
from
urllib.parse
import
parse_qsl
,
urlparse
from
urllib.parse
import
urlparse
import
requests
from
requests.exceptions
import
ConnectionError
,
InvalidSchema
,
SSLError
from
swh.core.tarball
import
MIMETYPE_TO_ARCHIVE_FORMAT
from
swh.lister
import
TARBALL_EXTENSIONS
from
swh.lister.pattern
import
CredentialsType
,
StatelessLister
from
swh.lister.utils
import
(
ArtifactNatureMistyped
,
ArtifactNatureUndetected
,
is_tarball
,
url_contains_tarball_filename
,
)
from
swh.scheduler.model
import
ListedOrigin
logger
=
logging
.
getLogger
(
__name__
)
...
...
@@ -52,29 +55,6 @@ DEFAULT_EXTENSIONS_TO_IGNORE = [
]
class
ArtifactNatureUndetected
(
ValueError
):
"""
Raised when a remote artifact
'
s nature (tarball, file) cannot be detected.
"""
pass
class
ArtifactNatureMistyped
(
ValueError
):
"""
Raised when a remote artifact is neither a tarball nor a file.
Error of this type are
'
probably a misconfiguration in the manifest generation that
badly typed a vcs repository.
"""
pass
class
ArtifactWithoutExtension
(
ValueError
):
"""
Raised when an artifact nature cannot be determined by its name.
"""
pass
class
ChecksumLayout
(
Enum
):
"""
The possible artifact types listed out of the manifest.
"""
...
...
@@ -147,163 +127,6 @@ POSSIBLE_TARBALL_MIMETYPES = tuple(MIMETYPE_TO_ARCHIVE_FORMAT.keys())
PATTERN_VERSION
=
re
.
compile
(
r
"
(v*[0-9]+[.])([0-9]+[.]*)+
"
)
def
url_contains_tarball_filename
(
urlparsed
,
extensions
:
List
[
str
],
raise_when_no_extension
:
bool
=
True
)
->
bool
:
"""
Determine whether urlparsed contains a tarball filename ending with one of the
extensions passed as parameter, path parts and query parameters are checked.
This also account for the edge case of a filename with only a version as name (so no
extension in the end.)
Raises:
ArtifactWithoutExtension in case no extension is available and
raise_when_no_extension is True (the default)
"""
paths
=
[
Path
(
p
)
for
(
_
,
p
)
in
[(
"
_
"
,
urlparsed
.
path
)]
+
parse_qsl
(
urlparsed
.
query
)]
match
=
any
(
path_part
.
endswith
(
tuple
(
extensions
))
for
path
in
paths
for
path_part
in
path
.
parts
)
if
match
:
return
match
if
raise_when_no_extension
and
not
any
(
path
.
suffix
!=
""
for
path
in
paths
):
raise
ArtifactWithoutExtension
# Some false negative can happen (e.g. https://<netloc>/path/0.1.5)), so make sure
# to catch those
name
=
Path
(
urlparsed
.
path
).
name
if
not
PATTERN_VERSION
.
match
(
name
):
return
match
if
raise_when_no_extension
:
raise
ArtifactWithoutExtension
return
False
def
is_tarball
(
urls
:
List
[
str
],
request
:
Optional
[
Any
]
=
None
,
)
->
Tuple
[
bool
,
str
]:
"""
Determine whether a list of files actually are tarball or simple files.
This iterates over the list of urls provided to detect the artifact
'
s nature. When
this cannot be answered simply out of the url and ``request`` is provided, this
executes a HTTP `HEAD` query on the url to determine the information. If request is
not provided, this raises an ArtifactNatureUndetected exception.
If, at the end of the iteration on the urls, no detection could be deduced, this
raises an ArtifactNatureUndetected.
Args:
urls: name of the remote files to check for artifact nature.
request: (Optional) Request object allowing http calls. If not provided and
naive check cannot detect anything, this raises ArtifactNatureUndetected.
Raises:
ArtifactNatureUndetected when the artifact
'
s nature cannot be detected out
of its urls
ArtifactNatureMistyped when the artifact is not a tarball nor a file. It
'
s up to
the caller to do what
'
s right with it.
Returns: A tuple (bool, url). The boolean represents whether the url is an archive
or not. The second parameter is the actual url once the head request is issued
as a fallback of not finding out whether the urls are tarballs or not.
"""
def
_is_tarball
(
url
):
"""
Determine out of an extension whether url is a tarball.
Raises:
ArtifactWithoutExtension in case no extension is available
"""
urlparsed
=
urlparse
(
url
)
if
urlparsed
.
scheme
not
in
(
"
http
"
,
"
https
"
,
"
ftp
"
):
raise
ArtifactNatureMistyped
(
f
"
Mistyped artifact
'
{
url
}
'"
)
return
url_contains_tarball_filename
(
urlparsed
,
TARBALL_EXTENSIONS
)
# Check all urls and as soon as an url allows the nature detection, this stops.
exceptions_to_raise
=
[]
for
url
in
urls
:
try
:
return
_is_tarball
(
url
),
urls
[
0
]
except
ArtifactWithoutExtension
:
if
request
is
None
:
exc
=
ArtifactNatureUndetected
(
f
"
Cannot determine artifact type from url <
{
url
}
>
"
)
exceptions_to_raise
.
append
(
exc
)
continue
logger
.
warning
(
"
Cannot detect extension for <%s>. Fallback to http head query
"
,
url
,
)
try
:
response
=
request
.
head
(
url
)
except
(
InvalidSchema
,
SSLError
,
ConnectionError
):
exc
=
ArtifactNatureUndetected
(
f
"
Cannot determine artifact type from url <
{
url
}
>
"
)
exceptions_to_raise
.
append
(
exc
)
continue
if
not
response
.
ok
or
response
.
status_code
==
404
:
exc
=
ArtifactNatureUndetected
(
f
"
Cannot determine artifact type from url <
{
url
}
>
"
)
exceptions_to_raise
.
append
(
exc
)
continue
location
=
response
.
headers
.
get
(
"
Location
"
)
if
location
:
# It's not always present
logger
.
debug
(
"
Location: %s
"
,
location
)
try
:
return
_is_tarball
(
location
),
url
except
ArtifactWithoutExtension
:
logger
.
warning
(
"
Still cannot detect extension through location <%s>...
"
,
url
,
)
origin
=
urls
[
0
]
content_type
=
response
.
headers
.
get
(
"
Content-Type
"
)
if
content_type
:
logger
.
debug
(
"
Content-Type: %s
"
,
content_type
)
if
content_type
==
"
application/json
"
:
return
False
,
origin
return
content_type
.
startswith
(
POSSIBLE_TARBALL_MIMETYPES
),
origin
content_disposition
=
response
.
headers
.
get
(
"
Content-Disposition
"
)
if
content_disposition
:
logger
.
debug
(
"
Content-Disposition: %s
"
,
content_disposition
)
if
"
filename=
"
in
content_disposition
:
fields
=
content_disposition
.
split
(
"
;
"
)
for
field
in
fields
:
if
"
filename=
"
in
field
:
_
,
filename
=
field
.
split
(
"
filename=
"
)
break
return
(
url_contains_tarball_filename
(
urlparse
(
filename
),
TARBALL_EXTENSIONS
,
raise_when_no_extension
=
False
,
),
origin
,
)
if
len
(
exceptions_to_raise
)
>
0
:
raise
exceptions_to_raise
[
0
]
raise
ArtifactNatureUndetected
(
f
"
Cannot determine artifact type from url <
{
urls
[
0
]
}
>
"
)
VCS_KEYS_MAPPING
=
{
"
git
"
:
{
"
ref
"
:
"
git_ref
"
,
...
...
This diff is collapsed.
Click to expand it.
swh/lister/nixguix/tests/test_lister.py
+
6
−
4
View file @
6618cf34
# Copyright (C) 2022 The Software Heritage developers
# Copyright (C) 2022
-2024
The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
...
...
@@ -19,14 +19,16 @@ from swh.lister.nixguix.lister import (
DEFAULT_EXTENSIONS_TO_IGNORE
,
POSSIBLE_TARBALL_MIMETYPES
,
VCS_ARTIFACT_TYPE_TO_VISIT_TYPE
,
ArtifactNatureMistyped
,
ArtifactNatureUndetected
,
ArtifactWithoutExtension
,
NixGuixLister
,
is_tarball
,
url_contains_tarball_filename
,
)
from
swh.lister.pattern
import
ListerStats
from
swh.lister.utils
import
(
ArtifactNatureMistyped
,
ArtifactNatureUndetected
,
ArtifactWithoutExtension
,
)
logger
=
logging
.
getLogger
(
__name__
)
...
...
This diff is collapsed.
Click to expand it.
swh/lister/utils.py
+
202
−
4
View file @
6618cf34
# Copyright (C) 2018-202
3
the Software Heritage developers
# Copyright (C) 2018-202
4
the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from
typing
import
Iterator
,
Optional
,
Tuple
import
urllib.parse
import
logging
from
pathlib
import
Path
import
re
from
typing
import
Any
,
Iterator
,
List
,
Optional
,
Tuple
from
urllib.parse
import
parse_qsl
,
urlparse
from
requests.exceptions
import
ConnectionError
,
InvalidSchema
,
SSLError
from
swh.core.tarball
import
MIMETYPE_TO_ARCHIVE_FORMAT
from
swh.lister
import
TARBALL_EXTENSIONS
logger
=
logging
.
getLogger
(
__name__
)
def
split_range
(
total_pages
:
int
,
nb_pages
:
int
)
->
Iterator
[
Tuple
[
int
,
int
]]:
...
...
@@ -65,7 +76,7 @@ def is_valid_origin_url(url: Optional[str]) -> bool:
# Empty or None
return
False
parsed
=
urllib
.
parse
.
urlparse
(
url
)
parsed
=
urlparse
(
url
)
if
not
parsed
.
netloc
:
# Is parsed as a relative URL
return
False
...
...
@@ -75,3 +86,190 @@ def is_valid_origin_url(url: Optional[str]) -> bool:
return
False
return
True
class
ArtifactNatureUndetected
(
ValueError
):
"""
Raised when a remote artifact
'
s nature (tarball, file) cannot be detected.
"""
pass
class
ArtifactNatureMistyped
(
ValueError
):
"""
Raised when a remote artifact is neither a tarball nor a file.
Error of this type are
'
probably a misconfiguration in the manifest generation that
badly typed a vcs repository.
"""
pass
class
ArtifactWithoutExtension
(
ValueError
):
"""
Raised when an artifact nature cannot be determined by its name.
"""
pass
# Rough approximation of what we can find of mimetypes for tarballs "out there"
POSSIBLE_TARBALL_MIMETYPES
=
tuple
(
MIMETYPE_TO_ARCHIVE_FORMAT
.
keys
())
PATTERN_VERSION
=
re
.
compile
(
r
"
(v*[0-9]+[.])([0-9]+[.]*)+
"
)
def
url_contains_tarball_filename
(
urlparsed
,
extensions
:
List
[
str
],
raise_when_no_extension
:
bool
=
True
)
->
bool
:
"""
Determine whether urlparsed contains a tarball filename ending with one of the
extensions passed as parameter, path parts and query parameters are checked.
This also account for the edge case of a filename with only a version as name (so no
extension in the end.)
Raises:
ArtifactWithoutExtension in case no extension is available and
raise_when_no_extension is True (the default)
"""
paths
=
[
Path
(
p
)
for
(
_
,
p
)
in
[(
"
_
"
,
urlparsed
.
path
)]
+
parse_qsl
(
urlparsed
.
query
)]
match
=
any
(
path_part
.
endswith
(
tuple
(
extensions
))
for
path
in
paths
for
path_part
in
path
.
parts
)
if
match
:
return
match
if
raise_when_no_extension
and
not
any
(
path
.
suffix
!=
""
for
path
in
paths
):
raise
ArtifactWithoutExtension
# Some false negative can happen (e.g. https://<netloc>/path/0.1.5)), so make sure
# to catch those
name
=
Path
(
urlparsed
.
path
).
name
if
not
PATTERN_VERSION
.
match
(
name
):
return
match
if
raise_when_no_extension
:
raise
ArtifactWithoutExtension
return
False
def
is_tarball
(
urls
:
List
[
str
],
request
:
Optional
[
Any
]
=
None
,
)
->
Tuple
[
bool
,
str
]:
"""
Determine whether a list of files actually are tarball or simple files.
This iterates over the list of urls provided to detect the artifact
'
s nature. When
this cannot be answered simply out of the url and ``request`` is provided, this
executes a HTTP `HEAD` query on the url to determine the information. If request is
not provided, this raises an ArtifactNatureUndetected exception.
If, at the end of the iteration on the urls, no detection could be deduced, this
raises an ArtifactNatureUndetected.
Args:
urls: name of the remote files to check for artifact nature.
request: (Optional) Request object allowing http calls. If not provided and
naive check cannot detect anything, this raises ArtifactNatureUndetected.
Raises:
ArtifactNatureUndetected when the artifact
'
s nature cannot be detected out
of its urls
ArtifactNatureMistyped when the artifact is not a tarball nor a file. It
'
s up to
the caller to do what
'
s right with it.
Returns: A tuple (bool, url). The boolean represents whether the url is an archive
or not. The second parameter is the actual url once the head request is issued
as a fallback of not finding out whether the urls are tarballs or not.
"""
def
_is_tarball
(
url
):
"""
Determine out of an extension whether url is a tarball.
Raises:
ArtifactWithoutExtension in case no extension is available
"""
urlparsed
=
urlparse
(
url
)
if
urlparsed
.
scheme
not
in
(
"
http
"
,
"
https
"
,
"
ftp
"
):
raise
ArtifactNatureMistyped
(
f
"
Mistyped artifact
'
{
url
}
'"
)
return
url_contains_tarball_filename
(
urlparsed
,
TARBALL_EXTENSIONS
)
# Check all urls and as soon as an url allows the nature detection, this stops.
exceptions_to_raise
=
[]
for
url
in
urls
:
try
:
return
_is_tarball
(
url
),
urls
[
0
]
except
ArtifactWithoutExtension
:
if
request
is
None
:
exc
=
ArtifactNatureUndetected
(
f
"
Cannot determine artifact type from url <
{
url
}
>
"
)
exceptions_to_raise
.
append
(
exc
)
continue
logger
.
warning
(
"
Cannot detect extension for <%s>. Fallback to http head query
"
,
url
,
)
try
:
response
=
request
.
head
(
url
)
except
(
InvalidSchema
,
SSLError
,
ConnectionError
):
exc
=
ArtifactNatureUndetected
(
f
"
Cannot determine artifact type from url <
{
url
}
>
"
)
exceptions_to_raise
.
append
(
exc
)
continue
if
not
response
.
ok
or
response
.
status_code
==
404
:
exc
=
ArtifactNatureUndetected
(
f
"
Cannot determine artifact type from url <
{
url
}
>
"
)
exceptions_to_raise
.
append
(
exc
)
continue
location
=
response
.
headers
.
get
(
"
Location
"
)
if
location
:
# It's not always present
logger
.
debug
(
"
Location: %s
"
,
location
)
try
:
return
_is_tarball
(
location
),
url
except
ArtifactWithoutExtension
:
logger
.
warning
(
"
Still cannot detect extension through location <%s>...
"
,
url
,
)
origin
=
urls
[
0
]
content_type
=
response
.
headers
.
get
(
"
Content-Type
"
)
if
content_type
:
logger
.
debug
(
"
Content-Type: %s
"
,
content_type
)
if
content_type
==
"
application/json
"
:
return
False
,
origin
return
content_type
.
startswith
(
POSSIBLE_TARBALL_MIMETYPES
),
origin
content_disposition
=
response
.
headers
.
get
(
"
Content-Disposition
"
)
if
content_disposition
:
logger
.
debug
(
"
Content-Disposition: %s
"
,
content_disposition
)
if
"
filename=
"
in
content_disposition
:
fields
=
content_disposition
.
split
(
"
;
"
)
for
field
in
fields
:
if
"
filename=
"
in
field
:
_
,
filename
=
field
.
split
(
"
filename=
"
)
break
return
(
url_contains_tarball_filename
(
urlparse
(
filename
),
TARBALL_EXTENSIONS
,
raise_when_no_extension
=
False
,
),
origin
,
)
if
len
(
exceptions_to_raise
)
>
0
:
raise
exceptions_to_raise
[
0
]
raise
ArtifactNatureUndetected
(
f
"
Cannot determine artifact type from url <
{
urls
[
0
]
}
>
"
)
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment