Skip to content
Snippets Groups Projects
Verified Commit 463a4149 authored by Antoine R. Dumont's avatar Antoine R. Dumont
Browse files

arch/lister: Drop artifact size approximation

That fails the current loading ingestion as this is expected to be an exact
value.

Refs. swh/devel/swh-loader-core#4746
parent 2eb32234
No related branches found
No related tags found
No related merge requests found
Pipeline #5154 failed
......@@ -26,41 +26,6 @@ logger = logging.getLogger(__name__)
ArchListerPage = List[Dict[str, Any]]
def size_to_bytes(size: str) -> int:
"""Convert human readable file size to bytes.
Resulting value is an approximation as input value is in most case rounded.
Args:
size: A string representing a human readable file size (eg: '500K')
Returns:
A decimal representation of file size
Examples::
>>> size_to_bytes("500")
500
>>> size_to_bytes("1K")
1000
"""
units = {
"K": 1000,
"M": 1000**2,
"G": 1000**3,
"T": 1000**4,
"P": 1000**5,
"E": 1000**6,
"Z": 1000**7,
"Y": 1000**8,
}
if size.endswith(tuple(units)):
v, u = (size[:-1], size[-1])
return int(v) * units[u]
else:
return int(size)
class ArchLister(StatelessLister[ArchListerPage]):
"""List Arch linux origins from 'core', 'extra', and 'community' repositories
......@@ -194,27 +159,24 @@ class ArchLister(StatelessLister[ArchListerPage]):
# Extract last_modified and an approximate file size
raw_text = link.next_sibling
raw_text_rex = re.compile(
r"^(?P<last_modified>\d+-\w+-\d+ \d\d:\d\d)\s+(?P<size>\w+)$"
r"^(?P<last_modified>\d+-\w+-\d+ \d\d:\d\d)\s+.*$"
)
s = raw_text_rex.search(raw_text.strip())
if s is None:
logger.error(
"Can not find a match for 'last_modified' and/or "
"'size' in '%(raw_text)s'",
"Can not find a match for 'last_modified' in '%(raw_text)s'",
dict(raw_text=raw_text),
)
else:
assert s.groups()
assert len(s.groups()) == 2
last_modified_str, size = s.groups()
values = s.groups()
assert values and len(values) == 1
last_modified_str = values[0]
# format as expected
last_modified = datetime.datetime.strptime(
last_modified_str, "%d-%b-%Y %H:%M"
).isoformat()
length = size_to_bytes(size) # we want bytes
# link url is relative, format a canonical one
url = self.ARCH_PACKAGE_DOWNLOAD_URL_PATTERN.format(
base_url=base_url, pkgname=name, filename=filename
......@@ -228,7 +190,6 @@ class ArchLister(StatelessLister[ArchListerPage]):
filename=filename,
url=url,
last_modified=last_modified,
length=length,
)
)
return versions
......
This diff is collapsed.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment