Skip to content
Snippets Groups Projects
Commit 108816f2 authored by Antoine Lambert's avatar Antoine Lambert
Browse files

rubygems: Use gems database dump to improve listing output

Instead of using an undocumented rubygems HTTP endpoint that only
gives us the names of the gems, prefer to exploit the daily PostgreSQL
dump of the rubygems.org database.

It enables to list all gems but also all versions of a gem and its
release artifacts. For each relase artifact, the following info are
extracted: version, download URL, sha256 checksum, release date
plus a couple of extra metadata.

The lister will now set list of artifacts and list of metadata as extra
loader arguments when sending a listed origin to the scheduler database.
A last_update date is also computed which should ensure loading tasks
for rubygems will be scheduled only when new releases are available since
last loadings.

To be noted, the lister will spawn a temporary postgres instance so this
require the initdb executable from postgres server installation to be
available in the execution environment.

Related to T1777
parent c22f41a6
No related branches found
No related tags found
No related merge requests found
......@@ -42,3 +42,9 @@ ignore_missing_imports = True
[mypy-dulwich.*]
ignore_missing_imports = True
[mypy-testing.postgresql.*]
ignore_missing_imports = True
[mypy-psycopg2.*]
ignore_missing_imports = True
......@@ -7,3 +7,5 @@ launchpadlib
tenacity >= 6.2
lxml
dulwich
testing.postgresql
psycopg2
......@@ -3,8 +3,20 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import base64
from datetime import timezone
import gzip
import logging
from typing import Iterator, List, Optional, Text
import os
import shutil
import subprocess
import tarfile
import tempfile
from typing import Any, Dict, Iterator, Optional, Tuple
from bs4 import BeautifulSoup
import psycopg2
from testing.postgresql import Postgresql
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
......@@ -13,18 +25,39 @@ from ..pattern import CredentialsType, StatelessLister
logger = logging.getLogger(__name__)
# Aliasing the page results returned by `get_pages` method from the lister.
RubyGemsListerPage = Text
RubyGemsListerPage = Dict[str, Any]
class RubyGemsLister(StatelessLister[RubyGemsListerPage]):
"""Lister for RubyGems.org, the Ruby community’s gem hosting service."""
"""Lister for RubyGems.org, the Ruby community's gem hosting service.
Instead of querying rubygems.org Web API, it uses gems data from the
daily PostreSQL database dump of rubygems. It enables to gather all
interesting info about a gem and its release artifacts (version number,
download URL, checksums, release date) in an efficient way and without
flooding rubygems Web API with numerous HTTP requests (as there is more
than 187000 gems available on 07/10/2022).
"""
LISTER_NAME = "rubygems"
VISIT_TYPE = "rubygems"
INSTANCE = "rubygems"
INDEX_URL = "https://rubygems.org/versions"
RUBY_GEMS_POSTGRES_DUMP_BASE_URL = (
"https://s3-us-west-2.amazonaws.com/rubygems-dumps"
)
RUBY_GEMS_POSTGRES_DUMP_LIST_URL = (
f"{RUBY_GEMS_POSTGRES_DUMP_BASE_URL}?prefix=production/public_postgresql"
)
RUBY_GEM_DOWNLOAD_URL_PATTERN = "https://rubygems.org/downloads/{gem}-{version}.gem"
RUBY_GEM_ORIGIN_URL_PATTERN = "https://rubygems.org/gems/{gem}"
RUBY_GEM_EXTRINSIC_METADATA_URL_PATTERN = (
"https://rubygems.org/api/v2/rubygems/{gem}/versions/{version}.json"
)
DB_NAME = "rubygems"
DUMP_SQL_PATH = "public_postgresql/databases/PostgreSQL.sql.gz"
def __init__(
self,
......@@ -35,41 +68,147 @@ class RubyGemsLister(StatelessLister[RubyGemsListerPage]):
scheduler=scheduler,
credentials=credentials,
instance=self.INSTANCE,
url=self.INDEX_URL,
url=self.RUBY_GEMS_POSTGRES_DUMP_BASE_URL,
)
def get_pages(self) -> Iterator[RubyGemsListerPage]:
"""Yield an iterator which returns 'page'
It uses the index file located at `https://rubygems.org/versions`
to get a list of package names. Each page returns an origin url based on
the following pattern::
https://rubygems.org/gems/{pkgname}
def get_latest_dump_file(self) -> str:
response = self.http_request(self.RUBY_GEMS_POSTGRES_DUMP_LIST_URL)
xml = BeautifulSoup(response.content, "xml")
contents = xml.find_all("Contents")
return contents[-1].find("Key").text
def create_rubygems_db(
self, postgresql: Postgresql
) -> Tuple[str, psycopg2._psycopg.connection]:
logger.debug("Creating rubygems database")
db_dsn = postgresql.dsn()
db_url = postgresql.url().replace(db_dsn["database"], self.DB_NAME)
db = psycopg2.connect(**db_dsn)
db.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
with db.cursor() as cursor:
cursor.execute(f"CREATE DATABASE {self.DB_NAME}")
db_dsn["database"] = self.DB_NAME
db = psycopg2.connect(**db_dsn)
db.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
with db.cursor() as cursor:
cursor.execute("CREATE EXTENSION IF NOT EXISTS hstore")
return db_url, db
def populate_rubygems_db(self, db_url: str):
dump_file = self.get_latest_dump_file()
dump_id = dump_file.split("/")[2]
response = self.http_request(f"{self.url}/{dump_file}", stream=True)
with tempfile.TemporaryDirectory() as temp_dir:
logger.debug(
"Downloading latest rubygems database dump: %s (%s bytes)",
dump_id,
response.headers["content-length"],
)
dump_file = os.path.join(temp_dir, "rubygems_dump.tar")
with open(dump_file, "wb") as dump:
for chunk in response.iter_content(chunk_size=1024):
dump.write(chunk)
with tarfile.open(dump_file) as dump_tar:
dump_tar.extractall(temp_dir)
logger.debug("Populating rubygems database with dump %s", dump_id)
psql = subprocess.Popen(
["psql", "-q", db_url],
stdin=subprocess.PIPE,
)
# passing value of gzip.open as stdin of subprocess.run makes the process
# read raw data instead of decompressed data so we have to use a pipe
with gzip.open(os.path.join(temp_dir, self.DUMP_SQL_PATH), "rb") as sql:
shutil.copyfileobj(sql, psql.stdin) # type: ignore
# denote end of read file
psql.stdin.close() # type: ignore
psql.wait()
"""
package_names: List[str] = []
response = self.http_request(url=self.url)
data = response.content.decode()
# remove the first 3 lines (file headers + first package named '-')
for line in data.splitlines()[3:]:
package_names.append(line.split(" ")[0])
# Remove duplicates
package_names_set: List[str] = list(set(package_names))
for pkgname in package_names_set:
yield f"https://rubygems.org/gems/{pkgname}"
def get_pages(self) -> Iterator[RubyGemsListerPage]:
# spawn a temporary postgres instance (require initdb executable in environment)
with Postgresql() as postgresql:
db_url, db = self.create_rubygems_db(postgresql)
self.populate_rubygems_db(db_url)
with db.cursor() as cursor:
cursor.execute("SELECT id, name from rubygems")
for gem_id, gem_name in cursor.fetchall():
logger.debug("Processing gem named %s", gem_name[1])
with db.cursor() as cursor_v:
cursor_v.execute(
"SELECT authors, built_at, number, sha256, size from versions "
"where rubygem_id = %s",
(gem_id,),
)
versions = [
{
"number": number,
"url": self.RUBY_GEM_DOWNLOAD_URL_PATTERN.format(
gem=gem_name, version=number
),
"date": built_at.replace(tzinfo=timezone.utc),
"authors": authors,
"sha256": (
base64.decodebytes(sha256.encode()).hex()
if sha256
else None
),
"size": size,
}
for authors, built_at, number, sha256, size in cursor_v.fetchall()
]
if versions:
yield {
"name": gem_name,
"versions": versions,
}
def get_origins_from_page(self, page: RubyGemsListerPage) -> Iterator[ListedOrigin]:
"""Iterate on all pages and yield ListedOrigin instances."""
assert self.lister_obj.id is not None
artifacts = []
rubygem_metadata = []
for version in page["versions"]:
artifacts.append(
{
"version": version["number"],
"filename": version["url"].split("/")[-1],
"url": version["url"],
"checksums": (
{"sha256": version["sha256"]} if version["sha256"] else {}
),
"length": version["size"],
}
)
rubygem_metadata.append(
{
"version": version["number"],
"date": version["date"].isoformat(),
"authors": version["authors"],
"extrinsic_metadata_url": (
self.RUBY_GEM_EXTRINSIC_METADATA_URL_PATTERN.format(
gem=page["name"], version=version["number"]
)
),
}
)
yield ListedOrigin(
lister_id=self.lister_obj.id,
visit_type=self.VISIT_TYPE,
url=page,
last_update=None,
url=self.RUBY_GEM_ORIGIN_URL_PATTERN.format(gem=page["name"]),
last_update=max(version["date"] for version in page["versions"]),
extra_loader_arguments={
"artifacts": artifacts,
"rubygem_metadata": rubygem_metadata,
},
)
created_at: 2022-09-01T00:00:05Z
---
- 1 05d0116933ba44b0b5d0ee19bfd35ccc
mercurial-ruby 0.3.0,0.4.0,0.5.0,0.6.0,0.6.1,0.7.0,0.7.1,0.7.2,0.7.3,0.7.4,0.7.5,0.7.6,0.7.7,0.7.8,0.7.9,0.7.10,0.7.11,0.7.12 3ea9d3b3f1010f06d292dcfcc799f260
mercurial-wrapper 0.8.4,0.8.5 b6541e48f15eafc0b50fa694cdbffc22
mercurius 0.0.1,0.0.2,0.0.3,0.0.5,0.0.6,0.0.7,0.0.8,0.0.9,0.1.0,0.1.1,0.1.2,0.1.3,0.1.4,0.1.5,0.1.6,0.1.7,0.1.8,0.1.9,0.2.0,0.2.1 9a388c7c57d2ed4a879ab42520d91ffd
<?xml version="1.0" encoding="UTF-8"?>
<ListBucketResult xmlns="http://s3.amazonaws.com/doc/2006-03-01/">
<Name>rubygems-dumps</Name>
<Prefix>production/public_postgresql</Prefix>
<Marker></Marker>
<MaxKeys>1000</MaxKeys>
<IsTruncated>false</IsTruncated>
<Contents>
<Key>production/public_postgresql/2022.10.05.06.10.11/public_postgresql.tar</Key>
<LastModified>2022-10-05T06:11:15.000Z</LastModified>
<ETag>&quot;d1c447a2a490225c2d59061e60ed86e9-75&quot;</ETag>
<Size>391653888</Size>
<StorageClass>STANDARD</StorageClass>
</Contents>
<Contents>
<Key>production/public_postgresql/2022.10.06.06.10.05/public_postgresql.tar</Key>
<LastModified>2022-10-06T06:11:11.000Z</LastModified>
<ETag>&quot;2ccd9340e4f802ec982e4cd00db2d168-75&quot;</ETag>
<Size>390047744</Size>
<StorageClass>STANDARD</StorageClass>
</Contents>
</ListBucketResult>
\ No newline at end of file
File added
#!/bin/bash
# this script requires a PostgreSQL server running on host,
# it enables to generate the rubygems_pgsql_dump.tar file used in tests data
# which contains a very small subset of gems for testing purpose
cd /tmp
# download rubygems load-pg-dump utility script
curl -O https://raw.githubusercontent.com/rubygems/rubygems.org/1c8cf7e079e56f709e7fc8f4b2398637e41815f2/script/load-pg-dump
# download latest rubygems pgsql dump and load rubygems db in local pgsql server
./load-pg-dump -c rubygems_dump.tar
# remove all rows in the rubygems db not related to gem haar_joke or l33tify
# those gems have few releases so that is why they have been picked
# also drop tables not needed by the rubygems lister
cleanup_script=$(cat <<- EOF
with t as (
select id from rubygems where name = 'haar_joke'
),
t2 as (
select id from rubygems where name = 'l33tify'
) delete from versions where rubygem_id != (select id from t) and rubygem_id != (select id from t2);
delete from rubygems where name != 'haar_joke' and name != 'l33tify';
drop table dependencies;
drop table gem_downloads;
drop table linksets;
EOF
)
echo $cleanup_script | psql rubygems
# create the rubygems_pgsql_dump.tar file
mkdir -p public_postgresql/databases
pg_dump rubygems | gzip -c > public_postgresql/databases/PostgreSQL.sql.gz
tar -cvf rubygems_pgsql_dump.tar public_postgresql
......@@ -2,26 +2,153 @@
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
# flake8: noqa: B950
from pathlib import Path
import iso8601
import pytest
from swh.lister.rubygems.lister import RubyGemsLister
from swh.scheduler.model import ListedOrigin
DUMP_FILEPATH = "production/public_postgresql/2022.10.06.06.10.05/public_postgresql.tar"
expected_origins = [
"https://rubygems.org/gems/mercurial-ruby",
"https://rubygems.org/gems/mercurial-wrapper",
"https://rubygems.org/gems/mercurius",
]
@pytest.fixture
def expected_listed_origins():
return [
{
"url": "https://rubygems.org/gems/haar_joke",
"visit_type": "rubygems",
"last_update": iso8601.parse_date("2016-11-05T00:00:00+00:00"),
"extra_loader_arguments": {
"artifacts": [
{
"url": "https://rubygems.org/downloads/haar_joke-0.0.2.gem",
"length": 8704,
"version": "0.0.2",
"filename": "haar_joke-0.0.2.gem",
"checksums": {
"sha256": "85a8cf5f41890e9605265eeebfe9e99aa0350a01a3c799f9f55a0615a31a2f5f"
},
},
{
"url": "https://rubygems.org/downloads/haar_joke-0.0.1.gem",
"length": 8704,
"version": "0.0.1",
"filename": "haar_joke-0.0.1.gem",
"checksums": {
"sha256": "a2ee7052fb8ffcfc4ec0fdb77fae9a36e473f859af196a36870a0f386b5ab55e"
},
},
],
"rubygem_metadata": [
{
"date": "2016-11-05T00:00:00+00:00",
"authors": "Gemma Gotch",
"version": "0.0.2",
"extrinsic_metadata_url": "https://rubygems.org/api/v2/rubygems/haar_joke/versions/0.0.2.json",
},
{
"date": "2016-07-23T00:00:00+00:00",
"authors": "Gemma Gotch",
"version": "0.0.1",
"extrinsic_metadata_url": "https://rubygems.org/api/v2/rubygems/haar_joke/versions/0.0.1.json",
},
],
},
},
{
"url": "https://rubygems.org/gems/l33tify",
"visit_type": "rubygems",
"last_update": iso8601.parse_date("2014-11-14T00:00:00+00:00"),
"extra_loader_arguments": {
"artifacts": [
{
"url": "https://rubygems.org/downloads/l33tify-0.0.2.gem",
"length": 6144,
"version": "0.0.2",
"filename": "l33tify-0.0.2.gem",
"checksums": {
"sha256": "0087a21fb6161bba8892df40de3b5e27404f941658084413b8fde49db2bc7c9f"
},
},
{
"url": "https://rubygems.org/downloads/l33tify-0.0.3.gem",
"length": 6144,
"version": "0.0.3",
"filename": "l33tify-0.0.3.gem",
"checksums": {
"sha256": "4502097ddf2657d561ce0f527ef1f49f1658c8a0968ab8cc853273138f8382a2"
},
},
{
"url": "https://rubygems.org/downloads/l33tify-0.0.1.gem",
"length": 6144,
"version": "0.0.1",
"filename": "l33tify-0.0.1.gem",
"checksums": {
"sha256": "5abfb737ce5cf561726f2f7cc1ba0f0e4f865f8b7283192e05eb3f246d3dbbca"
},
},
],
"rubygem_metadata": [
{
"date": "2014-11-14T00:00:00+00:00",
"authors": "E Alexander Liedtke",
"version": "0.0.2",
"extrinsic_metadata_url": "https://rubygems.org/api/v2/rubygems/l33tify/versions/0.0.2.json",
},
{
"date": "2014-11-14T00:00:00+00:00",
"authors": "E Alexander Liedtke",
"version": "0.0.3",
"extrinsic_metadata_url": "https://rubygems.org/api/v2/rubygems/l33tify/versions/0.0.3.json",
},
{
"date": "2014-11-14T00:00:00+00:00",
"authors": "E Alexander Liedtke",
"version": "0.0.1",
"extrinsic_metadata_url": "https://rubygems.org/api/v2/rubygems/l33tify/versions/0.0.1.json",
},
],
},
},
]
def test_rubygems_lister(datadir, requests_mock_datadir, swh_scheduler):
@pytest.fixture(autouse=True)
def network_requests_mock(datadir, requests_mock):
requests_mock.get(
RubyGemsLister.RUBY_GEMS_POSTGRES_DUMP_LIST_URL,
content=Path(datadir, "rubygems_dumps.xml").read_bytes(),
)
content = Path(datadir, "rubygems_pgsql_dump.tar").read_bytes()
requests_mock.get(
f"{RubyGemsLister.RUBY_GEMS_POSTGRES_DUMP_BASE_URL}/{DUMP_FILEPATH}",
content=content,
headers={"content-length": str(len(content))},
)
@pytest.mark.db
def test_rubygems_lister(swh_scheduler, expected_listed_origins):
lister = RubyGemsLister(scheduler=swh_scheduler)
res = lister.run()
assert res.pages == 3
assert res.origins == 1 + 1 + 1
assert res.pages == 2
assert res.origins == 2
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
assert len(scheduler_origins) == len(expected_origins)
for origin in scheduler_origins:
assert origin.visit_type == "rubygems"
assert origin.url in expected_origins
assert [
{
"url": origin.url,
"visit_type": origin.visit_type,
"last_update": origin.last_update,
"extra_loader_arguments": origin.extra_loader_arguments,
}
for origin in scheduler_origins
] == expected_listed_origins
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment