Skip to content
Snippets Groups Projects
Commit dd148fc6 authored by Nicolas Dandrimont's avatar Nicolas Dandrimont
Browse files

Guesstimate partition boundaries from extrema rather than using expensive offsets

Summary:
Using order by and offset makes the partitioning a n^2 operation on the number
of entries in the table, rather than an instant operation when using
min/max.

This assumes the indexable column is more or less uniform, which is not exactly
true but not the worst approximation either.

Test Plan: tox

Reviewers: #reviewers, douardda

Reviewed By: #reviewers, douardda

Subscribers: douardda, swh-public-ci

Differential Revision: https://forge.softwareheritage.org/D1267
parents 2a588d2d c574897e
No related branches found
No related tags found
No related merge requests found
......@@ -6,6 +6,7 @@ import abc
import logging
from itertools import count
import dateutil
from sqlalchemy import func
from .lister_transports import SWHListerHttpTransport
......@@ -103,23 +104,52 @@ class SWHIndexingLister(SWHListerBase):
declare approximately equal-sized ranges of existing
repos
"""
n = max(self.db_num_entries(), 10)
partitions = []
n = max(self.db_num_entries(), 10)
partition_size = min(partition_size, n)
prev_index = None
for i in range(0, n-1, partition_size):
# indexable column from the ith row
index = self.db_session.query(self.MODEL.indexable) \
.order_by(self.MODEL.indexable).offset(i).first()
if index:
index = index[0]
if index is not None and prev_index is not None:
partitions.append((prev_index, index))
prev_index = index
partitions.append((prev_index, self.db_last_index()))
return partitions
n_partitions = n // partition_size
min_index = self.db_first_index()
max_index = self.db_last_index()
if not min_index or not max_index:
raise ValueError("Can't partition an empty range")
if isinstance(min_index, str):
def format_bound(bound):
return bound.isoformat()
min_index = dateutil.parser.parse(min_index)
max_index = dateutil.parser.parse(max_index)
else:
def format_bound(bound):
return bound
partition_width = (max_index - min_index) / n_partitions
partitions = [
[
format_bound(min_index + i * partition_width),
format_bound(min_index + (i+1) * partition_width),
] for i in range(n_partitions)
]
# Remove bounds for lowest and highest partition
partitions[0][0] = None
partitions[-1][1] = None
return [tuple(partition) for partition in partitions]
def db_first_index(self):
"""Look in the db for the smallest indexable value
Returns:
the smallest indexable value of all repos in the db
"""
t = self.db_session.query(func.min(self.MODEL.indexable)).first()
if t:
return t[0]
else:
return None
def db_last_index(self):
"""Look in the db for the largest indexable value
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment