From 7fa2013235048aacd55f006ea59b310f395c82b1 Mon Sep 17 00:00:00 2001 From: Valentin Lorentz <vlorentz@softwareheritage.org> Date: Fri, 1 Feb 2019 11:04:13 +0100 Subject: [PATCH] Replace `entry_to_bytes` with psycopg2's typecast `entry_to_bytes` and its friends were called many times (eg. entry_to_bytes alone was called 40k times while indexing 500 origins with the metadata indexer), and its use of `isinstance` used a non-negligible amount of CPU time. Instead of using `*_to_bytes` function as post-processing on all bits of data returned by postgresql, this patch tells psycopg2 to use a new `typecast_bytea` function when needed (in `adapt_conn`). This function deffers the decoding work to psycopg2, which returns a `memoryview`, which is turned into `bytes`. --- swh/core/db/__init__.py | 28 +++++++++++++++++++++++++++- swh/core/db/db_utils.py | 28 ---------------------------- 2 files changed, 27 insertions(+), 29 deletions(-) diff --git a/swh/core/db/__init__.py b/swh/core/db/__init__.py index ad64b8bf..cab7ddb4 100644 --- a/swh/core/db/__init__.py +++ b/swh/core/db/__init__.py @@ -52,6 +52,12 @@ def escape(data): return str(data) +def typecast_bytea(value, cur): + if value is not None: + data = psycopg2.BINARY(value, cur) + return data.tobytes() + + class BaseDb: """Base class for swh.*.*Db. @@ -59,6 +65,23 @@ class BaseDb: """ + @classmethod + def adapt_conn(cls, conn): + """Makes psycopg2 use 'bytes' to decode bytea instead of + 'memoryview', for this connection.""" + cur = conn.cursor() + cur.execute("SELECT null::bytea, null::bytea[]") + bytea_oid = cur.description[0][1] + bytea_array_oid = cur.description[1][1] + + t_bytes = psycopg2.extensions.new_type( + (bytea_oid,), "bytea", typecast_bytea) + psycopg2.extensions.register_type(t_bytes, conn) + + t_bytes_array = psycopg2.extensions.new_array_type( + (bytea_array_oid,), "bytea[]", t_bytes) + psycopg2.extensions.register_type(t_bytes_array, conn) + @classmethod def connect(cls, *args, **kwargs): """factory method to create a DB proxy @@ -71,11 +94,14 @@ class BaseDb: """ conn = psycopg2.connect(*args, **kwargs) + cls.adapt_conn(conn) return cls(conn) @classmethod def from_pool(cls, pool): - return cls(pool.getconn(), pool=pool) + conn = pool.getconn() + cls.adapt_conn(conn) + return cls(conn, pool=pool) def __init__(self, conn, pool=None): """create a DB proxy diff --git a/swh/core/db/db_utils.py b/swh/core/db/db_utils.py index 41fbdd72..451fb584 100644 --- a/swh/core/db/db_utils.py +++ b/swh/core/db/db_utils.py @@ -44,34 +44,6 @@ def jsonize(value): return value -def entry_to_bytes(entry): - """Convert an entry coming from the database to bytes""" - if isinstance(entry, memoryview): - return entry.tobytes() - if isinstance(entry, list): - return [entry_to_bytes(value) for value in entry] - return entry - - -def line_to_bytes(line): - """Convert a line coming from the database to bytes""" - if not line: - return line - if isinstance(line, dict): - return {k: entry_to_bytes(v) for k, v in line.items()} - return line.__class__(entry_to_bytes(entry) for entry in line) - - -def cursor_to_bytes(cursor): - """Yield all the data from a cursor as bytes""" - yield from (line_to_bytes(line) for line in cursor) - - -def execute_values_to_bytes(*args, **kwargs): - for line in execute_values_generator(*args, **kwargs): - yield line_to_bytes(line) - - def _paginate(seq, page_size): """Consume an iterable and return it in chunks. Every chunk is at most `page_size`. Never return an empty chunk. -- GitLab