From 7fa2013235048aacd55f006ea59b310f395c82b1 Mon Sep 17 00:00:00 2001
From: Valentin Lorentz <vlorentz@softwareheritage.org>
Date: Fri, 1 Feb 2019 11:04:13 +0100
Subject: [PATCH] Replace `entry_to_bytes` with psycopg2's typecast

`entry_to_bytes` and its friends were called many times (eg.
entry_to_bytes alone was called 40k times while indexing 500
origins with the metadata indexer), and its use of `isinstance`
used a non-negligible amount of CPU time.

Instead of using `*_to_bytes` function as post-processing on
all bits of data returned by postgresql, this patch tells psycopg2
to use a new `typecast_bytea` function when needed (in `adapt_conn`).
This function deffers the decoding work to psycopg2, which returns a
`memoryview`, which is turned into `bytes`.
---
 swh/core/db/__init__.py | 28 +++++++++++++++++++++++++++-
 swh/core/db/db_utils.py | 28 ----------------------------
 2 files changed, 27 insertions(+), 29 deletions(-)

diff --git a/swh/core/db/__init__.py b/swh/core/db/__init__.py
index ad64b8bf..cab7ddb4 100644
--- a/swh/core/db/__init__.py
+++ b/swh/core/db/__init__.py
@@ -52,6 +52,12 @@ def escape(data):
         return str(data)
 
 
+def typecast_bytea(value, cur):
+    if value is not None:
+        data = psycopg2.BINARY(value, cur)
+        return data.tobytes()
+
+
 class BaseDb:
     """Base class for swh.*.*Db.
 
@@ -59,6 +65,23 @@ class BaseDb:
 
     """
 
+    @classmethod
+    def adapt_conn(cls, conn):
+        """Makes psycopg2 use 'bytes' to decode bytea instead of
+        'memoryview', for this connection."""
+        cur = conn.cursor()
+        cur.execute("SELECT null::bytea, null::bytea[]")
+        bytea_oid = cur.description[0][1]
+        bytea_array_oid = cur.description[1][1]
+
+        t_bytes = psycopg2.extensions.new_type(
+            (bytea_oid,), "bytea", typecast_bytea)
+        psycopg2.extensions.register_type(t_bytes, conn)
+
+        t_bytes_array = psycopg2.extensions.new_array_type(
+            (bytea_array_oid,), "bytea[]", t_bytes)
+        psycopg2.extensions.register_type(t_bytes_array, conn)
+
     @classmethod
     def connect(cls, *args, **kwargs):
         """factory method to create a DB proxy
@@ -71,11 +94,14 @@ class BaseDb:
 
         """
         conn = psycopg2.connect(*args, **kwargs)
+        cls.adapt_conn(conn)
         return cls(conn)
 
     @classmethod
     def from_pool(cls, pool):
-        return cls(pool.getconn(), pool=pool)
+        conn = pool.getconn()
+        cls.adapt_conn(conn)
+        return cls(conn, pool=pool)
 
     def __init__(self, conn, pool=None):
         """create a DB proxy
diff --git a/swh/core/db/db_utils.py b/swh/core/db/db_utils.py
index 41fbdd72..451fb584 100644
--- a/swh/core/db/db_utils.py
+++ b/swh/core/db/db_utils.py
@@ -44,34 +44,6 @@ def jsonize(value):
     return value
 
 
-def entry_to_bytes(entry):
-    """Convert an entry coming from the database to bytes"""
-    if isinstance(entry, memoryview):
-        return entry.tobytes()
-    if isinstance(entry, list):
-        return [entry_to_bytes(value) for value in entry]
-    return entry
-
-
-def line_to_bytes(line):
-    """Convert a line coming from the database to bytes"""
-    if not line:
-        return line
-    if isinstance(line, dict):
-        return {k: entry_to_bytes(v) for k, v in line.items()}
-    return line.__class__(entry_to_bytes(entry) for entry in line)
-
-
-def cursor_to_bytes(cursor):
-    """Yield all the data from a cursor as bytes"""
-    yield from (line_to_bytes(line) for line in cursor)
-
-
-def execute_values_to_bytes(*args, **kwargs):
-    for line in execute_values_generator(*args, **kwargs):
-        yield line_to_bytes(line)
-
-
 def _paginate(seq, page_size):
     """Consume an iterable and return it in chunks.
     Every chunk is at most `page_size`. Never return an empty chunk.
-- 
GitLab