Skip to content
Snippets Groups Projects
Commit 33a9cd4e authored by Stefano Zacchiroli's avatar Stefano Zacchiroli
Browse files

DB import: skip invalid SWHIDs during import

This makes DB import much more rosbut, at the price of ~15% performance
impact (which is still very much bearable, in light of recent DB import
improvements).  If needed, this cost can be mitigated in the future by
pipelining SWHID validation with SQLite insertion, using either a thread or
asyncio.

Closes T2813
parent fe844030
No related branches found
No related tags found
No related merge requests found
......@@ -11,7 +11,9 @@ SWHIDs can be added directly from an input file.
"""
from io import TextIOWrapper
import logging
from pathlib import Path
import re
import sqlite3
from typing import Iterable
......@@ -19,6 +21,10 @@ from swh.core.utils import grouper
from .exceptions import DBError
# XXX copied and simplified from swh.model.identifiers (WIP), replace this in favor of
# swh.model.identifiers.SWHID_RE when it is landed there
SWHID_RE = re.compile("^swh:1:(ori|snp|rel|rev|dir|cnt):[0-9a-f]{40}$")
class Db:
"""Local database interface"""
......@@ -45,6 +51,17 @@ class Db:
[(swhid_chunk,) for swhid_chunk in swhids_chunk],
)
@staticmethod
def iter_swhids(lines: Iterable[str]) -> Iterable[str]:
lineno = 0
for line in lines:
lineno += 1
swhid = line.rstrip()
if SWHID_RE.match(swhid):
yield swhid
else:
logging.error("ignoring invalid SWHID on line %d: %s", lineno, swhid)
def create_from(
self, input_file: TextIOWrapper, chunk_size: int, cur: sqlite3.Cursor
):
......@@ -53,7 +70,7 @@ class Db:
self.create_table(cur)
cur.execute("PRAGMA synchronous = OFF")
cur.execute("PRAGMA journal_mode = OFF")
self.add((line.rstrip() for line in input_file), chunk_size, cur)
self.add(self.iter_swhids(input_file), chunk_size, cur)
cur.close()
self.conn.commit()
except sqlite3.Error as e:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment