From cfe7507a7366c52d92793830f5ce89063a2acca4 Mon Sep 17 00:00:00 2001 From: Antoine Lambert <anlambert@softwareheritage.org> Date: Fri, 14 Oct 2022 11:33:35 +0200 Subject: [PATCH] loader, cvsclient: Read files line by line to reduce memory consumption Instead of using the readlines method on file objects that retrieve all lines of a file and store them in memory, prefer to read files line by line by using the lazy generator of lines from file objects. This significantly reduce loader memory consumption when processing a large rlog output stored in a file. --- swh/loader/cvs/cvsclient.py | 10 +++++----- swh/loader/cvs/loader.py | 25 ++++++++++++++++++------- swh/loader/cvs/tests/test_loader.py | 2 +- 3 files changed, 24 insertions(+), 13 deletions(-) diff --git a/swh/loader/cvs/cvsclient.py b/swh/loader/cvs/cvsclient.py index 4ee7fe1..d05baf5 100644 --- a/swh/loader/cvs/cvsclient.py +++ b/swh/loader/cvs/cvsclient.py @@ -11,7 +11,7 @@ import os.path import socket import subprocess import tempfile -from typing import Tuple +from typing import IO, Tuple from tenacity import retry from tenacity.retry import retry_if_exception_type @@ -291,12 +291,12 @@ class CVSClient: def __del__(self): self.conn_close() - def _parse_rlog_response(self, fp): + def _parse_rlog_response(self, fp: IO[bytes]): rlog_output = tempfile.TemporaryFile() expect_error = False - for line in fp.readlines(): + for line in fp: if expect_error: - raise CVSProtocolError("CVS server error: %s" % line) + raise CVSProtocolError("CVS server error: %r" % line) if line == b"ok\n": break elif line[0:2] == b"M ": @@ -311,7 +311,7 @@ class CVSClient: expect_error = True continue else: - raise CVSProtocolError("Bad CVS protocol response: %s" % line) + raise CVSProtocolError("Bad CVS protocol response: %r" % line) rlog_output.seek(0) return rlog_output diff --git a/swh/loader/cvs/loader.py b/swh/loader/cvs/loader.py index f5ce5e3..abaa86e 100644 --- a/swh/loader/cvs/loader.py +++ b/swh/loader/cvs/loader.py @@ -13,7 +13,18 @@ import os.path import subprocess import tempfile import time -from typing import Any, BinaryIO, Dict, Iterator, List, Optional, Sequence, Tuple, cast +from typing import ( + Any, + BinaryIO, + Dict, + Iterator, + List, + Optional, + Sequence, + TextIO, + Tuple, + cast, +) from urllib.parse import urlparse import sentry_sdk @@ -115,7 +126,7 @@ class CvsLoader(BaseLoader): self._visit_status = "full" self.visit_date = visit_date or self.visit_date self.cvsroot_path = cvsroot_path - self.custom_id_keyword = None + self.custom_id_keyword: Optional[str] = None self.excluded_keywords: List[str] = [] self.snapshot: Optional[Snapshot] = None @@ -303,7 +314,7 @@ class CvsLoader(BaseLoader): def cleanup(self) -> None: self.log.debug("cleanup") - def configure_custom_id_keyword(self, cvsconfig): + def configure_custom_id_keyword(self, cvsconfig: TextIO): """Parse CVSROOT/config and look for a custom keyword definition. There are two different configuration directives in use for this purpose. @@ -322,7 +333,7 @@ class CvsLoader(BaseLoader): For example, this disables expansion of the Date and Name keywords: KeywordExpand=eDate,Name """ - for line in cvsconfig.readlines(): + for line in cvsconfig: line = line.strip() try: (config_key, value) = line.split("=", 1) @@ -542,11 +553,11 @@ class CvsLoader(BaseLoader): # Combine all the rlog pieces we found and re-parse. fp = tempfile.TemporaryFile() for attic_rlog_file in attic_rlog_files: - for line in attic_rlog_file.readlines(): + for line in attic_rlog_file: fp.write(line) - attic_rlog_file.close() + attic_rlog_file.close() main_rlog_file.seek(0) - for line in main_rlog_file.readlines(): + for line in main_rlog_file: fp.write(line) main_rlog_file.close() fp.seek(0) diff --git a/swh/loader/cvs/tests/test_loader.py b/swh/loader/cvs/tests/test_loader.py index 4c41fdf..c7274f7 100644 --- a/swh/loader/cvs/tests/test_loader.py +++ b/swh/loader/cvs/tests/test_loader.py @@ -1175,7 +1175,7 @@ def test_loader_cvs_weird_paths_in_rlog( rlog_file_path = rlog_file.name rlog_weird_paths = open(os.path.join(datadir, rlog_unsafe_path)) - for line in rlog_weird_paths.readlines(): + for line in rlog_weird_paths: rlog_file.write(line.replace("{cvsroot_path}", os.path.dirname(repo_url[7:]))) rlog_file.close() rlog_file_override = open(rlog_file_path, "rb") # re-open as bytes instead of str -- GitLab