From cfe7507a7366c52d92793830f5ce89063a2acca4 Mon Sep 17 00:00:00 2001
From: Antoine Lambert <anlambert@softwareheritage.org>
Date: Fri, 14 Oct 2022 11:33:35 +0200
Subject: [PATCH] loader, cvsclient: Read files line by line to reduce memory
 consumption

Instead of using the readlines method on file objects that retrieve all
lines of a file and store them in memory, prefer to read files line
by line by using the lazy generator of lines from file objects.

This significantly reduce loader memory consumption when processing
a large rlog output stored in a file.
---
 swh/loader/cvs/cvsclient.py         | 10 +++++-----
 swh/loader/cvs/loader.py            | 25 ++++++++++++++++++-------
 swh/loader/cvs/tests/test_loader.py |  2 +-
 3 files changed, 24 insertions(+), 13 deletions(-)

diff --git a/swh/loader/cvs/cvsclient.py b/swh/loader/cvs/cvsclient.py
index 4ee7fe1..d05baf5 100644
--- a/swh/loader/cvs/cvsclient.py
+++ b/swh/loader/cvs/cvsclient.py
@@ -11,7 +11,7 @@ import os.path
 import socket
 import subprocess
 import tempfile
-from typing import Tuple
+from typing import IO, Tuple
 
 from tenacity import retry
 from tenacity.retry import retry_if_exception_type
@@ -291,12 +291,12 @@ class CVSClient:
     def __del__(self):
         self.conn_close()
 
-    def _parse_rlog_response(self, fp):
+    def _parse_rlog_response(self, fp: IO[bytes]):
         rlog_output = tempfile.TemporaryFile()
         expect_error = False
-        for line in fp.readlines():
+        for line in fp:
             if expect_error:
-                raise CVSProtocolError("CVS server error: %s" % line)
+                raise CVSProtocolError("CVS server error: %r" % line)
             if line == b"ok\n":
                 break
             elif line[0:2] == b"M ":
@@ -311,7 +311,7 @@ class CVSClient:
                 expect_error = True
                 continue
             else:
-                raise CVSProtocolError("Bad CVS protocol response: %s" % line)
+                raise CVSProtocolError("Bad CVS protocol response: %r" % line)
         rlog_output.seek(0)
         return rlog_output
 
diff --git a/swh/loader/cvs/loader.py b/swh/loader/cvs/loader.py
index f5ce5e3..abaa86e 100644
--- a/swh/loader/cvs/loader.py
+++ b/swh/loader/cvs/loader.py
@@ -13,7 +13,18 @@ import os.path
 import subprocess
 import tempfile
 import time
-from typing import Any, BinaryIO, Dict, Iterator, List, Optional, Sequence, Tuple, cast
+from typing import (
+    Any,
+    BinaryIO,
+    Dict,
+    Iterator,
+    List,
+    Optional,
+    Sequence,
+    TextIO,
+    Tuple,
+    cast,
+)
 from urllib.parse import urlparse
 
 import sentry_sdk
@@ -115,7 +126,7 @@ class CvsLoader(BaseLoader):
         self._visit_status = "full"
         self.visit_date = visit_date or self.visit_date
         self.cvsroot_path = cvsroot_path
-        self.custom_id_keyword = None
+        self.custom_id_keyword: Optional[str] = None
         self.excluded_keywords: List[str] = []
 
         self.snapshot: Optional[Snapshot] = None
@@ -303,7 +314,7 @@ class CvsLoader(BaseLoader):
     def cleanup(self) -> None:
         self.log.debug("cleanup")
 
-    def configure_custom_id_keyword(self, cvsconfig):
+    def configure_custom_id_keyword(self, cvsconfig: TextIO):
         """Parse CVSROOT/config and look for a custom keyword definition.
         There are two different configuration directives in use for this purpose.
 
@@ -322,7 +333,7 @@ class CvsLoader(BaseLoader):
         For example, this disables expansion of the Date and Name keywords:
         KeywordExpand=eDate,Name
         """
-        for line in cvsconfig.readlines():
+        for line in cvsconfig:
             line = line.strip()
             try:
                 (config_key, value) = line.split("=", 1)
@@ -542,11 +553,11 @@ class CvsLoader(BaseLoader):
                 # Combine all the rlog pieces we found and re-parse.
                 fp = tempfile.TemporaryFile()
                 for attic_rlog_file in attic_rlog_files:
-                    for line in attic_rlog_file.readlines():
+                    for line in attic_rlog_file:
                         fp.write(line)
-                        attic_rlog_file.close()
+                    attic_rlog_file.close()
                 main_rlog_file.seek(0)
-                for line in main_rlog_file.readlines():
+                for line in main_rlog_file:
                     fp.write(line)
                 main_rlog_file.close()
                 fp.seek(0)
diff --git a/swh/loader/cvs/tests/test_loader.py b/swh/loader/cvs/tests/test_loader.py
index 4c41fdf..c7274f7 100644
--- a/swh/loader/cvs/tests/test_loader.py
+++ b/swh/loader/cvs/tests/test_loader.py
@@ -1175,7 +1175,7 @@ def test_loader_cvs_weird_paths_in_rlog(
     rlog_file_path = rlog_file.name
 
     rlog_weird_paths = open(os.path.join(datadir, rlog_unsafe_path))
-    for line in rlog_weird_paths.readlines():
+    for line in rlog_weird_paths:
         rlog_file.write(line.replace("{cvsroot_path}", os.path.dirname(repo_url[7:])))
     rlog_file.close()
     rlog_file_override = open(rlog_file_path, "rb")  # re-open as bytes instead of str
-- 
GitLab