From 814a6c8416d56f5f8b3e590d419d5aea7a888ab2 Mon Sep 17 00:00:00 2001
From: Pierre-Yves David <pierre-yves.david@ens-lyon.org>
Date: Tue, 20 Sep 2022 14:26:17 +0200
Subject: [PATCH] from_disk: only build a model object once

Before this change, a Directory object was built to compute the `id` of
we fed to the Directory object we built for `to_model`.

We tested this change on simple information of the Mercurial loader,
with a noop-loader stockage:

    swh loader run mercurial https://foss.heptapod.net/mercurial/mercurial-devel directory=/data/repos/mercurial-devel

= Median time of 3 run =
before: 17 minutes 48 seconds
after:  12 minutes 59 seconds

On a profile of the same run, the `to_model` call of the from_disk's `Directory` class took the following percentage:
before: 43%
after:  24%
---
 CONTRIBUTORS           |  1 +
 swh/model/from_disk.py | 10 +++++++---
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/CONTRIBUTORS b/CONTRIBUTORS
index 7565e3d6..e69f838f 100644
--- a/CONTRIBUTORS
+++ b/CONTRIBUTORS
@@ -1,3 +1,4 @@
 Daniele Serafini
 Ishan Bhanuka
 Antoine Cezar
+Pierre-Yves David
diff --git a/swh/model/from_disk.py b/swh/model/from_disk.py
index 9ef7afa8..4c4fe0ce 100644
--- a/swh/model/from_disk.py
+++ b/swh/model/from_disk.py
@@ -401,7 +401,7 @@ class Directory(MerkleNode):
     for instance when the client is applying diffs.
     """
 
-    __slots__ = ["__entries"]
+    __slots__ = ["__entries", "__model_object"]
     object_type: Final = "directory"
 
     @classmethod
@@ -447,9 +447,11 @@ class Directory(MerkleNode):
     def __init__(self, data=None):
         super().__init__(data=data)
         self.__entries = None
+        self.__model_object = None
 
     def invalidate_hash(self):
         self.__entries = None
+        self.__model_object = None
         super().invalidate_hash()
 
     @staticmethod
@@ -497,12 +499,14 @@ class Directory(MerkleNode):
         return CoreSWHID(object_type=ObjectType.DIRECTORY, object_id=self.hash)
 
     def compute_hash(self):
-        return model.Directory.from_dict({"entries": self.entries}).id
+        return self.to_model().id
 
     def to_model(self) -> model.Directory:
         """Builds a `model.Directory` object based on this node;
         ignoring its children."""
-        return model.Directory.from_dict(self.get_data())
+        if self.__model_object is None:
+            self.__model_object = model.Directory.from_dict({"entries": self.entries})
+        return self.__model_object
 
     def __getitem__(self, key):
         if not isinstance(key, bytes):
-- 
GitLab