From a01a82fc755a6a41c2789c31f9cd6dd8655e9951 Mon Sep 17 00:00:00 2001
From: Valentin Lorentz <vlorentz@softwareheritage.org>
Date: Fri, 16 Dec 2022 15:39:58 +0100
Subject: [PATCH] luigi.UploadExportToS3: Skip upload of already-uploaded files

---
 swh/dataset/luigi.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/swh/dataset/luigi.py b/swh/dataset/luigi.py
index b93f8d9..87b0370 100644
--- a/swh/dataset/luigi.py
+++ b/swh/dataset/luigi.py
@@ -416,9 +416,16 @@ class UploadExportToS3(luigi.Task):
                     list(os.listdir(local_dir)),
                     desc=status_message,
                 ):
-                    client.put_multipart(
-                        local_dir / file_, f"{s3_dir}/{file_}", ACL="public-read"
-                    )
+                    local_path = local_dir / file_
+                    s3_path = f"{s3_dir}/{file_}"
+                    obj_summary = client.get_key(s3_path)
+                    if (
+                        obj_summary is not None
+                        and obj_summary.size == local_path.stat().st_size
+                    ):
+                        # already uploaded (probably by a previous interrupted run)
+                        continue
+                    client.put_multipart(local_path, s3_path, ACL="public-read")
 
         client.put(
             self.local_export_path / "meta" / "export.json",
-- 
GitLab