From a01a82fc755a6a41c2789c31f9cd6dd8655e9951 Mon Sep 17 00:00:00 2001 From: Valentin Lorentz <vlorentz@softwareheritage.org> Date: Fri, 16 Dec 2022 15:39:58 +0100 Subject: [PATCH] luigi.UploadExportToS3: Skip upload of already-uploaded files --- swh/dataset/luigi.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/swh/dataset/luigi.py b/swh/dataset/luigi.py index b93f8d9..87b0370 100644 --- a/swh/dataset/luigi.py +++ b/swh/dataset/luigi.py @@ -416,9 +416,16 @@ class UploadExportToS3(luigi.Task): list(os.listdir(local_dir)), desc=status_message, ): - client.put_multipart( - local_dir / file_, f"{s3_dir}/{file_}", ACL="public-read" - ) + local_path = local_dir / file_ + s3_path = f"{s3_dir}/{file_}" + obj_summary = client.get_key(s3_path) + if ( + obj_summary is not None + and obj_summary.size == local_path.stat().st_size + ): + # already uploaded (probably by a previous interrupted run) + continue + client.put_multipart(local_path, s3_path, ACL="public-read") client.put( self.local_export_path / "meta" / "export.json", -- GitLab