diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 1c95e3d8dd307abb21417e98468a4654b1484014..f972cd9cf2c0ab3a94442a9ae373962f75042954 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,19 +1,19 @@ repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.1.0 + rev: v4.3.0 hooks: - id: trailing-whitespace - id: check-json - id: check-yaml - - repo: https://gitlab.com/pycqa/flake8 - rev: 4.0.1 + - repo: https://github.com/pycqa/flake8 + rev: 5.0.4 hooks: - id: flake8 - additional_dependencies: [flake8-bugbear==22.3.23] + additional_dependencies: [flake8-bugbear==22.9.23] - repo: https://github.com/codespell-project/codespell - rev: v2.1.0 + rev: v2.2.2 hooks: - id: codespell name: Check source code spelling @@ -35,6 +35,6 @@ repos: - id: isort - repo: https://github.com/python/black - rev: 22.3.0 + rev: 22.10.0 hooks: - id: black diff --git a/PKG-INFO b/PKG-INFO index 3509a2adabcc536b685d2c0d200bd45985beb1c8..400925af4d20144cd4347aea6414476f6ae62ca2 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: swh.loader.git -Version: 1.10.0 +Version: 1.10.1 Summary: Software Heritage git loader Home-page: https://forge.softwareheritage.org/diffusion/DLDG/ Author: Software Heritage developers diff --git a/docs/attic/api-backend-protocol.txt b/docs/attic/api-backend-protocol.txt index e0b39eab247288baf89505f9cec0676d885dd394..56d82e982927785bdf6a6dd165ec6e87f6e731d6 100644 --- a/docs/attic/api-backend-protocol.txt +++ b/docs/attic/api-backend-protocol.txt @@ -31,7 +31,7 @@ happen between 1 worker and the backend api. ## 1 A worker parses a repository. -It sends the parsing result to the backend in muliple requests/responses. +It sends the parsing result to the backend in multiple requests/responses. The worker sends list of sha1s (git sha1s) encountered. The server responds with an unknowns sha1s list. The worker sends those sha1s and their associated data to the server. diff --git a/requirements-swh.txt b/requirements-swh.txt index 440669d46032270bd6575c9d8348d7ef7bdc9773..2a7349ac2db95c36ecb46795ea3b1215f8aef8be 100644 --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,5 +1,5 @@ swh.core >= 0.0.7 -swh.loader.core >= 3.5.0 +swh.loader.core >= 5.0.0 swh.model >= 4.3.0 swh.scheduler >= 0.0.39 swh.storage >= 0.22.0 diff --git a/swh.loader.git.egg-info/PKG-INFO b/swh.loader.git.egg-info/PKG-INFO index 3509a2adabcc536b685d2c0d200bd45985beb1c8..400925af4d20144cd4347aea6414476f6ae62ca2 100644 --- a/swh.loader.git.egg-info/PKG-INFO +++ b/swh.loader.git.egg-info/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: swh.loader.git -Version: 1.10.0 +Version: 1.10.1 Summary: Software Heritage git loader Home-page: https://forge.softwareheritage.org/diffusion/DLDG/ Author: Software Heritage developers diff --git a/swh.loader.git.egg-info/requires.txt b/swh.loader.git.egg-info/requires.txt index f99fc46e87c3b2d696ffc94b09f8477e5784e2de..aa9710e82713409a5045824c702359219b6961ea 100644 --- a/swh.loader.git.egg-info/requires.txt +++ b/swh.loader.git.egg-info/requires.txt @@ -2,7 +2,7 @@ dulwich>=0.20.43 retrying click swh.core>=0.0.7 -swh.loader.core>=3.5.0 +swh.loader.core>=5.0.0 swh.model>=4.3.0 swh.scheduler>=0.0.39 swh.storage>=0.22.0 diff --git a/swh/loader/git/converters.py b/swh/loader/git/converters.py index 0b7e8fa56047d1f4a28fb042815c18622ec5e8f6..ea9ccf3ffb3d10f39e9e0f836fa323084f6dc976 100644 --- a/swh/loader/git/converters.py +++ b/swh/loader/git/converters.py @@ -120,7 +120,9 @@ def dulwich_tree_to_directory(obj: ShaFile) -> Directory: DirectoryEntry( type=type_, perms=entry.mode, - name=entry.path, + name=entry.path.replace( + b"/", b"_" + ), # '/' is very rare, and invalid in SWH. target=hash_to_bytes(entry.sha.decode("ascii")), ) ) @@ -183,6 +185,7 @@ def dulwich_commit_to_revision(obj: ShaFile) -> Revision: author_timezone = None committer_timezone = None + assert commit._chunked_text is not None # to keep mypy happy for (field, value) in _parse_message(commit._chunked_text): if field == b"author": m = AUTHORSHIP_LINE_RE.match(value) diff --git a/swh/loader/git/dumb.py b/swh/loader/git/dumb.py index c34c19b3166c8c0d8296d15803827d05a8f49399..35826e9f24bf9b1a571adeb5480bfbbd97f05bdf 100644 --- a/swh/loader/git/dumb.py +++ b/swh/loader/git/dumb.py @@ -86,7 +86,7 @@ class GitObjectsFetcher: commit_objects = [] for ref in wants: ref_object = self._get_git_object(ref) - if ref_object.get_type() == Commit.type_num: + if ref_object.type_num == Commit.type_num: commit_objects.append(cast(Commit, ref_object)) self.objects[b"commit"].add(ref) else: diff --git a/swh/loader/git/tests/test_converters.py b/swh/loader/git/tests/test_converters.py index 9add9baa8a7e3616ca2065868cf4e8147994121e..833f205740f0703f5bc23e7cbcde29ede792b0b2 100644 --- a/swh/loader/git/tests/test_converters.py +++ b/swh/loader/git/tests/test_converters.py @@ -189,14 +189,14 @@ class TestConverters: def test_weird_tree(self): """Tests a tree with entries the wrong order""" - raw_manifest = ( + raw_string = ( b"0644 file2\x00" b"d\x1f\xb6\xe0\x8d\xdb.O\xd0\x96\xdc\xf1\x8e\x80\xb8\x94\xbf~%\xce" b"0644 file1\x00" b"d\x1f\xb6\xe0\x8d\xdb.O\xd0\x96\xdc\xf1\x8e\x80\xb8\x94\xbf~%\xce" ) - tree = dulwich.objects.Tree.from_raw_string(b"tree", raw_manifest) + tree = dulwich.objects.Tree.from_raw_string(b"tree", raw_string) assert converters.dulwich_tree_to_directory(tree) == Directory( entries=( @@ -214,7 +214,7 @@ class TestConverters: perms=0o644, ), ), - raw_manifest=b"tree 62\x00" + raw_manifest, + raw_manifest=b"tree 62\x00" + raw_string, ) def test_tree_perms(self): @@ -244,6 +244,59 @@ class TestConverters: ) ) + def test_tree_with_slashes(self): + raw_string = ( + b"100775 AUTHORS\x00" + b"\x7f\xde\x98Av\x81I\xbb\x19\x88N\xffu\xed\xca\x01\xe1\x04\xb1\x81" + b"100775 COPYING\x00" + b'\xd5\n\x11\xd6O\xa5(\xfcv\xb3\x81\x92\xd1\x8c\x05?\xe8"A\xda' + b"100775 README.markdown\x00" + b"X-c\xd6\xb7\xa8*\xfa\x13\x9e\xef\x83q\xbf^\x90\xe9UVQ" + b"100775 gitter/gitter.xml\x00" + b"\xecJ\xfa\xa3\\\xe1\x9fo\x93\x131I\xcb\xbf1h2\x00}n" + b"100775 gitter/script.py\x00" + b"\x1d\xd3\xec\x83\x94+\xbc\x04\xde\xee\x7f\xc6\xbe\x8b\x9cnp=W\xf9" + ) + + tree = dulwich.objects.Tree.from_raw_string(b"tree", raw_string) + + dir_ = Directory( + entries=( + DirectoryEntry( + name=b"AUTHORS", + type="file", + target=hash_to_bytes("7fde9841768149bb19884eff75edca01e104b181"), + perms=0o100775, + ), + DirectoryEntry( + name=b"COPYING", + type="file", + target=hash_to_bytes("d50a11d64fa528fc76b38192d18c053fe82241da"), + perms=0o100775, + ), + DirectoryEntry( + name=b"README.markdown", + type="file", + target=hash_to_bytes("582d63d6b7a82afa139eef8371bf5e90e9555651"), + perms=0o100775, + ), + DirectoryEntry( + name=b"gitter_gitter.xml", # changed + type="file", + target=hash_to_bytes("ec4afaa35ce19f6f93133149cbbf316832007d6e"), + perms=0o100775, + ), + DirectoryEntry( + name=b"gitter_script.py", # changed + type="file", + target=hash_to_bytes("1dd3ec83942bbc04deee7fc6be8b9c6e703d57f9"), + perms=0o100775, + ), + ), + raw_manifest=b"tree 202\x00" + raw_string, + ) + assert converters.dulwich_tree_to_directory(tree) == dir_ + def test_commit_to_revision(self): sha1 = b"9768d0b576dbaaecd80abedad6dfd0d72f1476da" @@ -438,13 +491,13 @@ class TestConverters: """Checks raw_manifest is set when the commit cannot fit the data model""" # Well-formed manifest - raw_manifest = ( + raw_string = ( b"tree 641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce\n" b"author Foo <foo@example.org> 1640191028 +0200\n" b"committer Foo <foo@example.org> 1640191028 +0200\n\n" b"some commit message" ) - commit = dulwich.objects.Commit.from_raw_string(b"commit", raw_manifest) + commit = dulwich.objects.Commit.from_raw_string(b"commit", raw_string) date = TimestampWithTimezone( timestamp=Timestamp(seconds=1640191028, microseconds=0), offset_bytes=b"+0200", @@ -466,8 +519,8 @@ class TestConverters: ) # Mess with the offset - raw_manifest2 = raw_manifest.replace(b"+0200", b"+200") - commit = dulwich.objects.Commit.from_raw_string(b"commit", raw_manifest2) + raw_string2 = raw_string.replace(b"+0200", b"+200") + commit = dulwich.objects.Commit.from_raw_string(b"commit", raw_string2) date = TimestampWithTimezone( timestamp=Timestamp(seconds=1640191028, microseconds=0), offset_bytes=b"+200", @@ -489,11 +542,11 @@ class TestConverters: ) # Mess with the rest of the manifest - raw_manifest2 = raw_manifest.replace( + raw_string2 = raw_string.replace( b"641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce", b"641FB6E08DDB2E4FD096DCF18E80B894BF7E25CE", ) - commit = dulwich.objects.Commit.from_raw_string(b"commit", raw_manifest2) + commit = dulwich.objects.Commit.from_raw_string(b"commit", raw_string2) date = TimestampWithTimezone( timestamp=Timestamp(seconds=1640191028, microseconds=0), offset_bytes=b"+0200", @@ -511,7 +564,7 @@ class TestConverters: date=date, committer_date=date, type=RevisionType.GIT, - raw_manifest=b"commit 161\x00" + raw_manifest2, + raw_manifest=b"commit 161\x00" + raw_string2, ) def test_author_line_to_author(self): @@ -787,14 +840,14 @@ class TestConverters: """Checks raw_manifest is set when the tag cannot fit the data model""" # Well-formed manifest - raw_manifest = ( + raw_string = ( b"object 641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce\n" b"type commit\n" b"tag blah\n" b"tagger Foo <foo@example.org> 1640191027 +0200\n\n" b"some release message" ) - tag = dulwich.objects.Tag.from_raw_string(b"tag", raw_manifest) + tag = dulwich.objects.Tag.from_raw_string(b"tag", raw_string) assert converters.dulwich_tag_to_release(tag) == Release( name=b"blah", message=b"some release message", @@ -812,8 +865,8 @@ class TestConverters: ) # Mess with the offset (negative UTC) - raw_manifest2 = raw_manifest.replace(b"+0200", b"-0000") - tag = dulwich.objects.Tag.from_raw_string(b"tag", raw_manifest2) + raw_string2 = raw_string.replace(b"+0200", b"-0000") + tag = dulwich.objects.Tag.from_raw_string(b"tag", raw_string2) assert converters.dulwich_tag_to_release(tag) == Release( name=b"blah", message=b"some release message", @@ -830,8 +883,8 @@ class TestConverters: ) # Mess with the offset (other) - raw_manifest2 = raw_manifest.replace(b"+0200", b"+200") - tag = dulwich.objects.Tag.from_raw_string(b"tag", raw_manifest2) + raw_string2 = raw_string.replace(b"+0200", b"+200") + tag = dulwich.objects.Tag.from_raw_string(b"tag", raw_string2) assert converters.dulwich_tag_to_release(tag) == Release( name=b"blah", message=b"some release message", @@ -848,11 +901,11 @@ class TestConverters: ) # Mess with the rest of the manifest - raw_manifest2 = raw_manifest.replace( + raw_string2 = raw_string.replace( b"641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce", b"641FB6E08DDB2E4FD096DCF18E80B894BF7E25CE", ) - tag = dulwich.objects.Tag.from_raw_string(b"tag", raw_manifest2) + tag = dulwich.objects.Tag.from_raw_string(b"tag", raw_string2) assert converters.dulwich_tag_to_release(tag) == Release( name=b"blah", message=b"some release message", @@ -866,5 +919,5 @@ class TestConverters: timestamp=Timestamp(seconds=1640191027, microseconds=0), offset_bytes=b"+0200", ), - raw_manifest=b"tag 136\x00" + raw_manifest2, + raw_manifest=b"tag 136\x00" + raw_string2, ) diff --git a/swh/loader/git/tests/test_tasks.py b/swh/loader/git/tests/test_tasks.py index fd442e1a01a56ba9d5064c8c4c801e34f0711f99..2491c0574d3bb8399c2aee26d43aba0eecbebb86 100644 --- a/swh/loader/git/tests/test_tasks.py +++ b/swh/loader/git/tests/test_tasks.py @@ -8,12 +8,8 @@ import uuid import pytest from swh.scheduler.model import ListedOrigin, Lister -from swh.scheduler.utils import create_origin_task_dict - -@pytest.fixture(autouse=True) -def celery_worker_and_swh_config(swh_scheduler_celery_worker, swh_config): - pass +NAMESPACE = "swh.loader.git" @pytest.fixture @@ -28,137 +24,72 @@ def git_listed_origin(git_lister): ) -def test_git_loader( - mocker, - swh_scheduler_celery_app, -): - mock_loader = mocker.patch("swh.loader.git.loader.GitLoader.load") - mock_loader.return_value = {"status": "eventful"} - - res = swh_scheduler_celery_app.send_task( - "swh.loader.git.tasks.UpdateGitRepository", - kwargs={"url": "origin_url"}, - ) - assert res - res.wait() - assert res.successful() - - assert res.result == {"status": "eventful"} - mock_loader.assert_called_once_with() - - def test_git_loader_for_listed_origin( - mocker, - swh_scheduler_celery_app, + loading_task_creation_for_listed_origin_test, git_lister, git_listed_origin, ): - mock_loader = mocker.patch("swh.loader.git.loader.GitLoader.load") - mock_loader.return_value = {"status": "eventful"} - - task_dict = create_origin_task_dict(git_listed_origin, git_lister) - - res = swh_scheduler_celery_app.send_task( - "swh.loader.git.tasks.UpdateGitRepository", - kwargs=task_dict["arguments"]["kwargs"], - ) - assert res - res.wait() - assert res.successful() - - assert res.result == {"status": "eventful"} - mock_loader.assert_called_once_with() - - -def test_git_loader_from_disk( - mocker, - swh_scheduler_celery_app, -): - mock_loader = mocker.patch("swh.loader.git.from_disk.GitLoaderFromDisk.load") - mock_loader.return_value = {"status": "uneventful"} - - res = swh_scheduler_celery_app.send_task( - "swh.loader.git.tasks.LoadDiskGitRepository", - kwargs={"url": "origin_url2", "directory": "/some/repo", "visit_date": "now"}, + loading_task_creation_for_listed_origin_test( + loader_class_name=f"{NAMESPACE}.loader.GitLoader", + task_function_name=f"{NAMESPACE}.tasks.UpdateGitRepository", + lister=git_lister, + listed_origin=git_listed_origin, ) - assert res - res.wait() - assert res.successful() - - assert res.result == {"status": "uneventful"} - mock_loader.assert_called_once_with() +@pytest.mark.parametrize( + "extra_loader_arguments", + [ + { + "directory": "/some/repo", + }, + { + "directory": "/some/repo", + "visit_date": "now", + }, + ], +) def test_git_loader_from_disk_for_listed_origin( - mocker, - swh_scheduler_celery_app, + loading_task_creation_for_listed_origin_test, git_lister, git_listed_origin, + extra_loader_arguments, ): - mock_loader = mocker.patch("swh.loader.git.from_disk.GitLoaderFromDisk.load") - mock_loader.return_value = {"status": "uneventful"} - git_listed_origin.extra_loader_arguments = { - "directory": "/some/repo", - } - task_dict = create_origin_task_dict(git_listed_origin, git_lister) + git_listed_origin.extra_loader_arguments = extra_loader_arguments - res = swh_scheduler_celery_app.send_task( - "swh.loader.git.tasks.LoadDiskGitRepository", - kwargs=task_dict["arguments"]["kwargs"], + loading_task_creation_for_listed_origin_test( + loader_class_name=f"{NAMESPACE}.from_disk.GitLoaderFromDisk", + task_function_name=f"{NAMESPACE}.tasks.LoadDiskGitRepository", + lister=git_lister, + listed_origin=git_listed_origin, ) - assert res - res.wait() - assert res.successful() - assert res.result == {"status": "uneventful"} - mock_loader.assert_called_once_with() - - -def test_git_loader_from_archive( - mocker, - swh_scheduler_celery_app, -): - mock_loader = mocker.patch("swh.loader.git.from_disk.GitLoaderFromArchive.load") - mock_loader.return_value = {"status": "failed"} - res = swh_scheduler_celery_app.send_task( - "swh.loader.git.tasks.UncompressAndLoadDiskGitRepository", - kwargs={ - "url": "origin_url3", +@pytest.mark.parametrize( + "extra_loader_arguments", + [ + { + "archive_path": "/some/repo", + }, + { "archive_path": "/some/repo", "visit_date": "now", }, - ) - assert res - res.wait() - assert res.successful() - - assert res.result == {"status": "failed"} - mock_loader.assert_called_once_with() - - + ], +) def test_git_loader_from_archive_for_listed_origin( - mocker, - swh_scheduler_celery_app, + loading_task_creation_for_listed_origin_test, git_lister, git_listed_origin, + extra_loader_arguments, ): - mock_loader = mocker.patch("swh.loader.git.from_disk.GitLoaderFromArchive.load") - mock_loader.return_value = {"status": "failed"} - git_listed_origin.extra_loader_arguments = { - "archive_path": "/some/repo", - } - task_dict = create_origin_task_dict(git_listed_origin, git_lister) + git_listed_origin.extra_loader_arguments = extra_loader_arguments - res = swh_scheduler_celery_app.send_task( - "swh.loader.git.tasks.UncompressAndLoadDiskGitRepository", - kwargs=task_dict["arguments"]["kwargs"], + loading_task_creation_for_listed_origin_test( + loader_class_name=f"{NAMESPACE}.from_disk.GitLoaderFromArchive", + task_function_name=f"{NAMESPACE}.tasks.UncompressAndLoadDiskGitRepository", + lister=git_lister, + listed_origin=git_listed_origin, ) - assert res - res.wait() - assert res.successful() - - assert res.result == {"status": "failed"} - mock_loader.assert_called_once_with() diff --git a/tox.ini b/tox.ini index 4fad105fbf4550c54c7779107581ce7ad5cbdb3a..6d07f6d9ee27c423a20529f3d3d68166a7e25760 100644 --- a/tox.ini +++ b/tox.ini @@ -20,15 +20,16 @@ commands = [testenv:black] skip_install = true deps = - black==22.3.0 + black==22.10.0 commands = {envpython} -m black --check swh [testenv:flake8] skip_install = true deps = - flake8==4.0.1 - flake8-bugbear==22.3.23 + flake8==5.0.4 + flake8-bugbear==22.9.23 + pycodestyle==2.9.1 commands = {envpython} -m flake8