Skip to content
Snippets Groups Projects
Commit f74b47bc authored by vlorentz's avatar vlorentz
Browse files

Fix crash when indexing two REMD objects from the same deposit

The deduplication code assumed `remd.target` matches the id of results,
but this is no longer true, as we started using REMD objects whose
`origin` context was used as result id, when `remd.target` is a
directory (221d48e2).
parent b2d8afff
No related branches found
Tags v2.9.1
1 merge request!412Fix crash when indexing two REMD objects from the same deposit
......@@ -5,7 +5,6 @@
from copy import deepcopy
import hashlib
import itertools
import logging
import time
from typing import (
......@@ -84,14 +83,15 @@ class ExtrinsicMetadataIndexer(
for item in objects.get("raw_extrinsic_metadata", []):
remd = RawExtrinsicMetadata.from_dict(item)
sentry_sdk.set_tag("swh-indexer-remd-swhid", str(remd.swhid()))
results[remd.target] = self.index(remd.id, data=remd)
for result in self.index(remd.id, data=remd):
results[result.id] = result
except Exception:
if not self.catch_exceptions:
raise
summary["status"] = "failed"
return summary
self.results = list(itertools.chain.from_iterable(results.values()))
self.results = list(results.values())
summary_persist = self.persist_index_computations(self.results)
if summary_persist:
for value in summary_persist.values():
......
......@@ -390,7 +390,7 @@ class TestMetadata:
assert results == []
def test_extrinsic_metadata_indexer_duplicate_origin(self, mocker):
"""Nominal case, calling the mapping and storing the result"""
"""Two metadata objects with the same origin target"""
origin = "https://example.org/jdoe/myrepo"
metadata_indexer = ExtrinsicMetadataIndexer(config=DIRECTORY_METADATA_CONFIG)
......@@ -417,3 +417,36 @@ class TestMetadata:
)
assert len(results) == 1, results
assert results[0].from_remd_id == b"\x00" * 20
def test_extrinsic_directory_metadata_indexer_duplicate_origin(self, mocker):
"""Two metadata objects on directories, but with an origin context"""
origin = DEPOSIT_REMD.origin
metadata_indexer = ExtrinsicMetadataIndexer(config=DIRECTORY_METADATA_CONFIG)
metadata_indexer.catch_exceptions = False
metadata_indexer.storage = mocker.patch.object(metadata_indexer, "storage")
metadata_indexer.storage.origin_get_by_sha1.return_value = [{"url": origin}]
tool = metadata_indexer.idx_storage.indexer_configuration_get(
{f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()}
)
assert tool is not None
assert metadata_indexer.process_journal_objects(
{
"raw_extrinsic_metadata": [
DEPOSIT_REMD.to_dict(),
{
**DEPOSIT_REMD.to_dict(),
"id": b"\x00" * 20,
"target": "swh:1:dir:" + "01" * 20,
},
]
}
) == {"status": "eventful", "origin_extrinsic_metadata:add": 1}
results = list(
metadata_indexer.idx_storage.origin_extrinsic_metadata_get([origin])
)
assert len(results) == 1, results
assert results[0].from_remd_id == b"\x00" * 20
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment