Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7437739
D8060.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
13 KB
Subscribers
None
D8060.diff
View Options
diff --git a/swh/indexer/indexer.py b/swh/indexer/indexer.py
--- a/swh/indexer/indexer.py
+++ b/swh/indexer/indexer.py
@@ -44,6 +44,7 @@
directory: List[Dict]
origin: List[Dict]
origin_visit_status: List[Dict]
+ raw_extrinsic_metadata: List[Dict]
@contextmanager
diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py
--- a/swh/indexer/metadata.py
+++ b/swh/indexer/metadata.py
@@ -16,13 +16,20 @@
TypeVar,
cast,
)
+from urllib.parse import urlparse
import sentry_sdk
from swh.core.config import merge_configs
from swh.core.utils import grouper
from swh.indexer.codemeta import merge_documents
-from swh.indexer.indexer import ContentIndexer, DirectoryIndexer, OriginIndexer
+from swh.indexer.indexer import (
+ BaseIndexer,
+ ContentIndexer,
+ DirectoryIndexer,
+ ObjectsDict,
+ OriginIndexer,
+)
from swh.indexer.metadata_detector import detect_metadata
from swh.indexer.metadata_dictionary import MAPPINGS
from swh.indexer.metadata_dictionary.base import DirectoryLsEntry
@@ -31,13 +38,14 @@
from swh.indexer.storage.model import (
ContentMetadataRow,
DirectoryIntrinsicMetadataRow,
+ OriginExtrinsicMetadataRow,
OriginIntrinsicMetadataRow,
)
from swh.model import hashutil
-from swh.model.model import Directory
+from swh.model.model import Directory, MetadataAuthorityType
from swh.model.model import ObjectType as ModelObjectType
-from swh.model.model import Origin, Sha1Git
-from swh.model.swhids import CoreSWHID, ObjectType
+from swh.model.model import Origin, RawExtrinsicMetadata, Sha1Git
+from swh.model.swhids import CoreSWHID, ExtendedObjectType, ObjectType
REVISION_GET_BATCH_SIZE = 10
RELEASE_GET_BATCH_SIZE = 10
@@ -59,6 +67,99 @@
yield from f(list(group))
+class ExtrinsicMetadataIndexer(
+ BaseIndexer[Sha1Git, RawExtrinsicMetadata, OriginExtrinsicMetadataRow]
+):
+ def process_journal_objects(self, objects: ObjectsDict) -> Dict:
+ summary: Dict[str, Any] = {"status": "uneventful"}
+ try:
+ results = []
+ for item in objects.get("raw_extrinsic_metadata", []):
+ results.extend(
+ self.index(item["id"], data=RawExtrinsicMetadata.from_dict(item))
+ )
+ except Exception:
+ if not self.catch_exceptions:
+ raise
+ summary["status"] = "failed"
+ return summary
+
+ summary_persist = self.persist_index_computations(results)
+ self.results = results
+ if summary_persist:
+ for value in summary_persist.values():
+ if value > 0:
+ summary["status"] = "eventful"
+ summary.update(summary_persist)
+ return summary
+
+ def index(
+ self,
+ id: Sha1Git,
+ data: Optional[RawExtrinsicMetadata],
+ **kwargs,
+ ) -> List[OriginExtrinsicMetadataRow]:
+ if data is None:
+ raise NotImplementedError(
+ "ExtrinsicMetadataIndexer.index() without RawExtrinsicMetadata data"
+ )
+ if data.target.object_type != ExtendedObjectType.ORIGIN:
+ # other types are not supported yet
+ return []
+
+ if data.authority.type != MetadataAuthorityType.FORGE:
+ # metadata provided by a third-party; don't trust it
+ # (technically this could be handled below, but we check it here
+ # to return early; sparing a translation and origin lookup)
+ # TODO: add ways to define trusted authorities
+ return []
+
+ metadata_items = []
+ mappings = []
+ for (mapping_name, mapping) in MAPPINGS.items():
+ if data.format in mapping.extrinsic_metadata_formats():
+ metadata_item = mapping().translate(data.metadata)
+ if metadata_item is not None:
+ metadata_items.append(metadata_item)
+ mappings.append(mapping_name)
+
+ if not metadata_items:
+ # Don't have any mapping to parse it, ignore
+ return []
+
+ # TODO: batch requests to origin_get_by_sha1()
+ origins = self.storage.origin_get_by_sha1([data.target.object_id])
+ try:
+ (origin,) = origins
+ if origin is None:
+ raise ValueError()
+ except ValueError:
+ raise ValueError(f"Unknown origin {data.target}") from None
+
+ if urlparse(data.authority.url).netloc != urlparse(origin["url"]).netloc:
+ # metadata provided by a third-party; don't trust it
+ # TODO: add ways to define trusted authorities
+ return []
+
+ metadata = merge_documents(metadata_items)
+
+ return [
+ OriginExtrinsicMetadataRow(
+ id=origin["url"],
+ indexer_configuration_id=self.tool["id"],
+ from_remd_id=data.id,
+ mappings=mappings,
+ metadata=metadata,
+ )
+ ]
+
+ def persist_index_computations(
+ self, results: List[OriginExtrinsicMetadataRow]
+ ) -> Dict[str, int]:
+ """Persist the results in storage."""
+ return self.idx_storage.origin_extrinsic_metadata_add(results)
+
+
class ContentMetadataIndexer(ContentIndexer[ContentMetadataRow]):
"""Content-level indexer
@@ -129,15 +230,7 @@
def persist_index_computations(
self, results: List[ContentMetadataRow]
) -> Dict[str, int]:
- """Persist the results in storage.
-
- Args:
- results: list of content_metadata, dict with the
- following keys:
- - id (bytes): content's identifier (sha1)
- - metadata (jsonb): detected metadata
-
- """
+ """Persist the results in storage."""
return self.idx_storage.content_metadata_add(results)
@@ -239,16 +332,7 @@
def persist_index_computations(
self, results: List[DirectoryIntrinsicMetadataRow]
) -> Dict[str, int]:
- """Persist the results in storage.
-
- Args:
- results: list of content_mimetype, dict with the
- following keys:
- - id (bytes): content's identifier (sha1)
- - mimetype (bytes): mimetype in bytes
- - encoding (bytes): encoding in bytes
-
- """
+ """Persist the results in storage."""
# TODO: add functions in storage to keep data in
# directory_intrinsic_metadata
return self.idx_storage.directory_intrinsic_metadata_add(results)
diff --git a/swh/indexer/metadata_dictionary/base.py b/swh/indexer/metadata_dictionary/base.py
--- a/swh/indexer/metadata_dictionary/base.py
+++ b/swh/indexer/metadata_dictionary/base.py
@@ -77,6 +77,7 @@
raise NotImplementedError(f"{cls.__name__}.extrinsic_metadata_formats")
def translate(self, file_content: bytes) -> Optional[Dict]:
+ """Translates intrinsic metadata, from the content of a file."""
raise NotImplementedError(f"{self.__class__.__name__}.translate")
def normalize_translation(self, metadata: Dict[str, Any]) -> Dict[str, Any]:
diff --git a/swh/indexer/metadata_dictionary/ruby.py b/swh/indexer/metadata_dictionary/ruby.py
--- a/swh/indexer/metadata_dictionary/ruby.py
+++ b/swh/indexer/metadata_dictionary/ruby.py
@@ -6,7 +6,7 @@
import ast
import itertools
import re
-from typing import List
+from typing import List, Tuple
from swh.indexer.codemeta import CROSSWALK_TABLE, SCHEMA_URI
from swh.indexer.metadata_dictionary.base import DirectoryLsEntry
@@ -37,6 +37,11 @@
return [entry["sha1"]]
return []
+ @classmethod
+ def extrinsic_metadata_formats(cls) -> Tuple[str, ...]:
+ # this class is only used by intrinsic metadata mappings
+ return ()
+
def translate(self, raw_content):
try:
raw_content = raw_content.decode()
diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py
--- a/swh/indexer/tests/test_metadata.py
+++ b/swh/indexer/tests/test_metadata.py
@@ -3,10 +3,31 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-from swh.indexer.metadata import ContentMetadataIndexer, DirectoryMetadataIndexer
-from swh.indexer.storage.model import ContentMetadataRow, DirectoryIntrinsicMetadataRow
+import datetime
+from unittest.mock import call
+
+import attr
+
+from swh.indexer.metadata import (
+ ContentMetadataIndexer,
+ DirectoryMetadataIndexer,
+ ExtrinsicMetadataIndexer,
+)
+from swh.indexer.storage.model import (
+ ContentMetadataRow,
+ DirectoryIntrinsicMetadataRow,
+ OriginExtrinsicMetadataRow,
+)
from swh.indexer.tests.utils import DIRECTORY2
-from swh.model.model import Directory, DirectoryEntry
+from swh.model.model import (
+ Directory,
+ DirectoryEntry,
+ MetadataAuthority,
+ MetadataAuthorityType,
+ MetadataFetcher,
+ RawExtrinsicMetadata,
+)
+from swh.model.swhids import ExtendedObjectType, ExtendedSWHID
from .utils import (
BASE_TEST_CONFIG,
@@ -36,6 +57,24 @@
"tools": TRANSLATOR_TOOL,
}
+REMD = RawExtrinsicMetadata(
+ target=ExtendedSWHID(
+ object_type=ExtendedObjectType.ORIGIN,
+ object_id=b"\x01" * 20,
+ ),
+ discovery_date=datetime.datetime.now(tz=datetime.timezone.utc),
+ authority=MetadataAuthority(
+ type=MetadataAuthorityType.FORGE,
+ url="https://example.org/",
+ ),
+ fetcher=MetadataFetcher(
+ name="example-fetcher",
+ version="1.0.0",
+ ),
+ format="application/vnd.github.v3+json",
+ metadata=b'{"full_name": "test software"}',
+)
+
class TestMetadata:
"""
@@ -141,3 +180,91 @@
del result.tool["id"]
assert results == expected_results
+
+ def test_extrinsic_metadata_indexer_unknown_format(self, mocker):
+ """Should be ignored when unknown format"""
+ metadata_indexer = ExtrinsicMetadataIndexer(config=DIRECTORY_METADATA_CONFIG)
+ metadata_indexer.storage = mocker.patch.object(metadata_indexer, "storage")
+
+ remd = attr.evolve(REMD, format="unknown format")
+
+ results = metadata_indexer.index(remd.id, data=remd)
+
+ assert metadata_indexer.storage.method_calls == []
+ assert results == []
+
+ def test_extrinsic_metadata_indexer_github(self, mocker):
+ """Nominal case, calling the mapping and storing the result"""
+ origin = "https://example.org/jdoe/myrepo"
+
+ metadata_indexer = ExtrinsicMetadataIndexer(config=DIRECTORY_METADATA_CONFIG)
+ metadata_indexer.catch_exceptions = False
+ metadata_indexer.storage = mocker.patch.object(metadata_indexer, "storage")
+ metadata_indexer.storage.origin_get_by_sha1.return_value = [{"url": origin}]
+
+ tool = metadata_indexer.idx_storage.indexer_configuration_get(
+ {f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()}
+ )
+ assert tool is not None
+
+ assert metadata_indexer.process_journal_objects(
+ {"raw_extrinsic_metadata": [REMD.to_dict()]}
+ ) == {"status": "eventful", "origin_extrinsic_metadata:add": 1}
+
+ assert metadata_indexer.storage.method_calls == [
+ call.origin_get_by_sha1([b"\x01" * 20])
+ ]
+
+ results = list(
+ metadata_indexer.idx_storage.origin_extrinsic_metadata_get([origin])
+ )
+ assert results == [
+ OriginExtrinsicMetadataRow(
+ id="https://example.org/jdoe/myrepo",
+ tool={"id": tool["id"], **TRANSLATOR_TOOL},
+ metadata={
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "https://forgefed.org/ns#Repository",
+ "name": "test software",
+ },
+ from_remd_id=REMD.id,
+ mappings=["GitHubMapping"],
+ )
+ ]
+
+ def test_extrinsic_metadata_indexer_nonforge_authority(self, mocker):
+ """Early abort on non-forge authorities"""
+ metadata_indexer = ExtrinsicMetadataIndexer(config=DIRECTORY_METADATA_CONFIG)
+ metadata_indexer.storage = mocker.patch.object(metadata_indexer, "storage")
+
+ remd = attr.evolve(
+ REMD,
+ authority=attr.evolve(REMD.authority, type=MetadataAuthorityType.REGISTRY),
+ )
+
+ results = metadata_indexer.index(remd.id, data=remd)
+
+ assert metadata_indexer.storage.method_calls == []
+ assert results == []
+
+ def test_extrinsic_metadata_indexer_thirdparty_authority(self, mocker):
+ """Should be ignored when authority URL does not match the origin"""
+
+ origin = "https://different-domain.example.org/jdoe/myrepo"
+
+ metadata_indexer = ExtrinsicMetadataIndexer(config=DIRECTORY_METADATA_CONFIG)
+ metadata_indexer.catch_exceptions = False
+ metadata_indexer.storage = mocker.patch.object(metadata_indexer, "storage")
+ metadata_indexer.storage.origin_get_by_sha1.return_value = [{"url": origin}]
+
+ tool = metadata_indexer.idx_storage.indexer_configuration_get(
+ {f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()}
+ )
+ assert tool is not None
+
+ results = metadata_indexer.index(REMD.id, data=REMD)
+
+ assert metadata_indexer.storage.method_calls == [
+ call.origin_get_by_sha1([b"\x01" * 20])
+ ]
+ assert results == []
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Mon, Apr 14, 3:44 AM (14 h, 15 m ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3217068
Attached To
D8060: Add extrinsic metadata indexer
Event Timeline
Log In to Comment