Changeset View
Changeset View
Standalone View
Standalone View
swh/indexer/metadata.py
Show All 17 Lines | |||||
from swh.core.config import merge_configs | from swh.core.config import merge_configs | ||||
from swh.core.utils import grouper | from swh.core.utils import grouper | ||||
from swh.indexer.codemeta import merge_documents | from swh.indexer.codemeta import merge_documents | ||||
from swh.indexer.indexer import ContentIndexer, OriginIndexer, RevisionIndexer | from swh.indexer.indexer import ContentIndexer, OriginIndexer, RevisionIndexer | ||||
from swh.indexer.metadata_detector import detect_metadata | from swh.indexer.metadata_detector import detect_metadata | ||||
from swh.indexer.metadata_dictionary import MAPPINGS | from swh.indexer.metadata_dictionary import MAPPINGS | ||||
from swh.indexer.origin_head import OriginHeadIndexer | from swh.indexer.origin_head import OriginHeadIndexer | ||||
from swh.indexer.storage import INDEXER_CFG_KEY | from swh.indexer.storage import INDEXER_CFG_KEY, Sha1 | ||||
from swh.indexer.storage.model import ( | from swh.indexer.storage.model import ( | ||||
ContentMetadataRow, | ContentMetadataRow, | ||||
OriginIntrinsicMetadataRow, | OriginIntrinsicMetadataRow, | ||||
RevisionIntrinsicMetadataRow, | RevisionIntrinsicMetadataRow, | ||||
) | ) | ||||
from swh.model import hashutil | from swh.model import hashutil | ||||
from swh.model.model import Revision | from swh.model.model import Revision, Sha1Git | ||||
REVISION_GET_BATCH_SIZE = 10 | REVISION_GET_BATCH_SIZE = 10 | ||||
ORIGIN_GET_BATCH_SIZE = 10 | ORIGIN_GET_BATCH_SIZE = 10 | ||||
T1 = TypeVar("T1") | T1 = TypeVar("T1") | ||||
T2 = TypeVar("T2") | T2 = TypeVar("T2") | ||||
def call_with_batches( | def call_with_batches( | ||||
f: Callable[[List[T1]], Iterable[T2]], args: List[T1], batch_size: int, | f: Callable[[List[T1]], Iterable[T2]], args: List[T1], batch_size: int, | ||||
) -> Iterator[T2]: | ) -> Iterator[T2]: | ||||
"""Calls a function with batches of args, and concatenates the results. | """Calls a function with batches of args, and concatenates the results. | ||||
""" | """ | ||||
groups = grouper(args, batch_size) | groups = grouper(args, batch_size) | ||||
for group in groups: | for group in groups: | ||||
yield from f(list(group)) | yield from f(list(group)) | ||||
class ContentMetadataIndexer(ContentIndexer[ContentMetadataRow]): | class ContentMetadataIndexer(ContentIndexer[ContentMetadataRow]): | ||||
ardumont: For my understanding, so now the generic TId, TData and TResult are now formalized into… | |||||
Done Inline Actionsyes vlorentz: yes | |||||
"""Content-level indexer | """Content-level indexer | ||||
This indexer is in charge of: | This indexer is in charge of: | ||||
- filtering out content already indexed in content_metadata | - filtering out content already indexed in content_metadata | ||||
- reading content from objstorage with the content's id sha1 | - reading content from objstorage with the content's id sha1 | ||||
- computing metadata by given context | - computing metadata by given context | ||||
- using the metadata_dictionary as the 'swh-metadata-translator' tool | - using the metadata_dictionary as the 'swh-metadata-translator' tool | ||||
- store result in content_metadata table | - store result in content_metadata table | ||||
""" | """ | ||||
def filter(self, ids): | def filter(self, ids): | ||||
"""Filter out known sha1s and return only missing ones. | """Filter out known sha1s and return only missing ones. | ||||
""" | """ | ||||
yield from self.idx_storage.content_metadata_missing( | yield from self.idx_storage.content_metadata_missing( | ||||
({"id": sha1, "indexer_configuration_id": self.tool["id"],} for sha1 in ids) | ({"id": sha1, "indexer_configuration_id": self.tool["id"],} for sha1 in ids) | ||||
) | ) | ||||
def index( | def index( | ||||
self, id, data: Optional[bytes] = None, log_suffix="unknown revision", **kwargs | self, | ||||
id: Sha1, | |||||
data: Optional[bytes] = None, | |||||
log_suffix="unknown revision", | |||||
**kwargs, | |||||
) -> List[ContentMetadataRow]: | ) -> List[ContentMetadataRow]: | ||||
"""Index sha1s' content and store result. | """Index sha1s' content and store result. | ||||
Args: | Args: | ||||
id (bytes): content's identifier | id (bytes): content's identifier | ||||
data (bytes): raw content in bytes | data (bytes): raw content in bytes | ||||
Returns: | Returns: | ||||
▲ Show 20 Lines • Show All 74 Lines • ▼ Show 20 Lines | def filter(self, sha1_gits): | ||||
""" | """ | ||||
yield from self.idx_storage.revision_intrinsic_metadata_missing( | yield from self.idx_storage.revision_intrinsic_metadata_missing( | ||||
( | ( | ||||
{"id": sha1_git, "indexer_configuration_id": self.tool["id"],} | {"id": sha1_git, "indexer_configuration_id": self.tool["id"],} | ||||
for sha1_git in sha1_gits | for sha1_git in sha1_gits | ||||
) | ) | ||||
) | ) | ||||
def index(self, id, data=None, **kwargs) -> List[RevisionIntrinsicMetadataRow]: | def index( | ||||
self, id: Sha1Git, data: Optional[Revision], **kwargs | |||||
) -> List[RevisionIntrinsicMetadataRow]: | |||||
"""Index rev by processing it and organizing result. | """Index rev by processing it and organizing result. | ||||
use metadata_detector to iterate on filenames | use metadata_detector to iterate on filenames | ||||
- if one filename detected -> sends file to content indexer | - if one filename detected -> sends file to content indexer | ||||
- if multiple file detected -> translation needed at revision level | - if multiple file detected -> translation needed at revision level | ||||
Args: | Args: | ||||
rev: revision model object from storage | id: sha1_git of the revision | ||||
data: revision model object from storage | |||||
Returns: | Returns: | ||||
dict: dictionary representing a revision_intrinsic_metadata, with | dict: dictionary representing a revision_intrinsic_metadata, with | ||||
keys: | keys: | ||||
- id (str): rev's identifier (sha1_git) | - id (str): rev's identifier (sha1_git) | ||||
- indexer_configuration_id (bytes): tool used | - indexer_configuration_id (bytes): tool used | ||||
- metadata: dict of retrieved metadata | - metadata: dict of retrieved metadata | ||||
""" | """ | ||||
rev = id | rev = data | ||||
assert isinstance(rev, Revision) | assert isinstance(rev, Revision) | ||||
assert data is None | |||||
try: | try: | ||||
root_dir = rev.directory | root_dir = rev.directory | ||||
dir_ls = list(self.storage.directory_ls(root_dir, recursive=False)) | dir_ls = list(self.storage.directory_ls(root_dir, recursive=False)) | ||||
if [entry["type"] for entry in dir_ls] == ["dir"]: | if [entry["type"] for entry in dir_ls] == ["dir"]: | ||||
# If the root is just a single directory, recurse into it | # If the root is just a single directory, recurse into it | ||||
# eg. PyPI packages, GNU tarballs | # eg. PyPI packages, GNU tarballs | ||||
subdir = dir_ls[0]["target"] | subdir = dir_ls[0]["target"] | ||||
▲ Show 20 Lines • Show All 142 Lines • ▼ Show 20 Lines | ) -> List[Tuple[OriginIntrinsicMetadataRow, RevisionIntrinsicMetadataRow]]: | ||||
assert len(head_revs) == len(head_rev_ids) | assert len(head_revs) == len(head_rev_ids) | ||||
results = [] | results = [] | ||||
for (origin, rev) in zip(origins_with_head, head_revs): | for (origin, rev) in zip(origins_with_head, head_revs): | ||||
if not rev: | if not rev: | ||||
self.log.warning("Missing head revision of origin %r", origin.url) | self.log.warning("Missing head revision of origin %r", origin.url) | ||||
continue | continue | ||||
for rev_metadata in self.revision_metadata_indexer.index(rev): | for rev_metadata in self.revision_metadata_indexer.index(rev.id, rev): | ||||
# There is at most one rev_metadata | # There is at most one rev_metadata | ||||
orig_metadata = OriginIntrinsicMetadataRow( | orig_metadata = OriginIntrinsicMetadataRow( | ||||
from_revision=rev_metadata.id, | from_revision=rev_metadata.id, | ||||
id=origin.url, | id=origin.url, | ||||
metadata=rev_metadata.metadata, | metadata=rev_metadata.metadata, | ||||
mappings=rev_metadata.mappings, | mappings=rev_metadata.mappings, | ||||
indexer_configuration_id=rev_metadata.indexer_configuration_id, | indexer_configuration_id=rev_metadata.indexer_configuration_id, | ||||
) | ) | ||||
▲ Show 20 Lines • Show All 72 Lines • Show Last 20 Lines |
For my understanding, so now the generic TId, TData and TResult are now formalized into respectively:
TId and TData because of ContentIndexer
TResult because of this declaration.
Did i get this right?
.oO(That's neat)