diff --git a/swh/indexer/indexer.py b/swh/indexer/indexer.py --- a/swh/indexer/indexer.py +++ b/swh/indexer/indexer.py @@ -18,6 +18,7 @@ List, Optional, Set, + Tuple, TypeVar, Union, ) @@ -31,7 +32,7 @@ from swh.indexer.storage import INDEXER_CFG_KEY, Sha1, get_indexer_storage from swh.indexer.storage.interface import IndexerStorageInterface from swh.model import hashutil -from swh.model.model import Origin, Revision, Sha1Git +from swh.model.model import Directory, Origin, Sha1Git from swh.objstorage.exc import ObjNotFoundError from swh.objstorage.factory import get_objstorage from swh.scheduler import CONFIG as SWH_CONFIG @@ -40,7 +41,7 @@ class ObjectsDict(TypedDict, total=False): - revision: List[Dict] + directory: List[Dict] origin: List[Dict] origin_visit_status: List[Dict] @@ -109,7 +110,7 @@ content, sha1_git for revision, directory, release, and id for origin To implement a new concrete indexer, inherit from the object level - classes: :class:`ContentIndexer`, :class:`RevisionIndexer`, + classes: :class:`ContentIndexer`, :class:`DirectoryIndexer`, :class:`OriginIndexer`. Then you need to implement the following functions: @@ -583,11 +584,11 @@ return results -class RevisionIndexer(BaseIndexer[Sha1Git, Revision, TResult], Generic[TResult]): +class DirectoryIndexer(BaseIndexer[Sha1Git, Directory, TResult], Generic[TResult]): """An object type indexer, inherits from the :class:`BaseIndexer` and - implements Revision indexing using the run method + implements Directory indexing using the run method - Note: the :class:`RevisionIndexer` is not an instantiable object. + Note: the :class:`DirectoryIndexer` is not an instantiable object. To use it in another context one should inherit from this class and override the methods mentioned in the :class:`BaseIndexer` class. @@ -597,7 +598,7 @@ def run(self, ids: List[Sha1Git], **kwargs) -> Dict: """Given a list of sha1_gits: - - retrieve revisions from storage + - retrieve directories from storage - execute the indexing computations - store the results @@ -612,36 +613,37 @@ ) del kwargs["policy_update"] - revision_ids = [ + directory_ids = [ hashutil.hash_to_bytes(id_) if isinstance(id_, str) else id_ for id_ in ids ] - revisions = [] - for (rev_id, rev) in zip(revision_ids, self.storage.revision_get(revision_ids)): - if not rev: - # TODO: call self.index() with rev=None? - self.log.warning( - "Revision %s not found in storage", hashutil.hash_to_hex(rev_id) - ) - continue - revisions.append(rev.to_dict()) - return self.process_journal_objects({"revision": revisions}) + return self._process_directories([(dir_id, None) for dir_id in directory_ids]) def process_journal_objects(self, objects: ObjectsDict) -> Dict: """Worker function for ``JournalClient``. Expects ``objects`` to have a single - key, ``"revision"``.""" - assert set(objects) == {"revision"} + key, ``"directory"``.""" + assert set(objects) == {"directory"} + return self._process_directories( + [(dir_["id"], Directory.from_dict(dir_)) for dir_ in objects["directory"]] + ) + + def _process_directories( + self, + directories: Union[List[Tuple[Sha1Git, Directory]], List[Tuple[Sha1Git, None]]], + ) -> Dict: summary: Dict[str, Any] = {"status": "uneventful"} results = [] - for rev in objects["revision"]: + # TODO: fetch raw_manifest when useful? + + for (dir_id, dir_) in directories: try: - results.extend(self.index(rev["id"], Revision.from_dict(rev))) + results.extend(self.index(dir_id, dir_)) except Exception: if not self.catch_exceptions: raise - self.log.exception("Problem when processing revision") + self.log.exception("Problem when processing directory") sentry_sdk.capture_exception() summary["status"] = "failed" diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py --- a/swh/indexer/metadata.py +++ b/swh/indexer/metadata.py @@ -21,18 +21,18 @@ from swh.core.config import merge_configs from swh.core.utils import grouper from swh.indexer.codemeta import merge_documents -from swh.indexer.indexer import ContentIndexer, OriginIndexer, RevisionIndexer +from swh.indexer.indexer import ContentIndexer, DirectoryIndexer, OriginIndexer from swh.indexer.metadata_detector import detect_metadata from swh.indexer.metadata_dictionary import MAPPINGS from swh.indexer.origin_head import get_head_swhid from swh.indexer.storage import INDEXER_CFG_KEY, Sha1 from swh.indexer.storage.model import ( ContentMetadataRow, + DirectoryIntrinsicMetadataRow, OriginIntrinsicMetadataRow, - RevisionIntrinsicMetadataRow, ) from swh.model import hashutil -from swh.model.model import Origin, Revision, Sha1Git +from swh.model.model import Directory, Origin, Sha1Git from swh.model.swhids import ObjectType REVISION_GET_BATCH_SIZE = 10 @@ -83,7 +83,7 @@ self, id: Sha1, data: Optional[bytes] = None, - log_suffix="unknown revision", + log_suffix="unknown directory", **kwargs, ) -> List[ContentMetadataRow]: """Index sha1s' content and store result. @@ -145,18 +145,18 @@ } -class RevisionMetadataIndexer(RevisionIndexer[RevisionIntrinsicMetadataRow]): - """Revision-level indexer +class DirectoryMetadataIndexer(DirectoryIndexer[DirectoryIntrinsicMetadataRow]): + """Directory-level indexer This indexer is in charge of: - - filtering revisions already indexed in revision_intrinsic_metadata table + - filtering directories already indexed in directory_intrinsic_metadata table with defined computation tool - - retrieve all entry_files in root directory + - retrieve all entry_files in directory - use metadata_detector for file_names containing metadata - compute metadata translation if necessary and possible (depends on tool) - send sha1s to content indexing if possible - - store the results for revision + - store the results for directory """ @@ -166,7 +166,7 @@ def filter(self, sha1_gits): """Filter out known sha1s and return only missing ones.""" - yield from self.idx_storage.revision_intrinsic_metadata_missing( + yield from self.idx_storage.directory_intrinsic_metadata_missing( ( { "id": sha1_git, @@ -177,51 +177,52 @@ ) def index( - self, id: Sha1Git, data: Optional[Revision], **kwargs - ) -> List[RevisionIntrinsicMetadataRow]: - """Index rev by processing it and organizing result. + self, id: Sha1Git, data: Optional[Directory] = None, **kwargs + ) -> List[DirectoryIntrinsicMetadataRow]: + """Index directory by processing it and organizing result. use metadata_detector to iterate on filenames - if one filename detected -> sends file to content indexer - - if multiple file detected -> translation needed at revision level + - if multiple file detected -> translation needed at directory level Args: - id: sha1_git of the revision - data: revision model object from storage + id: sha1_git of the directory + data: directory model object from storage Returns: - dict: dictionary representing a revision_intrinsic_metadata, with + dict: dictionary representing a directory_intrinsic_metadata, with keys: - - id (str): rev's identifier (sha1_git) + - id: directory's identifier (sha1_git) - indexer_configuration_id (bytes): tool used - metadata: dict of retrieved metadata """ - rev = data - assert isinstance(rev, Revision) + if data is None: + dir_ = list(self.storage.directory_ls(id, recursive=False)) + else: + assert isinstance(data, Directory) + dir_ = data.to_dict() try: - root_dir = rev.directory - dir_ls = list(self.storage.directory_ls(root_dir, recursive=False)) - if [entry["type"] for entry in dir_ls] == ["dir"]: + if [entry["type"] for entry in dir_] == ["dir"]: # If the root is just a single directory, recurse into it # eg. PyPI packages, GNU tarballs - subdir = dir_ls[0]["target"] - dir_ls = list(self.storage.directory_ls(subdir, recursive=False)) - files = [entry for entry in dir_ls if entry["type"] == "file"] + subdir = dir_[0]["target"] + dir_ = list(self.storage.directory_ls(subdir, recursive=False)) + files = [entry for entry in dir_ if entry["type"] == "file"] detected_files = detect_metadata(files) - (mappings, metadata) = self.translate_revision_intrinsic_metadata( + (mappings, metadata) = self.translate_directory_intrinsic_metadata( detected_files, - log_suffix="revision=%s" % hashutil.hash_to_hex(rev.id), + log_suffix="directory=%s" % hashutil.hash_to_hex(id), ) except Exception as e: - self.log.exception("Problem when indexing rev: %r", e) + self.log.exception("Problem when indexing dir: %r", e) sentry_sdk.capture_exception() return [ - RevisionIntrinsicMetadataRow( - id=rev.id, + DirectoryIntrinsicMetadataRow( + id=id, indexer_configuration_id=self.tool["id"], mappings=mappings, metadata=metadata, @@ -229,7 +230,7 @@ ] def persist_index_computations( - self, results: List[RevisionIntrinsicMetadataRow] + self, results: List[DirectoryIntrinsicMetadataRow] ) -> Dict[str, int]: """Persist the results in storage. @@ -242,10 +243,10 @@ """ # TODO: add functions in storage to keep data in - # revision_intrinsic_metadata - return self.idx_storage.revision_intrinsic_metadata_add(results) + # directory_intrinsic_metadata + return self.idx_storage.directory_intrinsic_metadata_add(results) - def translate_revision_intrinsic_metadata( + def translate_directory_intrinsic_metadata( self, detected_files: Dict[str, List[Any]], log_suffix: str ) -> Tuple[List[Any], Any]: """ @@ -316,17 +317,17 @@ class OriginMetadataIndexer( - OriginIndexer[Tuple[OriginIntrinsicMetadataRow, RevisionIntrinsicMetadataRow]] + OriginIndexer[Tuple[OriginIntrinsicMetadataRow, DirectoryIntrinsicMetadataRow]] ): USE_TOOLS = False def __init__(self, config=None, **kwargs) -> None: super().__init__(config=config, **kwargs) - self.revision_metadata_indexer = RevisionMetadataIndexer(config=config) + self.directory_metadata_indexer = DirectoryMetadataIndexer(config=config) def index_list( self, origins: List[Origin], check_origin_known: bool = True, **kwargs - ) -> List[Tuple[OriginIntrinsicMetadataRow, RevisionIntrinsicMetadataRow]]: + ) -> List[Tuple[OriginIntrinsicMetadataRow, DirectoryIntrinsicMetadataRow]]: head_rev_ids = [] origins_with_head = [] @@ -365,39 +366,41 @@ self.log.warning("Missing head revision of origin %r", origin.url) continue - for rev_metadata in self.revision_metadata_indexer.index(rev.id, rev): - # There is at most one rev_metadata + for dir_metadata in self.directory_metadata_indexer.index(rev.directory): + # There is at most one dir_metadata orig_metadata = OriginIntrinsicMetadataRow( - from_revision=rev_metadata.id, + from_directory=dir_metadata.id, id=origin.url, - metadata=rev_metadata.metadata, - mappings=rev_metadata.mappings, - indexer_configuration_id=rev_metadata.indexer_configuration_id, + metadata=dir_metadata.metadata, + mappings=dir_metadata.mappings, + indexer_configuration_id=dir_metadata.indexer_configuration_id, ) - results.append((orig_metadata, rev_metadata)) + results.append((orig_metadata, dir_metadata)) return results def persist_index_computations( self, - results: List[Tuple[OriginIntrinsicMetadataRow, RevisionIntrinsicMetadataRow]], + results: List[Tuple[OriginIntrinsicMetadataRow, DirectoryIntrinsicMetadataRow]], ) -> Dict[str, int]: - # Deduplicate revisions - rev_metadata: List[RevisionIntrinsicMetadataRow] = [] + # Deduplicate directories + dir_metadata: List[DirectoryIntrinsicMetadataRow] = [] orig_metadata: List[OriginIntrinsicMetadataRow] = [] summary: Dict = {} - for (orig_item, rev_item) in results: - assert rev_item.metadata == orig_item.metadata - if rev_item.metadata and not (rev_item.metadata.keys() <= {"@context"}): + for (orig_item, dir_item) in results: + assert dir_item.metadata == orig_item.metadata + if dir_item.metadata and not (dir_item.metadata.keys() <= {"@context"}): # Only store non-empty metadata sets - if rev_item not in rev_metadata: - rev_metadata.append(rev_item) + if dir_item not in dir_metadata: + dir_metadata.append(dir_item) if orig_item not in orig_metadata: orig_metadata.append(orig_item) - if rev_metadata: - summary_rev = self.idx_storage.revision_intrinsic_metadata_add(rev_metadata) - summary.update(summary_rev) + if dir_metadata: + summary_dir = self.idx_storage.directory_intrinsic_metadata_add( + dir_metadata + ) + summary.update(summary_dir) if orig_metadata: summary_ori = self.idx_storage.origin_intrinsic_metadata_add(orig_metadata) summary.update(summary_ori) diff --git a/swh/indexer/sql/30-schema.sql b/swh/indexer/sql/30-schema.sql --- a/swh/indexer/sql/30-schema.sql +++ b/swh/indexer/sql/30-schema.sql @@ -99,34 +99,34 @@ comment on column content_metadata.metadata is 'result of translation with defined format'; comment on column content_metadata.indexer_configuration_id is 'tool used for translation'; --- The table revision_intrinsic_metadata provides a minimal set of intrinsic +-- The table directory_intrinsic_metadata provides a minimal set of intrinsic -- metadata detected with the detection tool (indexer_configuration_id) and -- aggregated from the content_metadata translation. -create table revision_intrinsic_metadata( +create table directory_intrinsic_metadata( id sha1_git not null, metadata jsonb not null, indexer_configuration_id bigint not null, mappings text array not null ); -comment on table revision_intrinsic_metadata is 'metadata semantically detected and translated in a revision'; -comment on column revision_intrinsic_metadata.id is 'sha1_git of revision'; -comment on column revision_intrinsic_metadata.metadata is 'result of detection and translation with defined format'; -comment on column revision_intrinsic_metadata.indexer_configuration_id is 'tool used for detection'; -comment on column revision_intrinsic_metadata.mappings is 'type of metadata files used to obtain this metadata (eg. pkg-info, npm)'; +comment on table directory_intrinsic_metadata is 'metadata semantically detected and translated in a directory'; +comment on column directory_intrinsic_metadata.id is 'sha1_git of directory'; +comment on column directory_intrinsic_metadata.metadata is 'result of detection and translation with defined format'; +comment on column directory_intrinsic_metadata.indexer_configuration_id is 'tool used for detection'; +comment on column directory_intrinsic_metadata.mappings is 'type of metadata files used to obtain this metadata (eg. pkg-info, npm)'; create table origin_intrinsic_metadata( id text not null, -- origin url metadata jsonb, indexer_configuration_id bigint not null, - from_revision sha1_git not null, + from_directory sha1_git not null, metadata_tsvector tsvector, mappings text array not null ); comment on table origin_intrinsic_metadata is 'keeps intrinsic metadata for an origin'; comment on column origin_intrinsic_metadata.id is 'url of the origin'; -comment on column origin_intrinsic_metadata.metadata is 'metadata extracted from a revision'; +comment on column origin_intrinsic_metadata.metadata is 'metadata extracted from a directory'; comment on column origin_intrinsic_metadata.indexer_configuration_id is 'tool used to generate this metadata'; -comment on column origin_intrinsic_metadata.from_revision is 'sha1 of the revision this metadata was copied from.'; +comment on column origin_intrinsic_metadata.from_directory is 'sha1 of the directory this metadata was copied from.'; comment on column origin_intrinsic_metadata.mappings is 'type of metadata files used to obtain this metadata (eg. pkg-info, npm)'; diff --git a/swh/indexer/sql/50-func.sql b/swh/indexer/sql/50-func.sql --- a/swh/indexer/sql/50-func.sql +++ b/swh/indexer/sql/50-func.sql @@ -273,25 +273,25 @@ -- end content_metadata functions --- add tmp_revision_intrinsic_metadata entries to revision_intrinsic_metadata, +-- add tmp_directory_intrinsic_metadata entries to directory_intrinsic_metadata, -- overwriting duplicates. -- -- If filtering duplicates is in order, the call to --- swh_revision_intrinsic_metadata_missing must take place before calling this +-- swh_directory_intrinsic_metadata_missing must take place before calling this -- function. -- -- operates in bulk: 0. swh_mktemp(content_language), 1. COPY to --- tmp_revision_intrinsic_metadata, 2. call this function -create or replace function swh_revision_intrinsic_metadata_add() +-- tmp_directory_intrinsic_metadata, 2. call this function +create or replace function swh_directory_intrinsic_metadata_add() returns bigint language plpgsql as $$ declare res bigint; begin - insert into revision_intrinsic_metadata (id, metadata, mappings, indexer_configuration_id) + insert into directory_intrinsic_metadata (id, metadata, mappings, indexer_configuration_id) select id, metadata, mappings, indexer_configuration_id - from tmp_revision_intrinsic_metadata tcm + from tmp_directory_intrinsic_metadata tcm on conflict(id, indexer_configuration_id) do update set metadata = excluded.metadata, @@ -302,19 +302,19 @@ end $$; -comment on function swh_revision_intrinsic_metadata_add() IS 'Add new revision intrinsic metadata'; +comment on function swh_directory_intrinsic_metadata_add() IS 'Add new directory intrinsic metadata'; --- create a temporary table for retrieving revision_intrinsic_metadata -create or replace function swh_mktemp_revision_intrinsic_metadata() +-- create a temporary table for retrieving directory_intrinsic_metadata +create or replace function swh_mktemp_directory_intrinsic_metadata() returns void language sql as $$ - create temporary table if not exists tmp_revision_intrinsic_metadata ( - like revision_intrinsic_metadata including defaults + create temporary table if not exists tmp_directory_intrinsic_metadata ( + like directory_intrinsic_metadata including defaults ) on commit delete rows; $$; -comment on function swh_mktemp_revision_intrinsic_metadata() is 'Helper table to add revision intrinsic metadata'; +comment on function swh_mktemp_directory_intrinsic_metadata() is 'Helper table to add directory intrinsic metadata'; -- create a temporary table for retrieving origin_intrinsic_metadata create or replace function swh_mktemp_origin_intrinsic_metadata() @@ -380,8 +380,8 @@ begin perform swh_origin_intrinsic_metadata_compute_tsvector(); - insert into origin_intrinsic_metadata (id, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings) - select id, metadata, indexer_configuration_id, from_revision, + insert into origin_intrinsic_metadata (id, metadata, indexer_configuration_id, from_directory, metadata_tsvector, mappings) + select id, metadata, indexer_configuration_id, from_directory, metadata_tsvector, mappings from tmp_origin_intrinsic_metadata on conflict(id, indexer_configuration_id) @@ -389,7 +389,7 @@ metadata = excluded.metadata, metadata_tsvector = excluded.metadata_tsvector, mappings = excluded.mappings, - from_revision = excluded.from_revision; + from_directory = excluded.from_directory; get diagnostics res = ROW_COUNT; return res; diff --git a/swh/indexer/sql/60-indexes.sql b/swh/indexer/sql/60-indexes.sql --- a/swh/indexer/sql/60-indexes.sql +++ b/swh/indexer/sql/60-indexes.sql @@ -25,12 +25,12 @@ alter table content_metadata add constraint content_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid; alter table content_metadata validate constraint content_metadata_indexer_configuration_id_fkey; --- revision_intrinsic_metadata -create unique index revision_intrinsic_metadata_pkey on revision_intrinsic_metadata(id, indexer_configuration_id); -alter table revision_intrinsic_metadata add primary key using index revision_intrinsic_metadata_pkey; +-- directory_intrinsic_metadata +create unique index directory_intrinsic_metadata_pkey on directory_intrinsic_metadata(id, indexer_configuration_id); +alter table directory_intrinsic_metadata add primary key using index directory_intrinsic_metadata_pkey; -alter table revision_intrinsic_metadata add constraint revision_intrinsic_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid; -alter table revision_intrinsic_metadata validate constraint revision_intrinsic_metadata_indexer_configuration_id_fkey; +alter table directory_intrinsic_metadata add constraint directory_intrinsic_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid; +alter table directory_intrinsic_metadata validate constraint directory_intrinsic_metadata_indexer_configuration_id_fkey; -- content_mimetype create unique index content_mimetype_pkey on content_mimetype(id, indexer_configuration_id); diff --git a/swh/indexer/sql/upgrades/134.sql b/swh/indexer/sql/upgrades/134.sql new file mode 100644 --- /dev/null +++ b/swh/indexer/sql/upgrades/134.sql @@ -0,0 +1,145 @@ +-- SWH Indexer DB schema upgrade +-- from_version: 133 +-- to_version: 134 +-- description: replace revision_intrinsic_metadata with directory_intrinsic_metadata +-- and origin_intrinsic_metadata.from_revision with origin_intrinsic_metadata.from_directory +-- This migration works by dropping both tables and reindexing from scratch. + +insert into dbversion(version, release, description) + values(134, now(), 'Work In Progress'); + +drop table origin_intrinsic_metadata; +drop table revision_intrinsic_metadata; +drop function swh_revision_intrinsic_metadata_add; +drop function swh_mktemp_revision_intrinsic_metadata; + + +create table directory_intrinsic_metadata( + id sha1_git not null, + metadata jsonb not null, + indexer_configuration_id bigint not null, + mappings text array not null +); + +comment on table directory_intrinsic_metadata is 'metadata semantically detected and translated in a directory'; +comment on column directory_intrinsic_metadata.id is 'sha1_git of directory'; +comment on column directory_intrinsic_metadata.metadata is 'result of detection and translation with defined format'; +comment on column directory_intrinsic_metadata.indexer_configuration_id is 'tool used for detection'; +comment on column directory_intrinsic_metadata.mappings is 'type of metadata files used to obtain this metadata (eg. pkg-info, npm)'; + +create table origin_intrinsic_metadata( + id text not null, -- origin url + metadata jsonb, + indexer_configuration_id bigint not null, + from_directory sha1_git not null, + metadata_tsvector tsvector, + mappings text array not null +); + +comment on table origin_intrinsic_metadata is 'keeps intrinsic metadata for an origin'; +comment on column origin_intrinsic_metadata.id is 'url of the origin'; +comment on column origin_intrinsic_metadata.metadata is 'metadata extracted from a directory'; +comment on column origin_intrinsic_metadata.indexer_configuration_id is 'tool used to generate this metadata'; +comment on column origin_intrinsic_metadata.from_directory is 'sha1 of the directory this metadata was copied from.'; +comment on column origin_intrinsic_metadata.mappings is 'type of metadata files used to obtain this metadata (eg. pkg-info, npm)'; + +-- add tmp_directory_intrinsic_metadata entries to directory_intrinsic_metadata, +-- overwriting duplicates. +-- +-- If filtering duplicates is in order, the call to +-- swh_directory_intrinsic_metadata_missing must take place before calling this +-- function. +-- +-- operates in bulk: 0. swh_mktemp(content_language), 1. COPY to +-- tmp_directory_intrinsic_metadata, 2. call this function +create or replace function swh_directory_intrinsic_metadata_add() + returns bigint + language plpgsql +as $$ +declare + res bigint; +begin + insert into directory_intrinsic_metadata (id, metadata, mappings, indexer_configuration_id) + select id, metadata, mappings, indexer_configuration_id + from tmp_directory_intrinsic_metadata tcm + on conflict(id, indexer_configuration_id) + do update set + metadata = excluded.metadata, + mappings = excluded.mappings; + + get diagnostics res = ROW_COUNT; + return res; +end +$$; + +comment on function swh_directory_intrinsic_metadata_add() IS 'Add new directory intrinsic metadata'; + +-- create a temporary table for retrieving directory_intrinsic_metadata +create or replace function swh_mktemp_directory_intrinsic_metadata() + returns void + language sql +as $$ + create temporary table if not exists tmp_directory_intrinsic_metadata ( + like directory_intrinsic_metadata including defaults + ) on commit delete rows; +$$; + +comment on function swh_mktemp_directory_intrinsic_metadata() is 'Helper table to add directory intrinsic metadata'; + +-- create a temporary table for retrieving origin_intrinsic_metadata +create or replace function swh_mktemp_origin_intrinsic_metadata() + returns void + language sql +as $$ + create temporary table if not exists tmp_origin_intrinsic_metadata ( + like origin_intrinsic_metadata including defaults + ) on commit delete rows; +$$; + +comment on function swh_mktemp_origin_intrinsic_metadata() is 'Helper table to add origin intrinsic metadata'; + +-- add tmp_origin_intrinsic_metadata entries to origin_intrinsic_metadata, +-- overwriting duplicates. +-- +-- If filtering duplicates is in order, the call to +-- swh_origin_intrinsic_metadata_missing must take place before calling this +-- function. +-- +-- operates in bulk: 0. swh_mktemp(content_language), 1. COPY to +-- tmp_origin_intrinsic_metadata, 2. call this function +create or replace function swh_origin_intrinsic_metadata_add() + returns bigint + language plpgsql +as $$ +declare + res bigint; +begin + perform swh_origin_intrinsic_metadata_compute_tsvector(); + + insert into origin_intrinsic_metadata (id, metadata, indexer_configuration_id, from_directory, metadata_tsvector, mappings) + select id, metadata, indexer_configuration_id, from_directory, + metadata_tsvector, mappings + from tmp_origin_intrinsic_metadata + on conflict(id, indexer_configuration_id) + do update set + metadata = excluded.metadata, + metadata_tsvector = excluded.metadata_tsvector, + mappings = excluded.mappings, + from_directory = excluded.from_directory; + + get diagnostics res = ROW_COUNT; + return res; +end +$$; + +comment on function swh_origin_intrinsic_metadata_add() IS 'Add new origin intrinsic metadata'; + + + +-- directory_intrinsic_metadata +create unique index directory_intrinsic_metadata_pkey on directory_intrinsic_metadata(id, indexer_configuration_id); +alter table directory_intrinsic_metadata add primary key using index directory_intrinsic_metadata_pkey; + +alter table directory_intrinsic_metadata add constraint directory_intrinsic_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid; +alter table directory_intrinsic_metadata validate constraint directory_intrinsic_metadata_indexer_configuration_id_fkey; + diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py --- a/swh/indexer/storage/__init__.py +++ b/swh/indexer/storage/__init__.py @@ -30,8 +30,8 @@ ContentLicenseRow, ContentMetadataRow, ContentMimetypeRow, + DirectoryIntrinsicMetadataRow, OriginIntrinsicMetadataRow, - RevisionIntrinsicMetadataRow, ) from .writer import JournalWriter @@ -522,52 +522,52 @@ @timed @db_transaction() - def revision_intrinsic_metadata_missing( + def directory_intrinsic_metadata_missing( self, metadata: Iterable[Dict], db=None, cur=None ) -> List[Tuple[Sha1, int]]: return [ obj[0] - for obj in db.revision_intrinsic_metadata_missing_from_list(metadata, cur) + for obj in db.directory_intrinsic_metadata_missing_from_list(metadata, cur) ] @timed @db_transaction() - def revision_intrinsic_metadata_get( + def directory_intrinsic_metadata_get( self, ids: Iterable[Sha1], db=None, cur=None - ) -> List[RevisionIntrinsicMetadataRow]: + ) -> List[DirectoryIntrinsicMetadataRow]: return [ - RevisionIntrinsicMetadataRow.from_dict( + DirectoryIntrinsicMetadataRow.from_dict( converters.db_to_metadata( - dict(zip(db.revision_intrinsic_metadata_cols, c)) + dict(zip(db.directory_intrinsic_metadata_cols, c)) ) ) - for c in db.revision_intrinsic_metadata_get_from_list(ids, cur) + for c in db.directory_intrinsic_metadata_get_from_list(ids, cur) ] @timed @process_metrics @db_transaction() - def revision_intrinsic_metadata_add( + def directory_intrinsic_metadata_add( self, - metadata: List[RevisionIntrinsicMetadataRow], + metadata: List[DirectoryIntrinsicMetadataRow], db=None, cur=None, ) -> Dict[str, int]: check_id_duplicates(metadata) metadata.sort(key=lambda m: m.id) - self.journal_writer.write_additions("revision_intrinsic_metadata", metadata) + self.journal_writer.write_additions("directory_intrinsic_metadata", metadata) - db.mktemp_revision_intrinsic_metadata(cur) + db.mktemp_directory_intrinsic_metadata(cur) db.copy_to( [m.to_dict() for m in metadata], - "tmp_revision_intrinsic_metadata", + "tmp_directory_intrinsic_metadata", ["id", "metadata", "mappings", "indexer_configuration_id"], cur, ) - count = db.revision_intrinsic_metadata_add_from_temp(cur) + count = db.directory_intrinsic_metadata_add_from_temp(cur) return { - "revision_intrinsic_metadata:add": count, + "directory_intrinsic_metadata:add": count, } @timed @@ -602,7 +602,13 @@ db.copy_to( [m.to_dict() for m in metadata], "tmp_origin_intrinsic_metadata", - ["id", "metadata", "indexer_configuration_id", "from_revision", "mappings"], + [ + "id", + "metadata", + "indexer_configuration_id", + "from_directory", + "mappings", + ], cur, ) count = db.origin_intrinsic_metadata_add_from_temp(cur) diff --git a/swh/indexer/storage/db.py b/swh/indexer/storage/db.py --- a/swh/indexer/storage/db.py +++ b/swh/indexer/storage/db.py @@ -16,7 +16,7 @@ """Proxy to the SWH Indexer DB, with wrappers around stored procedures""" content_mimetype_hash_keys = ["id", "indexer_configuration_id"] - current_version = 133 + current_version = 134 def _missing_from_list( self, table: str, data: Iterable[Dict], hash_keys: List[str], cur=None @@ -350,18 +350,18 @@ "content_metadata", ids, self.content_metadata_cols, cur=cur ) - revision_intrinsic_metadata_hash_keys = ["id", "indexer_configuration_id"] + directory_intrinsic_metadata_hash_keys = ["id", "indexer_configuration_id"] - def revision_intrinsic_metadata_missing_from_list(self, metadata, cur=None): + def directory_intrinsic_metadata_missing_from_list(self, metadata, cur=None): """List missing metadata.""" yield from self._missing_from_list( - "revision_intrinsic_metadata", + "directory_intrinsic_metadata", metadata, - self.revision_intrinsic_metadata_hash_keys, + self.directory_intrinsic_metadata_hash_keys, cur=cur, ) - revision_intrinsic_metadata_cols = [ + directory_intrinsic_metadata_cols = [ "id", "metadata", "mappings", @@ -371,27 +371,27 @@ "tool_configuration", ] - @stored_procedure("swh_mktemp_revision_intrinsic_metadata") - def mktemp_revision_intrinsic_metadata(self, cur=None): + @stored_procedure("swh_mktemp_directory_intrinsic_metadata") + def mktemp_directory_intrinsic_metadata(self, cur=None): pass - def revision_intrinsic_metadata_add_from_temp(self, cur=None): + def directory_intrinsic_metadata_add_from_temp(self, cur=None): cur = self._cursor(cur) - cur.execute("select * from swh_revision_intrinsic_metadata_add()") + cur.execute("select * from swh_directory_intrinsic_metadata_add()") return cur.fetchone()[0] - def revision_intrinsic_metadata_get_from_list(self, ids, cur=None): + def directory_intrinsic_metadata_get_from_list(self, ids, cur=None): yield from self._get_from_list( - "revision_intrinsic_metadata", + "directory_intrinsic_metadata", ids, - self.revision_intrinsic_metadata_cols, + self.directory_intrinsic_metadata_cols, cur=cur, ) origin_intrinsic_metadata_cols = [ "id", "metadata", - "from_revision", + "from_directory", "mappings", "tool_id", "tool_name", diff --git a/swh/indexer/storage/in_memory.py b/swh/indexer/storage/in_memory.py --- a/swh/indexer/storage/in_memory.py +++ b/swh/indexer/storage/in_memory.py @@ -38,8 +38,8 @@ ContentLicenseRow, ContentMetadataRow, ContentMimetypeRow, + DirectoryIntrinsicMetadataRow, OriginIntrinsicMetadataRow, - RevisionIntrinsicMetadataRow, ) from .writer import JournalWriter @@ -250,8 +250,8 @@ self._content_ctags = SubStorage(ContentCtagsRow, *args) self._licenses = SubStorage(ContentLicenseRow, *args) self._content_metadata = SubStorage(ContentMetadataRow, *args) - self._revision_intrinsic_metadata = SubStorage( - RevisionIntrinsicMetadataRow, *args + self._directory_intrinsic_metadata = SubStorage( + DirectoryIntrinsicMetadataRow, *args ) self._origin_intrinsic_metadata = SubStorage(OriginIntrinsicMetadataRow, *args) @@ -369,21 +369,21 @@ added = self._content_metadata.add(metadata) return {"content_metadata:add": added} - def revision_intrinsic_metadata_missing( + def directory_intrinsic_metadata_missing( self, metadata: Iterable[Dict] ) -> List[Tuple[Sha1, int]]: - return self._revision_intrinsic_metadata.missing(metadata) + return self._directory_intrinsic_metadata.missing(metadata) - def revision_intrinsic_metadata_get( + def directory_intrinsic_metadata_get( self, ids: Iterable[Sha1] - ) -> List[RevisionIntrinsicMetadataRow]: - return self._revision_intrinsic_metadata.get(ids) + ) -> List[DirectoryIntrinsicMetadataRow]: + return self._directory_intrinsic_metadata.get(ids) - def revision_intrinsic_metadata_add( - self, metadata: List[RevisionIntrinsicMetadataRow] + def directory_intrinsic_metadata_add( + self, metadata: List[DirectoryIntrinsicMetadataRow] ) -> Dict[str, int]: - added = self._revision_intrinsic_metadata.add(metadata) - return {"revision_intrinsic_metadata:add": added} + added = self._directory_intrinsic_metadata.add(metadata) + return {"directory_intrinsic_metadata:add": added} def origin_intrinsic_metadata_get( self, urls: Iterable[str] diff --git a/swh/indexer/storage/interface.py b/swh/indexer/storage/interface.py --- a/swh/indexer/storage/interface.py +++ b/swh/indexer/storage/interface.py @@ -15,8 +15,8 @@ ContentLicenseRow, ContentMetadataRow, ContentMimetypeRow, + DirectoryIntrinsicMetadataRow, OriginIntrinsicMetadataRow, - RevisionIntrinsicMetadataRow, ) TResult = TypeVar("TResult") @@ -341,8 +341,8 @@ """ ... - @remote_api_endpoint("revision_intrinsic_metadata/missing") - def revision_intrinsic_metadata_missing( + @remote_api_endpoint("directory_intrinsic_metadata/missing") + def directory_intrinsic_metadata_missing( self, metadata: Iterable[Dict] ) -> List[Tuple[Sha1, int]]: """List metadata missing from storage. @@ -350,7 +350,7 @@ Args: metadata (iterable): dictionaries with keys: - - **id** (bytes): sha1_git revision identifier + - **id** (bytes): sha1_git directory identifier - **indexer_configuration_id** (int): tool used to compute the results @@ -360,11 +360,11 @@ """ ... - @remote_api_endpoint("revision_intrinsic_metadata") - def revision_intrinsic_metadata_get( + @remote_api_endpoint("directory_intrinsic_metadata") + def directory_intrinsic_metadata_get( self, ids: Iterable[Sha1] - ) -> List[RevisionIntrinsicMetadataRow]: - """Retrieve revision metadata per id. + ) -> List[DirectoryIntrinsicMetadataRow]: + """Retrieve directory metadata per id. Args: ids (iterable): sha1 checksums @@ -375,10 +375,10 @@ """ ... - @remote_api_endpoint("revision_intrinsic_metadata/add") - def revision_intrinsic_metadata_add( + @remote_api_endpoint("directory_intrinsic_metadata/add") + def directory_intrinsic_metadata_add( self, - metadata: List[RevisionIntrinsicMetadataRow], + metadata: List[DirectoryIntrinsicMetadataRow], ) -> Dict[str, int]: """Add metadata not present in storage. diff --git a/swh/indexer/storage/model.py b/swh/indexer/storage/model.py --- a/swh/indexer/storage/model.py +++ b/swh/indexer/storage/model.py @@ -120,8 +120,8 @@ @attr.s -class RevisionIntrinsicMetadataRow(BaseRow): - object_type: Final = "revision_intrinsic_metadata" +class DirectoryIntrinsicMetadataRow(BaseRow): + object_type: Final = "directory_intrinsic_metadata" id = attr.ib(type=Sha1Git) metadata = attr.ib(type=Dict[str, Any]) @@ -134,5 +134,5 @@ id = attr.ib(type=str) metadata = attr.ib(type=Dict[str, Any]) - from_revision = attr.ib(type=Sha1Git) + from_directory = attr.ib(type=Sha1Git) mappings = attr.ib(type=List[str]) diff --git a/swh/indexer/tests/conftest.py b/swh/indexer/tests/conftest.py --- a/swh/indexer/tests/conftest.py +++ b/swh/indexer/tests/conftest.py @@ -23,7 +23,7 @@ TASK_NAMES: List[Tuple[str, str]] = [ # (scheduler-task-type, task-class-test-name) - ("index-revision-metadata", "revision_intrinsic_metadata"), + ("index-directory-metadata", "directory_intrinsic_metadata"), ("index-origin-metadata", "origin_intrinsic_metadata"), ] diff --git a/swh/indexer/tests/storage/conftest.py b/swh/indexer/tests/storage/conftest.py --- a/swh/indexer/tests/storage/conftest.py +++ b/swh/indexer/tests/storage/conftest.py @@ -41,9 +41,9 @@ data.tools = tools data.sha1_1 = hash_to_bytes("34973274ccef6ab4dfaaf86599792fa9c3fe4689") data.sha1_2 = hash_to_bytes("61c2b3a30496d329e21af70dd2d7e097046d07b7") - data.revision_id_1 = hash_to_bytes("7026b7c1a2af56521e951c01ed20f255fa054238") - data.revision_id_2 = hash_to_bytes("7026b7c1a2af56521e9587659012345678904321") - data.revision_id_3 = hash_to_bytes("7026b7c1a2af56521e9587659012345678904320") + data.directory_id_1 = hash_to_bytes("7026b7c1a2af56521e951c01ed20f255fa054238") + data.directory_id_2 = hash_to_bytes("7026b7c1a2af56521e9587659012345678904321") + data.directory_id_3 = hash_to_bytes("7026b7c1a2af56521e9587659012345678904320") data.origin_url_1 = "file:///dev/0/zero" # 44434341 data.origin_url_2 = "file:///dev/1/one" # 44434342 data.origin_url_3 = "file:///dev/2/two" # 54974445 diff --git a/swh/indexer/tests/storage/test_storage.py b/swh/indexer/tests/storage/test_storage.py --- a/swh/indexer/tests/storage/test_storage.py +++ b/swh/indexer/tests/storage/test_storage.py @@ -19,8 +19,8 @@ ContentLicenseRow, ContentMetadataRow, ContentMimetypeRow, + DirectoryIntrinsicMetadataRow, OriginIntrinsicMetadataRow, - RevisionIntrinsicMetadataRow, ) from swh.model.hashutil import hash_to_bytes @@ -289,37 +289,37 @@ etype = self.endpoint_type tool = data.tools[self.tool_name] - data_rev1 = self.row_class.from_dict( + data_dir1 = self.row_class.from_dict( { - "id": data.revision_id_2, + "id": data.directory_id_2, **self.example_data[0], "indexer_configuration_id": tool["id"], } ) - data_rev2 = self.row_class.from_dict( + data_dir2 = self.row_class.from_dict( { - "id": data.revision_id_2, + "id": data.directory_id_2, **self.example_data[1], "indexer_configuration_id": tool["id"], } ) # when - summary = endpoint(storage, etype, "add")([data_rev1]) + summary = endpoint(storage, etype, "add")([data_dir1]) assert summary == expected_summary(1, etype) with pytest.raises(DuplicateId): - endpoint(storage, etype, "add")([data_rev2, data_rev2]) + endpoint(storage, etype, "add")([data_dir2, data_dir2]) # then actual_data = list( - endpoint(storage, etype, "get")([data.revision_id_2, data.revision_id_1]) + endpoint(storage, etype, "get")([data.directory_id_2, data.directory_id_1]) ) expected_data = [ self.row_class.from_dict( - {"id": data.revision_id_2, **self.example_data[0], "tool": tool} + {"id": data.directory_id_2, **self.example_data[0], "tool": tool} ) ] assert actual_data == expected_data @@ -806,11 +806,11 @@ row_class = ContentMetadataRow -class TestIndexerStorageRevisionIntrinsicMetadata(StorageETypeTester): - """Test Indexer Storage revision_intrinsic_metadata related methods""" +class TestIndexerStorageDirectoryIntrinsicMetadata(StorageETypeTester): + """Test Indexer Storage directory_intrinsic_metadata related methods""" tool_name = "swh-metadata-detector" - endpoint_type = "revision_intrinsic_metadata" + endpoint_type = "directory_intrinsic_metadata" example_data = [ { "metadata": { @@ -830,7 +830,7 @@ "mappings": ["mapping2"], }, ] - row_class = RevisionIntrinsicMetadataRow + row_class = DirectoryIntrinsicMetadataRow class TestIndexerStorageContentFossologyLicense(StorageETypeTester): @@ -1102,8 +1102,8 @@ "version": None, "name": None, } - metadata_rev = RevisionIntrinsicMetadataRow( - id=data.revision_id_2, + metadata_dir = DirectoryIntrinsicMetadataRow( + id=data.directory_id_2, metadata=metadata, mappings=["mapping1"], indexer_configuration_id=tool_id, @@ -1113,11 +1113,11 @@ metadata=metadata, indexer_configuration_id=tool_id, mappings=["mapping1"], - from_revision=data.revision_id_2, + from_directory=data.directory_id_2, ) # when - storage.revision_intrinsic_metadata_add([metadata_rev]) + storage.directory_intrinsic_metadata_add([metadata_dir]) storage.origin_intrinsic_metadata_add([metadata_origin]) # then @@ -1130,7 +1130,7 @@ id=data.origin_url_1, metadata=metadata, tool=data.tools["swh-metadata-detector"], - from_revision=data.revision_id_2, + from_directory=data.directory_id_2, mappings=["mapping1"], ) ] @@ -1156,8 +1156,8 @@ "version": None, "name": None, } - metadata_rev_v1 = RevisionIntrinsicMetadataRow( - id=data.revision_id_2, + metadata_dir_v1 = DirectoryIntrinsicMetadataRow( + id=data.directory_id_2, metadata=metadata_v1, mappings=[], indexer_configuration_id=tool_id, @@ -1167,11 +1167,11 @@ metadata=metadata_v1.copy(), indexer_configuration_id=tool_id, mappings=[], - from_revision=data.revision_id_2, + from_directory=data.directory_id_2, ) # given - storage.revision_intrinsic_metadata_add([metadata_rev_v1]) + storage.directory_intrinsic_metadata_add([metadata_dir_v1]) storage.origin_intrinsic_metadata_add([metadata_origin_v1]) # when @@ -1185,7 +1185,7 @@ id=data.origin_url_1, metadata=metadata_v1, tool=data.tools["swh-metadata-detector"], - from_revision=data.revision_id_2, + from_directory=data.directory_id_2, mappings=[], ) ] @@ -1199,16 +1199,16 @@ "author": "MG", } ) - metadata_rev_v2 = attr.evolve(metadata_rev_v1, metadata=metadata_v2) + metadata_dir_v2 = attr.evolve(metadata_dir_v1, metadata=metadata_v2) metadata_origin_v2 = OriginIntrinsicMetadataRow( id=data.origin_url_1, metadata=metadata_v2.copy(), indexer_configuration_id=tool_id, mappings=["npm"], - from_revision=data.revision_id_1, + from_directory=data.directory_id_1, ) - storage.revision_intrinsic_metadata_add([metadata_rev_v2]) + storage.directory_intrinsic_metadata_add([metadata_dir_v2]) storage.origin_intrinsic_metadata_add([metadata_origin_v2]) actual_metadata = list( @@ -1220,7 +1220,7 @@ id=data.origin_url_1, metadata=metadata_v2, tool=data.tools["swh-metadata-detector"], - from_revision=data.revision_id_1, + from_directory=data.directory_id_1, mappings=["npm"], ) ] @@ -1252,8 +1252,8 @@ "mappings": [], } - metadata_rev_v1 = RevisionIntrinsicMetadataRow( - id=data.revision_id_2, + metadata_dir_v1 = DirectoryIntrinsicMetadataRow( + id=data.directory_id_2, metadata={ "version": None, "name": None, @@ -1265,7 +1265,7 @@ data_v1 = [ OriginIntrinsicMetadataRow( id=origin, - from_revision=data.revision_id_2, + from_directory=data.directory_id_2, indexer_configuration_id=tool_id, **example_data1, ) @@ -1274,7 +1274,7 @@ data_v2 = [ OriginIntrinsicMetadataRow( id=origin, - from_revision=data.revision_id_2, + from_directory=data.directory_id_2, indexer_configuration_id=tool_id, **example_data2, ) @@ -1287,7 +1287,7 @@ data_v2b = list(reversed(data_v2[0:-1])) # given - storage.revision_intrinsic_metadata_add([metadata_rev_v1]) + storage.directory_intrinsic_metadata_add([metadata_dir_v1]) storage.origin_intrinsic_metadata_add(data_v1) # when @@ -1296,7 +1296,7 @@ expected_data_v1 = [ OriginIntrinsicMetadataRow( id=origin, - from_revision=data.revision_id_2, + from_directory=data.directory_id_2, tool=data.tools["swh-metadata-detector"], **example_data1, ) @@ -1326,7 +1326,7 @@ expected_data_v2 = [ OriginIntrinsicMetadataRow( id=origin, - from_revision=data.revision_id_2, + from_directory=data.directory_id_2, tool=data.tools["swh-metadata-detector"], **example_data2, ) @@ -1351,8 +1351,8 @@ "developmentStatus": None, "name": None, } - metadata_rev = RevisionIntrinsicMetadataRow( - id=data.revision_id_2, + metadata_dir = DirectoryIntrinsicMetadataRow( + id=data.directory_id_2, metadata=metadata, mappings=["mapping1"], indexer_configuration_id=tool_id, @@ -1362,11 +1362,11 @@ metadata=metadata, indexer_configuration_id=tool_id, mappings=["mapping1"], - from_revision=data.revision_id_2, + from_directory=data.directory_id_2, ) # when - storage.revision_intrinsic_metadata_add([metadata_rev]) + storage.directory_intrinsic_metadata_add([metadata_dir]) with pytest.raises(DuplicateId): storage.origin_intrinsic_metadata_add([metadata_origin, metadata_origin]) @@ -1381,8 +1381,8 @@ metadata1 = { "author": "John Doe", } - metadata1_rev = RevisionIntrinsicMetadataRow( - id=data.revision_id_1, + metadata1_dir = DirectoryIntrinsicMetadataRow( + id=data.directory_id_1, metadata=metadata1, mappings=[], indexer_configuration_id=tool_id, @@ -1392,13 +1392,13 @@ metadata=metadata1, mappings=[], indexer_configuration_id=tool_id, - from_revision=data.revision_id_1, + from_directory=data.directory_id_1, ) metadata2 = { "author": "Jane Doe", } - metadata2_rev = RevisionIntrinsicMetadataRow( - id=data.revision_id_2, + metadata2_dir = DirectoryIntrinsicMetadataRow( + id=data.directory_id_2, metadata=metadata2, mappings=[], indexer_configuration_id=tool_id, @@ -1408,13 +1408,13 @@ metadata=metadata2, mappings=[], indexer_configuration_id=tool_id, - from_revision=data.revision_id_2, + from_directory=data.directory_id_2, ) # when - storage.revision_intrinsic_metadata_add([metadata1_rev]) + storage.directory_intrinsic_metadata_add([metadata1_dir]) storage.origin_intrinsic_metadata_add([metadata1_origin]) - storage.revision_intrinsic_metadata_add([metadata2_rev]) + storage.directory_intrinsic_metadata_add([metadata2_dir]) storage.origin_intrinsic_metadata_add([metadata2_origin]) # then @@ -1444,8 +1444,8 @@ "Jane Doe", ] } - metadata1_rev = RevisionIntrinsicMetadataRow( - id=data.revision_id_1, + metadata1_dir = DirectoryIntrinsicMetadataRow( + id=data.directory_id_1, metadata=metadata1, mappings=[], indexer_configuration_id=tool_id, @@ -1455,7 +1455,7 @@ metadata=metadata1, mappings=[], indexer_configuration_id=tool_id, - from_revision=data.revision_id_1, + from_directory=data.directory_id_1, ) metadata2 = { "author": [ @@ -1463,8 +1463,8 @@ "Jane Doe", ] } - metadata2_rev = RevisionIntrinsicMetadataRow( - id=data.revision_id_2, + metadata2_dir = DirectoryIntrinsicMetadataRow( + id=data.directory_id_2, metadata=metadata2, mappings=[], indexer_configuration_id=tool_id, @@ -1474,13 +1474,13 @@ metadata=metadata2, mappings=[], indexer_configuration_id=tool_id, - from_revision=data.revision_id_2, + from_directory=data.directory_id_2, ) # when - storage.revision_intrinsic_metadata_add([metadata1_rev]) + storage.directory_intrinsic_metadata_add([metadata1_dir]) storage.origin_intrinsic_metadata_add([metadata1_origin]) - storage.revision_intrinsic_metadata_add([metadata2_rev]) + storage.directory_intrinsic_metadata_add([metadata2_dir]) storage.origin_intrinsic_metadata_add([metadata2_origin]) # then @@ -1508,8 +1508,8 @@ "@context": "foo", "author": "John Doe", } - metadata1_rev = RevisionIntrinsicMetadataRow( - id=data.revision_id_1, + metadata1_dir = DirectoryIntrinsicMetadataRow( + id=data.directory_id_1, metadata=metadata1, mappings=["npm"], indexer_configuration_id=tool1_id, @@ -1519,14 +1519,14 @@ metadata=metadata1, mappings=["npm"], indexer_configuration_id=tool1_id, - from_revision=data.revision_id_1, + from_directory=data.directory_id_1, ) metadata2 = { "@context": "foo", "author": "Jane Doe", } - metadata2_rev = RevisionIntrinsicMetadataRow( - id=data.revision_id_2, + metadata2_dir = DirectoryIntrinsicMetadataRow( + id=data.directory_id_2, metadata=metadata2, mappings=["npm", "gemspec"], indexer_configuration_id=tool2_id, @@ -1536,13 +1536,13 @@ metadata=metadata2, mappings=["npm", "gemspec"], indexer_configuration_id=tool2_id, - from_revision=data.revision_id_2, + from_directory=data.directory_id_2, ) metadata3 = { "@context": "foo", } - metadata3_rev = RevisionIntrinsicMetadataRow( - id=data.revision_id_3, + metadata3_dir = DirectoryIntrinsicMetadataRow( + id=data.directory_id_3, metadata=metadata3, mappings=["npm", "gemspec"], indexer_configuration_id=tool2_id, @@ -1552,14 +1552,14 @@ metadata=metadata3, mappings=["pkg-info"], indexer_configuration_id=tool2_id, - from_revision=data.revision_id_3, + from_directory=data.directory_id_3, ) - storage.revision_intrinsic_metadata_add([metadata1_rev]) + storage.directory_intrinsic_metadata_add([metadata1_dir]) storage.origin_intrinsic_metadata_add([metadata1_origin]) - storage.revision_intrinsic_metadata_add([metadata2_rev]) + storage.directory_intrinsic_metadata_add([metadata2_dir]) storage.origin_intrinsic_metadata_add([metadata2_origin]) - storage.revision_intrinsic_metadata_add([metadata3_rev]) + storage.directory_intrinsic_metadata_add([metadata3_dir]) storage.origin_intrinsic_metadata_add([metadata3_origin]) def test_origin_intrinsic_metadata_search_by_producer( @@ -1685,7 +1685,7 @@ }, mappings=["npm", "gemspec"], tool=tool2, - from_revision=data.revision_id_2, + from_directory=data.directory_id_2, ) ], next_page_token=None, diff --git a/swh/indexer/tests/tasks.py b/swh/indexer/tests/tasks.py --- a/swh/indexer/tests/tasks.py +++ b/swh/indexer/tests/tasks.py @@ -1,12 +1,12 @@ from celery import current_app as app -from swh.indexer.metadata import OriginMetadataIndexer, RevisionMetadataIndexer +from swh.indexer.metadata import DirectoryMetadataIndexer, OriginMetadataIndexer from .test_metadata import ContentMetadataTestIndexer from .utils import BASE_TEST_CONFIG -class RevisionMetadataTestIndexer(RevisionMetadataIndexer): +class DirectoryMetadataTestIndexer(DirectoryMetadataIndexer): """Specific indexer whose configuration is enough to satisfy the indexing tests. """ @@ -29,12 +29,12 @@ return {**BASE_TEST_CONFIG, "tools": []} def _prepare_sub_indexers(self): - self.revision_metadata_indexer = RevisionMetadataTestIndexer() + self.directory_metadata_indexer = DirectoryMetadataTestIndexer() @app.task -def revision_intrinsic_metadata(*args, **kwargs): - indexer = RevisionMetadataTestIndexer() +def directory_intrinsic_metadata(*args, **kwargs): + indexer = DirectoryMetadataTestIndexer() indexer.run(*args, **kwargs) print("REV RESULT=", indexer.results) diff --git a/swh/indexer/tests/test_cli.py b/swh/indexer/tests/test_cli.py --- a/swh/indexer/tests/test_cli.py +++ b/swh/indexer/tests/test_cli.py @@ -16,14 +16,14 @@ from swh.indexer.cli import indexer_cli_group from swh.indexer.storage.interface import IndexerStorageInterface from swh.indexer.storage.model import ( + DirectoryIntrinsicMetadataRow, OriginIntrinsicMetadataRow, - RevisionIntrinsicMetadataRow, ) from swh.journal.writer import get_journal_writer from swh.model.hashutil import hash_to_bytes from swh.model.model import OriginVisitStatus -from .utils import REVISION +from .utils import DIRECTORY2, REVISION def fill_idx_storage(idx_storage: IndexerStorageInterface, nb_rows: int) -> List[int]: @@ -40,15 +40,15 @@ origin_metadata = [ OriginIntrinsicMetadataRow( id="file://dev/%04d" % origin_id, - from_revision=hash_to_bytes("abcd{:0>36}".format(origin_id)), + from_directory=hash_to_bytes("abcd{:0>36}".format(origin_id)), indexer_configuration_id=tools[origin_id % 2]["id"], metadata={"name": "origin %d" % origin_id}, mappings=["mapping%d" % (origin_id % 10)], ) for origin_id in range(nb_rows) ] - revision_metadata = [ - RevisionIntrinsicMetadataRow( + directory_metadata = [ + DirectoryIntrinsicMetadataRow( id=hash_to_bytes("abcd{:0>36}".format(origin_id)), indexer_configuration_id=tools[origin_id % 2]["id"], metadata={"name": "origin %d" % origin_id}, @@ -57,7 +57,7 @@ for origin_id in range(nb_rows) ] - idx_storage.revision_intrinsic_metadata_add(revision_metadata) + idx_storage.directory_intrinsic_metadata_add(directory_metadata) idx_storage.origin_intrinsic_metadata_add(origin_metadata) return [tool["id"] for tool in tools] @@ -605,10 +605,10 @@ ) mocker.patch( - "swh.indexer.metadata.RevisionMetadataIndexer.index", + "swh.indexer.metadata.DirectoryMetadataIndexer.index", return_value=[ - RevisionIntrinsicMetadataRow( - id=REVISION.id, + DirectoryIntrinsicMetadataRow( + id=DIRECTORY2.id, indexer_configuration_id=1, mappings=["cff"], metadata={"foo": "bar"}, @@ -645,7 +645,7 @@ expected_results = [ OriginIntrinsicMetadataRow( id=status.origin, - from_revision=REVISION.id, + from_directory=DIRECTORY2.id, tool={"id": 1, **swh_indexer_config["tools"]}, mappings=["cff"], metadata={"foo": "bar"}, diff --git a/swh/indexer/tests/test_indexer.py b/swh/indexer/tests/test_indexer.py --- a/swh/indexer/tests/test_indexer.py +++ b/swh/indexer/tests/test_indexer.py @@ -11,13 +11,13 @@ from swh.indexer.indexer import ( ContentIndexer, ContentPartitionIndexer, + DirectoryIndexer, OriginIndexer, - RevisionIndexer, ) from swh.indexer.storage import PagedResult, Sha1 from swh.model.model import Content -from .utils import BASE_TEST_CONFIG, REVISION +from .utils import BASE_TEST_CONFIG, DIRECTORY2 class _TestException(Exception): @@ -49,7 +49,7 @@ pass -class CrashingRevisionIndexer(CrashingIndexerMixin, RevisionIndexer): +class CrashingDirectoryIndexer(CrashingIndexerMixin, DirectoryIndexer): pass @@ -86,14 +86,14 @@ indexer.run([b"foo"]) -def test_revision_indexer_catch_exceptions(): - indexer = CrashingRevisionIndexer(config=BASE_TEST_CONFIG) +def test_directory_indexer_catch_exceptions(): + indexer = CrashingDirectoryIndexer(config=BASE_TEST_CONFIG) indexer.storage = Mock() - indexer.storage.revision_get.return_value = [REVISION] + indexer.storage.directory_get.return_value = [DIRECTORY2] assert indexer.run([b"foo"]) == {"status": "failed"} - assert indexer.process_journal_objects({"revision": [REVISION.to_dict()]}) == { + assert indexer.process_journal_objects({"directory": [DIRECTORY2.to_dict()]}) == { "status": "failed" } @@ -103,7 +103,7 @@ indexer.run([b"foo"]) with pytest.raises(_TestException): - indexer.process_journal_objects({"revision": [REVISION.to_dict()]}) + indexer.process_journal_objects({"directory": [DIRECTORY2.to_dict()]}) def test_origin_indexer_catch_exceptions(): diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py --- a/swh/indexer/tests/test_metadata.py +++ b/swh/indexer/tests/test_metadata.py @@ -10,16 +10,16 @@ import pytest from swh.indexer.codemeta import CODEMETA_TERMS -from swh.indexer.metadata import ContentMetadataIndexer, RevisionMetadataIndexer +from swh.indexer.metadata import ContentMetadataIndexer, DirectoryMetadataIndexer from swh.indexer.metadata_detector import detect_metadata from swh.indexer.metadata_dictionary import MAPPINGS from swh.indexer.metadata_dictionary.maven import MavenMapping from swh.indexer.metadata_dictionary.npm import NpmMapping from swh.indexer.metadata_dictionary.ruby import GemspecMapping -from swh.indexer.storage.model import ContentMetadataRow, RevisionIntrinsicMetadataRow -from swh.indexer.tests.utils import DIRECTORY2, REVISION +from swh.indexer.storage.model import ContentMetadataRow, DirectoryIntrinsicMetadataRow +from swh.indexer.tests.utils import DIRECTORY2 from swh.model.hashutil import hash_to_bytes -from swh.model.model import Directory, DirectoryEntry, Revision +from swh.model.model import Directory, DirectoryEntry from .utils import ( BASE_TEST_CONFIG, @@ -43,10 +43,10 @@ """ def parse_config_file(self, *args, **kwargs): - assert False, "should not be called; the rev indexer configures it." + assert False, "should not be called; the dir indexer configures it." -REVISION_METADATA_CONFIG = { +DIRECTORY_METADATA_CONFIG = { **BASE_TEST_CONFIG, "tools": TRANSLATOR_TOOL, } @@ -1154,8 +1154,8 @@ parts.append(b"end\n") self.gemspec_mapping.translate(b"".join(parts)) - def test_revision_metadata_indexer(self): - metadata_indexer = RevisionMetadataIndexer(config=REVISION_METADATA_CONFIG) + def test_directory_metadata_indexer(self): + metadata_indexer = DirectoryMetadataIndexer(config=DIRECTORY_METADATA_CONFIG) fill_obj_storage(metadata_indexer.objstorage) fill_storage(metadata_indexer.storage) @@ -1163,8 +1163,7 @@ {f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()} ) assert tool is not None - rev = REVISION - assert rev.directory == DIRECTORY2.id + dir_ = DIRECTORY2 metadata_indexer.idx_storage.content_metadata_add( [ @@ -1176,15 +1175,17 @@ ] ) - metadata_indexer.run([rev.id]) + metadata_indexer.run([dir_.id]) results = list( - metadata_indexer.idx_storage.revision_intrinsic_metadata_get([REVISION.id]) + metadata_indexer.idx_storage.directory_intrinsic_metadata_get( + [DIRECTORY2.id] + ) ) expected_results = [ - RevisionIntrinsicMetadataRow( - id=rev.id, + DirectoryIntrinsicMetadataRow( + id=dir_.id, tool=TRANSLATOR_TOOL, metadata=YARN_PARSER_METADATA, mappings=["npm"], @@ -1197,33 +1198,27 @@ # then assert results == expected_results - def test_revision_metadata_indexer_single_root_dir(self): - metadata_indexer = RevisionMetadataIndexer(config=REVISION_METADATA_CONFIG) + def test_directory_metadata_indexer_single_root_dir(self): + metadata_indexer = DirectoryMetadataIndexer(config=DIRECTORY_METADATA_CONFIG) fill_obj_storage(metadata_indexer.objstorage) fill_storage(metadata_indexer.storage) # Add a parent directory, that is the only directory at the root - # of the revision - rev = REVISION - assert rev.directory == DIRECTORY2.id + # of the directory + dir_ = DIRECTORY2 - directory = Directory( + new_dir = Directory( entries=( DirectoryEntry( name=b"foobar-1.0.0", type="dir", - target=rev.directory, + target=dir_.id, perms=16384, ), ), ) - assert directory.id is not None - metadata_indexer.storage.directory_add([directory]) - - new_rev_dict = {**rev.to_dict(), "directory": directory.id} - new_rev_dict.pop("id") - new_rev = Revision.from_dict(new_rev_dict) - metadata_indexer.storage.revision_add([new_rev]) + assert new_dir.id is not None + metadata_indexer.storage.directory_add([new_dir]) tool = metadata_indexer.idx_storage.indexer_configuration_get( {f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()} @@ -1240,15 +1235,15 @@ ] ) - metadata_indexer.run([new_rev.id]) + metadata_indexer.run([new_dir.id]) results = list( - metadata_indexer.idx_storage.revision_intrinsic_metadata_get([new_rev.id]) + metadata_indexer.idx_storage.directory_intrinsic_metadata_get([new_dir.id]) ) expected_results = [ - RevisionIntrinsicMetadataRow( - id=new_rev.id, + DirectoryIntrinsicMetadataRow( + id=new_dir.id, tool=TRANSLATOR_TOOL, metadata=YARN_PARSER_METADATA, mappings=["npm"], diff --git a/swh/indexer/tests/test_origin_metadata.py b/swh/indexer/tests/test_origin_metadata.py --- a/swh/indexer/tests/test_origin_metadata.py +++ b/swh/indexer/tests/test_origin_metadata.py @@ -11,14 +11,14 @@ from swh.indexer.metadata import OriginMetadataIndexer from swh.indexer.storage.interface import IndexerStorageInterface from swh.indexer.storage.model import ( + DirectoryIntrinsicMetadataRow, OriginIntrinsicMetadataRow, - RevisionIntrinsicMetadataRow, ) from swh.model.model import Origin from swh.storage.interface import StorageInterface from .test_metadata import TRANSLATOR_TOOL -from .utils import REVISION, YARN_PARSER_METADATA +from .utils import DIRECTORY2, YARN_PARSER_METADATA @pytest.fixture @@ -41,9 +41,9 @@ tool = swh_indexer_config["tools"] - rev_id = REVISION.id - rev_metadata = RevisionIntrinsicMetadataRow( - id=rev_id, + dir_id = DIRECTORY2.id + dir_metadata = DirectoryIntrinsicMetadataRow( + id=dir_id, tool=tool, metadata=YARN_PARSER_METADATA, mappings=["npm"], @@ -51,16 +51,16 @@ origin_metadata = OriginIntrinsicMetadataRow( id=origin, tool=tool, - from_revision=rev_id, + from_directory=dir_id, metadata=YARN_PARSER_METADATA, mappings=["npm"], ) - rev_results = list(idx_storage.revision_intrinsic_metadata_get([rev_id])) - for rev_result in rev_results: - assert rev_result.tool - del rev_result.tool["id"] - assert rev_results == [rev_metadata] + dir_results = list(idx_storage.directory_intrinsic_metadata_get([dir_id])) + for dir_result in dir_results: + assert dir_result.tool + del dir_result.tool["id"] + assert dir_results == [dir_metadata] orig_results = list(idx_storage.origin_intrinsic_metadata_get([origin])) for orig_result in orig_results: @@ -82,10 +82,10 @@ indexer.run(["https://github.com/librariesio/yarn-parser"] * 2) origin = "https://github.com/librariesio/yarn-parser" - rev_id = REVISION.id + dir_id = DIRECTORY2.id - rev_results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id])) - assert len(rev_results) == 1 + dir_results = list(indexer.idx_storage.directory_intrinsic_metadata_get([dir_id])) + assert len(dir_results) == 1 orig_results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin])) assert len(orig_results) == 1 @@ -121,15 +121,15 @@ indexer = OriginMetadataIndexer(config=swh_indexer_config) indexer.run([origin1, origin2]) - rev_id = REVISION.id + dir_id = DIRECTORY2.id - rev_results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id])) - assert rev_results == [ - RevisionIntrinsicMetadataRow( - id=rev_id, + dir_results = list(indexer.idx_storage.directory_intrinsic_metadata_get([dir_id])) + assert dir_results == [ + DirectoryIntrinsicMetadataRow( + id=dir_id, metadata=YARN_PARSER_METADATA, mappings=["npm"], - tool=rev_results[0].tool, + tool=dir_results[0].tool, ) ] @@ -140,7 +140,7 @@ assert orig_results == [ OriginIntrinsicMetadataRow( id=origin2, - from_revision=rev_id, + from_directory=dir_id, metadata=YARN_PARSER_METADATA, mappings=["npm"], tool=orig_results[0].tool, @@ -148,7 +148,7 @@ ] -def test_origin_metadata_indexer_duplicate_revision( +def test_origin_metadata_indexer_duplicate_directory( swh_indexer_config, idx_storage: IndexerStorageInterface, storage: StorageInterface, @@ -162,10 +162,10 @@ origin2 = "https://github.com/librariesio/yarn-parser.git" indexer.run([origin1, origin2]) - rev_id = REVISION.id + dir_id = DIRECTORY2.id - rev_results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id])) - assert len(rev_results) == 1 + dir_results = list(indexer.idx_storage.directory_intrinsic_metadata_get([dir_id])) + assert len(dir_results) == 1 orig_results = list( indexer.idx_storage.origin_intrinsic_metadata_get([origin1, origin2]) @@ -185,10 +185,10 @@ with patch("swh.indexer.metadata_dictionary.npm.NpmMapping.filename", b"foo.json"): indexer.run([origin]) - rev_id = REVISION.id + dir_id = DIRECTORY2.id - rev_results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id])) - assert rev_results == [] + dir_results = list(indexer.idx_storage.directory_intrinsic_metadata_get([dir_id])) + assert dir_results == [] orig_results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin])) assert orig_results == [] @@ -204,16 +204,16 @@ indexer = OriginMetadataIndexer(config=swh_indexer_config) origin = "https://github.com/librariesio/yarn-parser" with patch( - "swh.indexer.metadata.RevisionMetadataIndexer" - ".translate_revision_intrinsic_metadata", + "swh.indexer.metadata.DirectoryMetadataIndexer" + ".translate_directory_intrinsic_metadata", return_value=(["npm"], {"@context": "foo"}), ): indexer.run([origin]) - rev_id = REVISION.id + dir_id = DIRECTORY2.id - rev_results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id])) - assert rev_results == [] + dir_results = list(indexer.idx_storage.directory_intrinsic_metadata_get([dir_id])) + assert dir_results == [] orig_results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin])) assert orig_results == [] @@ -229,16 +229,16 @@ indexer = OriginMetadataIndexer(config=swh_indexer_config) origin = "https://github.com/librariesio/yarn-parser" with patch( - "swh.indexer.metadata.RevisionMetadataIndexer" - ".translate_revision_intrinsic_metadata", + "swh.indexer.metadata.DirectoryMetadataIndexer" + ".translate_directory_intrinsic_metadata", return_value=None, ): indexer.run([origin]) - rev_id = REVISION.id + dir_id = DIRECTORY2.id - rev_results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id])) - assert rev_results == [] + dir_results = list(indexer.idx_storage.directory_intrinsic_metadata_get([dir_id])) + assert dir_results == [] orig_results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin])) assert orig_results == []