Page MenuHomeSoftware Heritage

D7937.id28590.diff
No OneTemporary

D7937.id28590.diff

diff --git a/swh/indexer/indexer.py b/swh/indexer/indexer.py
--- a/swh/indexer/indexer.py
+++ b/swh/indexer/indexer.py
@@ -18,6 +18,7 @@
List,
Optional,
Set,
+ Tuple,
TypeVar,
Union,
)
@@ -31,7 +32,7 @@
from swh.indexer.storage import INDEXER_CFG_KEY, Sha1, get_indexer_storage
from swh.indexer.storage.interface import IndexerStorageInterface
from swh.model import hashutil
-from swh.model.model import Origin, Revision, Sha1Git
+from swh.model.model import Directory, Origin, Sha1Git
from swh.objstorage.exc import ObjNotFoundError
from swh.objstorage.factory import get_objstorage
from swh.scheduler import CONFIG as SWH_CONFIG
@@ -40,7 +41,7 @@
class ObjectsDict(TypedDict, total=False):
- revision: List[Dict]
+ directory: List[Dict]
origin: List[Dict]
origin_visit_status: List[Dict]
@@ -109,7 +110,7 @@
content, sha1_git for revision, directory, release, and id for origin
To implement a new concrete indexer, inherit from the object level
- classes: :class:`ContentIndexer`, :class:`RevisionIndexer`,
+ classes: :class:`ContentIndexer`, :class:`DirectoryIndexer`,
:class:`OriginIndexer`.
Then you need to implement the following functions:
@@ -583,11 +584,11 @@
return results
-class RevisionIndexer(BaseIndexer[Sha1Git, Revision, TResult], Generic[TResult]):
+class DirectoryIndexer(BaseIndexer[Sha1Git, Directory, TResult], Generic[TResult]):
"""An object type indexer, inherits from the :class:`BaseIndexer` and
- implements Revision indexing using the run method
+ implements Directory indexing using the run method
- Note: the :class:`RevisionIndexer` is not an instantiable object.
+ Note: the :class:`DirectoryIndexer` is not an instantiable object.
To use it in another context one should inherit from this class
and override the methods mentioned in the :class:`BaseIndexer`
class.
@@ -597,7 +598,7 @@
def run(self, ids: List[Sha1Git], **kwargs) -> Dict:
"""Given a list of sha1_gits:
- - retrieve revisions from storage
+ - retrieve directories from storage
- execute the indexing computations
- store the results
@@ -612,36 +613,37 @@
)
del kwargs["policy_update"]
- revision_ids = [
+ directory_ids = [
hashutil.hash_to_bytes(id_) if isinstance(id_, str) else id_ for id_ in ids
]
- revisions = []
- for (rev_id, rev) in zip(revision_ids, self.storage.revision_get(revision_ids)):
- if not rev:
- # TODO: call self.index() with rev=None?
- self.log.warning(
- "Revision %s not found in storage", hashutil.hash_to_hex(rev_id)
- )
- continue
- revisions.append(rev.to_dict())
- return self.process_journal_objects({"revision": revisions})
+ return self._process_directories([(dir_id, None) for dir_id in directory_ids])
def process_journal_objects(self, objects: ObjectsDict) -> Dict:
"""Worker function for ``JournalClient``. Expects ``objects`` to have a single
- key, ``"revision"``."""
- assert set(objects) == {"revision"}
+ key, ``"directory"``."""
+ assert set(objects) == {"directory"}
+ return self._process_directories(
+ [(dir_["id"], Directory.from_dict(dir_)) for dir_ in objects["directory"]]
+ )
+
+ def _process_directories(
+ self,
+ directories: Union[List[Tuple[Sha1Git, Directory]], List[Tuple[Sha1Git, None]]],
+ ) -> Dict:
summary: Dict[str, Any] = {"status": "uneventful"}
results = []
- for rev in objects["revision"]:
+ # TODO: fetch raw_manifest when useful?
+
+ for (dir_id, dir_) in directories:
try:
- results.extend(self.index(rev["id"], Revision.from_dict(rev)))
+ results.extend(self.index(dir_id, dir_))
except Exception:
if not self.catch_exceptions:
raise
- self.log.exception("Problem when processing revision")
+ self.log.exception("Problem when processing directory")
sentry_sdk.capture_exception()
summary["status"] = "failed"
diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py
--- a/swh/indexer/metadata.py
+++ b/swh/indexer/metadata.py
@@ -21,18 +21,18 @@
from swh.core.config import merge_configs
from swh.core.utils import grouper
from swh.indexer.codemeta import merge_documents
-from swh.indexer.indexer import ContentIndexer, OriginIndexer, RevisionIndexer
+from swh.indexer.indexer import ContentIndexer, DirectoryIndexer, OriginIndexer
from swh.indexer.metadata_detector import detect_metadata
from swh.indexer.metadata_dictionary import MAPPINGS
from swh.indexer.origin_head import get_head_swhid
from swh.indexer.storage import INDEXER_CFG_KEY, Sha1
from swh.indexer.storage.model import (
ContentMetadataRow,
+ DirectoryIntrinsicMetadataRow,
OriginIntrinsicMetadataRow,
- RevisionIntrinsicMetadataRow,
)
from swh.model import hashutil
-from swh.model.model import Origin, Revision, Sha1Git
+from swh.model.model import Directory, Origin, Sha1Git
from swh.model.swhids import ObjectType
REVISION_GET_BATCH_SIZE = 10
@@ -83,7 +83,7 @@
self,
id: Sha1,
data: Optional[bytes] = None,
- log_suffix="unknown revision",
+ log_suffix="unknown directory",
**kwargs,
) -> List[ContentMetadataRow]:
"""Index sha1s' content and store result.
@@ -145,18 +145,18 @@
}
-class RevisionMetadataIndexer(RevisionIndexer[RevisionIntrinsicMetadataRow]):
- """Revision-level indexer
+class DirectoryMetadataIndexer(DirectoryIndexer[DirectoryIntrinsicMetadataRow]):
+ """Directory-level indexer
This indexer is in charge of:
- - filtering revisions already indexed in revision_intrinsic_metadata table
+ - filtering directories already indexed in directory_intrinsic_metadata table
with defined computation tool
- - retrieve all entry_files in root directory
+ - retrieve all entry_files in directory
- use metadata_detector for file_names containing metadata
- compute metadata translation if necessary and possible (depends on tool)
- send sha1s to content indexing if possible
- - store the results for revision
+ - store the results for directory
"""
@@ -166,7 +166,7 @@
def filter(self, sha1_gits):
"""Filter out known sha1s and return only missing ones."""
- yield from self.idx_storage.revision_intrinsic_metadata_missing(
+ yield from self.idx_storage.directory_intrinsic_metadata_missing(
(
{
"id": sha1_git,
@@ -177,51 +177,52 @@
)
def index(
- self, id: Sha1Git, data: Optional[Revision], **kwargs
- ) -> List[RevisionIntrinsicMetadataRow]:
- """Index rev by processing it and organizing result.
+ self, id: Sha1Git, data: Optional[Directory] = None, **kwargs
+ ) -> List[DirectoryIntrinsicMetadataRow]:
+ """Index directory by processing it and organizing result.
use metadata_detector to iterate on filenames
- if one filename detected -> sends file to content indexer
- - if multiple file detected -> translation needed at revision level
+ - if multiple file detected -> translation needed at directory level
Args:
- id: sha1_git of the revision
- data: revision model object from storage
+ id: sha1_git of the directory
+ data: directory model object from storage
Returns:
- dict: dictionary representing a revision_intrinsic_metadata, with
+ dict: dictionary representing a directory_intrinsic_metadata, with
keys:
- - id (str): rev's identifier (sha1_git)
+ - id (str): directory's identifier (sha1_git)
- indexer_configuration_id (bytes): tool used
- metadata: dict of retrieved metadata
"""
- rev = data
- assert isinstance(rev, Revision)
+ if data is None:
+ dir_ = list(self.storage.directory_ls(id, recursive=False))
+ else:
+ assert isinstance(data, Directory)
+ dir_ = data.to_dict()
try:
- root_dir = rev.directory
- dir_ls = list(self.storage.directory_ls(root_dir, recursive=False))
- if [entry["type"] for entry in dir_ls] == ["dir"]:
+ if [entry["type"] for entry in dir_] == ["dir"]:
# If the root is just a single directory, recurse into it
# eg. PyPI packages, GNU tarballs
- subdir = dir_ls[0]["target"]
- dir_ls = list(self.storage.directory_ls(subdir, recursive=False))
- files = [entry for entry in dir_ls if entry["type"] == "file"]
+ subdir = dir_[0]["target"]
+ dir_ = list(self.storage.directory_ls(subdir, recursive=False))
+ files = [entry for entry in dir_ if entry["type"] == "file"]
detected_files = detect_metadata(files)
- (mappings, metadata) = self.translate_revision_intrinsic_metadata(
+ (mappings, metadata) = self.translate_directory_intrinsic_metadata(
detected_files,
- log_suffix="revision=%s" % hashutil.hash_to_hex(rev.id),
+ log_suffix="directory=%s" % hashutil.hash_to_hex(id),
)
except Exception as e:
- self.log.exception("Problem when indexing rev: %r", e)
+ self.log.exception("Problem when indexing dir: %r", e)
sentry_sdk.capture_exception()
return [
- RevisionIntrinsicMetadataRow(
- id=rev.id,
+ DirectoryIntrinsicMetadataRow(
+ id=id,
indexer_configuration_id=self.tool["id"],
mappings=mappings,
metadata=metadata,
@@ -229,7 +230,7 @@
]
def persist_index_computations(
- self, results: List[RevisionIntrinsicMetadataRow]
+ self, results: List[DirectoryIntrinsicMetadataRow]
) -> Dict[str, int]:
"""Persist the results in storage.
@@ -242,10 +243,10 @@
"""
# TODO: add functions in storage to keep data in
- # revision_intrinsic_metadata
- return self.idx_storage.revision_intrinsic_metadata_add(results)
+ # directory_intrinsic_metadata
+ return self.idx_storage.directory_intrinsic_metadata_add(results)
- def translate_revision_intrinsic_metadata(
+ def translate_directory_intrinsic_metadata(
self, detected_files: Dict[str, List[Any]], log_suffix: str
) -> Tuple[List[Any], Any]:
"""
@@ -316,17 +317,17 @@
class OriginMetadataIndexer(
- OriginIndexer[Tuple[OriginIntrinsicMetadataRow, RevisionIntrinsicMetadataRow]]
+ OriginIndexer[Tuple[OriginIntrinsicMetadataRow, DirectoryIntrinsicMetadataRow]]
):
USE_TOOLS = False
def __init__(self, config=None, **kwargs) -> None:
super().__init__(config=config, **kwargs)
- self.revision_metadata_indexer = RevisionMetadataIndexer(config=config)
+ self.directory_metadata_indexer = DirectoryMetadataIndexer(config=config)
def index_list(
self, origins: List[Origin], check_origin_known: bool = True, **kwargs
- ) -> List[Tuple[OriginIntrinsicMetadataRow, RevisionIntrinsicMetadataRow]]:
+ ) -> List[Tuple[OriginIntrinsicMetadataRow, DirectoryIntrinsicMetadataRow]]:
head_rev_ids = []
origins_with_head = []
@@ -365,39 +366,41 @@
self.log.warning("Missing head revision of origin %r", origin.url)
continue
- for rev_metadata in self.revision_metadata_indexer.index(rev.id, rev):
- # There is at most one rev_metadata
+ for dir_metadata in self.directory_metadata_indexer.index(rev.directory):
+ # There is at most one dir_metadata
orig_metadata = OriginIntrinsicMetadataRow(
- from_revision=rev_metadata.id,
+ from_directory=dir_metadata.id,
id=origin.url,
- metadata=rev_metadata.metadata,
- mappings=rev_metadata.mappings,
- indexer_configuration_id=rev_metadata.indexer_configuration_id,
+ metadata=dir_metadata.metadata,
+ mappings=dir_metadata.mappings,
+ indexer_configuration_id=dir_metadata.indexer_configuration_id,
)
- results.append((orig_metadata, rev_metadata))
+ results.append((orig_metadata, dir_metadata))
return results
def persist_index_computations(
self,
- results: List[Tuple[OriginIntrinsicMetadataRow, RevisionIntrinsicMetadataRow]],
+ results: List[Tuple[OriginIntrinsicMetadataRow, DirectoryIntrinsicMetadataRow]],
) -> Dict[str, int]:
- # Deduplicate revisions
- rev_metadata: List[RevisionIntrinsicMetadataRow] = []
+ # Deduplicate directories
+ dir_metadata: List[DirectoryIntrinsicMetadataRow] = []
orig_metadata: List[OriginIntrinsicMetadataRow] = []
summary: Dict = {}
- for (orig_item, rev_item) in results:
- assert rev_item.metadata == orig_item.metadata
- if rev_item.metadata and not (rev_item.metadata.keys() <= {"@context"}):
+ for (orig_item, dir_item) in results:
+ assert dir_item.metadata == orig_item.metadata
+ if dir_item.metadata and not (dir_item.metadata.keys() <= {"@context"}):
# Only store non-empty metadata sets
- if rev_item not in rev_metadata:
- rev_metadata.append(rev_item)
+ if dir_item not in dir_metadata:
+ dir_metadata.append(dir_item)
if orig_item not in orig_metadata:
orig_metadata.append(orig_item)
- if rev_metadata:
- summary_rev = self.idx_storage.revision_intrinsic_metadata_add(rev_metadata)
- summary.update(summary_rev)
+ if dir_metadata:
+ summary_dir = self.idx_storage.directory_intrinsic_metadata_add(
+ dir_metadata
+ )
+ summary.update(summary_dir)
if orig_metadata:
summary_ori = self.idx_storage.origin_intrinsic_metadata_add(orig_metadata)
summary.update(summary_ori)
diff --git a/swh/indexer/sql/30-schema.sql b/swh/indexer/sql/30-schema.sql
--- a/swh/indexer/sql/30-schema.sql
+++ b/swh/indexer/sql/30-schema.sql
@@ -99,34 +99,34 @@
comment on column content_metadata.metadata is 'result of translation with defined format';
comment on column content_metadata.indexer_configuration_id is 'tool used for translation';
--- The table revision_intrinsic_metadata provides a minimal set of intrinsic
+-- The table directory_intrinsic_metadata provides a minimal set of intrinsic
-- metadata detected with the detection tool (indexer_configuration_id) and
-- aggregated from the content_metadata translation.
-create table revision_intrinsic_metadata(
+create table directory_intrinsic_metadata(
id sha1_git not null,
metadata jsonb not null,
indexer_configuration_id bigint not null,
mappings text array not null
);
-comment on table revision_intrinsic_metadata is 'metadata semantically detected and translated in a revision';
-comment on column revision_intrinsic_metadata.id is 'sha1_git of revision';
-comment on column revision_intrinsic_metadata.metadata is 'result of detection and translation with defined format';
-comment on column revision_intrinsic_metadata.indexer_configuration_id is 'tool used for detection';
-comment on column revision_intrinsic_metadata.mappings is 'type of metadata files used to obtain this metadata (eg. pkg-info, npm)';
+comment on table directory_intrinsic_metadata is 'metadata semantically detected and translated in a directory';
+comment on column directory_intrinsic_metadata.id is 'sha1_git of directory';
+comment on column directory_intrinsic_metadata.metadata is 'result of detection and translation with defined format';
+comment on column directory_intrinsic_metadata.indexer_configuration_id is 'tool used for detection';
+comment on column directory_intrinsic_metadata.mappings is 'type of metadata files used to obtain this metadata (eg. pkg-info, npm)';
create table origin_intrinsic_metadata(
id text not null, -- origin url
metadata jsonb,
indexer_configuration_id bigint not null,
- from_revision sha1_git not null,
+ from_directory sha1_git not null,
metadata_tsvector tsvector,
mappings text array not null
);
comment on table origin_intrinsic_metadata is 'keeps intrinsic metadata for an origin';
comment on column origin_intrinsic_metadata.id is 'url of the origin';
-comment on column origin_intrinsic_metadata.metadata is 'metadata extracted from a revision';
+comment on column origin_intrinsic_metadata.metadata is 'metadata extracted from a directory';
comment on column origin_intrinsic_metadata.indexer_configuration_id is 'tool used to generate this metadata';
-comment on column origin_intrinsic_metadata.from_revision is 'sha1 of the revision this metadata was copied from.';
+comment on column origin_intrinsic_metadata.from_directory is 'sha1 of the directory this metadata was copied from.';
comment on column origin_intrinsic_metadata.mappings is 'type of metadata files used to obtain this metadata (eg. pkg-info, npm)';
diff --git a/swh/indexer/sql/50-func.sql b/swh/indexer/sql/50-func.sql
--- a/swh/indexer/sql/50-func.sql
+++ b/swh/indexer/sql/50-func.sql
@@ -273,25 +273,25 @@
-- end content_metadata functions
--- add tmp_revision_intrinsic_metadata entries to revision_intrinsic_metadata,
+-- add tmp_directory_intrinsic_metadata entries to directory_intrinsic_metadata,
-- overwriting duplicates.
--
-- If filtering duplicates is in order, the call to
--- swh_revision_intrinsic_metadata_missing must take place before calling this
+-- swh_directory_intrinsic_metadata_missing must take place before calling this
-- function.
--
-- operates in bulk: 0. swh_mktemp(content_language), 1. COPY to
--- tmp_revision_intrinsic_metadata, 2. call this function
-create or replace function swh_revision_intrinsic_metadata_add()
+-- tmp_directory_intrinsic_metadata, 2. call this function
+create or replace function swh_directory_intrinsic_metadata_add()
returns bigint
language plpgsql
as $$
declare
res bigint;
begin
- insert into revision_intrinsic_metadata (id, metadata, mappings, indexer_configuration_id)
+ insert into directory_intrinsic_metadata (id, metadata, mappings, indexer_configuration_id)
select id, metadata, mappings, indexer_configuration_id
- from tmp_revision_intrinsic_metadata tcm
+ from tmp_directory_intrinsic_metadata tcm
on conflict(id, indexer_configuration_id)
do update set
metadata = excluded.metadata,
@@ -302,19 +302,19 @@
end
$$;
-comment on function swh_revision_intrinsic_metadata_add() IS 'Add new revision intrinsic metadata';
+comment on function swh_directory_intrinsic_metadata_add() IS 'Add new directory intrinsic metadata';
--- create a temporary table for retrieving revision_intrinsic_metadata
-create or replace function swh_mktemp_revision_intrinsic_metadata()
+-- create a temporary table for retrieving directory_intrinsic_metadata
+create or replace function swh_mktemp_directory_intrinsic_metadata()
returns void
language sql
as $$
- create temporary table if not exists tmp_revision_intrinsic_metadata (
- like revision_intrinsic_metadata including defaults
+ create temporary table if not exists tmp_directory_intrinsic_metadata (
+ like directory_intrinsic_metadata including defaults
) on commit delete rows;
$$;
-comment on function swh_mktemp_revision_intrinsic_metadata() is 'Helper table to add revision intrinsic metadata';
+comment on function swh_mktemp_directory_intrinsic_metadata() is 'Helper table to add directory intrinsic metadata';
-- create a temporary table for retrieving origin_intrinsic_metadata
create or replace function swh_mktemp_origin_intrinsic_metadata()
@@ -380,8 +380,8 @@
begin
perform swh_origin_intrinsic_metadata_compute_tsvector();
- insert into origin_intrinsic_metadata (id, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings)
- select id, metadata, indexer_configuration_id, from_revision,
+ insert into origin_intrinsic_metadata (id, metadata, indexer_configuration_id, from_directory, metadata_tsvector, mappings)
+ select id, metadata, indexer_configuration_id, from_directory,
metadata_tsvector, mappings
from tmp_origin_intrinsic_metadata
on conflict(id, indexer_configuration_id)
@@ -389,7 +389,7 @@
metadata = excluded.metadata,
metadata_tsvector = excluded.metadata_tsvector,
mappings = excluded.mappings,
- from_revision = excluded.from_revision;
+ from_directory = excluded.from_directory;
get diagnostics res = ROW_COUNT;
return res;
diff --git a/swh/indexer/sql/60-indexes.sql b/swh/indexer/sql/60-indexes.sql
--- a/swh/indexer/sql/60-indexes.sql
+++ b/swh/indexer/sql/60-indexes.sql
@@ -25,12 +25,12 @@
alter table content_metadata add constraint content_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid;
alter table content_metadata validate constraint content_metadata_indexer_configuration_id_fkey;
--- revision_intrinsic_metadata
-create unique index revision_intrinsic_metadata_pkey on revision_intrinsic_metadata(id, indexer_configuration_id);
-alter table revision_intrinsic_metadata add primary key using index revision_intrinsic_metadata_pkey;
+-- directory_intrinsic_metadata
+create unique index directory_intrinsic_metadata_pkey on directory_intrinsic_metadata(id, indexer_configuration_id);
+alter table directory_intrinsic_metadata add primary key using index directory_intrinsic_metadata_pkey;
-alter table revision_intrinsic_metadata add constraint revision_intrinsic_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid;
-alter table revision_intrinsic_metadata validate constraint revision_intrinsic_metadata_indexer_configuration_id_fkey;
+alter table directory_intrinsic_metadata add constraint directory_intrinsic_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid;
+alter table directory_intrinsic_metadata validate constraint directory_intrinsic_metadata_indexer_configuration_id_fkey;
-- content_mimetype
create unique index content_mimetype_pkey on content_mimetype(id, indexer_configuration_id);
diff --git a/swh/indexer/sql/upgrades/134.sql b/swh/indexer/sql/upgrades/134.sql
new file mode 100644
--- /dev/null
+++ b/swh/indexer/sql/upgrades/134.sql
@@ -0,0 +1,18 @@
+-- SWH Indexer DB schema upgrade
+-- from_version: 133
+-- to_version: 134
+-- description: replace revision_intrinsic_metadata with directory_intrinsic_metadata
+-- and origin_intrinsic_metadata.from_revision with origin_intrinsic_metadata.from_directory
+-- This migration works by dropping both tables and reindexing from scratch.
+
+insert into dbversion(version, release, description)
+ values(134, now(), 'Work In Progress');
+
+drop table origin_intrinsic_metadata;
+drop table revision_intrinsic_metadata;
+drop function swh_revision_intrinsic_metadata_add;
+drop function swh_mktemp_revision_intrinsic_metadata;
+
+\ir '../30-schema.sql'
+\ir '../50-func.sql'
+\ir '../60-indexes.sql'
diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py
--- a/swh/indexer/storage/__init__.py
+++ b/swh/indexer/storage/__init__.py
@@ -30,8 +30,8 @@
ContentLicenseRow,
ContentMetadataRow,
ContentMimetypeRow,
+ DirectoryIntrinsicMetadataRow,
OriginIntrinsicMetadataRow,
- RevisionIntrinsicMetadataRow,
)
from .writer import JournalWriter
@@ -522,52 +522,52 @@
@timed
@db_transaction()
- def revision_intrinsic_metadata_missing(
+ def directory_intrinsic_metadata_missing(
self, metadata: Iterable[Dict], db=None, cur=None
) -> List[Tuple[Sha1, int]]:
return [
obj[0]
- for obj in db.revision_intrinsic_metadata_missing_from_list(metadata, cur)
+ for obj in db.directory_intrinsic_metadata_missing_from_list(metadata, cur)
]
@timed
@db_transaction()
- def revision_intrinsic_metadata_get(
+ def directory_intrinsic_metadata_get(
self, ids: Iterable[Sha1], db=None, cur=None
- ) -> List[RevisionIntrinsicMetadataRow]:
+ ) -> List[DirectoryIntrinsicMetadataRow]:
return [
- RevisionIntrinsicMetadataRow.from_dict(
+ DirectoryIntrinsicMetadataRow.from_dict(
converters.db_to_metadata(
- dict(zip(db.revision_intrinsic_metadata_cols, c))
+ dict(zip(db.directory_intrinsic_metadata_cols, c))
)
)
- for c in db.revision_intrinsic_metadata_get_from_list(ids, cur)
+ for c in db.directory_intrinsic_metadata_get_from_list(ids, cur)
]
@timed
@process_metrics
@db_transaction()
- def revision_intrinsic_metadata_add(
+ def directory_intrinsic_metadata_add(
self,
- metadata: List[RevisionIntrinsicMetadataRow],
+ metadata: List[DirectoryIntrinsicMetadataRow],
db=None,
cur=None,
) -> Dict[str, int]:
check_id_duplicates(metadata)
metadata.sort(key=lambda m: m.id)
- self.journal_writer.write_additions("revision_intrinsic_metadata", metadata)
+ self.journal_writer.write_additions("directory_intrinsic_metadata", metadata)
- db.mktemp_revision_intrinsic_metadata(cur)
+ db.mktemp_directory_intrinsic_metadata(cur)
db.copy_to(
[m.to_dict() for m in metadata],
- "tmp_revision_intrinsic_metadata",
+ "tmp_directory_intrinsic_metadata",
["id", "metadata", "mappings", "indexer_configuration_id"],
cur,
)
- count = db.revision_intrinsic_metadata_add_from_temp(cur)
+ count = db.directory_intrinsic_metadata_add_from_temp(cur)
return {
- "revision_intrinsic_metadata:add": count,
+ "directory_intrinsic_metadata:add": count,
}
@timed
@@ -602,7 +602,13 @@
db.copy_to(
[m.to_dict() for m in metadata],
"tmp_origin_intrinsic_metadata",
- ["id", "metadata", "indexer_configuration_id", "from_revision", "mappings"],
+ [
+ "id",
+ "metadata",
+ "indexer_configuration_id",
+ "from_directory",
+ "mappings",
+ ],
cur,
)
count = db.origin_intrinsic_metadata_add_from_temp(cur)
diff --git a/swh/indexer/storage/db.py b/swh/indexer/storage/db.py
--- a/swh/indexer/storage/db.py
+++ b/swh/indexer/storage/db.py
@@ -350,18 +350,18 @@
"content_metadata", ids, self.content_metadata_cols, cur=cur
)
- revision_intrinsic_metadata_hash_keys = ["id", "indexer_configuration_id"]
+ directory_intrinsic_metadata_hash_keys = ["id", "indexer_configuration_id"]
- def revision_intrinsic_metadata_missing_from_list(self, metadata, cur=None):
+ def directory_intrinsic_metadata_missing_from_list(self, metadata, cur=None):
"""List missing metadata."""
yield from self._missing_from_list(
- "revision_intrinsic_metadata",
+ "directory_intrinsic_metadata",
metadata,
- self.revision_intrinsic_metadata_hash_keys,
+ self.directory_intrinsic_metadata_hash_keys,
cur=cur,
)
- revision_intrinsic_metadata_cols = [
+ directory_intrinsic_metadata_cols = [
"id",
"metadata",
"mappings",
@@ -371,27 +371,27 @@
"tool_configuration",
]
- @stored_procedure("swh_mktemp_revision_intrinsic_metadata")
- def mktemp_revision_intrinsic_metadata(self, cur=None):
+ @stored_procedure("swh_mktemp_directory_intrinsic_metadata")
+ def mktemp_directory_intrinsic_metadata(self, cur=None):
pass
- def revision_intrinsic_metadata_add_from_temp(self, cur=None):
+ def directory_intrinsic_metadata_add_from_temp(self, cur=None):
cur = self._cursor(cur)
- cur.execute("select * from swh_revision_intrinsic_metadata_add()")
+ cur.execute("select * from swh_directory_intrinsic_metadata_add()")
return cur.fetchone()[0]
- def revision_intrinsic_metadata_get_from_list(self, ids, cur=None):
+ def directory_intrinsic_metadata_get_from_list(self, ids, cur=None):
yield from self._get_from_list(
- "revision_intrinsic_metadata",
+ "directory_intrinsic_metadata",
ids,
- self.revision_intrinsic_metadata_cols,
+ self.directory_intrinsic_metadata_cols,
cur=cur,
)
origin_intrinsic_metadata_cols = [
"id",
"metadata",
- "from_revision",
+ "from_directory",
"mappings",
"tool_id",
"tool_name",
diff --git a/swh/indexer/storage/in_memory.py b/swh/indexer/storage/in_memory.py
--- a/swh/indexer/storage/in_memory.py
+++ b/swh/indexer/storage/in_memory.py
@@ -38,8 +38,8 @@
ContentLicenseRow,
ContentMetadataRow,
ContentMimetypeRow,
+ DirectoryIntrinsicMetadataRow,
OriginIntrinsicMetadataRow,
- RevisionIntrinsicMetadataRow,
)
from .writer import JournalWriter
@@ -250,8 +250,8 @@
self._content_ctags = SubStorage(ContentCtagsRow, *args)
self._licenses = SubStorage(ContentLicenseRow, *args)
self._content_metadata = SubStorage(ContentMetadataRow, *args)
- self._revision_intrinsic_metadata = SubStorage(
- RevisionIntrinsicMetadataRow, *args
+ self._directory_intrinsic_metadata = SubStorage(
+ DirectoryIntrinsicMetadataRow, *args
)
self._origin_intrinsic_metadata = SubStorage(OriginIntrinsicMetadataRow, *args)
@@ -369,21 +369,21 @@
added = self._content_metadata.add(metadata)
return {"content_metadata:add": added}
- def revision_intrinsic_metadata_missing(
+ def directory_intrinsic_metadata_missing(
self, metadata: Iterable[Dict]
) -> List[Tuple[Sha1, int]]:
- return self._revision_intrinsic_metadata.missing(metadata)
+ return self._directory_intrinsic_metadata.missing(metadata)
- def revision_intrinsic_metadata_get(
+ def directory_intrinsic_metadata_get(
self, ids: Iterable[Sha1]
- ) -> List[RevisionIntrinsicMetadataRow]:
- return self._revision_intrinsic_metadata.get(ids)
+ ) -> List[DirectoryIntrinsicMetadataRow]:
+ return self._directory_intrinsic_metadata.get(ids)
- def revision_intrinsic_metadata_add(
- self, metadata: List[RevisionIntrinsicMetadataRow]
+ def directory_intrinsic_metadata_add(
+ self, metadata: List[DirectoryIntrinsicMetadataRow]
) -> Dict[str, int]:
- added = self._revision_intrinsic_metadata.add(metadata)
- return {"revision_intrinsic_metadata:add": added}
+ added = self._directory_intrinsic_metadata.add(metadata)
+ return {"directory_intrinsic_metadata:add": added}
def origin_intrinsic_metadata_get(
self, urls: Iterable[str]
diff --git a/swh/indexer/storage/interface.py b/swh/indexer/storage/interface.py
--- a/swh/indexer/storage/interface.py
+++ b/swh/indexer/storage/interface.py
@@ -15,8 +15,8 @@
ContentLicenseRow,
ContentMetadataRow,
ContentMimetypeRow,
+ DirectoryIntrinsicMetadataRow,
OriginIntrinsicMetadataRow,
- RevisionIntrinsicMetadataRow,
)
TResult = TypeVar("TResult")
@@ -341,8 +341,8 @@
"""
...
- @remote_api_endpoint("revision_intrinsic_metadata/missing")
- def revision_intrinsic_metadata_missing(
+ @remote_api_endpoint("directory_intrinsic_metadata/missing")
+ def directory_intrinsic_metadata_missing(
self, metadata: Iterable[Dict]
) -> List[Tuple[Sha1, int]]:
"""List metadata missing from storage.
@@ -350,7 +350,7 @@
Args:
metadata (iterable): dictionaries with keys:
- - **id** (bytes): sha1_git revision identifier
+ - **id** (bytes): sha1_git directory identifier
- **indexer_configuration_id** (int): tool used to compute
the results
@@ -360,11 +360,11 @@
"""
...
- @remote_api_endpoint("revision_intrinsic_metadata")
- def revision_intrinsic_metadata_get(
+ @remote_api_endpoint("directory_intrinsic_metadata")
+ def directory_intrinsic_metadata_get(
self, ids: Iterable[Sha1]
- ) -> List[RevisionIntrinsicMetadataRow]:
- """Retrieve revision metadata per id.
+ ) -> List[DirectoryIntrinsicMetadataRow]:
+ """Retrieve directory metadata per id.
Args:
ids (iterable): sha1 checksums
@@ -375,10 +375,10 @@
"""
...
- @remote_api_endpoint("revision_intrinsic_metadata/add")
- def revision_intrinsic_metadata_add(
+ @remote_api_endpoint("directory_intrinsic_metadata/add")
+ def directory_intrinsic_metadata_add(
self,
- metadata: List[RevisionIntrinsicMetadataRow],
+ metadata: List[DirectoryIntrinsicMetadataRow],
) -> Dict[str, int]:
"""Add metadata not present in storage.
diff --git a/swh/indexer/storage/model.py b/swh/indexer/storage/model.py
--- a/swh/indexer/storage/model.py
+++ b/swh/indexer/storage/model.py
@@ -120,8 +120,8 @@
@attr.s
-class RevisionIntrinsicMetadataRow(BaseRow):
- object_type: Final = "revision_intrinsic_metadata"
+class DirectoryIntrinsicMetadataRow(BaseRow):
+ object_type: Final = "directory_intrinsic_metadata"
id = attr.ib(type=Sha1Git)
metadata = attr.ib(type=Dict[str, Any])
@@ -134,5 +134,5 @@
id = attr.ib(type=str)
metadata = attr.ib(type=Dict[str, Any])
- from_revision = attr.ib(type=Sha1Git)
+ from_directory = attr.ib(type=Sha1Git)
mappings = attr.ib(type=List[str])
diff --git a/swh/indexer/tests/conftest.py b/swh/indexer/tests/conftest.py
--- a/swh/indexer/tests/conftest.py
+++ b/swh/indexer/tests/conftest.py
@@ -23,7 +23,7 @@
TASK_NAMES: List[Tuple[str, str]] = [
# (scheduler-task-type, task-class-test-name)
- ("index-revision-metadata", "revision_intrinsic_metadata"),
+ ("index-directory-metadata", "directory_intrinsic_metadata"),
("index-origin-metadata", "origin_intrinsic_metadata"),
]
diff --git a/swh/indexer/tests/storage/conftest.py b/swh/indexer/tests/storage/conftest.py
--- a/swh/indexer/tests/storage/conftest.py
+++ b/swh/indexer/tests/storage/conftest.py
@@ -41,9 +41,9 @@
data.tools = tools
data.sha1_1 = hash_to_bytes("34973274ccef6ab4dfaaf86599792fa9c3fe4689")
data.sha1_2 = hash_to_bytes("61c2b3a30496d329e21af70dd2d7e097046d07b7")
- data.revision_id_1 = hash_to_bytes("7026b7c1a2af56521e951c01ed20f255fa054238")
- data.revision_id_2 = hash_to_bytes("7026b7c1a2af56521e9587659012345678904321")
- data.revision_id_3 = hash_to_bytes("7026b7c1a2af56521e9587659012345678904320")
+ data.directory_id_1 = hash_to_bytes("7026b7c1a2af56521e951c01ed20f255fa054238")
+ data.directory_id_2 = hash_to_bytes("7026b7c1a2af56521e9587659012345678904321")
+ data.directory_id_3 = hash_to_bytes("7026b7c1a2af56521e9587659012345678904320")
data.origin_url_1 = "file:///dev/0/zero" # 44434341
data.origin_url_2 = "file:///dev/1/one" # 44434342
data.origin_url_3 = "file:///dev/2/two" # 54974445
diff --git a/swh/indexer/tests/storage/test_storage.py b/swh/indexer/tests/storage/test_storage.py
--- a/swh/indexer/tests/storage/test_storage.py
+++ b/swh/indexer/tests/storage/test_storage.py
@@ -19,8 +19,8 @@
ContentLicenseRow,
ContentMetadataRow,
ContentMimetypeRow,
+ DirectoryIntrinsicMetadataRow,
OriginIntrinsicMetadataRow,
- RevisionIntrinsicMetadataRow,
)
from swh.model.hashutil import hash_to_bytes
@@ -289,37 +289,37 @@
etype = self.endpoint_type
tool = data.tools[self.tool_name]
- data_rev1 = self.row_class.from_dict(
+ data_dir1 = self.row_class.from_dict(
{
- "id": data.revision_id_2,
+ "id": data.directory_id_2,
**self.example_data[0],
"indexer_configuration_id": tool["id"],
}
)
- data_rev2 = self.row_class.from_dict(
+ data_dir2 = self.row_class.from_dict(
{
- "id": data.revision_id_2,
+ "id": data.directory_id_2,
**self.example_data[1],
"indexer_configuration_id": tool["id"],
}
)
# when
- summary = endpoint(storage, etype, "add")([data_rev1])
+ summary = endpoint(storage, etype, "add")([data_dir1])
assert summary == expected_summary(1, etype)
with pytest.raises(DuplicateId):
- endpoint(storage, etype, "add")([data_rev2, data_rev2])
+ endpoint(storage, etype, "add")([data_dir2, data_dir2])
# then
actual_data = list(
- endpoint(storage, etype, "get")([data.revision_id_2, data.revision_id_1])
+ endpoint(storage, etype, "get")([data.directory_id_2, data.directory_id_1])
)
expected_data = [
self.row_class.from_dict(
- {"id": data.revision_id_2, **self.example_data[0], "tool": tool}
+ {"id": data.directory_id_2, **self.example_data[0], "tool": tool}
)
]
assert actual_data == expected_data
@@ -806,11 +806,11 @@
row_class = ContentMetadataRow
-class TestIndexerStorageRevisionIntrinsicMetadata(StorageETypeTester):
- """Test Indexer Storage revision_intrinsic_metadata related methods"""
+class TestIndexerStorageDirectoryIntrinsicMetadata(StorageETypeTester):
+ """Test Indexer Storage directory_intrinsic_metadata related methods"""
tool_name = "swh-metadata-detector"
- endpoint_type = "revision_intrinsic_metadata"
+ endpoint_type = "directory_intrinsic_metadata"
example_data = [
{
"metadata": {
@@ -830,7 +830,7 @@
"mappings": ["mapping2"],
},
]
- row_class = RevisionIntrinsicMetadataRow
+ row_class = DirectoryIntrinsicMetadataRow
class TestIndexerStorageContentFossologyLicense(StorageETypeTester):
@@ -1102,8 +1102,8 @@
"version": None,
"name": None,
}
- metadata_rev = RevisionIntrinsicMetadataRow(
- id=data.revision_id_2,
+ metadata_dir = DirectoryIntrinsicMetadataRow(
+ id=data.directory_id_2,
metadata=metadata,
mappings=["mapping1"],
indexer_configuration_id=tool_id,
@@ -1113,11 +1113,11 @@
metadata=metadata,
indexer_configuration_id=tool_id,
mappings=["mapping1"],
- from_revision=data.revision_id_2,
+ from_directory=data.directory_id_2,
)
# when
- storage.revision_intrinsic_metadata_add([metadata_rev])
+ storage.directory_intrinsic_metadata_add([metadata_dir])
storage.origin_intrinsic_metadata_add([metadata_origin])
# then
@@ -1130,7 +1130,7 @@
id=data.origin_url_1,
metadata=metadata,
tool=data.tools["swh-metadata-detector"],
- from_revision=data.revision_id_2,
+ from_directory=data.directory_id_2,
mappings=["mapping1"],
)
]
@@ -1156,8 +1156,8 @@
"version": None,
"name": None,
}
- metadata_rev_v1 = RevisionIntrinsicMetadataRow(
- id=data.revision_id_2,
+ metadata_dir_v1 = DirectoryIntrinsicMetadataRow(
+ id=data.directory_id_2,
metadata=metadata_v1,
mappings=[],
indexer_configuration_id=tool_id,
@@ -1167,11 +1167,11 @@
metadata=metadata_v1.copy(),
indexer_configuration_id=tool_id,
mappings=[],
- from_revision=data.revision_id_2,
+ from_directory=data.directory_id_2,
)
# given
- storage.revision_intrinsic_metadata_add([metadata_rev_v1])
+ storage.directory_intrinsic_metadata_add([metadata_dir_v1])
storage.origin_intrinsic_metadata_add([metadata_origin_v1])
# when
@@ -1185,7 +1185,7 @@
id=data.origin_url_1,
metadata=metadata_v1,
tool=data.tools["swh-metadata-detector"],
- from_revision=data.revision_id_2,
+ from_directory=data.directory_id_2,
mappings=[],
)
]
@@ -1199,16 +1199,16 @@
"author": "MG",
}
)
- metadata_rev_v2 = attr.evolve(metadata_rev_v1, metadata=metadata_v2)
+ metadata_dir_v2 = attr.evolve(metadata_dir_v1, metadata=metadata_v2)
metadata_origin_v2 = OriginIntrinsicMetadataRow(
id=data.origin_url_1,
metadata=metadata_v2.copy(),
indexer_configuration_id=tool_id,
mappings=["npm"],
- from_revision=data.revision_id_1,
+ from_directory=data.directory_id_1,
)
- storage.revision_intrinsic_metadata_add([metadata_rev_v2])
+ storage.directory_intrinsic_metadata_add([metadata_dir_v2])
storage.origin_intrinsic_metadata_add([metadata_origin_v2])
actual_metadata = list(
@@ -1220,7 +1220,7 @@
id=data.origin_url_1,
metadata=metadata_v2,
tool=data.tools["swh-metadata-detector"],
- from_revision=data.revision_id_1,
+ from_directory=data.directory_id_1,
mappings=["npm"],
)
]
@@ -1252,8 +1252,8 @@
"mappings": [],
}
- metadata_rev_v1 = RevisionIntrinsicMetadataRow(
- id=data.revision_id_2,
+ metadata_dir_v1 = DirectoryIntrinsicMetadataRow(
+ id=data.directory_id_2,
metadata={
"version": None,
"name": None,
@@ -1265,7 +1265,7 @@
data_v1 = [
OriginIntrinsicMetadataRow(
id=origin,
- from_revision=data.revision_id_2,
+ from_directory=data.directory_id_2,
indexer_configuration_id=tool_id,
**example_data1,
)
@@ -1274,7 +1274,7 @@
data_v2 = [
OriginIntrinsicMetadataRow(
id=origin,
- from_revision=data.revision_id_2,
+ from_directory=data.directory_id_2,
indexer_configuration_id=tool_id,
**example_data2,
)
@@ -1287,7 +1287,7 @@
data_v2b = list(reversed(data_v2[0:-1]))
# given
- storage.revision_intrinsic_metadata_add([metadata_rev_v1])
+ storage.directory_intrinsic_metadata_add([metadata_dir_v1])
storage.origin_intrinsic_metadata_add(data_v1)
# when
@@ -1296,7 +1296,7 @@
expected_data_v1 = [
OriginIntrinsicMetadataRow(
id=origin,
- from_revision=data.revision_id_2,
+ from_directory=data.directory_id_2,
tool=data.tools["swh-metadata-detector"],
**example_data1,
)
@@ -1326,7 +1326,7 @@
expected_data_v2 = [
OriginIntrinsicMetadataRow(
id=origin,
- from_revision=data.revision_id_2,
+ from_directory=data.directory_id_2,
tool=data.tools["swh-metadata-detector"],
**example_data2,
)
@@ -1351,8 +1351,8 @@
"developmentStatus": None,
"name": None,
}
- metadata_rev = RevisionIntrinsicMetadataRow(
- id=data.revision_id_2,
+ metadata_dir = DirectoryIntrinsicMetadataRow(
+ id=data.directory_id_2,
metadata=metadata,
mappings=["mapping1"],
indexer_configuration_id=tool_id,
@@ -1362,11 +1362,11 @@
metadata=metadata,
indexer_configuration_id=tool_id,
mappings=["mapping1"],
- from_revision=data.revision_id_2,
+ from_directory=data.directory_id_2,
)
# when
- storage.revision_intrinsic_metadata_add([metadata_rev])
+ storage.directory_intrinsic_metadata_add([metadata_dir])
with pytest.raises(DuplicateId):
storage.origin_intrinsic_metadata_add([metadata_origin, metadata_origin])
@@ -1381,8 +1381,8 @@
metadata1 = {
"author": "John Doe",
}
- metadata1_rev = RevisionIntrinsicMetadataRow(
- id=data.revision_id_1,
+ metadata1_dir = DirectoryIntrinsicMetadataRow(
+ id=data.directory_id_1,
metadata=metadata1,
mappings=[],
indexer_configuration_id=tool_id,
@@ -1392,13 +1392,13 @@
metadata=metadata1,
mappings=[],
indexer_configuration_id=tool_id,
- from_revision=data.revision_id_1,
+ from_directory=data.directory_id_1,
)
metadata2 = {
"author": "Jane Doe",
}
- metadata2_rev = RevisionIntrinsicMetadataRow(
- id=data.revision_id_2,
+ metadata2_dir = DirectoryIntrinsicMetadataRow(
+ id=data.directory_id_2,
metadata=metadata2,
mappings=[],
indexer_configuration_id=tool_id,
@@ -1408,13 +1408,13 @@
metadata=metadata2,
mappings=[],
indexer_configuration_id=tool_id,
- from_revision=data.revision_id_2,
+ from_directory=data.directory_id_2,
)
# when
- storage.revision_intrinsic_metadata_add([metadata1_rev])
+ storage.directory_intrinsic_metadata_add([metadata1_dir])
storage.origin_intrinsic_metadata_add([metadata1_origin])
- storage.revision_intrinsic_metadata_add([metadata2_rev])
+ storage.directory_intrinsic_metadata_add([metadata2_dir])
storage.origin_intrinsic_metadata_add([metadata2_origin])
# then
@@ -1444,8 +1444,8 @@
"Jane Doe",
]
}
- metadata1_rev = RevisionIntrinsicMetadataRow(
- id=data.revision_id_1,
+ metadata1_dir = DirectoryIntrinsicMetadataRow(
+ id=data.directory_id_1,
metadata=metadata1,
mappings=[],
indexer_configuration_id=tool_id,
@@ -1455,7 +1455,7 @@
metadata=metadata1,
mappings=[],
indexer_configuration_id=tool_id,
- from_revision=data.revision_id_1,
+ from_directory=data.directory_id_1,
)
metadata2 = {
"author": [
@@ -1463,8 +1463,8 @@
"Jane Doe",
]
}
- metadata2_rev = RevisionIntrinsicMetadataRow(
- id=data.revision_id_2,
+ metadata2_dir = DirectoryIntrinsicMetadataRow(
+ id=data.directory_id_2,
metadata=metadata2,
mappings=[],
indexer_configuration_id=tool_id,
@@ -1474,13 +1474,13 @@
metadata=metadata2,
mappings=[],
indexer_configuration_id=tool_id,
- from_revision=data.revision_id_2,
+ from_directory=data.directory_id_2,
)
# when
- storage.revision_intrinsic_metadata_add([metadata1_rev])
+ storage.directory_intrinsic_metadata_add([metadata1_dir])
storage.origin_intrinsic_metadata_add([metadata1_origin])
- storage.revision_intrinsic_metadata_add([metadata2_rev])
+ storage.directory_intrinsic_metadata_add([metadata2_dir])
storage.origin_intrinsic_metadata_add([metadata2_origin])
# then
@@ -1508,8 +1508,8 @@
"@context": "foo",
"author": "John Doe",
}
- metadata1_rev = RevisionIntrinsicMetadataRow(
- id=data.revision_id_1,
+ metadata1_dir = DirectoryIntrinsicMetadataRow(
+ id=data.directory_id_1,
metadata=metadata1,
mappings=["npm"],
indexer_configuration_id=tool1_id,
@@ -1519,14 +1519,14 @@
metadata=metadata1,
mappings=["npm"],
indexer_configuration_id=tool1_id,
- from_revision=data.revision_id_1,
+ from_directory=data.directory_id_1,
)
metadata2 = {
"@context": "foo",
"author": "Jane Doe",
}
- metadata2_rev = RevisionIntrinsicMetadataRow(
- id=data.revision_id_2,
+ metadata2_dir = DirectoryIntrinsicMetadataRow(
+ id=data.directory_id_2,
metadata=metadata2,
mappings=["npm", "gemspec"],
indexer_configuration_id=tool2_id,
@@ -1536,13 +1536,13 @@
metadata=metadata2,
mappings=["npm", "gemspec"],
indexer_configuration_id=tool2_id,
- from_revision=data.revision_id_2,
+ from_directory=data.directory_id_2,
)
metadata3 = {
"@context": "foo",
}
- metadata3_rev = RevisionIntrinsicMetadataRow(
- id=data.revision_id_3,
+ metadata3_dir = DirectoryIntrinsicMetadataRow(
+ id=data.directory_id_3,
metadata=metadata3,
mappings=["npm", "gemspec"],
indexer_configuration_id=tool2_id,
@@ -1552,14 +1552,14 @@
metadata=metadata3,
mappings=["pkg-info"],
indexer_configuration_id=tool2_id,
- from_revision=data.revision_id_3,
+ from_directory=data.directory_id_3,
)
- storage.revision_intrinsic_metadata_add([metadata1_rev])
+ storage.directory_intrinsic_metadata_add([metadata1_dir])
storage.origin_intrinsic_metadata_add([metadata1_origin])
- storage.revision_intrinsic_metadata_add([metadata2_rev])
+ storage.directory_intrinsic_metadata_add([metadata2_dir])
storage.origin_intrinsic_metadata_add([metadata2_origin])
- storage.revision_intrinsic_metadata_add([metadata3_rev])
+ storage.directory_intrinsic_metadata_add([metadata3_dir])
storage.origin_intrinsic_metadata_add([metadata3_origin])
def test_origin_intrinsic_metadata_search_by_producer(
@@ -1685,7 +1685,7 @@
},
mappings=["npm", "gemspec"],
tool=tool2,
- from_revision=data.revision_id_2,
+ from_directory=data.directory_id_2,
)
],
next_page_token=None,
diff --git a/swh/indexer/tests/tasks.py b/swh/indexer/tests/tasks.py
--- a/swh/indexer/tests/tasks.py
+++ b/swh/indexer/tests/tasks.py
@@ -1,12 +1,12 @@
from celery import current_app as app
-from swh.indexer.metadata import OriginMetadataIndexer, RevisionMetadataIndexer
+from swh.indexer.metadata import DirectoryMetadataIndexer, OriginMetadataIndexer
from .test_metadata import ContentMetadataTestIndexer
from .utils import BASE_TEST_CONFIG
-class RevisionMetadataTestIndexer(RevisionMetadataIndexer):
+class DirectoryMetadataTestIndexer(DirectoryMetadataIndexer):
"""Specific indexer whose configuration is enough to satisfy the
indexing tests.
"""
@@ -29,12 +29,12 @@
return {**BASE_TEST_CONFIG, "tools": []}
def _prepare_sub_indexers(self):
- self.revision_metadata_indexer = RevisionMetadataTestIndexer()
+ self.directory_metadata_indexer = DirectoryMetadataTestIndexer()
@app.task
-def revision_intrinsic_metadata(*args, **kwargs):
- indexer = RevisionMetadataTestIndexer()
+def directory_intrinsic_metadata(*args, **kwargs):
+ indexer = DirectoryMetadataTestIndexer()
indexer.run(*args, **kwargs)
print("REV RESULT=", indexer.results)
diff --git a/swh/indexer/tests/test_cli.py b/swh/indexer/tests/test_cli.py
--- a/swh/indexer/tests/test_cli.py
+++ b/swh/indexer/tests/test_cli.py
@@ -16,14 +16,14 @@
from swh.indexer.cli import indexer_cli_group
from swh.indexer.storage.interface import IndexerStorageInterface
from swh.indexer.storage.model import (
+ DirectoryIntrinsicMetadataRow,
OriginIntrinsicMetadataRow,
- RevisionIntrinsicMetadataRow,
)
from swh.journal.writer import get_journal_writer
from swh.model.hashutil import hash_to_bytes
from swh.model.model import OriginVisitStatus
-from .utils import REVISION
+from .utils import DIRECTORY2, REVISION
def fill_idx_storage(idx_storage: IndexerStorageInterface, nb_rows: int) -> List[int]:
@@ -40,15 +40,15 @@
origin_metadata = [
OriginIntrinsicMetadataRow(
id="file://dev/%04d" % origin_id,
- from_revision=hash_to_bytes("abcd{:0>36}".format(origin_id)),
+ from_directory=hash_to_bytes("abcd{:0>36}".format(origin_id)),
indexer_configuration_id=tools[origin_id % 2]["id"],
metadata={"name": "origin %d" % origin_id},
mappings=["mapping%d" % (origin_id % 10)],
)
for origin_id in range(nb_rows)
]
- revision_metadata = [
- RevisionIntrinsicMetadataRow(
+ directory_metadata = [
+ DirectoryIntrinsicMetadataRow(
id=hash_to_bytes("abcd{:0>36}".format(origin_id)),
indexer_configuration_id=tools[origin_id % 2]["id"],
metadata={"name": "origin %d" % origin_id},
@@ -57,7 +57,7 @@
for origin_id in range(nb_rows)
]
- idx_storage.revision_intrinsic_metadata_add(revision_metadata)
+ idx_storage.directory_intrinsic_metadata_add(directory_metadata)
idx_storage.origin_intrinsic_metadata_add(origin_metadata)
return [tool["id"] for tool in tools]
@@ -605,10 +605,10 @@
)
mocker.patch(
- "swh.indexer.metadata.RevisionMetadataIndexer.index",
+ "swh.indexer.metadata.DirectoryMetadataIndexer.index",
return_value=[
- RevisionIntrinsicMetadataRow(
- id=REVISION.id,
+ DirectoryIntrinsicMetadataRow(
+ id=DIRECTORY2.id,
indexer_configuration_id=1,
mappings=["cff"],
metadata={"foo": "bar"},
@@ -645,7 +645,7 @@
expected_results = [
OriginIntrinsicMetadataRow(
id=status.origin,
- from_revision=REVISION.id,
+ from_directory=DIRECTORY2.id,
tool={"id": 1, **swh_indexer_config["tools"]},
mappings=["cff"],
metadata={"foo": "bar"},
diff --git a/swh/indexer/tests/test_indexer.py b/swh/indexer/tests/test_indexer.py
--- a/swh/indexer/tests/test_indexer.py
+++ b/swh/indexer/tests/test_indexer.py
@@ -11,13 +11,13 @@
from swh.indexer.indexer import (
ContentIndexer,
ContentPartitionIndexer,
+ DirectoryIndexer,
OriginIndexer,
- RevisionIndexer,
)
from swh.indexer.storage import PagedResult, Sha1
from swh.model.model import Content
-from .utils import BASE_TEST_CONFIG, REVISION
+from .utils import BASE_TEST_CONFIG, DIRECTORY2
class _TestException(Exception):
@@ -49,7 +49,7 @@
pass
-class CrashingRevisionIndexer(CrashingIndexerMixin, RevisionIndexer):
+class CrashingDirectoryIndexer(CrashingIndexerMixin, DirectoryIndexer):
pass
@@ -86,14 +86,14 @@
indexer.run([b"foo"])
-def test_revision_indexer_catch_exceptions():
- indexer = CrashingRevisionIndexer(config=BASE_TEST_CONFIG)
+def test_directory_indexer_catch_exceptions():
+ indexer = CrashingDirectoryIndexer(config=BASE_TEST_CONFIG)
indexer.storage = Mock()
- indexer.storage.revision_get.return_value = [REVISION]
+ indexer.storage.directory_get.return_value = [DIRECTORY2]
assert indexer.run([b"foo"]) == {"status": "failed"}
- assert indexer.process_journal_objects({"revision": [REVISION.to_dict()]}) == {
+ assert indexer.process_journal_objects({"directory": [DIRECTORY2.to_dict()]}) == {
"status": "failed"
}
@@ -103,7 +103,7 @@
indexer.run([b"foo"])
with pytest.raises(_TestException):
- indexer.process_journal_objects({"revision": [REVISION.to_dict()]})
+ indexer.process_journal_objects({"directory": [DIRECTORY2.to_dict()]})
def test_origin_indexer_catch_exceptions():
diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py
--- a/swh/indexer/tests/test_metadata.py
+++ b/swh/indexer/tests/test_metadata.py
@@ -10,16 +10,16 @@
import pytest
from swh.indexer.codemeta import CODEMETA_TERMS
-from swh.indexer.metadata import ContentMetadataIndexer, RevisionMetadataIndexer
+from swh.indexer.metadata import ContentMetadataIndexer, DirectoryMetadataIndexer
from swh.indexer.metadata_detector import detect_metadata
from swh.indexer.metadata_dictionary import MAPPINGS
from swh.indexer.metadata_dictionary.maven import MavenMapping
from swh.indexer.metadata_dictionary.npm import NpmMapping
from swh.indexer.metadata_dictionary.ruby import GemspecMapping
-from swh.indexer.storage.model import ContentMetadataRow, RevisionIntrinsicMetadataRow
-from swh.indexer.tests.utils import DIRECTORY2, REVISION
+from swh.indexer.storage.model import ContentMetadataRow, DirectoryIntrinsicMetadataRow
+from swh.indexer.tests.utils import DIRECTORY2
from swh.model.hashutil import hash_to_bytes
-from swh.model.model import Directory, DirectoryEntry, Revision
+from swh.model.model import Directory, DirectoryEntry
from .utils import (
BASE_TEST_CONFIG,
@@ -43,10 +43,10 @@
"""
def parse_config_file(self, *args, **kwargs):
- assert False, "should not be called; the rev indexer configures it."
+ assert False, "should not be called; the dir indexer configures it."
-REVISION_METADATA_CONFIG = {
+DIRECTORY_METADATA_CONFIG = {
**BASE_TEST_CONFIG,
"tools": TRANSLATOR_TOOL,
}
@@ -1154,8 +1154,8 @@
parts.append(b"end\n")
self.gemspec_mapping.translate(b"".join(parts))
- def test_revision_metadata_indexer(self):
- metadata_indexer = RevisionMetadataIndexer(config=REVISION_METADATA_CONFIG)
+ def test_directory_metadata_indexer(self):
+ metadata_indexer = DirectoryMetadataIndexer(config=DIRECTORY_METADATA_CONFIG)
fill_obj_storage(metadata_indexer.objstorage)
fill_storage(metadata_indexer.storage)
@@ -1163,8 +1163,7 @@
{f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()}
)
assert tool is not None
- rev = REVISION
- assert rev.directory == DIRECTORY2.id
+ dir_ = DIRECTORY2
metadata_indexer.idx_storage.content_metadata_add(
[
@@ -1176,15 +1175,17 @@
]
)
- metadata_indexer.run([rev.id])
+ metadata_indexer.run([dir_.id])
results = list(
- metadata_indexer.idx_storage.revision_intrinsic_metadata_get([REVISION.id])
+ metadata_indexer.idx_storage.directory_intrinsic_metadata_get(
+ [DIRECTORY2.id]
+ )
)
expected_results = [
- RevisionIntrinsicMetadataRow(
- id=rev.id,
+ DirectoryIntrinsicMetadataRow(
+ id=dir_.id,
tool=TRANSLATOR_TOOL,
metadata=YARN_PARSER_METADATA,
mappings=["npm"],
@@ -1197,33 +1198,27 @@
# then
assert results == expected_results
- def test_revision_metadata_indexer_single_root_dir(self):
- metadata_indexer = RevisionMetadataIndexer(config=REVISION_METADATA_CONFIG)
+ def test_directory_metadata_indexer_single_root_dir(self):
+ metadata_indexer = DirectoryMetadataIndexer(config=DIRECTORY_METADATA_CONFIG)
fill_obj_storage(metadata_indexer.objstorage)
fill_storage(metadata_indexer.storage)
# Add a parent directory, that is the only directory at the root
- # of the revision
- rev = REVISION
- assert rev.directory == DIRECTORY2.id
+ # of the directory
+ dir_ = DIRECTORY2
- directory = Directory(
+ new_dir = Directory(
entries=(
DirectoryEntry(
name=b"foobar-1.0.0",
type="dir",
- target=rev.directory,
+ target=dir_.id,
perms=16384,
),
),
)
- assert directory.id is not None
- metadata_indexer.storage.directory_add([directory])
-
- new_rev_dict = {**rev.to_dict(), "directory": directory.id}
- new_rev_dict.pop("id")
- new_rev = Revision.from_dict(new_rev_dict)
- metadata_indexer.storage.revision_add([new_rev])
+ assert new_dir.id is not None
+ metadata_indexer.storage.directory_add([new_dir])
tool = metadata_indexer.idx_storage.indexer_configuration_get(
{f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()}
@@ -1240,15 +1235,15 @@
]
)
- metadata_indexer.run([new_rev.id])
+ metadata_indexer.run([new_dir.id])
results = list(
- metadata_indexer.idx_storage.revision_intrinsic_metadata_get([new_rev.id])
+ metadata_indexer.idx_storage.directory_intrinsic_metadata_get([new_dir.id])
)
expected_results = [
- RevisionIntrinsicMetadataRow(
- id=new_rev.id,
+ DirectoryIntrinsicMetadataRow(
+ id=new_dir.id,
tool=TRANSLATOR_TOOL,
metadata=YARN_PARSER_METADATA,
mappings=["npm"],
diff --git a/swh/indexer/tests/test_origin_metadata.py b/swh/indexer/tests/test_origin_metadata.py
--- a/swh/indexer/tests/test_origin_metadata.py
+++ b/swh/indexer/tests/test_origin_metadata.py
@@ -11,14 +11,14 @@
from swh.indexer.metadata import OriginMetadataIndexer
from swh.indexer.storage.interface import IndexerStorageInterface
from swh.indexer.storage.model import (
+ DirectoryIntrinsicMetadataRow,
OriginIntrinsicMetadataRow,
- RevisionIntrinsicMetadataRow,
)
from swh.model.model import Origin
from swh.storage.interface import StorageInterface
from .test_metadata import TRANSLATOR_TOOL
-from .utils import REVISION, YARN_PARSER_METADATA
+from .utils import DIRECTORY2, YARN_PARSER_METADATA
@pytest.fixture
@@ -41,9 +41,9 @@
tool = swh_indexer_config["tools"]
- rev_id = REVISION.id
- rev_metadata = RevisionIntrinsicMetadataRow(
- id=rev_id,
+ dir_id = DIRECTORY2.id
+ rev_metadata = DirectoryIntrinsicMetadataRow(
+ id=dir_id,
tool=tool,
metadata=YARN_PARSER_METADATA,
mappings=["npm"],
@@ -51,12 +51,12 @@
origin_metadata = OriginIntrinsicMetadataRow(
id=origin,
tool=tool,
- from_revision=rev_id,
+ from_directory=dir_id,
metadata=YARN_PARSER_METADATA,
mappings=["npm"],
)
- rev_results = list(idx_storage.revision_intrinsic_metadata_get([rev_id]))
+ rev_results = list(idx_storage.directory_intrinsic_metadata_get([dir_id]))
for rev_result in rev_results:
assert rev_result.tool
del rev_result.tool["id"]
@@ -82,9 +82,9 @@
indexer.run(["https://github.com/librariesio/yarn-parser"] * 2)
origin = "https://github.com/librariesio/yarn-parser"
- rev_id = REVISION.id
+ dir_id = DIRECTORY2.id
- rev_results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id]))
+ rev_results = list(indexer.idx_storage.directory_intrinsic_metadata_get([dir_id]))
assert len(rev_results) == 1
orig_results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin]))
@@ -121,12 +121,12 @@
indexer = OriginMetadataIndexer(config=swh_indexer_config)
indexer.run([origin1, origin2])
- rev_id = REVISION.id
+ dir_id = DIRECTORY2.id
- rev_results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id]))
+ rev_results = list(indexer.idx_storage.directory_intrinsic_metadata_get([dir_id]))
assert rev_results == [
- RevisionIntrinsicMetadataRow(
- id=rev_id,
+ DirectoryIntrinsicMetadataRow(
+ id=dir_id,
metadata=YARN_PARSER_METADATA,
mappings=["npm"],
tool=rev_results[0].tool,
@@ -140,7 +140,7 @@
assert orig_results == [
OriginIntrinsicMetadataRow(
id=origin2,
- from_revision=rev_id,
+ from_directory=dir_id,
metadata=YARN_PARSER_METADATA,
mappings=["npm"],
tool=orig_results[0].tool,
@@ -148,7 +148,7 @@
]
-def test_origin_metadata_indexer_duplicate_revision(
+def test_origin_metadata_indexer_duplicate_directory(
swh_indexer_config,
idx_storage: IndexerStorageInterface,
storage: StorageInterface,
@@ -162,9 +162,9 @@
origin2 = "https://github.com/librariesio/yarn-parser.git"
indexer.run([origin1, origin2])
- rev_id = REVISION.id
+ dir_id = DIRECTORY2.id
- rev_results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id]))
+ rev_results = list(indexer.idx_storage.directory_intrinsic_metadata_get([dir_id]))
assert len(rev_results) == 1
orig_results = list(
@@ -185,9 +185,9 @@
with patch("swh.indexer.metadata_dictionary.npm.NpmMapping.filename", b"foo.json"):
indexer.run([origin])
- rev_id = REVISION.id
+ dir_id = DIRECTORY2.id
- rev_results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id]))
+ rev_results = list(indexer.idx_storage.directory_intrinsic_metadata_get([dir_id]))
assert rev_results == []
orig_results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin]))
@@ -204,15 +204,15 @@
indexer = OriginMetadataIndexer(config=swh_indexer_config)
origin = "https://github.com/librariesio/yarn-parser"
with patch(
- "swh.indexer.metadata.RevisionMetadataIndexer"
- ".translate_revision_intrinsic_metadata",
+ "swh.indexer.metadata.DirectoryMetadataIndexer"
+ ".translate_directory_intrinsic_metadata",
return_value=(["npm"], {"@context": "foo"}),
):
indexer.run([origin])
- rev_id = REVISION.id
+ dir_id = DIRECTORY2.id
- rev_results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id]))
+ rev_results = list(indexer.idx_storage.directory_intrinsic_metadata_get([dir_id]))
assert rev_results == []
orig_results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin]))
@@ -229,15 +229,15 @@
indexer = OriginMetadataIndexer(config=swh_indexer_config)
origin = "https://github.com/librariesio/yarn-parser"
with patch(
- "swh.indexer.metadata.RevisionMetadataIndexer"
- ".translate_revision_intrinsic_metadata",
+ "swh.indexer.metadata.DirectoryMetadataIndexer"
+ ".translate_directory_intrinsic_metadata",
return_value=None,
):
indexer.run([origin])
- rev_id = REVISION.id
+ dir_id = DIRECTORY2.id
- rev_results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id]))
+ rev_results = list(indexer.idx_storage.directory_intrinsic_metadata_get([dir_id]))
assert rev_results == []
orig_results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin]))

File Metadata

Mime Type
text/plain
Expires
Nov 5 2024, 6:55 AM (10 w, 6 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3231971

Event Timeline