Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7066372
D7937.id28590.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
67 KB
Subscribers
None
D7937.id28590.diff
View Options
diff --git a/swh/indexer/indexer.py b/swh/indexer/indexer.py
--- a/swh/indexer/indexer.py
+++ b/swh/indexer/indexer.py
@@ -18,6 +18,7 @@
List,
Optional,
Set,
+ Tuple,
TypeVar,
Union,
)
@@ -31,7 +32,7 @@
from swh.indexer.storage import INDEXER_CFG_KEY, Sha1, get_indexer_storage
from swh.indexer.storage.interface import IndexerStorageInterface
from swh.model import hashutil
-from swh.model.model import Origin, Revision, Sha1Git
+from swh.model.model import Directory, Origin, Sha1Git
from swh.objstorage.exc import ObjNotFoundError
from swh.objstorage.factory import get_objstorage
from swh.scheduler import CONFIG as SWH_CONFIG
@@ -40,7 +41,7 @@
class ObjectsDict(TypedDict, total=False):
- revision: List[Dict]
+ directory: List[Dict]
origin: List[Dict]
origin_visit_status: List[Dict]
@@ -109,7 +110,7 @@
content, sha1_git for revision, directory, release, and id for origin
To implement a new concrete indexer, inherit from the object level
- classes: :class:`ContentIndexer`, :class:`RevisionIndexer`,
+ classes: :class:`ContentIndexer`, :class:`DirectoryIndexer`,
:class:`OriginIndexer`.
Then you need to implement the following functions:
@@ -583,11 +584,11 @@
return results
-class RevisionIndexer(BaseIndexer[Sha1Git, Revision, TResult], Generic[TResult]):
+class DirectoryIndexer(BaseIndexer[Sha1Git, Directory, TResult], Generic[TResult]):
"""An object type indexer, inherits from the :class:`BaseIndexer` and
- implements Revision indexing using the run method
+ implements Directory indexing using the run method
- Note: the :class:`RevisionIndexer` is not an instantiable object.
+ Note: the :class:`DirectoryIndexer` is not an instantiable object.
To use it in another context one should inherit from this class
and override the methods mentioned in the :class:`BaseIndexer`
class.
@@ -597,7 +598,7 @@
def run(self, ids: List[Sha1Git], **kwargs) -> Dict:
"""Given a list of sha1_gits:
- - retrieve revisions from storage
+ - retrieve directories from storage
- execute the indexing computations
- store the results
@@ -612,36 +613,37 @@
)
del kwargs["policy_update"]
- revision_ids = [
+ directory_ids = [
hashutil.hash_to_bytes(id_) if isinstance(id_, str) else id_ for id_ in ids
]
- revisions = []
- for (rev_id, rev) in zip(revision_ids, self.storage.revision_get(revision_ids)):
- if not rev:
- # TODO: call self.index() with rev=None?
- self.log.warning(
- "Revision %s not found in storage", hashutil.hash_to_hex(rev_id)
- )
- continue
- revisions.append(rev.to_dict())
- return self.process_journal_objects({"revision": revisions})
+ return self._process_directories([(dir_id, None) for dir_id in directory_ids])
def process_journal_objects(self, objects: ObjectsDict) -> Dict:
"""Worker function for ``JournalClient``. Expects ``objects`` to have a single
- key, ``"revision"``."""
- assert set(objects) == {"revision"}
+ key, ``"directory"``."""
+ assert set(objects) == {"directory"}
+ return self._process_directories(
+ [(dir_["id"], Directory.from_dict(dir_)) for dir_ in objects["directory"]]
+ )
+
+ def _process_directories(
+ self,
+ directories: Union[List[Tuple[Sha1Git, Directory]], List[Tuple[Sha1Git, None]]],
+ ) -> Dict:
summary: Dict[str, Any] = {"status": "uneventful"}
results = []
- for rev in objects["revision"]:
+ # TODO: fetch raw_manifest when useful?
+
+ for (dir_id, dir_) in directories:
try:
- results.extend(self.index(rev["id"], Revision.from_dict(rev)))
+ results.extend(self.index(dir_id, dir_))
except Exception:
if not self.catch_exceptions:
raise
- self.log.exception("Problem when processing revision")
+ self.log.exception("Problem when processing directory")
sentry_sdk.capture_exception()
summary["status"] = "failed"
diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py
--- a/swh/indexer/metadata.py
+++ b/swh/indexer/metadata.py
@@ -21,18 +21,18 @@
from swh.core.config import merge_configs
from swh.core.utils import grouper
from swh.indexer.codemeta import merge_documents
-from swh.indexer.indexer import ContentIndexer, OriginIndexer, RevisionIndexer
+from swh.indexer.indexer import ContentIndexer, DirectoryIndexer, OriginIndexer
from swh.indexer.metadata_detector import detect_metadata
from swh.indexer.metadata_dictionary import MAPPINGS
from swh.indexer.origin_head import get_head_swhid
from swh.indexer.storage import INDEXER_CFG_KEY, Sha1
from swh.indexer.storage.model import (
ContentMetadataRow,
+ DirectoryIntrinsicMetadataRow,
OriginIntrinsicMetadataRow,
- RevisionIntrinsicMetadataRow,
)
from swh.model import hashutil
-from swh.model.model import Origin, Revision, Sha1Git
+from swh.model.model import Directory, Origin, Sha1Git
from swh.model.swhids import ObjectType
REVISION_GET_BATCH_SIZE = 10
@@ -83,7 +83,7 @@
self,
id: Sha1,
data: Optional[bytes] = None,
- log_suffix="unknown revision",
+ log_suffix="unknown directory",
**kwargs,
) -> List[ContentMetadataRow]:
"""Index sha1s' content and store result.
@@ -145,18 +145,18 @@
}
-class RevisionMetadataIndexer(RevisionIndexer[RevisionIntrinsicMetadataRow]):
- """Revision-level indexer
+class DirectoryMetadataIndexer(DirectoryIndexer[DirectoryIntrinsicMetadataRow]):
+ """Directory-level indexer
This indexer is in charge of:
- - filtering revisions already indexed in revision_intrinsic_metadata table
+ - filtering directories already indexed in directory_intrinsic_metadata table
with defined computation tool
- - retrieve all entry_files in root directory
+ - retrieve all entry_files in directory
- use metadata_detector for file_names containing metadata
- compute metadata translation if necessary and possible (depends on tool)
- send sha1s to content indexing if possible
- - store the results for revision
+ - store the results for directory
"""
@@ -166,7 +166,7 @@
def filter(self, sha1_gits):
"""Filter out known sha1s and return only missing ones."""
- yield from self.idx_storage.revision_intrinsic_metadata_missing(
+ yield from self.idx_storage.directory_intrinsic_metadata_missing(
(
{
"id": sha1_git,
@@ -177,51 +177,52 @@
)
def index(
- self, id: Sha1Git, data: Optional[Revision], **kwargs
- ) -> List[RevisionIntrinsicMetadataRow]:
- """Index rev by processing it and organizing result.
+ self, id: Sha1Git, data: Optional[Directory] = None, **kwargs
+ ) -> List[DirectoryIntrinsicMetadataRow]:
+ """Index directory by processing it and organizing result.
use metadata_detector to iterate on filenames
- if one filename detected -> sends file to content indexer
- - if multiple file detected -> translation needed at revision level
+ - if multiple file detected -> translation needed at directory level
Args:
- id: sha1_git of the revision
- data: revision model object from storage
+ id: sha1_git of the directory
+ data: directory model object from storage
Returns:
- dict: dictionary representing a revision_intrinsic_metadata, with
+ dict: dictionary representing a directory_intrinsic_metadata, with
keys:
- - id (str): rev's identifier (sha1_git)
+ - id (str): directory's identifier (sha1_git)
- indexer_configuration_id (bytes): tool used
- metadata: dict of retrieved metadata
"""
- rev = data
- assert isinstance(rev, Revision)
+ if data is None:
+ dir_ = list(self.storage.directory_ls(id, recursive=False))
+ else:
+ assert isinstance(data, Directory)
+ dir_ = data.to_dict()
try:
- root_dir = rev.directory
- dir_ls = list(self.storage.directory_ls(root_dir, recursive=False))
- if [entry["type"] for entry in dir_ls] == ["dir"]:
+ if [entry["type"] for entry in dir_] == ["dir"]:
# If the root is just a single directory, recurse into it
# eg. PyPI packages, GNU tarballs
- subdir = dir_ls[0]["target"]
- dir_ls = list(self.storage.directory_ls(subdir, recursive=False))
- files = [entry for entry in dir_ls if entry["type"] == "file"]
+ subdir = dir_[0]["target"]
+ dir_ = list(self.storage.directory_ls(subdir, recursive=False))
+ files = [entry for entry in dir_ if entry["type"] == "file"]
detected_files = detect_metadata(files)
- (mappings, metadata) = self.translate_revision_intrinsic_metadata(
+ (mappings, metadata) = self.translate_directory_intrinsic_metadata(
detected_files,
- log_suffix="revision=%s" % hashutil.hash_to_hex(rev.id),
+ log_suffix="directory=%s" % hashutil.hash_to_hex(id),
)
except Exception as e:
- self.log.exception("Problem when indexing rev: %r", e)
+ self.log.exception("Problem when indexing dir: %r", e)
sentry_sdk.capture_exception()
return [
- RevisionIntrinsicMetadataRow(
- id=rev.id,
+ DirectoryIntrinsicMetadataRow(
+ id=id,
indexer_configuration_id=self.tool["id"],
mappings=mappings,
metadata=metadata,
@@ -229,7 +230,7 @@
]
def persist_index_computations(
- self, results: List[RevisionIntrinsicMetadataRow]
+ self, results: List[DirectoryIntrinsicMetadataRow]
) -> Dict[str, int]:
"""Persist the results in storage.
@@ -242,10 +243,10 @@
"""
# TODO: add functions in storage to keep data in
- # revision_intrinsic_metadata
- return self.idx_storage.revision_intrinsic_metadata_add(results)
+ # directory_intrinsic_metadata
+ return self.idx_storage.directory_intrinsic_metadata_add(results)
- def translate_revision_intrinsic_metadata(
+ def translate_directory_intrinsic_metadata(
self, detected_files: Dict[str, List[Any]], log_suffix: str
) -> Tuple[List[Any], Any]:
"""
@@ -316,17 +317,17 @@
class OriginMetadataIndexer(
- OriginIndexer[Tuple[OriginIntrinsicMetadataRow, RevisionIntrinsicMetadataRow]]
+ OriginIndexer[Tuple[OriginIntrinsicMetadataRow, DirectoryIntrinsicMetadataRow]]
):
USE_TOOLS = False
def __init__(self, config=None, **kwargs) -> None:
super().__init__(config=config, **kwargs)
- self.revision_metadata_indexer = RevisionMetadataIndexer(config=config)
+ self.directory_metadata_indexer = DirectoryMetadataIndexer(config=config)
def index_list(
self, origins: List[Origin], check_origin_known: bool = True, **kwargs
- ) -> List[Tuple[OriginIntrinsicMetadataRow, RevisionIntrinsicMetadataRow]]:
+ ) -> List[Tuple[OriginIntrinsicMetadataRow, DirectoryIntrinsicMetadataRow]]:
head_rev_ids = []
origins_with_head = []
@@ -365,39 +366,41 @@
self.log.warning("Missing head revision of origin %r", origin.url)
continue
- for rev_metadata in self.revision_metadata_indexer.index(rev.id, rev):
- # There is at most one rev_metadata
+ for dir_metadata in self.directory_metadata_indexer.index(rev.directory):
+ # There is at most one dir_metadata
orig_metadata = OriginIntrinsicMetadataRow(
- from_revision=rev_metadata.id,
+ from_directory=dir_metadata.id,
id=origin.url,
- metadata=rev_metadata.metadata,
- mappings=rev_metadata.mappings,
- indexer_configuration_id=rev_metadata.indexer_configuration_id,
+ metadata=dir_metadata.metadata,
+ mappings=dir_metadata.mappings,
+ indexer_configuration_id=dir_metadata.indexer_configuration_id,
)
- results.append((orig_metadata, rev_metadata))
+ results.append((orig_metadata, dir_metadata))
return results
def persist_index_computations(
self,
- results: List[Tuple[OriginIntrinsicMetadataRow, RevisionIntrinsicMetadataRow]],
+ results: List[Tuple[OriginIntrinsicMetadataRow, DirectoryIntrinsicMetadataRow]],
) -> Dict[str, int]:
- # Deduplicate revisions
- rev_metadata: List[RevisionIntrinsicMetadataRow] = []
+ # Deduplicate directories
+ dir_metadata: List[DirectoryIntrinsicMetadataRow] = []
orig_metadata: List[OriginIntrinsicMetadataRow] = []
summary: Dict = {}
- for (orig_item, rev_item) in results:
- assert rev_item.metadata == orig_item.metadata
- if rev_item.metadata and not (rev_item.metadata.keys() <= {"@context"}):
+ for (orig_item, dir_item) in results:
+ assert dir_item.metadata == orig_item.metadata
+ if dir_item.metadata and not (dir_item.metadata.keys() <= {"@context"}):
# Only store non-empty metadata sets
- if rev_item not in rev_metadata:
- rev_metadata.append(rev_item)
+ if dir_item not in dir_metadata:
+ dir_metadata.append(dir_item)
if orig_item not in orig_metadata:
orig_metadata.append(orig_item)
- if rev_metadata:
- summary_rev = self.idx_storage.revision_intrinsic_metadata_add(rev_metadata)
- summary.update(summary_rev)
+ if dir_metadata:
+ summary_dir = self.idx_storage.directory_intrinsic_metadata_add(
+ dir_metadata
+ )
+ summary.update(summary_dir)
if orig_metadata:
summary_ori = self.idx_storage.origin_intrinsic_metadata_add(orig_metadata)
summary.update(summary_ori)
diff --git a/swh/indexer/sql/30-schema.sql b/swh/indexer/sql/30-schema.sql
--- a/swh/indexer/sql/30-schema.sql
+++ b/swh/indexer/sql/30-schema.sql
@@ -99,34 +99,34 @@
comment on column content_metadata.metadata is 'result of translation with defined format';
comment on column content_metadata.indexer_configuration_id is 'tool used for translation';
--- The table revision_intrinsic_metadata provides a minimal set of intrinsic
+-- The table directory_intrinsic_metadata provides a minimal set of intrinsic
-- metadata detected with the detection tool (indexer_configuration_id) and
-- aggregated from the content_metadata translation.
-create table revision_intrinsic_metadata(
+create table directory_intrinsic_metadata(
id sha1_git not null,
metadata jsonb not null,
indexer_configuration_id bigint not null,
mappings text array not null
);
-comment on table revision_intrinsic_metadata is 'metadata semantically detected and translated in a revision';
-comment on column revision_intrinsic_metadata.id is 'sha1_git of revision';
-comment on column revision_intrinsic_metadata.metadata is 'result of detection and translation with defined format';
-comment on column revision_intrinsic_metadata.indexer_configuration_id is 'tool used for detection';
-comment on column revision_intrinsic_metadata.mappings is 'type of metadata files used to obtain this metadata (eg. pkg-info, npm)';
+comment on table directory_intrinsic_metadata is 'metadata semantically detected and translated in a directory';
+comment on column directory_intrinsic_metadata.id is 'sha1_git of directory';
+comment on column directory_intrinsic_metadata.metadata is 'result of detection and translation with defined format';
+comment on column directory_intrinsic_metadata.indexer_configuration_id is 'tool used for detection';
+comment on column directory_intrinsic_metadata.mappings is 'type of metadata files used to obtain this metadata (eg. pkg-info, npm)';
create table origin_intrinsic_metadata(
id text not null, -- origin url
metadata jsonb,
indexer_configuration_id bigint not null,
- from_revision sha1_git not null,
+ from_directory sha1_git not null,
metadata_tsvector tsvector,
mappings text array not null
);
comment on table origin_intrinsic_metadata is 'keeps intrinsic metadata for an origin';
comment on column origin_intrinsic_metadata.id is 'url of the origin';
-comment on column origin_intrinsic_metadata.metadata is 'metadata extracted from a revision';
+comment on column origin_intrinsic_metadata.metadata is 'metadata extracted from a directory';
comment on column origin_intrinsic_metadata.indexer_configuration_id is 'tool used to generate this metadata';
-comment on column origin_intrinsic_metadata.from_revision is 'sha1 of the revision this metadata was copied from.';
+comment on column origin_intrinsic_metadata.from_directory is 'sha1 of the directory this metadata was copied from.';
comment on column origin_intrinsic_metadata.mappings is 'type of metadata files used to obtain this metadata (eg. pkg-info, npm)';
diff --git a/swh/indexer/sql/50-func.sql b/swh/indexer/sql/50-func.sql
--- a/swh/indexer/sql/50-func.sql
+++ b/swh/indexer/sql/50-func.sql
@@ -273,25 +273,25 @@
-- end content_metadata functions
--- add tmp_revision_intrinsic_metadata entries to revision_intrinsic_metadata,
+-- add tmp_directory_intrinsic_metadata entries to directory_intrinsic_metadata,
-- overwriting duplicates.
--
-- If filtering duplicates is in order, the call to
--- swh_revision_intrinsic_metadata_missing must take place before calling this
+-- swh_directory_intrinsic_metadata_missing must take place before calling this
-- function.
--
-- operates in bulk: 0. swh_mktemp(content_language), 1. COPY to
--- tmp_revision_intrinsic_metadata, 2. call this function
-create or replace function swh_revision_intrinsic_metadata_add()
+-- tmp_directory_intrinsic_metadata, 2. call this function
+create or replace function swh_directory_intrinsic_metadata_add()
returns bigint
language plpgsql
as $$
declare
res bigint;
begin
- insert into revision_intrinsic_metadata (id, metadata, mappings, indexer_configuration_id)
+ insert into directory_intrinsic_metadata (id, metadata, mappings, indexer_configuration_id)
select id, metadata, mappings, indexer_configuration_id
- from tmp_revision_intrinsic_metadata tcm
+ from tmp_directory_intrinsic_metadata tcm
on conflict(id, indexer_configuration_id)
do update set
metadata = excluded.metadata,
@@ -302,19 +302,19 @@
end
$$;
-comment on function swh_revision_intrinsic_metadata_add() IS 'Add new revision intrinsic metadata';
+comment on function swh_directory_intrinsic_metadata_add() IS 'Add new directory intrinsic metadata';
--- create a temporary table for retrieving revision_intrinsic_metadata
-create or replace function swh_mktemp_revision_intrinsic_metadata()
+-- create a temporary table for retrieving directory_intrinsic_metadata
+create or replace function swh_mktemp_directory_intrinsic_metadata()
returns void
language sql
as $$
- create temporary table if not exists tmp_revision_intrinsic_metadata (
- like revision_intrinsic_metadata including defaults
+ create temporary table if not exists tmp_directory_intrinsic_metadata (
+ like directory_intrinsic_metadata including defaults
) on commit delete rows;
$$;
-comment on function swh_mktemp_revision_intrinsic_metadata() is 'Helper table to add revision intrinsic metadata';
+comment on function swh_mktemp_directory_intrinsic_metadata() is 'Helper table to add directory intrinsic metadata';
-- create a temporary table for retrieving origin_intrinsic_metadata
create or replace function swh_mktemp_origin_intrinsic_metadata()
@@ -380,8 +380,8 @@
begin
perform swh_origin_intrinsic_metadata_compute_tsvector();
- insert into origin_intrinsic_metadata (id, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings)
- select id, metadata, indexer_configuration_id, from_revision,
+ insert into origin_intrinsic_metadata (id, metadata, indexer_configuration_id, from_directory, metadata_tsvector, mappings)
+ select id, metadata, indexer_configuration_id, from_directory,
metadata_tsvector, mappings
from tmp_origin_intrinsic_metadata
on conflict(id, indexer_configuration_id)
@@ -389,7 +389,7 @@
metadata = excluded.metadata,
metadata_tsvector = excluded.metadata_tsvector,
mappings = excluded.mappings,
- from_revision = excluded.from_revision;
+ from_directory = excluded.from_directory;
get diagnostics res = ROW_COUNT;
return res;
diff --git a/swh/indexer/sql/60-indexes.sql b/swh/indexer/sql/60-indexes.sql
--- a/swh/indexer/sql/60-indexes.sql
+++ b/swh/indexer/sql/60-indexes.sql
@@ -25,12 +25,12 @@
alter table content_metadata add constraint content_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid;
alter table content_metadata validate constraint content_metadata_indexer_configuration_id_fkey;
--- revision_intrinsic_metadata
-create unique index revision_intrinsic_metadata_pkey on revision_intrinsic_metadata(id, indexer_configuration_id);
-alter table revision_intrinsic_metadata add primary key using index revision_intrinsic_metadata_pkey;
+-- directory_intrinsic_metadata
+create unique index directory_intrinsic_metadata_pkey on directory_intrinsic_metadata(id, indexer_configuration_id);
+alter table directory_intrinsic_metadata add primary key using index directory_intrinsic_metadata_pkey;
-alter table revision_intrinsic_metadata add constraint revision_intrinsic_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid;
-alter table revision_intrinsic_metadata validate constraint revision_intrinsic_metadata_indexer_configuration_id_fkey;
+alter table directory_intrinsic_metadata add constraint directory_intrinsic_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid;
+alter table directory_intrinsic_metadata validate constraint directory_intrinsic_metadata_indexer_configuration_id_fkey;
-- content_mimetype
create unique index content_mimetype_pkey on content_mimetype(id, indexer_configuration_id);
diff --git a/swh/indexer/sql/upgrades/134.sql b/swh/indexer/sql/upgrades/134.sql
new file mode 100644
--- /dev/null
+++ b/swh/indexer/sql/upgrades/134.sql
@@ -0,0 +1,18 @@
+-- SWH Indexer DB schema upgrade
+-- from_version: 133
+-- to_version: 134
+-- description: replace revision_intrinsic_metadata with directory_intrinsic_metadata
+-- and origin_intrinsic_metadata.from_revision with origin_intrinsic_metadata.from_directory
+-- This migration works by dropping both tables and reindexing from scratch.
+
+insert into dbversion(version, release, description)
+ values(134, now(), 'Work In Progress');
+
+drop table origin_intrinsic_metadata;
+drop table revision_intrinsic_metadata;
+drop function swh_revision_intrinsic_metadata_add;
+drop function swh_mktemp_revision_intrinsic_metadata;
+
+\ir '../30-schema.sql'
+\ir '../50-func.sql'
+\ir '../60-indexes.sql'
diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py
--- a/swh/indexer/storage/__init__.py
+++ b/swh/indexer/storage/__init__.py
@@ -30,8 +30,8 @@
ContentLicenseRow,
ContentMetadataRow,
ContentMimetypeRow,
+ DirectoryIntrinsicMetadataRow,
OriginIntrinsicMetadataRow,
- RevisionIntrinsicMetadataRow,
)
from .writer import JournalWriter
@@ -522,52 +522,52 @@
@timed
@db_transaction()
- def revision_intrinsic_metadata_missing(
+ def directory_intrinsic_metadata_missing(
self, metadata: Iterable[Dict], db=None, cur=None
) -> List[Tuple[Sha1, int]]:
return [
obj[0]
- for obj in db.revision_intrinsic_metadata_missing_from_list(metadata, cur)
+ for obj in db.directory_intrinsic_metadata_missing_from_list(metadata, cur)
]
@timed
@db_transaction()
- def revision_intrinsic_metadata_get(
+ def directory_intrinsic_metadata_get(
self, ids: Iterable[Sha1], db=None, cur=None
- ) -> List[RevisionIntrinsicMetadataRow]:
+ ) -> List[DirectoryIntrinsicMetadataRow]:
return [
- RevisionIntrinsicMetadataRow.from_dict(
+ DirectoryIntrinsicMetadataRow.from_dict(
converters.db_to_metadata(
- dict(zip(db.revision_intrinsic_metadata_cols, c))
+ dict(zip(db.directory_intrinsic_metadata_cols, c))
)
)
- for c in db.revision_intrinsic_metadata_get_from_list(ids, cur)
+ for c in db.directory_intrinsic_metadata_get_from_list(ids, cur)
]
@timed
@process_metrics
@db_transaction()
- def revision_intrinsic_metadata_add(
+ def directory_intrinsic_metadata_add(
self,
- metadata: List[RevisionIntrinsicMetadataRow],
+ metadata: List[DirectoryIntrinsicMetadataRow],
db=None,
cur=None,
) -> Dict[str, int]:
check_id_duplicates(metadata)
metadata.sort(key=lambda m: m.id)
- self.journal_writer.write_additions("revision_intrinsic_metadata", metadata)
+ self.journal_writer.write_additions("directory_intrinsic_metadata", metadata)
- db.mktemp_revision_intrinsic_metadata(cur)
+ db.mktemp_directory_intrinsic_metadata(cur)
db.copy_to(
[m.to_dict() for m in metadata],
- "tmp_revision_intrinsic_metadata",
+ "tmp_directory_intrinsic_metadata",
["id", "metadata", "mappings", "indexer_configuration_id"],
cur,
)
- count = db.revision_intrinsic_metadata_add_from_temp(cur)
+ count = db.directory_intrinsic_metadata_add_from_temp(cur)
return {
- "revision_intrinsic_metadata:add": count,
+ "directory_intrinsic_metadata:add": count,
}
@timed
@@ -602,7 +602,13 @@
db.copy_to(
[m.to_dict() for m in metadata],
"tmp_origin_intrinsic_metadata",
- ["id", "metadata", "indexer_configuration_id", "from_revision", "mappings"],
+ [
+ "id",
+ "metadata",
+ "indexer_configuration_id",
+ "from_directory",
+ "mappings",
+ ],
cur,
)
count = db.origin_intrinsic_metadata_add_from_temp(cur)
diff --git a/swh/indexer/storage/db.py b/swh/indexer/storage/db.py
--- a/swh/indexer/storage/db.py
+++ b/swh/indexer/storage/db.py
@@ -350,18 +350,18 @@
"content_metadata", ids, self.content_metadata_cols, cur=cur
)
- revision_intrinsic_metadata_hash_keys = ["id", "indexer_configuration_id"]
+ directory_intrinsic_metadata_hash_keys = ["id", "indexer_configuration_id"]
- def revision_intrinsic_metadata_missing_from_list(self, metadata, cur=None):
+ def directory_intrinsic_metadata_missing_from_list(self, metadata, cur=None):
"""List missing metadata."""
yield from self._missing_from_list(
- "revision_intrinsic_metadata",
+ "directory_intrinsic_metadata",
metadata,
- self.revision_intrinsic_metadata_hash_keys,
+ self.directory_intrinsic_metadata_hash_keys,
cur=cur,
)
- revision_intrinsic_metadata_cols = [
+ directory_intrinsic_metadata_cols = [
"id",
"metadata",
"mappings",
@@ -371,27 +371,27 @@
"tool_configuration",
]
- @stored_procedure("swh_mktemp_revision_intrinsic_metadata")
- def mktemp_revision_intrinsic_metadata(self, cur=None):
+ @stored_procedure("swh_mktemp_directory_intrinsic_metadata")
+ def mktemp_directory_intrinsic_metadata(self, cur=None):
pass
- def revision_intrinsic_metadata_add_from_temp(self, cur=None):
+ def directory_intrinsic_metadata_add_from_temp(self, cur=None):
cur = self._cursor(cur)
- cur.execute("select * from swh_revision_intrinsic_metadata_add()")
+ cur.execute("select * from swh_directory_intrinsic_metadata_add()")
return cur.fetchone()[0]
- def revision_intrinsic_metadata_get_from_list(self, ids, cur=None):
+ def directory_intrinsic_metadata_get_from_list(self, ids, cur=None):
yield from self._get_from_list(
- "revision_intrinsic_metadata",
+ "directory_intrinsic_metadata",
ids,
- self.revision_intrinsic_metadata_cols,
+ self.directory_intrinsic_metadata_cols,
cur=cur,
)
origin_intrinsic_metadata_cols = [
"id",
"metadata",
- "from_revision",
+ "from_directory",
"mappings",
"tool_id",
"tool_name",
diff --git a/swh/indexer/storage/in_memory.py b/swh/indexer/storage/in_memory.py
--- a/swh/indexer/storage/in_memory.py
+++ b/swh/indexer/storage/in_memory.py
@@ -38,8 +38,8 @@
ContentLicenseRow,
ContentMetadataRow,
ContentMimetypeRow,
+ DirectoryIntrinsicMetadataRow,
OriginIntrinsicMetadataRow,
- RevisionIntrinsicMetadataRow,
)
from .writer import JournalWriter
@@ -250,8 +250,8 @@
self._content_ctags = SubStorage(ContentCtagsRow, *args)
self._licenses = SubStorage(ContentLicenseRow, *args)
self._content_metadata = SubStorage(ContentMetadataRow, *args)
- self._revision_intrinsic_metadata = SubStorage(
- RevisionIntrinsicMetadataRow, *args
+ self._directory_intrinsic_metadata = SubStorage(
+ DirectoryIntrinsicMetadataRow, *args
)
self._origin_intrinsic_metadata = SubStorage(OriginIntrinsicMetadataRow, *args)
@@ -369,21 +369,21 @@
added = self._content_metadata.add(metadata)
return {"content_metadata:add": added}
- def revision_intrinsic_metadata_missing(
+ def directory_intrinsic_metadata_missing(
self, metadata: Iterable[Dict]
) -> List[Tuple[Sha1, int]]:
- return self._revision_intrinsic_metadata.missing(metadata)
+ return self._directory_intrinsic_metadata.missing(metadata)
- def revision_intrinsic_metadata_get(
+ def directory_intrinsic_metadata_get(
self, ids: Iterable[Sha1]
- ) -> List[RevisionIntrinsicMetadataRow]:
- return self._revision_intrinsic_metadata.get(ids)
+ ) -> List[DirectoryIntrinsicMetadataRow]:
+ return self._directory_intrinsic_metadata.get(ids)
- def revision_intrinsic_metadata_add(
- self, metadata: List[RevisionIntrinsicMetadataRow]
+ def directory_intrinsic_metadata_add(
+ self, metadata: List[DirectoryIntrinsicMetadataRow]
) -> Dict[str, int]:
- added = self._revision_intrinsic_metadata.add(metadata)
- return {"revision_intrinsic_metadata:add": added}
+ added = self._directory_intrinsic_metadata.add(metadata)
+ return {"directory_intrinsic_metadata:add": added}
def origin_intrinsic_metadata_get(
self, urls: Iterable[str]
diff --git a/swh/indexer/storage/interface.py b/swh/indexer/storage/interface.py
--- a/swh/indexer/storage/interface.py
+++ b/swh/indexer/storage/interface.py
@@ -15,8 +15,8 @@
ContentLicenseRow,
ContentMetadataRow,
ContentMimetypeRow,
+ DirectoryIntrinsicMetadataRow,
OriginIntrinsicMetadataRow,
- RevisionIntrinsicMetadataRow,
)
TResult = TypeVar("TResult")
@@ -341,8 +341,8 @@
"""
...
- @remote_api_endpoint("revision_intrinsic_metadata/missing")
- def revision_intrinsic_metadata_missing(
+ @remote_api_endpoint("directory_intrinsic_metadata/missing")
+ def directory_intrinsic_metadata_missing(
self, metadata: Iterable[Dict]
) -> List[Tuple[Sha1, int]]:
"""List metadata missing from storage.
@@ -350,7 +350,7 @@
Args:
metadata (iterable): dictionaries with keys:
- - **id** (bytes): sha1_git revision identifier
+ - **id** (bytes): sha1_git directory identifier
- **indexer_configuration_id** (int): tool used to compute
the results
@@ -360,11 +360,11 @@
"""
...
- @remote_api_endpoint("revision_intrinsic_metadata")
- def revision_intrinsic_metadata_get(
+ @remote_api_endpoint("directory_intrinsic_metadata")
+ def directory_intrinsic_metadata_get(
self, ids: Iterable[Sha1]
- ) -> List[RevisionIntrinsicMetadataRow]:
- """Retrieve revision metadata per id.
+ ) -> List[DirectoryIntrinsicMetadataRow]:
+ """Retrieve directory metadata per id.
Args:
ids (iterable): sha1 checksums
@@ -375,10 +375,10 @@
"""
...
- @remote_api_endpoint("revision_intrinsic_metadata/add")
- def revision_intrinsic_metadata_add(
+ @remote_api_endpoint("directory_intrinsic_metadata/add")
+ def directory_intrinsic_metadata_add(
self,
- metadata: List[RevisionIntrinsicMetadataRow],
+ metadata: List[DirectoryIntrinsicMetadataRow],
) -> Dict[str, int]:
"""Add metadata not present in storage.
diff --git a/swh/indexer/storage/model.py b/swh/indexer/storage/model.py
--- a/swh/indexer/storage/model.py
+++ b/swh/indexer/storage/model.py
@@ -120,8 +120,8 @@
@attr.s
-class RevisionIntrinsicMetadataRow(BaseRow):
- object_type: Final = "revision_intrinsic_metadata"
+class DirectoryIntrinsicMetadataRow(BaseRow):
+ object_type: Final = "directory_intrinsic_metadata"
id = attr.ib(type=Sha1Git)
metadata = attr.ib(type=Dict[str, Any])
@@ -134,5 +134,5 @@
id = attr.ib(type=str)
metadata = attr.ib(type=Dict[str, Any])
- from_revision = attr.ib(type=Sha1Git)
+ from_directory = attr.ib(type=Sha1Git)
mappings = attr.ib(type=List[str])
diff --git a/swh/indexer/tests/conftest.py b/swh/indexer/tests/conftest.py
--- a/swh/indexer/tests/conftest.py
+++ b/swh/indexer/tests/conftest.py
@@ -23,7 +23,7 @@
TASK_NAMES: List[Tuple[str, str]] = [
# (scheduler-task-type, task-class-test-name)
- ("index-revision-metadata", "revision_intrinsic_metadata"),
+ ("index-directory-metadata", "directory_intrinsic_metadata"),
("index-origin-metadata", "origin_intrinsic_metadata"),
]
diff --git a/swh/indexer/tests/storage/conftest.py b/swh/indexer/tests/storage/conftest.py
--- a/swh/indexer/tests/storage/conftest.py
+++ b/swh/indexer/tests/storage/conftest.py
@@ -41,9 +41,9 @@
data.tools = tools
data.sha1_1 = hash_to_bytes("34973274ccef6ab4dfaaf86599792fa9c3fe4689")
data.sha1_2 = hash_to_bytes("61c2b3a30496d329e21af70dd2d7e097046d07b7")
- data.revision_id_1 = hash_to_bytes("7026b7c1a2af56521e951c01ed20f255fa054238")
- data.revision_id_2 = hash_to_bytes("7026b7c1a2af56521e9587659012345678904321")
- data.revision_id_3 = hash_to_bytes("7026b7c1a2af56521e9587659012345678904320")
+ data.directory_id_1 = hash_to_bytes("7026b7c1a2af56521e951c01ed20f255fa054238")
+ data.directory_id_2 = hash_to_bytes("7026b7c1a2af56521e9587659012345678904321")
+ data.directory_id_3 = hash_to_bytes("7026b7c1a2af56521e9587659012345678904320")
data.origin_url_1 = "file:///dev/0/zero" # 44434341
data.origin_url_2 = "file:///dev/1/one" # 44434342
data.origin_url_3 = "file:///dev/2/two" # 54974445
diff --git a/swh/indexer/tests/storage/test_storage.py b/swh/indexer/tests/storage/test_storage.py
--- a/swh/indexer/tests/storage/test_storage.py
+++ b/swh/indexer/tests/storage/test_storage.py
@@ -19,8 +19,8 @@
ContentLicenseRow,
ContentMetadataRow,
ContentMimetypeRow,
+ DirectoryIntrinsicMetadataRow,
OriginIntrinsicMetadataRow,
- RevisionIntrinsicMetadataRow,
)
from swh.model.hashutil import hash_to_bytes
@@ -289,37 +289,37 @@
etype = self.endpoint_type
tool = data.tools[self.tool_name]
- data_rev1 = self.row_class.from_dict(
+ data_dir1 = self.row_class.from_dict(
{
- "id": data.revision_id_2,
+ "id": data.directory_id_2,
**self.example_data[0],
"indexer_configuration_id": tool["id"],
}
)
- data_rev2 = self.row_class.from_dict(
+ data_dir2 = self.row_class.from_dict(
{
- "id": data.revision_id_2,
+ "id": data.directory_id_2,
**self.example_data[1],
"indexer_configuration_id": tool["id"],
}
)
# when
- summary = endpoint(storage, etype, "add")([data_rev1])
+ summary = endpoint(storage, etype, "add")([data_dir1])
assert summary == expected_summary(1, etype)
with pytest.raises(DuplicateId):
- endpoint(storage, etype, "add")([data_rev2, data_rev2])
+ endpoint(storage, etype, "add")([data_dir2, data_dir2])
# then
actual_data = list(
- endpoint(storage, etype, "get")([data.revision_id_2, data.revision_id_1])
+ endpoint(storage, etype, "get")([data.directory_id_2, data.directory_id_1])
)
expected_data = [
self.row_class.from_dict(
- {"id": data.revision_id_2, **self.example_data[0], "tool": tool}
+ {"id": data.directory_id_2, **self.example_data[0], "tool": tool}
)
]
assert actual_data == expected_data
@@ -806,11 +806,11 @@
row_class = ContentMetadataRow
-class TestIndexerStorageRevisionIntrinsicMetadata(StorageETypeTester):
- """Test Indexer Storage revision_intrinsic_metadata related methods"""
+class TestIndexerStorageDirectoryIntrinsicMetadata(StorageETypeTester):
+ """Test Indexer Storage directory_intrinsic_metadata related methods"""
tool_name = "swh-metadata-detector"
- endpoint_type = "revision_intrinsic_metadata"
+ endpoint_type = "directory_intrinsic_metadata"
example_data = [
{
"metadata": {
@@ -830,7 +830,7 @@
"mappings": ["mapping2"],
},
]
- row_class = RevisionIntrinsicMetadataRow
+ row_class = DirectoryIntrinsicMetadataRow
class TestIndexerStorageContentFossologyLicense(StorageETypeTester):
@@ -1102,8 +1102,8 @@
"version": None,
"name": None,
}
- metadata_rev = RevisionIntrinsicMetadataRow(
- id=data.revision_id_2,
+ metadata_dir = DirectoryIntrinsicMetadataRow(
+ id=data.directory_id_2,
metadata=metadata,
mappings=["mapping1"],
indexer_configuration_id=tool_id,
@@ -1113,11 +1113,11 @@
metadata=metadata,
indexer_configuration_id=tool_id,
mappings=["mapping1"],
- from_revision=data.revision_id_2,
+ from_directory=data.directory_id_2,
)
# when
- storage.revision_intrinsic_metadata_add([metadata_rev])
+ storage.directory_intrinsic_metadata_add([metadata_dir])
storage.origin_intrinsic_metadata_add([metadata_origin])
# then
@@ -1130,7 +1130,7 @@
id=data.origin_url_1,
metadata=metadata,
tool=data.tools["swh-metadata-detector"],
- from_revision=data.revision_id_2,
+ from_directory=data.directory_id_2,
mappings=["mapping1"],
)
]
@@ -1156,8 +1156,8 @@
"version": None,
"name": None,
}
- metadata_rev_v1 = RevisionIntrinsicMetadataRow(
- id=data.revision_id_2,
+ metadata_dir_v1 = DirectoryIntrinsicMetadataRow(
+ id=data.directory_id_2,
metadata=metadata_v1,
mappings=[],
indexer_configuration_id=tool_id,
@@ -1167,11 +1167,11 @@
metadata=metadata_v1.copy(),
indexer_configuration_id=tool_id,
mappings=[],
- from_revision=data.revision_id_2,
+ from_directory=data.directory_id_2,
)
# given
- storage.revision_intrinsic_metadata_add([metadata_rev_v1])
+ storage.directory_intrinsic_metadata_add([metadata_dir_v1])
storage.origin_intrinsic_metadata_add([metadata_origin_v1])
# when
@@ -1185,7 +1185,7 @@
id=data.origin_url_1,
metadata=metadata_v1,
tool=data.tools["swh-metadata-detector"],
- from_revision=data.revision_id_2,
+ from_directory=data.directory_id_2,
mappings=[],
)
]
@@ -1199,16 +1199,16 @@
"author": "MG",
}
)
- metadata_rev_v2 = attr.evolve(metadata_rev_v1, metadata=metadata_v2)
+ metadata_dir_v2 = attr.evolve(metadata_dir_v1, metadata=metadata_v2)
metadata_origin_v2 = OriginIntrinsicMetadataRow(
id=data.origin_url_1,
metadata=metadata_v2.copy(),
indexer_configuration_id=tool_id,
mappings=["npm"],
- from_revision=data.revision_id_1,
+ from_directory=data.directory_id_1,
)
- storage.revision_intrinsic_metadata_add([metadata_rev_v2])
+ storage.directory_intrinsic_metadata_add([metadata_dir_v2])
storage.origin_intrinsic_metadata_add([metadata_origin_v2])
actual_metadata = list(
@@ -1220,7 +1220,7 @@
id=data.origin_url_1,
metadata=metadata_v2,
tool=data.tools["swh-metadata-detector"],
- from_revision=data.revision_id_1,
+ from_directory=data.directory_id_1,
mappings=["npm"],
)
]
@@ -1252,8 +1252,8 @@
"mappings": [],
}
- metadata_rev_v1 = RevisionIntrinsicMetadataRow(
- id=data.revision_id_2,
+ metadata_dir_v1 = DirectoryIntrinsicMetadataRow(
+ id=data.directory_id_2,
metadata={
"version": None,
"name": None,
@@ -1265,7 +1265,7 @@
data_v1 = [
OriginIntrinsicMetadataRow(
id=origin,
- from_revision=data.revision_id_2,
+ from_directory=data.directory_id_2,
indexer_configuration_id=tool_id,
**example_data1,
)
@@ -1274,7 +1274,7 @@
data_v2 = [
OriginIntrinsicMetadataRow(
id=origin,
- from_revision=data.revision_id_2,
+ from_directory=data.directory_id_2,
indexer_configuration_id=tool_id,
**example_data2,
)
@@ -1287,7 +1287,7 @@
data_v2b = list(reversed(data_v2[0:-1]))
# given
- storage.revision_intrinsic_metadata_add([metadata_rev_v1])
+ storage.directory_intrinsic_metadata_add([metadata_dir_v1])
storage.origin_intrinsic_metadata_add(data_v1)
# when
@@ -1296,7 +1296,7 @@
expected_data_v1 = [
OriginIntrinsicMetadataRow(
id=origin,
- from_revision=data.revision_id_2,
+ from_directory=data.directory_id_2,
tool=data.tools["swh-metadata-detector"],
**example_data1,
)
@@ -1326,7 +1326,7 @@
expected_data_v2 = [
OriginIntrinsicMetadataRow(
id=origin,
- from_revision=data.revision_id_2,
+ from_directory=data.directory_id_2,
tool=data.tools["swh-metadata-detector"],
**example_data2,
)
@@ -1351,8 +1351,8 @@
"developmentStatus": None,
"name": None,
}
- metadata_rev = RevisionIntrinsicMetadataRow(
- id=data.revision_id_2,
+ metadata_dir = DirectoryIntrinsicMetadataRow(
+ id=data.directory_id_2,
metadata=metadata,
mappings=["mapping1"],
indexer_configuration_id=tool_id,
@@ -1362,11 +1362,11 @@
metadata=metadata,
indexer_configuration_id=tool_id,
mappings=["mapping1"],
- from_revision=data.revision_id_2,
+ from_directory=data.directory_id_2,
)
# when
- storage.revision_intrinsic_metadata_add([metadata_rev])
+ storage.directory_intrinsic_metadata_add([metadata_dir])
with pytest.raises(DuplicateId):
storage.origin_intrinsic_metadata_add([metadata_origin, metadata_origin])
@@ -1381,8 +1381,8 @@
metadata1 = {
"author": "John Doe",
}
- metadata1_rev = RevisionIntrinsicMetadataRow(
- id=data.revision_id_1,
+ metadata1_dir = DirectoryIntrinsicMetadataRow(
+ id=data.directory_id_1,
metadata=metadata1,
mappings=[],
indexer_configuration_id=tool_id,
@@ -1392,13 +1392,13 @@
metadata=metadata1,
mappings=[],
indexer_configuration_id=tool_id,
- from_revision=data.revision_id_1,
+ from_directory=data.directory_id_1,
)
metadata2 = {
"author": "Jane Doe",
}
- metadata2_rev = RevisionIntrinsicMetadataRow(
- id=data.revision_id_2,
+ metadata2_dir = DirectoryIntrinsicMetadataRow(
+ id=data.directory_id_2,
metadata=metadata2,
mappings=[],
indexer_configuration_id=tool_id,
@@ -1408,13 +1408,13 @@
metadata=metadata2,
mappings=[],
indexer_configuration_id=tool_id,
- from_revision=data.revision_id_2,
+ from_directory=data.directory_id_2,
)
# when
- storage.revision_intrinsic_metadata_add([metadata1_rev])
+ storage.directory_intrinsic_metadata_add([metadata1_dir])
storage.origin_intrinsic_metadata_add([metadata1_origin])
- storage.revision_intrinsic_metadata_add([metadata2_rev])
+ storage.directory_intrinsic_metadata_add([metadata2_dir])
storage.origin_intrinsic_metadata_add([metadata2_origin])
# then
@@ -1444,8 +1444,8 @@
"Jane Doe",
]
}
- metadata1_rev = RevisionIntrinsicMetadataRow(
- id=data.revision_id_1,
+ metadata1_dir = DirectoryIntrinsicMetadataRow(
+ id=data.directory_id_1,
metadata=metadata1,
mappings=[],
indexer_configuration_id=tool_id,
@@ -1455,7 +1455,7 @@
metadata=metadata1,
mappings=[],
indexer_configuration_id=tool_id,
- from_revision=data.revision_id_1,
+ from_directory=data.directory_id_1,
)
metadata2 = {
"author": [
@@ -1463,8 +1463,8 @@
"Jane Doe",
]
}
- metadata2_rev = RevisionIntrinsicMetadataRow(
- id=data.revision_id_2,
+ metadata2_dir = DirectoryIntrinsicMetadataRow(
+ id=data.directory_id_2,
metadata=metadata2,
mappings=[],
indexer_configuration_id=tool_id,
@@ -1474,13 +1474,13 @@
metadata=metadata2,
mappings=[],
indexer_configuration_id=tool_id,
- from_revision=data.revision_id_2,
+ from_directory=data.directory_id_2,
)
# when
- storage.revision_intrinsic_metadata_add([metadata1_rev])
+ storage.directory_intrinsic_metadata_add([metadata1_dir])
storage.origin_intrinsic_metadata_add([metadata1_origin])
- storage.revision_intrinsic_metadata_add([metadata2_rev])
+ storage.directory_intrinsic_metadata_add([metadata2_dir])
storage.origin_intrinsic_metadata_add([metadata2_origin])
# then
@@ -1508,8 +1508,8 @@
"@context": "foo",
"author": "John Doe",
}
- metadata1_rev = RevisionIntrinsicMetadataRow(
- id=data.revision_id_1,
+ metadata1_dir = DirectoryIntrinsicMetadataRow(
+ id=data.directory_id_1,
metadata=metadata1,
mappings=["npm"],
indexer_configuration_id=tool1_id,
@@ -1519,14 +1519,14 @@
metadata=metadata1,
mappings=["npm"],
indexer_configuration_id=tool1_id,
- from_revision=data.revision_id_1,
+ from_directory=data.directory_id_1,
)
metadata2 = {
"@context": "foo",
"author": "Jane Doe",
}
- metadata2_rev = RevisionIntrinsicMetadataRow(
- id=data.revision_id_2,
+ metadata2_dir = DirectoryIntrinsicMetadataRow(
+ id=data.directory_id_2,
metadata=metadata2,
mappings=["npm", "gemspec"],
indexer_configuration_id=tool2_id,
@@ -1536,13 +1536,13 @@
metadata=metadata2,
mappings=["npm", "gemspec"],
indexer_configuration_id=tool2_id,
- from_revision=data.revision_id_2,
+ from_directory=data.directory_id_2,
)
metadata3 = {
"@context": "foo",
}
- metadata3_rev = RevisionIntrinsicMetadataRow(
- id=data.revision_id_3,
+ metadata3_dir = DirectoryIntrinsicMetadataRow(
+ id=data.directory_id_3,
metadata=metadata3,
mappings=["npm", "gemspec"],
indexer_configuration_id=tool2_id,
@@ -1552,14 +1552,14 @@
metadata=metadata3,
mappings=["pkg-info"],
indexer_configuration_id=tool2_id,
- from_revision=data.revision_id_3,
+ from_directory=data.directory_id_3,
)
- storage.revision_intrinsic_metadata_add([metadata1_rev])
+ storage.directory_intrinsic_metadata_add([metadata1_dir])
storage.origin_intrinsic_metadata_add([metadata1_origin])
- storage.revision_intrinsic_metadata_add([metadata2_rev])
+ storage.directory_intrinsic_metadata_add([metadata2_dir])
storage.origin_intrinsic_metadata_add([metadata2_origin])
- storage.revision_intrinsic_metadata_add([metadata3_rev])
+ storage.directory_intrinsic_metadata_add([metadata3_dir])
storage.origin_intrinsic_metadata_add([metadata3_origin])
def test_origin_intrinsic_metadata_search_by_producer(
@@ -1685,7 +1685,7 @@
},
mappings=["npm", "gemspec"],
tool=tool2,
- from_revision=data.revision_id_2,
+ from_directory=data.directory_id_2,
)
],
next_page_token=None,
diff --git a/swh/indexer/tests/tasks.py b/swh/indexer/tests/tasks.py
--- a/swh/indexer/tests/tasks.py
+++ b/swh/indexer/tests/tasks.py
@@ -1,12 +1,12 @@
from celery import current_app as app
-from swh.indexer.metadata import OriginMetadataIndexer, RevisionMetadataIndexer
+from swh.indexer.metadata import DirectoryMetadataIndexer, OriginMetadataIndexer
from .test_metadata import ContentMetadataTestIndexer
from .utils import BASE_TEST_CONFIG
-class RevisionMetadataTestIndexer(RevisionMetadataIndexer):
+class DirectoryMetadataTestIndexer(DirectoryMetadataIndexer):
"""Specific indexer whose configuration is enough to satisfy the
indexing tests.
"""
@@ -29,12 +29,12 @@
return {**BASE_TEST_CONFIG, "tools": []}
def _prepare_sub_indexers(self):
- self.revision_metadata_indexer = RevisionMetadataTestIndexer()
+ self.directory_metadata_indexer = DirectoryMetadataTestIndexer()
@app.task
-def revision_intrinsic_metadata(*args, **kwargs):
- indexer = RevisionMetadataTestIndexer()
+def directory_intrinsic_metadata(*args, **kwargs):
+ indexer = DirectoryMetadataTestIndexer()
indexer.run(*args, **kwargs)
print("REV RESULT=", indexer.results)
diff --git a/swh/indexer/tests/test_cli.py b/swh/indexer/tests/test_cli.py
--- a/swh/indexer/tests/test_cli.py
+++ b/swh/indexer/tests/test_cli.py
@@ -16,14 +16,14 @@
from swh.indexer.cli import indexer_cli_group
from swh.indexer.storage.interface import IndexerStorageInterface
from swh.indexer.storage.model import (
+ DirectoryIntrinsicMetadataRow,
OriginIntrinsicMetadataRow,
- RevisionIntrinsicMetadataRow,
)
from swh.journal.writer import get_journal_writer
from swh.model.hashutil import hash_to_bytes
from swh.model.model import OriginVisitStatus
-from .utils import REVISION
+from .utils import DIRECTORY2, REVISION
def fill_idx_storage(idx_storage: IndexerStorageInterface, nb_rows: int) -> List[int]:
@@ -40,15 +40,15 @@
origin_metadata = [
OriginIntrinsicMetadataRow(
id="file://dev/%04d" % origin_id,
- from_revision=hash_to_bytes("abcd{:0>36}".format(origin_id)),
+ from_directory=hash_to_bytes("abcd{:0>36}".format(origin_id)),
indexer_configuration_id=tools[origin_id % 2]["id"],
metadata={"name": "origin %d" % origin_id},
mappings=["mapping%d" % (origin_id % 10)],
)
for origin_id in range(nb_rows)
]
- revision_metadata = [
- RevisionIntrinsicMetadataRow(
+ directory_metadata = [
+ DirectoryIntrinsicMetadataRow(
id=hash_to_bytes("abcd{:0>36}".format(origin_id)),
indexer_configuration_id=tools[origin_id % 2]["id"],
metadata={"name": "origin %d" % origin_id},
@@ -57,7 +57,7 @@
for origin_id in range(nb_rows)
]
- idx_storage.revision_intrinsic_metadata_add(revision_metadata)
+ idx_storage.directory_intrinsic_metadata_add(directory_metadata)
idx_storage.origin_intrinsic_metadata_add(origin_metadata)
return [tool["id"] for tool in tools]
@@ -605,10 +605,10 @@
)
mocker.patch(
- "swh.indexer.metadata.RevisionMetadataIndexer.index",
+ "swh.indexer.metadata.DirectoryMetadataIndexer.index",
return_value=[
- RevisionIntrinsicMetadataRow(
- id=REVISION.id,
+ DirectoryIntrinsicMetadataRow(
+ id=DIRECTORY2.id,
indexer_configuration_id=1,
mappings=["cff"],
metadata={"foo": "bar"},
@@ -645,7 +645,7 @@
expected_results = [
OriginIntrinsicMetadataRow(
id=status.origin,
- from_revision=REVISION.id,
+ from_directory=DIRECTORY2.id,
tool={"id": 1, **swh_indexer_config["tools"]},
mappings=["cff"],
metadata={"foo": "bar"},
diff --git a/swh/indexer/tests/test_indexer.py b/swh/indexer/tests/test_indexer.py
--- a/swh/indexer/tests/test_indexer.py
+++ b/swh/indexer/tests/test_indexer.py
@@ -11,13 +11,13 @@
from swh.indexer.indexer import (
ContentIndexer,
ContentPartitionIndexer,
+ DirectoryIndexer,
OriginIndexer,
- RevisionIndexer,
)
from swh.indexer.storage import PagedResult, Sha1
from swh.model.model import Content
-from .utils import BASE_TEST_CONFIG, REVISION
+from .utils import BASE_TEST_CONFIG, DIRECTORY2
class _TestException(Exception):
@@ -49,7 +49,7 @@
pass
-class CrashingRevisionIndexer(CrashingIndexerMixin, RevisionIndexer):
+class CrashingDirectoryIndexer(CrashingIndexerMixin, DirectoryIndexer):
pass
@@ -86,14 +86,14 @@
indexer.run([b"foo"])
-def test_revision_indexer_catch_exceptions():
- indexer = CrashingRevisionIndexer(config=BASE_TEST_CONFIG)
+def test_directory_indexer_catch_exceptions():
+ indexer = CrashingDirectoryIndexer(config=BASE_TEST_CONFIG)
indexer.storage = Mock()
- indexer.storage.revision_get.return_value = [REVISION]
+ indexer.storage.directory_get.return_value = [DIRECTORY2]
assert indexer.run([b"foo"]) == {"status": "failed"}
- assert indexer.process_journal_objects({"revision": [REVISION.to_dict()]}) == {
+ assert indexer.process_journal_objects({"directory": [DIRECTORY2.to_dict()]}) == {
"status": "failed"
}
@@ -103,7 +103,7 @@
indexer.run([b"foo"])
with pytest.raises(_TestException):
- indexer.process_journal_objects({"revision": [REVISION.to_dict()]})
+ indexer.process_journal_objects({"directory": [DIRECTORY2.to_dict()]})
def test_origin_indexer_catch_exceptions():
diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py
--- a/swh/indexer/tests/test_metadata.py
+++ b/swh/indexer/tests/test_metadata.py
@@ -10,16 +10,16 @@
import pytest
from swh.indexer.codemeta import CODEMETA_TERMS
-from swh.indexer.metadata import ContentMetadataIndexer, RevisionMetadataIndexer
+from swh.indexer.metadata import ContentMetadataIndexer, DirectoryMetadataIndexer
from swh.indexer.metadata_detector import detect_metadata
from swh.indexer.metadata_dictionary import MAPPINGS
from swh.indexer.metadata_dictionary.maven import MavenMapping
from swh.indexer.metadata_dictionary.npm import NpmMapping
from swh.indexer.metadata_dictionary.ruby import GemspecMapping
-from swh.indexer.storage.model import ContentMetadataRow, RevisionIntrinsicMetadataRow
-from swh.indexer.tests.utils import DIRECTORY2, REVISION
+from swh.indexer.storage.model import ContentMetadataRow, DirectoryIntrinsicMetadataRow
+from swh.indexer.tests.utils import DIRECTORY2
from swh.model.hashutil import hash_to_bytes
-from swh.model.model import Directory, DirectoryEntry, Revision
+from swh.model.model import Directory, DirectoryEntry
from .utils import (
BASE_TEST_CONFIG,
@@ -43,10 +43,10 @@
"""
def parse_config_file(self, *args, **kwargs):
- assert False, "should not be called; the rev indexer configures it."
+ assert False, "should not be called; the dir indexer configures it."
-REVISION_METADATA_CONFIG = {
+DIRECTORY_METADATA_CONFIG = {
**BASE_TEST_CONFIG,
"tools": TRANSLATOR_TOOL,
}
@@ -1154,8 +1154,8 @@
parts.append(b"end\n")
self.gemspec_mapping.translate(b"".join(parts))
- def test_revision_metadata_indexer(self):
- metadata_indexer = RevisionMetadataIndexer(config=REVISION_METADATA_CONFIG)
+ def test_directory_metadata_indexer(self):
+ metadata_indexer = DirectoryMetadataIndexer(config=DIRECTORY_METADATA_CONFIG)
fill_obj_storage(metadata_indexer.objstorage)
fill_storage(metadata_indexer.storage)
@@ -1163,8 +1163,7 @@
{f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()}
)
assert tool is not None
- rev = REVISION
- assert rev.directory == DIRECTORY2.id
+ dir_ = DIRECTORY2
metadata_indexer.idx_storage.content_metadata_add(
[
@@ -1176,15 +1175,17 @@
]
)
- metadata_indexer.run([rev.id])
+ metadata_indexer.run([dir_.id])
results = list(
- metadata_indexer.idx_storage.revision_intrinsic_metadata_get([REVISION.id])
+ metadata_indexer.idx_storage.directory_intrinsic_metadata_get(
+ [DIRECTORY2.id]
+ )
)
expected_results = [
- RevisionIntrinsicMetadataRow(
- id=rev.id,
+ DirectoryIntrinsicMetadataRow(
+ id=dir_.id,
tool=TRANSLATOR_TOOL,
metadata=YARN_PARSER_METADATA,
mappings=["npm"],
@@ -1197,33 +1198,27 @@
# then
assert results == expected_results
- def test_revision_metadata_indexer_single_root_dir(self):
- metadata_indexer = RevisionMetadataIndexer(config=REVISION_METADATA_CONFIG)
+ def test_directory_metadata_indexer_single_root_dir(self):
+ metadata_indexer = DirectoryMetadataIndexer(config=DIRECTORY_METADATA_CONFIG)
fill_obj_storage(metadata_indexer.objstorage)
fill_storage(metadata_indexer.storage)
# Add a parent directory, that is the only directory at the root
- # of the revision
- rev = REVISION
- assert rev.directory == DIRECTORY2.id
+ # of the directory
+ dir_ = DIRECTORY2
- directory = Directory(
+ new_dir = Directory(
entries=(
DirectoryEntry(
name=b"foobar-1.0.0",
type="dir",
- target=rev.directory,
+ target=dir_.id,
perms=16384,
),
),
)
- assert directory.id is not None
- metadata_indexer.storage.directory_add([directory])
-
- new_rev_dict = {**rev.to_dict(), "directory": directory.id}
- new_rev_dict.pop("id")
- new_rev = Revision.from_dict(new_rev_dict)
- metadata_indexer.storage.revision_add([new_rev])
+ assert new_dir.id is not None
+ metadata_indexer.storage.directory_add([new_dir])
tool = metadata_indexer.idx_storage.indexer_configuration_get(
{f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()}
@@ -1240,15 +1235,15 @@
]
)
- metadata_indexer.run([new_rev.id])
+ metadata_indexer.run([new_dir.id])
results = list(
- metadata_indexer.idx_storage.revision_intrinsic_metadata_get([new_rev.id])
+ metadata_indexer.idx_storage.directory_intrinsic_metadata_get([new_dir.id])
)
expected_results = [
- RevisionIntrinsicMetadataRow(
- id=new_rev.id,
+ DirectoryIntrinsicMetadataRow(
+ id=new_dir.id,
tool=TRANSLATOR_TOOL,
metadata=YARN_PARSER_METADATA,
mappings=["npm"],
diff --git a/swh/indexer/tests/test_origin_metadata.py b/swh/indexer/tests/test_origin_metadata.py
--- a/swh/indexer/tests/test_origin_metadata.py
+++ b/swh/indexer/tests/test_origin_metadata.py
@@ -11,14 +11,14 @@
from swh.indexer.metadata import OriginMetadataIndexer
from swh.indexer.storage.interface import IndexerStorageInterface
from swh.indexer.storage.model import (
+ DirectoryIntrinsicMetadataRow,
OriginIntrinsicMetadataRow,
- RevisionIntrinsicMetadataRow,
)
from swh.model.model import Origin
from swh.storage.interface import StorageInterface
from .test_metadata import TRANSLATOR_TOOL
-from .utils import REVISION, YARN_PARSER_METADATA
+from .utils import DIRECTORY2, YARN_PARSER_METADATA
@pytest.fixture
@@ -41,9 +41,9 @@
tool = swh_indexer_config["tools"]
- rev_id = REVISION.id
- rev_metadata = RevisionIntrinsicMetadataRow(
- id=rev_id,
+ dir_id = DIRECTORY2.id
+ rev_metadata = DirectoryIntrinsicMetadataRow(
+ id=dir_id,
tool=tool,
metadata=YARN_PARSER_METADATA,
mappings=["npm"],
@@ -51,12 +51,12 @@
origin_metadata = OriginIntrinsicMetadataRow(
id=origin,
tool=tool,
- from_revision=rev_id,
+ from_directory=dir_id,
metadata=YARN_PARSER_METADATA,
mappings=["npm"],
)
- rev_results = list(idx_storage.revision_intrinsic_metadata_get([rev_id]))
+ rev_results = list(idx_storage.directory_intrinsic_metadata_get([dir_id]))
for rev_result in rev_results:
assert rev_result.tool
del rev_result.tool["id"]
@@ -82,9 +82,9 @@
indexer.run(["https://github.com/librariesio/yarn-parser"] * 2)
origin = "https://github.com/librariesio/yarn-parser"
- rev_id = REVISION.id
+ dir_id = DIRECTORY2.id
- rev_results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id]))
+ rev_results = list(indexer.idx_storage.directory_intrinsic_metadata_get([dir_id]))
assert len(rev_results) == 1
orig_results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin]))
@@ -121,12 +121,12 @@
indexer = OriginMetadataIndexer(config=swh_indexer_config)
indexer.run([origin1, origin2])
- rev_id = REVISION.id
+ dir_id = DIRECTORY2.id
- rev_results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id]))
+ rev_results = list(indexer.idx_storage.directory_intrinsic_metadata_get([dir_id]))
assert rev_results == [
- RevisionIntrinsicMetadataRow(
- id=rev_id,
+ DirectoryIntrinsicMetadataRow(
+ id=dir_id,
metadata=YARN_PARSER_METADATA,
mappings=["npm"],
tool=rev_results[0].tool,
@@ -140,7 +140,7 @@
assert orig_results == [
OriginIntrinsicMetadataRow(
id=origin2,
- from_revision=rev_id,
+ from_directory=dir_id,
metadata=YARN_PARSER_METADATA,
mappings=["npm"],
tool=orig_results[0].tool,
@@ -148,7 +148,7 @@
]
-def test_origin_metadata_indexer_duplicate_revision(
+def test_origin_metadata_indexer_duplicate_directory(
swh_indexer_config,
idx_storage: IndexerStorageInterface,
storage: StorageInterface,
@@ -162,9 +162,9 @@
origin2 = "https://github.com/librariesio/yarn-parser.git"
indexer.run([origin1, origin2])
- rev_id = REVISION.id
+ dir_id = DIRECTORY2.id
- rev_results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id]))
+ rev_results = list(indexer.idx_storage.directory_intrinsic_metadata_get([dir_id]))
assert len(rev_results) == 1
orig_results = list(
@@ -185,9 +185,9 @@
with patch("swh.indexer.metadata_dictionary.npm.NpmMapping.filename", b"foo.json"):
indexer.run([origin])
- rev_id = REVISION.id
+ dir_id = DIRECTORY2.id
- rev_results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id]))
+ rev_results = list(indexer.idx_storage.directory_intrinsic_metadata_get([dir_id]))
assert rev_results == []
orig_results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin]))
@@ -204,15 +204,15 @@
indexer = OriginMetadataIndexer(config=swh_indexer_config)
origin = "https://github.com/librariesio/yarn-parser"
with patch(
- "swh.indexer.metadata.RevisionMetadataIndexer"
- ".translate_revision_intrinsic_metadata",
+ "swh.indexer.metadata.DirectoryMetadataIndexer"
+ ".translate_directory_intrinsic_metadata",
return_value=(["npm"], {"@context": "foo"}),
):
indexer.run([origin])
- rev_id = REVISION.id
+ dir_id = DIRECTORY2.id
- rev_results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id]))
+ rev_results = list(indexer.idx_storage.directory_intrinsic_metadata_get([dir_id]))
assert rev_results == []
orig_results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin]))
@@ -229,15 +229,15 @@
indexer = OriginMetadataIndexer(config=swh_indexer_config)
origin = "https://github.com/librariesio/yarn-parser"
with patch(
- "swh.indexer.metadata.RevisionMetadataIndexer"
- ".translate_revision_intrinsic_metadata",
+ "swh.indexer.metadata.DirectoryMetadataIndexer"
+ ".translate_directory_intrinsic_metadata",
return_value=None,
):
indexer.run([origin])
- rev_id = REVISION.id
+ dir_id = DIRECTORY2.id
- rev_results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id]))
+ rev_results = list(indexer.idx_storage.directory_intrinsic_metadata_get([dir_id]))
assert rev_results == []
orig_results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin]))
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Nov 5 2024, 6:55 AM (10 w, 6 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3231971
Attached To
D7937: Replace RevisionMetadataIndexer with DirectoryMetadataIndexer
Event Timeline
Log In to Comment