Page MenuHomeSoftware Heritage

D8002.id28836.diff
No OneTemporary

D8002.id28836.diff

diff --git a/requirements-swh.txt b/requirements-swh.txt
--- a/requirements-swh.txt
+++ b/requirements-swh.txt
@@ -1,4 +1,4 @@
-swh.core[db,http] >= 0.14.0
+swh.core[db,http] >= 2.9
swh.model >= 0.0.15
swh.objstorage >= 0.2.2
swh.scheduler >= 0.5.2
diff --git a/swh/indexer/cli.py b/swh/indexer/cli.py
--- a/swh/indexer/cli.py
+++ b/swh/indexer/cli.py
@@ -1,9 +1,9 @@
-# Copyright (C) 2019-2020 The Software Heritage developers
+# Copyright (C) 2019-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-from typing import Iterator
+from typing import Callable, Dict, Iterator, List, Optional
# WARNING: do not import unnecessary things here to keep cli startup time under
# control
@@ -213,6 +213,12 @@
@indexer_cli_group.command("journal-client")
+@click.argument(
+ "indexer",
+ type=click.Choice(["origin-intrinsic-metadata", "*"]),
+ required=False
+ # TODO: remove required=False after we stop using it
+)
@click.option("--scheduler-url", "-s", default=None, help="URL of the scheduler API")
@click.option(
"--origin-metadata-task-type",
@@ -236,18 +242,27 @@
@click.pass_context
def journal_client(
ctx,
- scheduler_url,
- origin_metadata_task_type,
- brokers,
- prefix,
- group_id,
- stop_after_objects,
+ indexer: Optional[str],
+ scheduler_url: str,
+ origin_metadata_task_type: str,
+ brokers: List[str],
+ prefix: str,
+ group_id: str,
+ stop_after_objects: Optional[int],
):
- """Listens for new objects from the SWH Journal, and schedules tasks
- to run relevant indexers (currently, only origin-intrinsic-metadata)
- on these new objects."""
+ """
+ Listens for new objects from the SWH Journal, and either:
+
+ * runs the indexer with the name passed as argument, if any
+ * schedules tasks to run relevant indexers (currently, only
+ origin-intrinsic-metadata) on these new objects otherwise.
+
+ Passing '*' as indexer name runs all indexers.
+ """
import functools
+ import warnings
+ from swh.indexer.indexer import ObjectsDict
from swh.indexer.journal_client import process_journal_objects
from swh.journal.client import get_journal_client
from swh.scheduler import get_scheduler
@@ -268,22 +283,50 @@
)
stop_after_objects = stop_after_objects or journal_cfg.get("stop_after_objects")
+ object_types = set()
+ worker_fns: List[Callable[[ObjectsDict], Dict]] = []
+
+ if indexer is None:
+ warnings.warn(
+ "'swh indexer journal-client' with no argument creates scheduler tasks "
+ "to index, rather than index directly.",
+ DeprecationWarning,
+ )
+ object_types.add("origin_visit_status")
+ worker_fns.append(
+ functools.partial(
+ process_journal_objects,
+ scheduler=scheduler,
+ task_names={
+ "origin_metadata": origin_metadata_task_type,
+ },
+ )
+ )
+
+ if indexer in ("origin-intrinsic-metadata", "*"):
+ from swh.indexer.metadata import OriginMetadataIndexer
+
+ object_types.add("origin_visit_status")
+ idx = OriginMetadataIndexer()
+ idx.catch_exceptions = False # don't commit offsets if indexation failed
+ worker_fns.append(idx.process_journal_objects)
+
+ if not worker_fns:
+ raise click.ClickException(f"Unknown indexer: {indexer}")
+
client = get_journal_client(
cls="kafka",
brokers=brokers,
prefix=prefix,
group_id=group_id,
- object_types=["origin_visit_status"],
+ object_types=list(object_types),
stop_after_objects=stop_after_objects,
)
- worker_fn = functools.partial(
- process_journal_objects,
- scheduler=scheduler,
- task_names={
- "origin_metadata": origin_metadata_task_type,
- },
- )
+ def worker_fn(objects: ObjectsDict):
+ for fn in worker_fns:
+ fn(objects)
+
try:
client.process(worker_fn)
except KeyboardInterrupt:
diff --git a/swh/indexer/indexer.py b/swh/indexer/indexer.py
--- a/swh/indexer/indexer.py
+++ b/swh/indexer/indexer.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2016-2021 The Software Heritage developers
+# Copyright (C) 2016-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@@ -18,19 +18,21 @@
List,
Optional,
Set,
+ Tuple,
TypeVar,
Union,
)
import warnings
import sentry_sdk
+from typing_extensions import TypedDict
from swh.core import utils
from swh.core.config import load_from_envvar, merge_configs
from swh.indexer.storage import INDEXER_CFG_KEY, Sha1, get_indexer_storage
from swh.indexer.storage.interface import IndexerStorageInterface
from swh.model import hashutil
-from swh.model.model import Revision, Sha1Git
+from swh.model.model import Directory, Origin, Sha1Git
from swh.objstorage.exc import ObjNotFoundError
from swh.objstorage.factory import get_objstorage
from swh.scheduler import CONFIG as SWH_CONFIG
@@ -38,6 +40,12 @@
from swh.storage.interface import StorageInterface
+class ObjectsDict(TypedDict, total=False):
+ directory: List[Dict]
+ origin: List[Dict]
+ origin_visit_status: List[Dict]
+
+
@contextmanager
def write_to_temp(filename: str, data: bytes, working_directory: str) -> Iterator[str]:
"""Write the sha1's content in a temporary file.
@@ -102,7 +110,7 @@
content, sha1_git for revision, directory, release, and id for origin
To implement a new concrete indexer, inherit from the object level
- classes: :class:`ContentIndexer`, :class:`RevisionIndexer`,
+ classes: :class:`ContentIndexer`, :class:`DirectoryIndexer`,
:class:`OriginIndexer`.
Then you need to implement the following functions:
@@ -526,9 +534,29 @@
DeprecationWarning,
)
del kwargs["policy_update"]
+
+ origins = [{"url": url} for url in origin_urls]
+
+ return self.process_journal_objects({"origin": origins})
+
+ def process_journal_objects(self, objects: ObjectsDict) -> Dict:
+ """Worker function for ``JournalClient``. Expects ``objects`` to have a single
+ key, either ``origin`` or ``"origin_visit_status"``."""
+ origins = [
+ Origin(url=status["origin"])
+ for status in objects.get("origin_visit_status", [])
+ if status["status"] == "full"
+ ] + [Origin(url=origin["url"]) for origin in objects.get("origin", [])]
+
summary: Dict[str, Any] = {"status": "uneventful"}
try:
- results = self.index_list(origin_urls, **kwargs)
+ results = self.index_list(
+ origins,
+ check_origin_known=False,
+ # no need to check they exist, as we just received either an origin or
+ # visit status; which cannot be created by swh-storage unless the origin
+ # already exists
+ )
except Exception:
if not self.catch_exceptions:
raise
@@ -544,23 +572,23 @@
summary.update(summary_persist)
return summary
- def index_list(self, origin_urls: List[str], **kwargs) -> List[TResult]:
+ def index_list(self, origins: List[Origin], **kwargs) -> List[TResult]:
results = []
- for origin_url in origin_urls:
+ for origin in origins:
try:
- results.extend(self.index(origin_url, **kwargs))
+ results.extend(self.index(origin.url, **kwargs))
except Exception:
- self.log.exception("Problem when processing origin %s", origin_url)
+ self.log.exception("Problem when processing origin %s", origin.url)
sentry_sdk.capture_exception()
raise
return results
-class RevisionIndexer(BaseIndexer[Sha1Git, Revision, TResult], Generic[TResult]):
+class DirectoryIndexer(BaseIndexer[Sha1Git, Directory, TResult], Generic[TResult]):
"""An object type indexer, inherits from the :class:`BaseIndexer` and
- implements Revision indexing using the run method
+ implements Directory indexing using the run method
- Note: the :class:`RevisionIndexer` is not an instantiable object.
+ Note: the :class:`DirectoryIndexer` is not an instantiable object.
To use it in another context one should inherit from this class
and override the methods mentioned in the :class:`BaseIndexer`
class.
@@ -570,7 +598,7 @@
def run(self, ids: List[Sha1Git], **kwargs) -> Dict:
"""Given a list of sha1_gits:
- - retrieve revisions from storage
+ - retrieve directories from storage
- execute the indexing computations
- store the results
@@ -584,28 +612,40 @@
DeprecationWarning,
)
del kwargs["policy_update"]
- summary: Dict[str, Any] = {"status": "uneventful"}
- results = []
- revision_ids = [
+ directory_ids = [
hashutil.hash_to_bytes(id_) if isinstance(id_, str) else id_ for id_ in ids
]
- for (rev_id, rev) in zip(revision_ids, self.storage.revision_get(revision_ids)):
- if not rev:
- # TODO: call self.index() with rev=None?
- self.log.warning(
- "Revision %s not found in storage", hashutil.hash_to_hex(rev_id)
- )
- continue
+
+ return self._process_directories([(dir_id, None) for dir_id in directory_ids])
+
+ def process_journal_objects(self, objects: ObjectsDict) -> Dict:
+ """Worker function for ``JournalClient``. Expects ``objects`` to have a single
+ key, ``"directory"``."""
+ assert set(objects) == {"directory"}
+ return self._process_directories(
+ [(dir_["id"], Directory.from_dict(dir_)) for dir_ in objects["directory"]]
+ )
+
+ def _process_directories(
+ self,
+ directories: Union[List[Tuple[Sha1Git, Directory]], List[Tuple[Sha1Git, None]]],
+ ) -> Dict:
+
+ summary: Dict[str, Any] = {"status": "uneventful"}
+ results = []
+
+ # TODO: fetch raw_manifest when useful?
+
+ for (dir_id, dir_) in directories:
try:
- results.extend(self.index(rev_id, rev))
+ results.extend(self.index(dir_id, dir_))
except Exception:
if not self.catch_exceptions:
raise
- self.log.exception("Problem when processing revision")
+ self.log.exception("Problem when processing directory")
sentry_sdk.capture_exception()
summary["status"] = "failed"
- return summary
summary_persist = self.persist_index_computations(results)
if summary_persist:
diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py
--- a/swh/indexer/metadata.py
+++ b/swh/indexer/metadata.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2017-2021 The Software Heritage developers
+# Copyright (C) 2017-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@@ -21,20 +21,24 @@
from swh.core.config import merge_configs
from swh.core.utils import grouper
from swh.indexer.codemeta import merge_documents
-from swh.indexer.indexer import ContentIndexer, OriginIndexer, RevisionIndexer
+from swh.indexer.indexer import ContentIndexer, DirectoryIndexer, OriginIndexer
from swh.indexer.metadata_detector import detect_metadata
from swh.indexer.metadata_dictionary import MAPPINGS
-from swh.indexer.origin_head import OriginHeadIndexer
+from swh.indexer.origin_head import get_head_swhid
from swh.indexer.storage import INDEXER_CFG_KEY, Sha1
from swh.indexer.storage.model import (
ContentMetadataRow,
+ DirectoryIntrinsicMetadataRow,
OriginIntrinsicMetadataRow,
- RevisionIntrinsicMetadataRow,
)
from swh.model import hashutil
-from swh.model.model import Revision, Sha1Git
+from swh.model.model import Directory
+from swh.model.model import ObjectType as ModelObjectType
+from swh.model.model import Origin, Sha1Git
+from swh.model.swhids import CoreSWHID, ObjectType
REVISION_GET_BATCH_SIZE = 10
+RELEASE_GET_BATCH_SIZE = 10
ORIGIN_GET_BATCH_SIZE = 10
@@ -82,7 +86,7 @@
self,
id: Sha1,
data: Optional[bytes] = None,
- log_suffix="unknown revision",
+ log_suffix="unknown directory",
**kwargs,
) -> List[ContentMetadataRow]:
"""Index sha1s' content and store result.
@@ -144,18 +148,18 @@
}
-class RevisionMetadataIndexer(RevisionIndexer[RevisionIntrinsicMetadataRow]):
- """Revision-level indexer
+class DirectoryMetadataIndexer(DirectoryIndexer[DirectoryIntrinsicMetadataRow]):
+ """Directory-level indexer
This indexer is in charge of:
- - filtering revisions already indexed in revision_intrinsic_metadata table
+ - filtering directories already indexed in directory_intrinsic_metadata table
with defined computation tool
- - retrieve all entry_files in root directory
+ - retrieve all entry_files in directory
- use metadata_detector for file_names containing metadata
- compute metadata translation if necessary and possible (depends on tool)
- send sha1s to content indexing if possible
- - store the results for revision
+ - store the results for directory
"""
@@ -165,7 +169,7 @@
def filter(self, sha1_gits):
"""Filter out known sha1s and return only missing ones."""
- yield from self.idx_storage.revision_intrinsic_metadata_missing(
+ yield from self.idx_storage.directory_intrinsic_metadata_missing(
(
{
"id": sha1_git,
@@ -176,51 +180,52 @@
)
def index(
- self, id: Sha1Git, data: Optional[Revision], **kwargs
- ) -> List[RevisionIntrinsicMetadataRow]:
- """Index rev by processing it and organizing result.
+ self, id: Sha1Git, data: Optional[Directory] = None, **kwargs
+ ) -> List[DirectoryIntrinsicMetadataRow]:
+ """Index directory by processing it and organizing result.
use metadata_detector to iterate on filenames
- if one filename detected -> sends file to content indexer
- - if multiple file detected -> translation needed at revision level
+ - if multiple file detected -> translation needed at directory level
Args:
- id: sha1_git of the revision
- data: revision model object from storage
+ id: sha1_git of the directory
+ data: directory model object from storage
Returns:
- dict: dictionary representing a revision_intrinsic_metadata, with
+ dict: dictionary representing a directory_intrinsic_metadata, with
keys:
- - id (str): rev's identifier (sha1_git)
+ - id: directory's identifier (sha1_git)
- indexer_configuration_id (bytes): tool used
- metadata: dict of retrieved metadata
"""
- rev = data
- assert isinstance(rev, Revision)
+ if data is None:
+ dir_ = list(self.storage.directory_ls(id, recursive=False))
+ else:
+ assert isinstance(data, Directory)
+ dir_ = data.to_dict()
try:
- root_dir = rev.directory
- dir_ls = list(self.storage.directory_ls(root_dir, recursive=False))
- if [entry["type"] for entry in dir_ls] == ["dir"]:
+ if [entry["type"] for entry in dir_] == ["dir"]:
# If the root is just a single directory, recurse into it
# eg. PyPI packages, GNU tarballs
- subdir = dir_ls[0]["target"]
- dir_ls = list(self.storage.directory_ls(subdir, recursive=False))
- files = [entry for entry in dir_ls if entry["type"] == "file"]
+ subdir = dir_[0]["target"]
+ dir_ = list(self.storage.directory_ls(subdir, recursive=False))
+ files = [entry for entry in dir_ if entry["type"] == "file"]
detected_files = detect_metadata(files)
- (mappings, metadata) = self.translate_revision_intrinsic_metadata(
+ (mappings, metadata) = self.translate_directory_intrinsic_metadata(
detected_files,
- log_suffix="revision=%s" % hashutil.hash_to_hex(rev.id),
+ log_suffix="directory=%s" % hashutil.hash_to_hex(id),
)
except Exception as e:
- self.log.exception("Problem when indexing rev: %r", e)
+ self.log.exception("Problem when indexing dir: %r", e)
sentry_sdk.capture_exception()
return [
- RevisionIntrinsicMetadataRow(
- id=rev.id,
+ DirectoryIntrinsicMetadataRow(
+ id=id,
indexer_configuration_id=self.tool["id"],
mappings=mappings,
metadata=metadata,
@@ -228,7 +233,7 @@
]
def persist_index_computations(
- self, results: List[RevisionIntrinsicMetadataRow]
+ self, results: List[DirectoryIntrinsicMetadataRow]
) -> Dict[str, int]:
"""Persist the results in storage.
@@ -241,10 +246,10 @@
"""
# TODO: add functions in storage to keep data in
- # revision_intrinsic_metadata
- return self.idx_storage.revision_intrinsic_metadata_add(results)
+ # directory_intrinsic_metadata
+ return self.idx_storage.directory_intrinsic_metadata_add(results)
- def translate_revision_intrinsic_metadata(
+ def translate_directory_intrinsic_metadata(
self, detected_files: Dict[str, List[Any]], log_suffix: str
) -> Tuple[List[Any], Any]:
"""
@@ -315,81 +320,129 @@
class OriginMetadataIndexer(
- OriginIndexer[Tuple[OriginIntrinsicMetadataRow, RevisionIntrinsicMetadataRow]]
+ OriginIndexer[Tuple[OriginIntrinsicMetadataRow, DirectoryIntrinsicMetadataRow]]
):
USE_TOOLS = False
def __init__(self, config=None, **kwargs) -> None:
super().__init__(config=config, **kwargs)
- self.origin_head_indexer = OriginHeadIndexer(config=config)
- self.revision_metadata_indexer = RevisionMetadataIndexer(config=config)
+ self.directory_metadata_indexer = DirectoryMetadataIndexer(config=config)
def index_list(
- self, origin_urls: List[str], **kwargs
- ) -> List[Tuple[OriginIntrinsicMetadataRow, RevisionIntrinsicMetadataRow]]:
+ self, origins: List[Origin], check_origin_known: bool = True, **kwargs
+ ) -> List[Tuple[OriginIntrinsicMetadataRow, DirectoryIntrinsicMetadataRow]]:
head_rev_ids = []
- origins_with_head = []
- origins = list(
- call_with_batches(
- self.storage.origin_get,
- origin_urls,
- ORIGIN_GET_BATCH_SIZE,
+ head_rel_ids = []
+ origin_heads: Dict[Origin, CoreSWHID] = {}
+
+ # Filter out origins not in the storage
+ if check_origin_known:
+ known_origins = list(
+ call_with_batches(
+ self.storage.origin_get,
+ [origin.url for origin in origins],
+ ORIGIN_GET_BATCH_SIZE,
+ )
)
- )
- for origin in origins:
+ else:
+ known_origins = list(origins)
+
+ for origin in known_origins:
if origin is None:
continue
- head_results = self.origin_head_indexer.index(origin.url)
- if head_results:
- (head_result,) = head_results
- origins_with_head.append(origin)
- head_rev_ids.append(head_result["revision_id"])
-
- head_revs = list(
- call_with_batches(
- self.storage.revision_get, head_rev_ids, REVISION_GET_BATCH_SIZE
+ head_swhid = get_head_swhid(self.storage, origin.url)
+ if head_swhid:
+ origin_heads[origin] = head_swhid
+ if head_swhid.object_type == ObjectType.REVISION:
+ head_rev_ids.append(head_swhid.object_id)
+ elif head_swhid.object_type == ObjectType.RELEASE:
+ head_rel_ids.append(head_swhid.object_id)
+ else:
+ assert False, head_swhid
+
+ head_revs = dict(
+ zip(
+ head_rev_ids,
+ call_with_batches(
+ self.storage.revision_get, head_rev_ids, REVISION_GET_BATCH_SIZE
+ ),
+ )
+ )
+ head_rels = dict(
+ zip(
+ head_rel_ids,
+ call_with_batches(
+ self.storage.release_get, head_rel_ids, RELEASE_GET_BATCH_SIZE
+ ),
)
)
- assert len(head_revs) == len(head_rev_ids)
results = []
- for (origin, rev) in zip(origins_with_head, head_revs):
- if not rev:
- self.log.warning("Missing head revision of origin %r", origin.url)
- continue
-
- for rev_metadata in self.revision_metadata_indexer.index(rev.id, rev):
- # There is at most one rev_metadata
+ for (origin, head_swhid) in origin_heads.items():
+ if head_swhid.object_type == ObjectType.REVISION:
+ rev = head_revs[head_swhid.object_id]
+ if not rev:
+ self.log.warning(
+ "Missing head object %s of origin %r", head_swhid, origin.url
+ )
+ continue
+ directory_id = rev.directory
+ elif head_swhid.object_type == ObjectType.RELEASE:
+ rel = head_rels[head_swhid.object_id]
+ if not rel:
+ self.log.warning(
+ "Missing head object %s of origin %r", head_swhid, origin.url
+ )
+ continue
+ if rel.target_type != ModelObjectType.DIRECTORY:
+ # TODO
+ self.log.warning(
+ "Head release %s of %r has unexpected target type %s",
+ head_swhid,
+ origin.url,
+ rel.target_type,
+ )
+ continue
+ assert rel.target, rel
+ directory_id = rel.target
+ else:
+ assert False, head_swhid
+
+ for dir_metadata in self.directory_metadata_indexer.index(directory_id):
+ # There is at most one dir_metadata
orig_metadata = OriginIntrinsicMetadataRow(
- from_revision=rev_metadata.id,
+ from_directory=dir_metadata.id,
id=origin.url,
- metadata=rev_metadata.metadata,
- mappings=rev_metadata.mappings,
- indexer_configuration_id=rev_metadata.indexer_configuration_id,
+ metadata=dir_metadata.metadata,
+ mappings=dir_metadata.mappings,
+ indexer_configuration_id=dir_metadata.indexer_configuration_id,
)
- results.append((orig_metadata, rev_metadata))
+ results.append((orig_metadata, dir_metadata))
+
return results
def persist_index_computations(
self,
- results: List[Tuple[OriginIntrinsicMetadataRow, RevisionIntrinsicMetadataRow]],
+ results: List[Tuple[OriginIntrinsicMetadataRow, DirectoryIntrinsicMetadataRow]],
) -> Dict[str, int]:
- # Deduplicate revisions
- rev_metadata: List[RevisionIntrinsicMetadataRow] = []
+ # Deduplicate directories
+ dir_metadata: List[DirectoryIntrinsicMetadataRow] = []
orig_metadata: List[OriginIntrinsicMetadataRow] = []
summary: Dict = {}
- for (orig_item, rev_item) in results:
- assert rev_item.metadata == orig_item.metadata
- if rev_item.metadata and not (rev_item.metadata.keys() <= {"@context"}):
+ for (orig_item, dir_item) in results:
+ assert dir_item.metadata == orig_item.metadata
+ if dir_item.metadata and not (dir_item.metadata.keys() <= {"@context"}):
# Only store non-empty metadata sets
- if rev_item not in rev_metadata:
- rev_metadata.append(rev_item)
+ if dir_item not in dir_metadata:
+ dir_metadata.append(dir_item)
if orig_item not in orig_metadata:
orig_metadata.append(orig_item)
- if rev_metadata:
- summary_rev = self.idx_storage.revision_intrinsic_metadata_add(rev_metadata)
- summary.update(summary_rev)
+ if dir_metadata:
+ summary_dir = self.idx_storage.directory_intrinsic_metadata_add(
+ dir_metadata
+ )
+ summary.update(summary_dir)
if orig_metadata:
summary_ori = self.idx_storage.origin_intrinsic_metadata_add(orig_metadata)
summary.update(summary_ori)
diff --git a/swh/indexer/metadata_dictionary/cff.py b/swh/indexer/metadata_dictionary/cff.py
--- a/swh/indexer/metadata_dictionary/cff.py
+++ b/swh/indexer/metadata_dictionary/cff.py
@@ -6,10 +6,12 @@
from .base import DictMapping, SingleFileMapping
-yaml.SafeLoader.yaml_implicit_resolvers = {
- k: [r for r in v if r[0] != "tag:yaml.org,2002:timestamp"]
- for k, v in yaml.SafeLoader.yaml_implicit_resolvers.items()
-}
+
+class SafeLoader(yaml.SafeLoader):
+ yaml_implicit_resolvers = {
+ k: [r for r in v if r[0] != "tag:yaml.org,2002:timestamp"]
+ for k, v in yaml.SafeLoader.yaml_implicit_resolvers.items()
+ }
class CffMapping(DictMapping, SingleFileMapping):
diff --git a/swh/indexer/metadata_dictionary/npm.py b/swh/indexer/metadata_dictionary/npm.py
--- a/swh/indexer/metadata_dictionary/npm.py
+++ b/swh/indexer/metadata_dictionary/npm.py
@@ -133,6 +133,73 @@
author[SCHEMA_URI + "url"] = {"@id": url}
return {"@list": [author]}
+ def normalize_description(self, description):
+ r"""Try to re-decode ``description`` as UTF-16, as this is a somewhat common
+ mistake that causes issues in the database because of null bytes in JSON.
+
+ >>> NpmMapping().normalize_description("foo bar")
+ 'foo bar'
+ >>> NpmMapping().normalize_description(
+ ... "\ufffd\ufffd#\x00 \x00f\x00o\x00o\x00 \x00b\x00a\x00r\x00\r\x00 \x00"
+ ... )
+ 'foo bar'
+ >>> NpmMapping().normalize_description(
+ ... "\ufffd\ufffd\x00#\x00 \x00f\x00o\x00o\x00 \x00b\x00a\x00r\x00\r\x00 "
+ ... )
+ 'foo bar'
+ >>> NpmMapping().normalize_description(
+ ... # invalid UTF-16 and meaningless UTF-8:
+ ... "\ufffd\ufffd\x00#\x00\x00\x00 \x00\x00\x00\x00f\x00\x00\x00\x00"
+ ... ) is None
+ True
+ >>> NpmMapping().normalize_description(
+ ... # ditto (ut looks like little-endian at first)
+ ... "\ufffd\ufffd#\x00\x00\x00 \x00\x00\x00\x00f\x00\x00\x00\x00\x00"
+ ... ) is None
+ True
+ >>> NpmMapping().normalize_description(None) is None
+ True
+ """
+ if description is None:
+ return None
+ # XXX: if this function ever need to support more cases, consider
+ # switching to https://pypi.org/project/ftfy/ instead of adding more hacks
+ if description.startswith("\ufffd\ufffd") and "\x00" in description:
+ # 2 unicode replacement characters followed by '# ' encoded as UTF-16
+ # is a common mistake, which indicates a README.md was saved as UTF-16,
+ # and some NPM tool opened it as UTF-8 and used the first line as
+ # description.
+
+ description_bytes = description.encode()
+
+ # Strip the the two unicode replacement characters
+ assert description_bytes.startswith(b"\xef\xbf\xbd\xef\xbf\xbd")
+ description_bytes = description_bytes[6:]
+
+ # If the following attempts fail to recover the description, discard it
+ # entirely because the current indexer storage backend (postgresql) cannot
+ # store zero bytes in JSON columns.
+ description = None
+
+ if not description_bytes.startswith(b"\x00"):
+ # try UTF-16 little-endian (the most common) first
+ try:
+ description = description_bytes.decode("utf-16le")
+ except UnicodeDecodeError:
+ pass
+ if description is None:
+ # if it fails, try UTF-16 big-endian
+ try:
+ description = description_bytes.decode("utf-16be")
+ except UnicodeDecodeError:
+ pass
+
+ if description:
+ if description.startswith("# "):
+ description = description[2:]
+ return description.rstrip()
+ return description
+
def normalize_license(self, s):
"""https://docs.npmjs.com/files/package.json#license
diff --git a/swh/indexer/origin_head.py b/swh/indexer/origin_head.py
--- a/swh/indexer/origin_head.py
+++ b/swh/indexer/origin_head.py
@@ -1,159 +1,120 @@
-# Copyright (C) 2018-2020 The Software Heritage developers
+# Copyright (C) 2018-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-import logging
import re
-from typing import Any, Dict, List, Tuple, Union
+from typing import Dict, List, Optional, Tuple, Union
-import click
-
-from swh.indexer.indexer import OriginIndexer
from swh.model.model import SnapshotBranch, TargetType
+from swh.model.swhids import CoreSWHID, ObjectType
from swh.storage.algos.origin import origin_get_latest_visit_status
from swh.storage.algos.snapshot import snapshot_get_all_branches
-class OriginHeadIndexer(OriginIndexer[Dict]):
- """Origin-level indexer.
-
- This indexer is in charge of looking up the revision that acts as the
- "head" of an origin.
-
- In git, this is usually the commit pointed to by the 'master' branch."""
-
- USE_TOOLS = False
-
- def persist_index_computations(self, results: Any) -> Dict[str, int]:
- """Do nothing. The indexer's results are not persistent, they
- should only be piped to another indexer."""
- return {}
-
- # Dispatch
-
- def index(self, id: str, data: None = None, **kwargs) -> List[Dict]:
- origin_url = id
- visit_status = origin_get_latest_visit_status(
- self.storage, origin_url, allowed_statuses=["full"], require_snapshot=True
- )
- if not visit_status:
- return []
- assert visit_status.snapshot is not None
- snapshot = snapshot_get_all_branches(self.storage, visit_status.snapshot)
- if snapshot is None:
- return []
- method = getattr(
- self, "_try_get_%s_head" % visit_status.type, self._try_get_head_generic
- )
-
- rev_id = method(snapshot.branches) # type: ignore
- if rev_id is not None:
- return [
- {
- "origin_url": origin_url,
- "revision_id": rev_id,
- }
- ]
-
- # could not find a head revision
- return []
-
- # Tarballs
-
- _archive_filename_re = re.compile(
- rb"^"
- rb"(?P<pkgname>.*)[-_]"
- rb"(?P<version>[0-9]+(\.[0-9])*)"
- rb"(?P<preversion>[-+][a-zA-Z0-9.~]+?)?"
- rb"(?P<extension>(\.[a-zA-Z0-9]+)+)"
- rb"$"
+def get_head_swhid(storage, origin_url: str) -> Optional[CoreSWHID]:
+ """Returns the SWHID of the head revision or release of an origin"""
+ visit_status = origin_get_latest_visit_status(
+ storage, origin_url, allowed_statuses=["full"], require_snapshot=True
)
+ if not visit_status:
+ return None
+ assert visit_status.snapshot is not None
+ snapshot = snapshot_get_all_branches(storage, visit_status.snapshot)
+ if snapshot is None:
+ return None
+
+ if visit_status.type == "ftp":
+ return _try_get_ftp_head(dict(snapshot.branches))
+ else:
+ return _try_get_head_generic(dict(snapshot.branches))
+
+
+_archive_filename_re = re.compile(
+ rb"^"
+ rb"(?P<pkgname>.*)[-_]"
+ rb"(?P<version>[0-9]+(\.[0-9])*)"
+ rb"(?P<preversion>[-+][a-zA-Z0-9.~]+?)?"
+ rb"(?P<extension>(\.[a-zA-Z0-9]+)+)"
+ rb"$"
+)
- @classmethod
- def _parse_version(cls: Any, filename: bytes) -> Tuple[Union[float, int], ...]:
- """Extracts the release version from an archive filename,
- to get an ordering whose maximum is likely to be the last
- version of the software
-
- >>> OriginHeadIndexer._parse_version(b'foo')
- (-inf,)
- >>> OriginHeadIndexer._parse_version(b'foo.tar.gz')
- (-inf,)
- >>> OriginHeadIndexer._parse_version(b'gnu-hello-0.0.1.tar.gz')
- (0, 0, 1, 0)
- >>> OriginHeadIndexer._parse_version(b'gnu-hello-0.0.1-beta2.tar.gz')
- (0, 0, 1, -1, 'beta2')
- >>> OriginHeadIndexer._parse_version(b'gnu-hello-0.0.1+foobar.tar.gz')
- (0, 0, 1, 1, 'foobar')
- """
- res = cls._archive_filename_re.match(filename)
- if res is None:
- return (float("-infinity"),)
- version = [int(n) for n in res.group("version").decode().split(".")]
- if res.group("preversion") is None:
- version.append(0)
+
+def _parse_version(filename: bytes) -> Tuple[Union[float, int, str], ...]:
+ """Extracts the release version from an archive filename,
+ to get an ordering whose maximum is likely to be the last
+ version of the software
+
+ >>> _parse_version(b'foo')
+ (-inf,)
+ >>> _parse_version(b'foo.tar.gz')
+ (-inf,)
+ >>> _parse_version(b'gnu-hello-0.0.1.tar.gz')
+ (0, 0, 1, 0)
+ >>> _parse_version(b'gnu-hello-0.0.1-beta2.tar.gz')
+ (0, 0, 1, -1, 'beta2')
+ >>> _parse_version(b'gnu-hello-0.0.1+foobar.tar.gz')
+ (0, 0, 1, 1, 'foobar')
+ """
+ res = _archive_filename_re.match(filename)
+ if res is None:
+ return (float("-infinity"),)
+ version: List[Union[float, int, str]] = [
+ int(n) for n in res.group("version").decode().split(".")
+ ]
+ if res.group("preversion") is None:
+ version.append(0)
+ else:
+ preversion = res.group("preversion").decode()
+ if preversion.startswith("-"):
+ version.append(-1)
+ version.append(preversion[1:])
+ elif preversion.startswith("+"):
+ version.append(1)
+ version.append(preversion[1:])
else:
- preversion = res.group("preversion").decode()
- if preversion.startswith("-"):
- version.append(-1)
- version.append(preversion[1:])
- elif preversion.startswith("+"):
- version.append(1)
- version.append(preversion[1:])
- else:
- assert False, res.group("preversion")
- return tuple(version)
-
- def _try_get_ftp_head(self, branches: Dict[bytes, SnapshotBranch]) -> Any:
- archive_names = list(branches)
- max_archive_name = max(archive_names, key=self._parse_version)
- r = self._try_resolve_target(branches, max_archive_name)
- return r
-
- # Generic
-
- def _try_get_head_generic(self, branches: Dict[bytes, SnapshotBranch]) -> Any:
- # Works on 'deposit', 'pypi', and VCSs.
- return self._try_resolve_target(branches, b"HEAD") or self._try_resolve_target(
- branches, b"master"
- )
-
- def _try_resolve_target(
- self, branches: Dict[bytes, SnapshotBranch], branch_name: bytes
- ) -> Any:
- try:
- branch = branches[branch_name]
- if branch is None:
- return None
- while branch.target_type == TargetType.ALIAS:
- branch = branches[branch.target]
- if branch is None:
- return None
-
- if branch.target_type == TargetType.REVISION:
- return branch.target
- elif branch.target_type == TargetType.CONTENT:
- return None # TODO
- elif branch.target_type == TargetType.DIRECTORY:
- return None # TODO
- elif branch.target_type == TargetType.RELEASE:
- return None # TODO
- else:
- assert False, branch
- except KeyError:
- return None
+ assert False, res.group("preversion")
+ return tuple(version)
-@click.command()
-@click.option(
- "--origins", "-i", help='Origins to lookup, in the "type+url" format', multiple=True
-)
-def main(origins: List[str]) -> None:
- rev_metadata_indexer = OriginHeadIndexer()
- rev_metadata_indexer.run(origins)
+def _try_get_ftp_head(
+ branches: Dict[bytes, Optional[SnapshotBranch]]
+) -> Optional[CoreSWHID]:
+ archive_names = list(branches)
+ max_archive_name = max(archive_names, key=_parse_version)
+ return _try_resolve_target(branches, max_archive_name)
-if __name__ == "__main__":
- logging.basicConfig(level=logging.INFO)
- main()
+def _try_get_head_generic(
+ branches: Dict[bytes, Optional[SnapshotBranch]]
+) -> Optional[CoreSWHID]:
+ # Works on 'deposit', 'pypi', and VCSs.
+ return _try_resolve_target(branches, b"HEAD") or _try_resolve_target(
+ branches, b"master"
+ )
+
+
+def _try_resolve_target(
+ branches: Dict[bytes, Optional[SnapshotBranch]], branch_name: bytes
+) -> Optional[CoreSWHID]:
+ try:
+ branch = branches[branch_name]
+ if branch is None:
+ return None
+ while branch.target_type == TargetType.ALIAS:
+ branch = branches[branch.target]
+ if branch is None:
+ return None
+
+ if branch.target_type == TargetType.REVISION:
+ return CoreSWHID(object_type=ObjectType.REVISION, object_id=branch.target)
+ elif branch.target_type == TargetType.CONTENT:
+ return None # TODO
+ elif branch.target_type == TargetType.DIRECTORY:
+ return None # TODO
+ elif branch.target_type == TargetType.RELEASE:
+ return CoreSWHID(object_type=ObjectType.RELEASE, object_id=branch.target)
+ else:
+ assert False, branch
+ except KeyError:
+ return None
diff --git a/swh/indexer/sql/30-schema.sql b/swh/indexer/sql/30-schema.sql
--- a/swh/indexer/sql/30-schema.sql
+++ b/swh/indexer/sql/30-schema.sql
@@ -99,34 +99,34 @@
comment on column content_metadata.metadata is 'result of translation with defined format';
comment on column content_metadata.indexer_configuration_id is 'tool used for translation';
--- The table revision_intrinsic_metadata provides a minimal set of intrinsic
+-- The table directory_intrinsic_metadata provides a minimal set of intrinsic
-- metadata detected with the detection tool (indexer_configuration_id) and
-- aggregated from the content_metadata translation.
-create table revision_intrinsic_metadata(
+create table directory_intrinsic_metadata(
id sha1_git not null,
metadata jsonb not null,
indexer_configuration_id bigint not null,
mappings text array not null
);
-comment on table revision_intrinsic_metadata is 'metadata semantically detected and translated in a revision';
-comment on column revision_intrinsic_metadata.id is 'sha1_git of revision';
-comment on column revision_intrinsic_metadata.metadata is 'result of detection and translation with defined format';
-comment on column revision_intrinsic_metadata.indexer_configuration_id is 'tool used for detection';
-comment on column revision_intrinsic_metadata.mappings is 'type of metadata files used to obtain this metadata (eg. pkg-info, npm)';
+comment on table directory_intrinsic_metadata is 'metadata semantically detected and translated in a directory';
+comment on column directory_intrinsic_metadata.id is 'sha1_git of directory';
+comment on column directory_intrinsic_metadata.metadata is 'result of detection and translation with defined format';
+comment on column directory_intrinsic_metadata.indexer_configuration_id is 'tool used for detection';
+comment on column directory_intrinsic_metadata.mappings is 'type of metadata files used to obtain this metadata (eg. pkg-info, npm)';
create table origin_intrinsic_metadata(
id text not null, -- origin url
metadata jsonb,
indexer_configuration_id bigint not null,
- from_revision sha1_git not null,
+ from_directory sha1_git not null,
metadata_tsvector tsvector,
mappings text array not null
);
comment on table origin_intrinsic_metadata is 'keeps intrinsic metadata for an origin';
comment on column origin_intrinsic_metadata.id is 'url of the origin';
-comment on column origin_intrinsic_metadata.metadata is 'metadata extracted from a revision';
+comment on column origin_intrinsic_metadata.metadata is 'metadata extracted from a directory';
comment on column origin_intrinsic_metadata.indexer_configuration_id is 'tool used to generate this metadata';
-comment on column origin_intrinsic_metadata.from_revision is 'sha1 of the revision this metadata was copied from.';
+comment on column origin_intrinsic_metadata.from_directory is 'sha1 of the directory this metadata was copied from.';
comment on column origin_intrinsic_metadata.mappings is 'type of metadata files used to obtain this metadata (eg. pkg-info, npm)';
diff --git a/swh/indexer/sql/50-func.sql b/swh/indexer/sql/50-func.sql
--- a/swh/indexer/sql/50-func.sql
+++ b/swh/indexer/sql/50-func.sql
@@ -273,25 +273,25 @@
-- end content_metadata functions
--- add tmp_revision_intrinsic_metadata entries to revision_intrinsic_metadata,
+-- add tmp_directory_intrinsic_metadata entries to directory_intrinsic_metadata,
-- overwriting duplicates.
--
-- If filtering duplicates is in order, the call to
--- swh_revision_intrinsic_metadata_missing must take place before calling this
+-- swh_directory_intrinsic_metadata_missing must take place before calling this
-- function.
--
-- operates in bulk: 0. swh_mktemp(content_language), 1. COPY to
--- tmp_revision_intrinsic_metadata, 2. call this function
-create or replace function swh_revision_intrinsic_metadata_add()
+-- tmp_directory_intrinsic_metadata, 2. call this function
+create or replace function swh_directory_intrinsic_metadata_add()
returns bigint
language plpgsql
as $$
declare
res bigint;
begin
- insert into revision_intrinsic_metadata (id, metadata, mappings, indexer_configuration_id)
+ insert into directory_intrinsic_metadata (id, metadata, mappings, indexer_configuration_id)
select id, metadata, mappings, indexer_configuration_id
- from tmp_revision_intrinsic_metadata tcm
+ from tmp_directory_intrinsic_metadata tcm
on conflict(id, indexer_configuration_id)
do update set
metadata = excluded.metadata,
@@ -302,19 +302,19 @@
end
$$;
-comment on function swh_revision_intrinsic_metadata_add() IS 'Add new revision intrinsic metadata';
+comment on function swh_directory_intrinsic_metadata_add() IS 'Add new directory intrinsic metadata';
--- create a temporary table for retrieving revision_intrinsic_metadata
-create or replace function swh_mktemp_revision_intrinsic_metadata()
+-- create a temporary table for retrieving directory_intrinsic_metadata
+create or replace function swh_mktemp_directory_intrinsic_metadata()
returns void
language sql
as $$
- create temporary table if not exists tmp_revision_intrinsic_metadata (
- like revision_intrinsic_metadata including defaults
+ create temporary table if not exists tmp_directory_intrinsic_metadata (
+ like directory_intrinsic_metadata including defaults
) on commit delete rows;
$$;
-comment on function swh_mktemp_revision_intrinsic_metadata() is 'Helper table to add revision intrinsic metadata';
+comment on function swh_mktemp_directory_intrinsic_metadata() is 'Helper table to add directory intrinsic metadata';
-- create a temporary table for retrieving origin_intrinsic_metadata
create or replace function swh_mktemp_origin_intrinsic_metadata()
@@ -380,8 +380,8 @@
begin
perform swh_origin_intrinsic_metadata_compute_tsvector();
- insert into origin_intrinsic_metadata (id, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings)
- select id, metadata, indexer_configuration_id, from_revision,
+ insert into origin_intrinsic_metadata (id, metadata, indexer_configuration_id, from_directory, metadata_tsvector, mappings)
+ select id, metadata, indexer_configuration_id, from_directory,
metadata_tsvector, mappings
from tmp_origin_intrinsic_metadata
on conflict(id, indexer_configuration_id)
@@ -389,7 +389,7 @@
metadata = excluded.metadata,
metadata_tsvector = excluded.metadata_tsvector,
mappings = excluded.mappings,
- from_revision = excluded.from_revision;
+ from_directory = excluded.from_directory;
get diagnostics res = ROW_COUNT;
return res;
diff --git a/swh/indexer/sql/60-indexes.sql b/swh/indexer/sql/60-indexes.sql
--- a/swh/indexer/sql/60-indexes.sql
+++ b/swh/indexer/sql/60-indexes.sql
@@ -25,12 +25,12 @@
alter table content_metadata add constraint content_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid;
alter table content_metadata validate constraint content_metadata_indexer_configuration_id_fkey;
--- revision_intrinsic_metadata
-create unique index revision_intrinsic_metadata_pkey on revision_intrinsic_metadata(id, indexer_configuration_id);
-alter table revision_intrinsic_metadata add primary key using index revision_intrinsic_metadata_pkey;
+-- directory_intrinsic_metadata
+create unique index directory_intrinsic_metadata_pkey on directory_intrinsic_metadata(id, indexer_configuration_id);
+alter table directory_intrinsic_metadata add primary key using index directory_intrinsic_metadata_pkey;
-alter table revision_intrinsic_metadata add constraint revision_intrinsic_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid;
-alter table revision_intrinsic_metadata validate constraint revision_intrinsic_metadata_indexer_configuration_id_fkey;
+alter table directory_intrinsic_metadata add constraint directory_intrinsic_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid;
+alter table directory_intrinsic_metadata validate constraint directory_intrinsic_metadata_indexer_configuration_id_fkey;
-- content_mimetype
create unique index content_mimetype_pkey on content_mimetype(id, indexer_configuration_id);
diff --git a/swh/indexer/sql/upgrades/134.sql b/swh/indexer/sql/upgrades/134.sql
new file mode 100644
--- /dev/null
+++ b/swh/indexer/sql/upgrades/134.sql
@@ -0,0 +1,154 @@
+-- SWH Indexer DB schema upgrade
+-- from_version: 133
+-- to_version: 134
+-- description: replace revision_intrinsic_metadata with directory_intrinsic_metadata
+-- and origin_intrinsic_metadata.from_revision with origin_intrinsic_metadata.from_directory
+-- This migration works by dropping both tables and reindexing from scratch.
+
+insert into dbversion(version, release, description)
+ values(134, now(), 'Work In Progress');
+
+drop table origin_intrinsic_metadata;
+drop table revision_intrinsic_metadata;
+drop function swh_revision_intrinsic_metadata_add;
+drop function swh_mktemp_revision_intrinsic_metadata;
+
+
+create table directory_intrinsic_metadata(
+ id sha1_git not null,
+ metadata jsonb not null,
+ indexer_configuration_id bigint not null,
+ mappings text array not null
+);
+
+comment on table directory_intrinsic_metadata is 'metadata semantically detected and translated in a directory';
+comment on column directory_intrinsic_metadata.id is 'sha1_git of directory';
+comment on column directory_intrinsic_metadata.metadata is 'result of detection and translation with defined format';
+comment on column directory_intrinsic_metadata.indexer_configuration_id is 'tool used for detection';
+comment on column directory_intrinsic_metadata.mappings is 'type of metadata files used to obtain this metadata (eg. pkg-info, npm)';
+
+create table origin_intrinsic_metadata(
+ id text not null, -- origin url
+ metadata jsonb,
+ indexer_configuration_id bigint not null,
+ from_directory sha1_git not null,
+ metadata_tsvector tsvector,
+ mappings text array not null
+);
+
+comment on table origin_intrinsic_metadata is 'keeps intrinsic metadata for an origin';
+comment on column origin_intrinsic_metadata.id is 'url of the origin';
+comment on column origin_intrinsic_metadata.metadata is 'metadata extracted from a directory';
+comment on column origin_intrinsic_metadata.indexer_configuration_id is 'tool used to generate this metadata';
+comment on column origin_intrinsic_metadata.from_directory is 'sha1 of the directory this metadata was copied from.';
+comment on column origin_intrinsic_metadata.mappings is 'type of metadata files used to obtain this metadata (eg. pkg-info, npm)';
+
+-- add tmp_directory_intrinsic_metadata entries to directory_intrinsic_metadata,
+-- overwriting duplicates.
+--
+-- If filtering duplicates is in order, the call to
+-- swh_directory_intrinsic_metadata_missing must take place before calling this
+-- function.
+--
+-- operates in bulk: 0. swh_mktemp(content_language), 1. COPY to
+-- tmp_directory_intrinsic_metadata, 2. call this function
+create or replace function swh_directory_intrinsic_metadata_add()
+ returns bigint
+ language plpgsql
+as $$
+declare
+ res bigint;
+begin
+ insert into directory_intrinsic_metadata (id, metadata, mappings, indexer_configuration_id)
+ select id, metadata, mappings, indexer_configuration_id
+ from tmp_directory_intrinsic_metadata tcm
+ on conflict(id, indexer_configuration_id)
+ do update set
+ metadata = excluded.metadata,
+ mappings = excluded.mappings;
+
+ get diagnostics res = ROW_COUNT;
+ return res;
+end
+$$;
+
+comment on function swh_directory_intrinsic_metadata_add() IS 'Add new directory intrinsic metadata';
+
+-- create a temporary table for retrieving directory_intrinsic_metadata
+create or replace function swh_mktemp_directory_intrinsic_metadata()
+ returns void
+ language sql
+as $$
+ create temporary table if not exists tmp_directory_intrinsic_metadata (
+ like directory_intrinsic_metadata including defaults
+ ) on commit delete rows;
+$$;
+
+comment on function swh_mktemp_directory_intrinsic_metadata() is 'Helper table to add directory intrinsic metadata';
+
+-- create a temporary table for retrieving origin_intrinsic_metadata
+create or replace function swh_mktemp_origin_intrinsic_metadata()
+ returns void
+ language sql
+as $$
+ create temporary table if not exists tmp_origin_intrinsic_metadata (
+ like origin_intrinsic_metadata including defaults
+ ) on commit delete rows;
+$$;
+
+comment on function swh_mktemp_origin_intrinsic_metadata() is 'Helper table to add origin intrinsic metadata';
+
+-- add tmp_origin_intrinsic_metadata entries to origin_intrinsic_metadata,
+-- overwriting duplicates.
+--
+-- If filtering duplicates is in order, the call to
+-- swh_origin_intrinsic_metadata_missing must take place before calling this
+-- function.
+--
+-- operates in bulk: 0. swh_mktemp(content_language), 1. COPY to
+-- tmp_origin_intrinsic_metadata, 2. call this function
+create or replace function swh_origin_intrinsic_metadata_add()
+ returns bigint
+ language plpgsql
+as $$
+declare
+ res bigint;
+begin
+ perform swh_origin_intrinsic_metadata_compute_tsvector();
+
+ insert into origin_intrinsic_metadata (id, metadata, indexer_configuration_id, from_directory, metadata_tsvector, mappings)
+ select id, metadata, indexer_configuration_id, from_directory,
+ metadata_tsvector, mappings
+ from tmp_origin_intrinsic_metadata
+ on conflict(id, indexer_configuration_id)
+ do update set
+ metadata = excluded.metadata,
+ metadata_tsvector = excluded.metadata_tsvector,
+ mappings = excluded.mappings,
+ from_directory = excluded.from_directory;
+
+ get diagnostics res = ROW_COUNT;
+ return res;
+end
+$$;
+
+comment on function swh_origin_intrinsic_metadata_add() IS 'Add new origin intrinsic metadata';
+
+
+
+-- directory_intrinsic_metadata
+create unique index directory_intrinsic_metadata_pkey on directory_intrinsic_metadata(id, indexer_configuration_id);
+alter table directory_intrinsic_metadata add primary key using index directory_intrinsic_metadata_pkey;
+
+alter table directory_intrinsic_metadata add constraint directory_intrinsic_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid;
+alter table directory_intrinsic_metadata validate constraint directory_intrinsic_metadata_indexer_configuration_id_fkey;
+
+-- origin_intrinsic_metadata
+create unique index origin_intrinsic_metadata_pkey on origin_intrinsic_metadata(id, indexer_configuration_id);
+alter table origin_intrinsic_metadata add primary key using index origin_intrinsic_metadata_pkey;
+
+alter table origin_intrinsic_metadata add constraint origin_intrinsic_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid;
+alter table origin_intrinsic_metadata validate constraint origin_intrinsic_metadata_indexer_configuration_id_fkey;
+
+create index origin_intrinsic_metadata_fulltext_idx on origin_intrinsic_metadata using gin (metadata_tsvector);
+create index origin_intrinsic_metadata_mappings_idx on origin_intrinsic_metadata using gin (mappings);
diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py
--- a/swh/indexer/storage/__init__.py
+++ b/swh/indexer/storage/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2015-2020 The Software Heritage developers
+# Copyright (C) 2015-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@@ -30,8 +30,8 @@
ContentLicenseRow,
ContentMetadataRow,
ContentMimetypeRow,
+ DirectoryIntrinsicMetadataRow,
OriginIntrinsicMetadataRow,
- RevisionIntrinsicMetadataRow,
)
from .writer import JournalWriter
@@ -120,7 +120,9 @@
class IndexerStorage:
- """SWH Indexer Storage"""
+ """SWH Indexer Storage Datastore"""
+
+ current_version = 134
def __init__(self, db, min_pool_conns=1, max_pool_conns=10, journal_writer=None):
"""
@@ -152,10 +154,6 @@
if db is not self._db:
db.put_conn()
- @db_transaction()
- def get_current_version(self, *, db=None, cur=None):
- return db.current_version
-
@timed
@db_transaction()
def check_config(self, *, check_write, db=None, cur=None):
@@ -522,52 +520,52 @@
@timed
@db_transaction()
- def revision_intrinsic_metadata_missing(
+ def directory_intrinsic_metadata_missing(
self, metadata: Iterable[Dict], db=None, cur=None
) -> List[Tuple[Sha1, int]]:
return [
obj[0]
- for obj in db.revision_intrinsic_metadata_missing_from_list(metadata, cur)
+ for obj in db.directory_intrinsic_metadata_missing_from_list(metadata, cur)
]
@timed
@db_transaction()
- def revision_intrinsic_metadata_get(
+ def directory_intrinsic_metadata_get(
self, ids: Iterable[Sha1], db=None, cur=None
- ) -> List[RevisionIntrinsicMetadataRow]:
+ ) -> List[DirectoryIntrinsicMetadataRow]:
return [
- RevisionIntrinsicMetadataRow.from_dict(
+ DirectoryIntrinsicMetadataRow.from_dict(
converters.db_to_metadata(
- dict(zip(db.revision_intrinsic_metadata_cols, c))
+ dict(zip(db.directory_intrinsic_metadata_cols, c))
)
)
- for c in db.revision_intrinsic_metadata_get_from_list(ids, cur)
+ for c in db.directory_intrinsic_metadata_get_from_list(ids, cur)
]
@timed
@process_metrics
@db_transaction()
- def revision_intrinsic_metadata_add(
+ def directory_intrinsic_metadata_add(
self,
- metadata: List[RevisionIntrinsicMetadataRow],
+ metadata: List[DirectoryIntrinsicMetadataRow],
db=None,
cur=None,
) -> Dict[str, int]:
check_id_duplicates(metadata)
metadata.sort(key=lambda m: m.id)
- self.journal_writer.write_additions("revision_intrinsic_metadata", metadata)
+ self.journal_writer.write_additions("directory_intrinsic_metadata", metadata)
- db.mktemp_revision_intrinsic_metadata(cur)
+ db.mktemp_directory_intrinsic_metadata(cur)
db.copy_to(
[m.to_dict() for m in metadata],
- "tmp_revision_intrinsic_metadata",
+ "tmp_directory_intrinsic_metadata",
["id", "metadata", "mappings", "indexer_configuration_id"],
cur,
)
- count = db.revision_intrinsic_metadata_add_from_temp(cur)
+ count = db.directory_intrinsic_metadata_add_from_temp(cur)
return {
- "revision_intrinsic_metadata:add": count,
+ "directory_intrinsic_metadata:add": count,
}
@timed
@@ -602,7 +600,13 @@
db.copy_to(
[m.to_dict() for m in metadata],
"tmp_origin_intrinsic_metadata",
- ["id", "metadata", "indexer_configuration_id", "from_revision", "mappings"],
+ [
+ "id",
+ "metadata",
+ "indexer_configuration_id",
+ "from_directory",
+ "mappings",
+ ],
cur,
)
count = db.origin_intrinsic_metadata_add_from_temp(cur)
diff --git a/swh/indexer/storage/api/server.py b/swh/indexer/storage/api/server.py
--- a/swh/indexer/storage/api/server.py
+++ b/swh/indexer/storage/api/server.py
@@ -86,10 +86,10 @@
if type == "local":
vcfg = cfg["indexer_storage"]
cls = vcfg.get("cls")
- if cls != "local":
+ if cls not in ("local", "postgresql"):
raise ValueError(
"The indexer_storage backend can only be started with a "
- "'local' configuration"
+ "'postgresql' configuration"
)
if not vcfg.get("db"):
diff --git a/swh/indexer/storage/db.py b/swh/indexer/storage/db.py
--- a/swh/indexer/storage/db.py
+++ b/swh/indexer/storage/db.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2015-2018 The Software Heritage developers
+# Copyright (C) 2015-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@@ -16,7 +16,6 @@
"""Proxy to the SWH Indexer DB, with wrappers around stored procedures"""
content_mimetype_hash_keys = ["id", "indexer_configuration_id"]
- current_version = 133
def _missing_from_list(
self, table: str, data: Iterable[Dict], hash_keys: List[str], cur=None
@@ -350,18 +349,18 @@
"content_metadata", ids, self.content_metadata_cols, cur=cur
)
- revision_intrinsic_metadata_hash_keys = ["id", "indexer_configuration_id"]
+ directory_intrinsic_metadata_hash_keys = ["id", "indexer_configuration_id"]
- def revision_intrinsic_metadata_missing_from_list(self, metadata, cur=None):
+ def directory_intrinsic_metadata_missing_from_list(self, metadata, cur=None):
"""List missing metadata."""
yield from self._missing_from_list(
- "revision_intrinsic_metadata",
+ "directory_intrinsic_metadata",
metadata,
- self.revision_intrinsic_metadata_hash_keys,
+ self.directory_intrinsic_metadata_hash_keys,
cur=cur,
)
- revision_intrinsic_metadata_cols = [
+ directory_intrinsic_metadata_cols = [
"id",
"metadata",
"mappings",
@@ -371,27 +370,27 @@
"tool_configuration",
]
- @stored_procedure("swh_mktemp_revision_intrinsic_metadata")
- def mktemp_revision_intrinsic_metadata(self, cur=None):
+ @stored_procedure("swh_mktemp_directory_intrinsic_metadata")
+ def mktemp_directory_intrinsic_metadata(self, cur=None):
pass
- def revision_intrinsic_metadata_add_from_temp(self, cur=None):
+ def directory_intrinsic_metadata_add_from_temp(self, cur=None):
cur = self._cursor(cur)
- cur.execute("select * from swh_revision_intrinsic_metadata_add()")
+ cur.execute("select * from swh_directory_intrinsic_metadata_add()")
return cur.fetchone()[0]
- def revision_intrinsic_metadata_get_from_list(self, ids, cur=None):
+ def directory_intrinsic_metadata_get_from_list(self, ids, cur=None):
yield from self._get_from_list(
- "revision_intrinsic_metadata",
+ "directory_intrinsic_metadata",
ids,
- self.revision_intrinsic_metadata_cols,
+ self.directory_intrinsic_metadata_cols,
cur=cur,
)
origin_intrinsic_metadata_cols = [
"id",
"metadata",
- "from_revision",
+ "from_directory",
"mappings",
"tool_id",
"tool_name",
diff --git a/swh/indexer/storage/in_memory.py b/swh/indexer/storage/in_memory.py
--- a/swh/indexer/storage/in_memory.py
+++ b/swh/indexer/storage/in_memory.py
@@ -38,8 +38,8 @@
ContentLicenseRow,
ContentMetadataRow,
ContentMimetypeRow,
+ DirectoryIntrinsicMetadataRow,
OriginIntrinsicMetadataRow,
- RevisionIntrinsicMetadataRow,
)
from .writer import JournalWriter
@@ -250,8 +250,8 @@
self._content_ctags = SubStorage(ContentCtagsRow, *args)
self._licenses = SubStorage(ContentLicenseRow, *args)
self._content_metadata = SubStorage(ContentMetadataRow, *args)
- self._revision_intrinsic_metadata = SubStorage(
- RevisionIntrinsicMetadataRow, *args
+ self._directory_intrinsic_metadata = SubStorage(
+ DirectoryIntrinsicMetadataRow, *args
)
self._origin_intrinsic_metadata = SubStorage(OriginIntrinsicMetadataRow, *args)
@@ -369,21 +369,21 @@
added = self._content_metadata.add(metadata)
return {"content_metadata:add": added}
- def revision_intrinsic_metadata_missing(
+ def directory_intrinsic_metadata_missing(
self, metadata: Iterable[Dict]
) -> List[Tuple[Sha1, int]]:
- return self._revision_intrinsic_metadata.missing(metadata)
+ return self._directory_intrinsic_metadata.missing(metadata)
- def revision_intrinsic_metadata_get(
+ def directory_intrinsic_metadata_get(
self, ids: Iterable[Sha1]
- ) -> List[RevisionIntrinsicMetadataRow]:
- return self._revision_intrinsic_metadata.get(ids)
+ ) -> List[DirectoryIntrinsicMetadataRow]:
+ return self._directory_intrinsic_metadata.get(ids)
- def revision_intrinsic_metadata_add(
- self, metadata: List[RevisionIntrinsicMetadataRow]
+ def directory_intrinsic_metadata_add(
+ self, metadata: List[DirectoryIntrinsicMetadataRow]
) -> Dict[str, int]:
- added = self._revision_intrinsic_metadata.add(metadata)
- return {"revision_intrinsic_metadata:add": added}
+ added = self._directory_intrinsic_metadata.add(metadata)
+ return {"directory_intrinsic_metadata:add": added}
def origin_intrinsic_metadata_get(
self, urls: Iterable[str]
diff --git a/swh/indexer/storage/interface.py b/swh/indexer/storage/interface.py
--- a/swh/indexer/storage/interface.py
+++ b/swh/indexer/storage/interface.py
@@ -15,8 +15,8 @@
ContentLicenseRow,
ContentMetadataRow,
ContentMimetypeRow,
+ DirectoryIntrinsicMetadataRow,
OriginIntrinsicMetadataRow,
- RevisionIntrinsicMetadataRow,
)
TResult = TypeVar("TResult")
@@ -341,8 +341,8 @@
"""
...
- @remote_api_endpoint("revision_intrinsic_metadata/missing")
- def revision_intrinsic_metadata_missing(
+ @remote_api_endpoint("directory_intrinsic_metadata/missing")
+ def directory_intrinsic_metadata_missing(
self, metadata: Iterable[Dict]
) -> List[Tuple[Sha1, int]]:
"""List metadata missing from storage.
@@ -350,7 +350,7 @@
Args:
metadata (iterable): dictionaries with keys:
- - **id** (bytes): sha1_git revision identifier
+ - **id** (bytes): sha1_git directory identifier
- **indexer_configuration_id** (int): tool used to compute
the results
@@ -360,11 +360,11 @@
"""
...
- @remote_api_endpoint("revision_intrinsic_metadata")
- def revision_intrinsic_metadata_get(
+ @remote_api_endpoint("directory_intrinsic_metadata")
+ def directory_intrinsic_metadata_get(
self, ids: Iterable[Sha1]
- ) -> List[RevisionIntrinsicMetadataRow]:
- """Retrieve revision metadata per id.
+ ) -> List[DirectoryIntrinsicMetadataRow]:
+ """Retrieve directory metadata per id.
Args:
ids (iterable): sha1 checksums
@@ -375,10 +375,10 @@
"""
...
- @remote_api_endpoint("revision_intrinsic_metadata/add")
- def revision_intrinsic_metadata_add(
+ @remote_api_endpoint("directory_intrinsic_metadata/add")
+ def directory_intrinsic_metadata_add(
self,
- metadata: List[RevisionIntrinsicMetadataRow],
+ metadata: List[DirectoryIntrinsicMetadataRow],
) -> Dict[str, int]:
"""Add metadata not present in storage.
diff --git a/swh/indexer/storage/model.py b/swh/indexer/storage/model.py
--- a/swh/indexer/storage/model.py
+++ b/swh/indexer/storage/model.py
@@ -120,8 +120,8 @@
@attr.s
-class RevisionIntrinsicMetadataRow(BaseRow):
- object_type: Final = "revision_intrinsic_metadata"
+class DirectoryIntrinsicMetadataRow(BaseRow):
+ object_type: Final = "directory_intrinsic_metadata"
id = attr.ib(type=Sha1Git)
metadata = attr.ib(type=Dict[str, Any])
@@ -134,5 +134,5 @@
id = attr.ib(type=str)
metadata = attr.ib(type=Dict[str, Any])
- from_revision = attr.ib(type=Sha1Git)
+ from_directory = attr.ib(type=Sha1Git)
mappings = attr.ib(type=List[str])
diff --git a/swh/indexer/tests/conftest.py b/swh/indexer/tests/conftest.py
--- a/swh/indexer/tests/conftest.py
+++ b/swh/indexer/tests/conftest.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2019-2020 The Software Heritage developers
+# Copyright (C) 2019-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@@ -13,9 +13,8 @@
from pytest_postgresql import factories
import yaml
-from swh.core.db.pytest_plugin import initialize_database_for_module, postgresql_fact
-from swh.indexer.storage import get_indexer_storage
-from swh.indexer.storage.db import Db as IndexerDb
+from swh.core.db.pytest_plugin import initialize_database_for_module
+from swh.indexer.storage import IndexerStorage, get_indexer_storage
from swh.objstorage.factory import get_objstorage
from swh.storage import get_storage
@@ -23,23 +22,22 @@
TASK_NAMES: List[Tuple[str, str]] = [
# (scheduler-task-type, task-class-test-name)
- ("index-revision-metadata", "revision_intrinsic_metadata"),
+ ("index-directory-metadata", "directory_intrinsic_metadata"),
("index-origin-metadata", "origin_intrinsic_metadata"),
]
idx_postgresql_proc = factories.postgresql_proc(
- dbname="indexer_storage",
load=[
partial(
initialize_database_for_module,
modname="indexer",
- version=IndexerDb.current_version,
+ version=IndexerStorage.current_version,
)
],
)
-idx_storage_postgresql = postgresql_fact("idx_postgresql_proc")
+idx_storage_postgresql = factories.postgresql("idx_postgresql_proc")
@pytest.fixture
diff --git a/swh/indexer/tests/storage/conftest.py b/swh/indexer/tests/storage/conftest.py
--- a/swh/indexer/tests/storage/conftest.py
+++ b/swh/indexer/tests/storage/conftest.py
@@ -41,9 +41,9 @@
data.tools = tools
data.sha1_1 = hash_to_bytes("34973274ccef6ab4dfaaf86599792fa9c3fe4689")
data.sha1_2 = hash_to_bytes("61c2b3a30496d329e21af70dd2d7e097046d07b7")
- data.revision_id_1 = hash_to_bytes("7026b7c1a2af56521e951c01ed20f255fa054238")
- data.revision_id_2 = hash_to_bytes("7026b7c1a2af56521e9587659012345678904321")
- data.revision_id_3 = hash_to_bytes("7026b7c1a2af56521e9587659012345678904320")
+ data.directory_id_1 = hash_to_bytes("7026b7c1a2af56521e951c01ed20f255fa054238")
+ data.directory_id_2 = hash_to_bytes("7026b7c1a2af56521e9587659012345678904321")
+ data.directory_id_3 = hash_to_bytes("7026b7c1a2af56521e9587659012345678904320")
data.origin_url_1 = "file:///dev/0/zero" # 44434341
data.origin_url_2 = "file:///dev/1/one" # 44434342
data.origin_url_3 = "file:///dev/2/two" # 54974445
diff --git a/swh/indexer/tests/storage/test_server.py b/swh/indexer/tests/storage/test_server.py
--- a/swh/indexer/tests/storage/test_server.py
+++ b/swh/indexer/tests/storage/test_server.py
@@ -57,13 +57,13 @@
def test_load_and_check_config_remote_config_local_type_raise(
class_storage, tmpdir
) -> None:
- """Any other configuration than 'local' (the default) is rejected"""
+ """Any other configuration than 'postgresql' (the default) is rejected"""
assert class_storage != "local"
incompatible_config = {"indexer_storage": {"cls": class_storage}}
config_path = prepare_config_file(tmpdir, incompatible_config)
expected_error = (
- "The indexer_storage backend can only be started with a 'local' "
+ "The indexer_storage backend can only be started with a 'postgresql' "
"configuration"
)
with pytest.raises(ValueError, match=expected_error):
@@ -82,8 +82,8 @@
def test_load_and_check_config_local_incomplete_configuration(tmpdir) -> None:
- """Incomplete 'local' configuration should raise"""
- config = {"indexer_storage": {"cls": "local"}}
+ """Incomplete 'postgresql' configuration should raise"""
+ config = {"indexer_storage": {"cls": "postgresql"}}
expected_error = "Invalid configuration; missing 'db' config entry"
config_path = prepare_config_file(tmpdir, config)
@@ -95,10 +95,10 @@
"""'Complete 'local' configuration is fine"""
config = {
"indexer_storage": {
- "cls": "local",
+ "cls": "postgresql",
"db": "db",
}
}
config_path = prepare_config_file(tmpdir, config)
- cfg = load_and_check_config(config_path, type="local")
+ cfg = load_and_check_config(config_path, type="postgresql")
assert cfg == config
diff --git a/swh/indexer/tests/storage/test_storage.py b/swh/indexer/tests/storage/test_storage.py
--- a/swh/indexer/tests/storage/test_storage.py
+++ b/swh/indexer/tests/storage/test_storage.py
@@ -19,8 +19,8 @@
ContentLicenseRow,
ContentMetadataRow,
ContentMimetypeRow,
+ DirectoryIntrinsicMetadataRow,
OriginIntrinsicMetadataRow,
- RevisionIntrinsicMetadataRow,
)
from swh.model.hashutil import hash_to_bytes
@@ -289,37 +289,37 @@
etype = self.endpoint_type
tool = data.tools[self.tool_name]
- data_rev1 = self.row_class.from_dict(
+ data_dir1 = self.row_class.from_dict(
{
- "id": data.revision_id_2,
+ "id": data.directory_id_2,
**self.example_data[0],
"indexer_configuration_id": tool["id"],
}
)
- data_rev2 = self.row_class.from_dict(
+ data_dir2 = self.row_class.from_dict(
{
- "id": data.revision_id_2,
+ "id": data.directory_id_2,
**self.example_data[1],
"indexer_configuration_id": tool["id"],
}
)
# when
- summary = endpoint(storage, etype, "add")([data_rev1])
+ summary = endpoint(storage, etype, "add")([data_dir1])
assert summary == expected_summary(1, etype)
with pytest.raises(DuplicateId):
- endpoint(storage, etype, "add")([data_rev2, data_rev2])
+ endpoint(storage, etype, "add")([data_dir2, data_dir2])
# then
actual_data = list(
- endpoint(storage, etype, "get")([data.revision_id_2, data.revision_id_1])
+ endpoint(storage, etype, "get")([data.directory_id_2, data.directory_id_1])
)
expected_data = [
self.row_class.from_dict(
- {"id": data.revision_id_2, **self.example_data[0], "tool": tool}
+ {"id": data.directory_id_2, **self.example_data[0], "tool": tool}
)
]
assert actual_data == expected_data
@@ -806,11 +806,11 @@
row_class = ContentMetadataRow
-class TestIndexerStorageRevisionIntrinsicMetadata(StorageETypeTester):
- """Test Indexer Storage revision_intrinsic_metadata related methods"""
+class TestIndexerStorageDirectoryIntrinsicMetadata(StorageETypeTester):
+ """Test Indexer Storage directory_intrinsic_metadata related methods"""
tool_name = "swh-metadata-detector"
- endpoint_type = "revision_intrinsic_metadata"
+ endpoint_type = "directory_intrinsic_metadata"
example_data = [
{
"metadata": {
@@ -830,7 +830,7 @@
"mappings": ["mapping2"],
},
]
- row_class = RevisionIntrinsicMetadataRow
+ row_class = DirectoryIntrinsicMetadataRow
class TestIndexerStorageContentFossologyLicense(StorageETypeTester):
@@ -1102,8 +1102,8 @@
"version": None,
"name": None,
}
- metadata_rev = RevisionIntrinsicMetadataRow(
- id=data.revision_id_2,
+ metadata_dir = DirectoryIntrinsicMetadataRow(
+ id=data.directory_id_2,
metadata=metadata,
mappings=["mapping1"],
indexer_configuration_id=tool_id,
@@ -1113,11 +1113,11 @@
metadata=metadata,
indexer_configuration_id=tool_id,
mappings=["mapping1"],
- from_revision=data.revision_id_2,
+ from_directory=data.directory_id_2,
)
# when
- storage.revision_intrinsic_metadata_add([metadata_rev])
+ storage.directory_intrinsic_metadata_add([metadata_dir])
storage.origin_intrinsic_metadata_add([metadata_origin])
# then
@@ -1130,7 +1130,7 @@
id=data.origin_url_1,
metadata=metadata,
tool=data.tools["swh-metadata-detector"],
- from_revision=data.revision_id_2,
+ from_directory=data.directory_id_2,
mappings=["mapping1"],
)
]
@@ -1156,8 +1156,8 @@
"version": None,
"name": None,
}
- metadata_rev_v1 = RevisionIntrinsicMetadataRow(
- id=data.revision_id_2,
+ metadata_dir_v1 = DirectoryIntrinsicMetadataRow(
+ id=data.directory_id_2,
metadata=metadata_v1,
mappings=[],
indexer_configuration_id=tool_id,
@@ -1167,11 +1167,11 @@
metadata=metadata_v1.copy(),
indexer_configuration_id=tool_id,
mappings=[],
- from_revision=data.revision_id_2,
+ from_directory=data.directory_id_2,
)
# given
- storage.revision_intrinsic_metadata_add([metadata_rev_v1])
+ storage.directory_intrinsic_metadata_add([metadata_dir_v1])
storage.origin_intrinsic_metadata_add([metadata_origin_v1])
# when
@@ -1185,7 +1185,7 @@
id=data.origin_url_1,
metadata=metadata_v1,
tool=data.tools["swh-metadata-detector"],
- from_revision=data.revision_id_2,
+ from_directory=data.directory_id_2,
mappings=[],
)
]
@@ -1199,16 +1199,16 @@
"author": "MG",
}
)
- metadata_rev_v2 = attr.evolve(metadata_rev_v1, metadata=metadata_v2)
+ metadata_dir_v2 = attr.evolve(metadata_dir_v1, metadata=metadata_v2)
metadata_origin_v2 = OriginIntrinsicMetadataRow(
id=data.origin_url_1,
metadata=metadata_v2.copy(),
indexer_configuration_id=tool_id,
mappings=["npm"],
- from_revision=data.revision_id_1,
+ from_directory=data.directory_id_1,
)
- storage.revision_intrinsic_metadata_add([metadata_rev_v2])
+ storage.directory_intrinsic_metadata_add([metadata_dir_v2])
storage.origin_intrinsic_metadata_add([metadata_origin_v2])
actual_metadata = list(
@@ -1220,7 +1220,7 @@
id=data.origin_url_1,
metadata=metadata_v2,
tool=data.tools["swh-metadata-detector"],
- from_revision=data.revision_id_1,
+ from_directory=data.directory_id_1,
mappings=["npm"],
)
]
@@ -1252,8 +1252,8 @@
"mappings": [],
}
- metadata_rev_v1 = RevisionIntrinsicMetadataRow(
- id=data.revision_id_2,
+ metadata_dir_v1 = DirectoryIntrinsicMetadataRow(
+ id=data.directory_id_2,
metadata={
"version": None,
"name": None,
@@ -1265,7 +1265,7 @@
data_v1 = [
OriginIntrinsicMetadataRow(
id=origin,
- from_revision=data.revision_id_2,
+ from_directory=data.directory_id_2,
indexer_configuration_id=tool_id,
**example_data1,
)
@@ -1274,7 +1274,7 @@
data_v2 = [
OriginIntrinsicMetadataRow(
id=origin,
- from_revision=data.revision_id_2,
+ from_directory=data.directory_id_2,
indexer_configuration_id=tool_id,
**example_data2,
)
@@ -1287,7 +1287,7 @@
data_v2b = list(reversed(data_v2[0:-1]))
# given
- storage.revision_intrinsic_metadata_add([metadata_rev_v1])
+ storage.directory_intrinsic_metadata_add([metadata_dir_v1])
storage.origin_intrinsic_metadata_add(data_v1)
# when
@@ -1296,7 +1296,7 @@
expected_data_v1 = [
OriginIntrinsicMetadataRow(
id=origin,
- from_revision=data.revision_id_2,
+ from_directory=data.directory_id_2,
tool=data.tools["swh-metadata-detector"],
**example_data1,
)
@@ -1326,7 +1326,7 @@
expected_data_v2 = [
OriginIntrinsicMetadataRow(
id=origin,
- from_revision=data.revision_id_2,
+ from_directory=data.directory_id_2,
tool=data.tools["swh-metadata-detector"],
**example_data2,
)
@@ -1351,8 +1351,8 @@
"developmentStatus": None,
"name": None,
}
- metadata_rev = RevisionIntrinsicMetadataRow(
- id=data.revision_id_2,
+ metadata_dir = DirectoryIntrinsicMetadataRow(
+ id=data.directory_id_2,
metadata=metadata,
mappings=["mapping1"],
indexer_configuration_id=tool_id,
@@ -1362,11 +1362,11 @@
metadata=metadata,
indexer_configuration_id=tool_id,
mappings=["mapping1"],
- from_revision=data.revision_id_2,
+ from_directory=data.directory_id_2,
)
# when
- storage.revision_intrinsic_metadata_add([metadata_rev])
+ storage.directory_intrinsic_metadata_add([metadata_dir])
with pytest.raises(DuplicateId):
storage.origin_intrinsic_metadata_add([metadata_origin, metadata_origin])
@@ -1381,8 +1381,8 @@
metadata1 = {
"author": "John Doe",
}
- metadata1_rev = RevisionIntrinsicMetadataRow(
- id=data.revision_id_1,
+ metadata1_dir = DirectoryIntrinsicMetadataRow(
+ id=data.directory_id_1,
metadata=metadata1,
mappings=[],
indexer_configuration_id=tool_id,
@@ -1392,13 +1392,13 @@
metadata=metadata1,
mappings=[],
indexer_configuration_id=tool_id,
- from_revision=data.revision_id_1,
+ from_directory=data.directory_id_1,
)
metadata2 = {
"author": "Jane Doe",
}
- metadata2_rev = RevisionIntrinsicMetadataRow(
- id=data.revision_id_2,
+ metadata2_dir = DirectoryIntrinsicMetadataRow(
+ id=data.directory_id_2,
metadata=metadata2,
mappings=[],
indexer_configuration_id=tool_id,
@@ -1408,13 +1408,13 @@
metadata=metadata2,
mappings=[],
indexer_configuration_id=tool_id,
- from_revision=data.revision_id_2,
+ from_directory=data.directory_id_2,
)
# when
- storage.revision_intrinsic_metadata_add([metadata1_rev])
+ storage.directory_intrinsic_metadata_add([metadata1_dir])
storage.origin_intrinsic_metadata_add([metadata1_origin])
- storage.revision_intrinsic_metadata_add([metadata2_rev])
+ storage.directory_intrinsic_metadata_add([metadata2_dir])
storage.origin_intrinsic_metadata_add([metadata2_origin])
# then
@@ -1444,8 +1444,8 @@
"Jane Doe",
]
}
- metadata1_rev = RevisionIntrinsicMetadataRow(
- id=data.revision_id_1,
+ metadata1_dir = DirectoryIntrinsicMetadataRow(
+ id=data.directory_id_1,
metadata=metadata1,
mappings=[],
indexer_configuration_id=tool_id,
@@ -1455,7 +1455,7 @@
metadata=metadata1,
mappings=[],
indexer_configuration_id=tool_id,
- from_revision=data.revision_id_1,
+ from_directory=data.directory_id_1,
)
metadata2 = {
"author": [
@@ -1463,8 +1463,8 @@
"Jane Doe",
]
}
- metadata2_rev = RevisionIntrinsicMetadataRow(
- id=data.revision_id_2,
+ metadata2_dir = DirectoryIntrinsicMetadataRow(
+ id=data.directory_id_2,
metadata=metadata2,
mappings=[],
indexer_configuration_id=tool_id,
@@ -1474,13 +1474,13 @@
metadata=metadata2,
mappings=[],
indexer_configuration_id=tool_id,
- from_revision=data.revision_id_2,
+ from_directory=data.directory_id_2,
)
# when
- storage.revision_intrinsic_metadata_add([metadata1_rev])
+ storage.directory_intrinsic_metadata_add([metadata1_dir])
storage.origin_intrinsic_metadata_add([metadata1_origin])
- storage.revision_intrinsic_metadata_add([metadata2_rev])
+ storage.directory_intrinsic_metadata_add([metadata2_dir])
storage.origin_intrinsic_metadata_add([metadata2_origin])
# then
@@ -1508,8 +1508,8 @@
"@context": "foo",
"author": "John Doe",
}
- metadata1_rev = RevisionIntrinsicMetadataRow(
- id=data.revision_id_1,
+ metadata1_dir = DirectoryIntrinsicMetadataRow(
+ id=data.directory_id_1,
metadata=metadata1,
mappings=["npm"],
indexer_configuration_id=tool1_id,
@@ -1519,14 +1519,14 @@
metadata=metadata1,
mappings=["npm"],
indexer_configuration_id=tool1_id,
- from_revision=data.revision_id_1,
+ from_directory=data.directory_id_1,
)
metadata2 = {
"@context": "foo",
"author": "Jane Doe",
}
- metadata2_rev = RevisionIntrinsicMetadataRow(
- id=data.revision_id_2,
+ metadata2_dir = DirectoryIntrinsicMetadataRow(
+ id=data.directory_id_2,
metadata=metadata2,
mappings=["npm", "gemspec"],
indexer_configuration_id=tool2_id,
@@ -1536,13 +1536,13 @@
metadata=metadata2,
mappings=["npm", "gemspec"],
indexer_configuration_id=tool2_id,
- from_revision=data.revision_id_2,
+ from_directory=data.directory_id_2,
)
metadata3 = {
"@context": "foo",
}
- metadata3_rev = RevisionIntrinsicMetadataRow(
- id=data.revision_id_3,
+ metadata3_dir = DirectoryIntrinsicMetadataRow(
+ id=data.directory_id_3,
metadata=metadata3,
mappings=["npm", "gemspec"],
indexer_configuration_id=tool2_id,
@@ -1552,14 +1552,14 @@
metadata=metadata3,
mappings=["pkg-info"],
indexer_configuration_id=tool2_id,
- from_revision=data.revision_id_3,
+ from_directory=data.directory_id_3,
)
- storage.revision_intrinsic_metadata_add([metadata1_rev])
+ storage.directory_intrinsic_metadata_add([metadata1_dir])
storage.origin_intrinsic_metadata_add([metadata1_origin])
- storage.revision_intrinsic_metadata_add([metadata2_rev])
+ storage.directory_intrinsic_metadata_add([metadata2_dir])
storage.origin_intrinsic_metadata_add([metadata2_origin])
- storage.revision_intrinsic_metadata_add([metadata3_rev])
+ storage.directory_intrinsic_metadata_add([metadata3_dir])
storage.origin_intrinsic_metadata_add([metadata3_origin])
def test_origin_intrinsic_metadata_search_by_producer(
@@ -1685,7 +1685,7 @@
},
mappings=["npm", "gemspec"],
tool=tool2,
- from_revision=data.revision_id_2,
+ from_directory=data.directory_id_2,
)
],
next_page_token=None,
diff --git a/swh/indexer/tests/tasks.py b/swh/indexer/tests/tasks.py
--- a/swh/indexer/tests/tasks.py
+++ b/swh/indexer/tests/tasks.py
@@ -1,13 +1,12 @@
from celery import current_app as app
-from swh.indexer.metadata import OriginMetadataIndexer, RevisionMetadataIndexer
+from swh.indexer.metadata import DirectoryMetadataIndexer, OriginMetadataIndexer
from .test_metadata import ContentMetadataTestIndexer
-from .test_origin_head import OriginHeadTestIndexer
from .utils import BASE_TEST_CONFIG
-class RevisionMetadataTestIndexer(RevisionMetadataIndexer):
+class DirectoryMetadataTestIndexer(DirectoryMetadataIndexer):
"""Specific indexer whose configuration is enough to satisfy the
indexing tests.
"""
@@ -30,13 +29,12 @@
return {**BASE_TEST_CONFIG, "tools": []}
def _prepare_sub_indexers(self):
- self.origin_head_indexer = OriginHeadTestIndexer()
- self.revision_metadata_indexer = RevisionMetadataTestIndexer()
+ self.directory_metadata_indexer = DirectoryMetadataTestIndexer()
@app.task
-def revision_intrinsic_metadata(*args, **kwargs):
- indexer = RevisionMetadataTestIndexer()
+def directory_intrinsic_metadata(*args, **kwargs):
+ indexer = DirectoryMetadataTestIndexer()
indexer.run(*args, **kwargs)
print("REV RESULT=", indexer.results)
diff --git a/swh/indexer/tests/test_cli.py b/swh/indexer/tests/test_cli.py
--- a/swh/indexer/tests/test_cli.py
+++ b/swh/indexer/tests/test_cli.py
@@ -16,13 +16,15 @@
from swh.indexer.cli import indexer_cli_group
from swh.indexer.storage.interface import IndexerStorageInterface
from swh.indexer.storage.model import (
+ DirectoryIntrinsicMetadataRow,
OriginIntrinsicMetadataRow,
- RevisionIntrinsicMetadataRow,
)
from swh.journal.writer import get_journal_writer
from swh.model.hashutil import hash_to_bytes
from swh.model.model import OriginVisitStatus
+from .utils import DIRECTORY2, REVISION
+
def fill_idx_storage(idx_storage: IndexerStorageInterface, nb_rows: int) -> List[int]:
tools: List[Dict[str, Any]] = [
@@ -38,15 +40,15 @@
origin_metadata = [
OriginIntrinsicMetadataRow(
id="file://dev/%04d" % origin_id,
- from_revision=hash_to_bytes("abcd{:0>36}".format(origin_id)),
+ from_directory=hash_to_bytes("abcd{:0>36}".format(origin_id)),
indexer_configuration_id=tools[origin_id % 2]["id"],
metadata={"name": "origin %d" % origin_id},
mappings=["mapping%d" % (origin_id % 10)],
)
for origin_id in range(nb_rows)
]
- revision_metadata = [
- RevisionIntrinsicMetadataRow(
+ directory_metadata = [
+ DirectoryIntrinsicMetadataRow(
id=hash_to_bytes("abcd{:0>36}".format(origin_id)),
indexer_configuration_id=tools[origin_id % 2]["id"],
metadata={"name": "origin %d" % origin_id},
@@ -55,7 +57,7 @@
for origin_id in range(nb_rows)
]
- idx_storage.revision_intrinsic_metadata_add(revision_metadata)
+ idx_storage.directory_intrinsic_metadata_add(directory_metadata)
idx_storage.origin_intrinsic_metadata_add(origin_metadata)
return [tool["id"] for tool in tools]
@@ -400,7 +402,7 @@
return datetime.datetime.now(tz=datetime.timezone.utc)
-def test_cli_journal_client(
+def test_cli_journal_client_schedule(
cli_runner,
swh_config,
indexer_scheduler,
@@ -523,3 +525,131 @@
],
catch_exceptions=False,
)
+
+
+@pytest.mark.parametrize("indexer_name", ["origin-intrinsic-metadata", "*"])
+def test_cli_journal_client_index(
+ cli_runner,
+ swh_config,
+ kafka_prefix: str,
+ kafka_server,
+ consumer: Consumer,
+ idx_storage,
+ storage,
+ mocker,
+ swh_indexer_config,
+ indexer_name: str,
+):
+ """Test the 'swh indexer journal-client' cli tool."""
+ journal_writer = get_journal_writer(
+ "kafka",
+ brokers=[kafka_server],
+ prefix=kafka_prefix,
+ client_id="test producer",
+ value_sanitizer=lambda object_type, value: value,
+ flush_timeout=3, # fail early if something is going wrong
+ )
+
+ visit_statuses = [
+ OriginVisitStatus(
+ origin="file:///dev/zero",
+ visit=1,
+ date=now(),
+ status="full",
+ snapshot=None,
+ ),
+ OriginVisitStatus(
+ origin="file:///dev/foobar",
+ visit=2,
+ date=now(),
+ status="full",
+ snapshot=None,
+ ),
+ OriginVisitStatus(
+ origin="file:///tmp/spamegg",
+ visit=3,
+ date=now(),
+ status="full",
+ snapshot=None,
+ ),
+ OriginVisitStatus(
+ origin="file:///dev/0002",
+ visit=6,
+ date=now(),
+ status="full",
+ snapshot=None,
+ ),
+ OriginVisitStatus( # will be filtered out due to its 'partial' status
+ origin="file:///dev/0000",
+ visit=4,
+ date=now(),
+ status="partial",
+ snapshot=None,
+ ),
+ OriginVisitStatus( # will be filtered out due to its 'ongoing' status
+ origin="file:///dev/0001",
+ visit=5,
+ date=now(),
+ status="ongoing",
+ snapshot=None,
+ ),
+ ]
+
+ journal_writer.write_additions("origin_visit_status", visit_statuses)
+ visit_statuses_full = [vs for vs in visit_statuses if vs.status == "full"]
+ storage.revision_add([REVISION])
+
+ mocker.patch(
+ "swh.indexer.metadata.get_head_swhid",
+ return_value=REVISION.swhid(),
+ )
+
+ mocker.patch(
+ "swh.indexer.metadata.DirectoryMetadataIndexer.index",
+ return_value=[
+ DirectoryIntrinsicMetadataRow(
+ id=DIRECTORY2.id,
+ indexer_configuration_id=1,
+ mappings=["cff"],
+ metadata={"foo": "bar"},
+ )
+ ],
+ )
+ result = cli_runner.invoke(
+ indexer_cli_group,
+ [
+ "-C",
+ swh_config,
+ "journal-client",
+ indexer_name,
+ "--broker",
+ kafka_server,
+ "--prefix",
+ kafka_prefix,
+ "--group-id",
+ "test-consumer",
+ "--stop-after-objects",
+ len(visit_statuses),
+ ],
+ catch_exceptions=False,
+ )
+
+ # Check the output
+ expected_output = "Done.\n"
+ assert result.exit_code == 0, result.output
+ assert result.output == expected_output
+
+ results = idx_storage.origin_intrinsic_metadata_get(
+ [status.origin for status in visit_statuses]
+ )
+ expected_results = [
+ OriginIntrinsicMetadataRow(
+ id=status.origin,
+ from_directory=DIRECTORY2.id,
+ tool={"id": 1, **swh_indexer_config["tools"]},
+ mappings=["cff"],
+ metadata={"foo": "bar"},
+ )
+ for status in sorted(visit_statuses_full, key=lambda r: r.origin)
+ ]
+ assert sorted(results, key=lambda r: r.id) == expected_results
diff --git a/swh/indexer/tests/test_indexer.py b/swh/indexer/tests/test_indexer.py
--- a/swh/indexer/tests/test_indexer.py
+++ b/swh/indexer/tests/test_indexer.py
@@ -11,13 +11,13 @@
from swh.indexer.indexer import (
ContentIndexer,
ContentPartitionIndexer,
+ DirectoryIndexer,
OriginIndexer,
- RevisionIndexer,
)
from swh.indexer.storage import PagedResult, Sha1
from swh.model.model import Content
-from .utils import BASE_TEST_CONFIG
+from .utils import BASE_TEST_CONFIG, DIRECTORY2
class _TestException(Exception):
@@ -49,7 +49,7 @@
pass
-class CrashingRevisionIndexer(CrashingIndexerMixin, RevisionIndexer):
+class CrashingDirectoryIndexer(CrashingIndexerMixin, DirectoryIndexer):
pass
@@ -86,29 +86,43 @@
indexer.run([b"foo"])
-def test_revision_indexer_catch_exceptions():
- indexer = CrashingRevisionIndexer(config=BASE_TEST_CONFIG)
+def test_directory_indexer_catch_exceptions():
+ indexer = CrashingDirectoryIndexer(config=BASE_TEST_CONFIG)
indexer.storage = Mock()
- indexer.storage.revision_get.return_value = ["rev"]
+ indexer.storage.directory_get.return_value = [DIRECTORY2]
assert indexer.run([b"foo"]) == {"status": "failed"}
+ assert indexer.process_journal_objects({"directory": [DIRECTORY2.to_dict()]}) == {
+ "status": "failed"
+ }
+
indexer.catch_exceptions = False
with pytest.raises(_TestException):
indexer.run([b"foo"])
+ with pytest.raises(_TestException):
+ indexer.process_journal_objects({"directory": [DIRECTORY2.to_dict()]})
+
def test_origin_indexer_catch_exceptions():
indexer = CrashingOriginIndexer(config=BASE_TEST_CONFIG)
assert indexer.run(["http://example.org"]) == {"status": "failed"}
+ assert indexer.process_journal_objects(
+ {"origin": [{"url": "http://example.org"}]}
+ ) == {"status": "failed"}
+
indexer.catch_exceptions = False
with pytest.raises(_TestException):
indexer.run(["http://example.org"])
+ with pytest.raises(_TestException):
+ indexer.process_journal_objects({"origin": [{"url": "http://example.org"}]})
+
def test_content_partition_indexer_catch_exceptions():
indexer = CrashingContentPartitionIndexer(
diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py
--- a/swh/indexer/tests/test_metadata.py
+++ b/swh/indexer/tests/test_metadata.py
@@ -1,24 +1,25 @@
-# Copyright (C) 2017-2020 The Software Heritage developers
+# Copyright (C) 2017-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import json
-import unittest
+import logging
from hypothesis import HealthCheck, given, settings, strategies
+import pytest
from swh.indexer.codemeta import CODEMETA_TERMS
-from swh.indexer.metadata import ContentMetadataIndexer, RevisionMetadataIndexer
+from swh.indexer.metadata import ContentMetadataIndexer, DirectoryMetadataIndexer
from swh.indexer.metadata_detector import detect_metadata
from swh.indexer.metadata_dictionary import MAPPINGS
from swh.indexer.metadata_dictionary.maven import MavenMapping
from swh.indexer.metadata_dictionary.npm import NpmMapping
from swh.indexer.metadata_dictionary.ruby import GemspecMapping
-from swh.indexer.storage.model import ContentMetadataRow, RevisionIntrinsicMetadataRow
-from swh.indexer.tests.utils import DIRECTORY2, REVISION
+from swh.indexer.storage.model import ContentMetadataRow, DirectoryIntrinsicMetadataRow
+from swh.indexer.tests.utils import DIRECTORY2
from swh.model.hashutil import hash_to_bytes
-from swh.model.model import Directory, DirectoryEntry, Revision
+from swh.model.model import Directory, DirectoryEntry
from .utils import (
BASE_TEST_CONFIG,
@@ -42,25 +43,21 @@
"""
def parse_config_file(self, *args, **kwargs):
- assert False, "should not be called; the rev indexer configures it."
+ assert False, "should not be called; the dir indexer configures it."
-REVISION_METADATA_CONFIG = {
+DIRECTORY_METADATA_CONFIG = {
**BASE_TEST_CONFIG,
"tools": TRANSLATOR_TOOL,
}
-class Metadata(unittest.TestCase):
+class TestMetadata:
"""
Tests metadata_mock_tool tool for Metadata detection
"""
- def setUp(self):
- """
- shows the entire diff in the results
- """
- self.maxDiff = None
+ def setup_method(self):
self.npm_mapping = MAPPINGS["NpmMapping"]()
self.codemeta_mapping = MAPPINGS["CodemetaMapping"]()
self.maven_mapping = MAPPINGS["MavenMapping"]()
@@ -81,7 +78,7 @@
# when
result = self.npm_mapping.translate(content)
# then
- self.assertEqual(declared_metadata, result)
+ assert declared_metadata == result
def test_compute_metadata_cff(self):
"""
@@ -160,7 +157,7 @@
# when
result = self.cff_mapping.translate(content)
# then
- self.assertEqual(expected, result)
+ assert expected == result
def test_compute_metadata_npm(self):
"""
@@ -201,7 +198,7 @@
# when
result = self.npm_mapping.translate(content)
# then
- self.assertEqual(declared_metadata, result)
+ assert declared_metadata == result
def test_index_content_metadata_npm(self):
"""
@@ -275,7 +272,7 @@
del result.tool["id"]
# The assertion below returns False sometimes because of nested lists
- self.assertEqual(expected_results, results)
+ assert expected_results == results
def test_npm_bugs_normalization(self):
# valid dictionary
@@ -287,15 +284,12 @@
}
}"""
result = self.npm_mapping.translate(package_json)
- self.assertEqual(
- result,
- {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "name": "foo",
- "issueTracker": "https://github.com/owner/project/issues",
- "type": "SoftwareSourceCode",
- },
- )
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "name": "foo",
+ "issueTracker": "https://github.com/owner/project/issues",
+ "type": "SoftwareSourceCode",
+ }
# "invalid" dictionary
package_json = b"""{
@@ -305,14 +299,11 @@
}
}"""
result = self.npm_mapping.translate(package_json)
- self.assertEqual(
- result,
- {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "name": "foo",
- "type": "SoftwareSourceCode",
- },
- )
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "name": "foo",
+ "type": "SoftwareSourceCode",
+ }
# string
package_json = b"""{
@@ -320,15 +311,12 @@
"bugs": "https://github.com/owner/project/issues"
}"""
result = self.npm_mapping.translate(package_json)
- self.assertEqual(
- result,
- {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "name": "foo",
- "issueTracker": "https://github.com/owner/project/issues",
- "type": "SoftwareSourceCode",
- },
- )
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "name": "foo",
+ "issueTracker": "https://github.com/owner/project/issues",
+ "type": "SoftwareSourceCode",
+ }
def test_npm_repository_normalization(self):
# normal
@@ -340,15 +328,12 @@
}
}"""
result = self.npm_mapping.translate(package_json)
- self.assertEqual(
- result,
- {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "name": "foo",
- "codeRepository": "git+https://github.com/npm/cli.git",
- "type": "SoftwareSourceCode",
- },
- )
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "name": "foo",
+ "codeRepository": "git+https://github.com/npm/cli.git",
+ "type": "SoftwareSourceCode",
+ }
# missing url
package_json = b"""{
@@ -358,14 +343,11 @@
}
}"""
result = self.npm_mapping.translate(package_json)
- self.assertEqual(
- result,
- {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "name": "foo",
- "type": "SoftwareSourceCode",
- },
- )
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "name": "foo",
+ "type": "SoftwareSourceCode",
+ }
# github shortcut
package_json = b"""{
@@ -379,7 +361,7 @@
"codeRepository": "git+https://github.com/npm/cli.git",
"type": "SoftwareSourceCode",
}
- self.assertEqual(result, expected_result)
+ assert result == expected_result
# github shortshortcut
package_json = b"""{
@@ -387,7 +369,7 @@
"repository": "npm/cli"
}"""
result = self.npm_mapping.translate(package_json)
- self.assertEqual(result, expected_result)
+ assert result == expected_result
# gitlab shortcut
package_json = b"""{
@@ -395,52 +377,48 @@
"repository": "gitlab:user/repo"
}"""
result = self.npm_mapping.translate(package_json)
- self.assertEqual(
- result,
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "name": "foo",
+ "codeRepository": "git+https://gitlab.com/user/repo.git",
+ "type": "SoftwareSourceCode",
+ }
+
+ @pytest.mark.parametrize(
+ "filename", [b"package.json", b"Package.json", b"PACKAGE.json", b"PACKAGE.JSON"]
+ )
+ def test_detect_metadata_package_json(self, filename):
+ # given
+ df = [
{
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "name": "foo",
- "codeRepository": "git+https://gitlab.com/user/repo.git",
- "type": "SoftwareSourceCode",
+ "sha1_git": b"abc",
+ "name": b"index.js",
+ "target": b"abc",
+ "length": 897,
+ "status": "visible",
+ "type": "file",
+ "perms": 33188,
+ "dir_id": b"dir_a",
+ "sha1": b"bcd",
},
- )
-
- def test_detect_metadata_package_json(self):
- filenames = [b"package.json", b"Package.json", b"PACKAGE.json", b"PACKAGE.JSON"]
-
- for filename in filenames:
- with self.subTest(filename=filename):
- # given
- df = [
- {
- "sha1_git": b"abc",
- "name": b"index.js",
- "target": b"abc",
- "length": 897,
- "status": "visible",
- "type": "file",
- "perms": 33188,
- "dir_id": b"dir_a",
- "sha1": b"bcd",
- },
- {
- "sha1_git": b"aab",
- "name": filename,
- "target": b"aab",
- "length": 712,
- "status": "visible",
- "type": "file",
- "perms": 33188,
- "dir_id": b"dir_a",
- "sha1": b"cde",
- },
- ]
- # when
- results = detect_metadata(df)
+ {
+ "sha1_git": b"aab",
+ "name": filename,
+ "target": b"aab",
+ "length": 712,
+ "status": "visible",
+ "type": "file",
+ "perms": 33188,
+ "dir_id": b"dir_a",
+ "sha1": b"cde",
+ },
+ ]
+ # when
+ results = detect_metadata(df)
- expected_results = {"NpmMapping": [b"cde"]}
- # then
- self.assertEqual(expected_results, results)
+ expected_results = {"NpmMapping": [b"cde"]}
+ # then
+ assert expected_results == results
def test_detect_metadata_codemeta_json_uppercase(self):
# given
@@ -473,7 +451,7 @@
expected_results = {"CodemetaMapping": [b"bcd"]}
# then
- self.assertEqual(expected_results, results)
+ assert expected_results == results
def test_compute_metadata_valid_codemeta(self):
raw_content = b"""{
@@ -580,7 +558,7 @@
"programmingLanguage": "JSON-LD",
}
result = self.codemeta_mapping.translate(raw_content)
- self.assertEqual(result, expected_result)
+ assert result == expected_result
def test_compute_metadata_codemeta_alternate_context(self):
raw_content = b"""{
@@ -594,7 +572,7 @@
"identifier": "CodeMeta",
}
result = self.codemeta_mapping.translate(raw_content)
- self.assertEqual(result, expected_result)
+ assert result == expected_result
def test_compute_metadata_maven(self):
raw_content = b"""
@@ -625,33 +603,27 @@
</licenses>
</project>"""
result = self.maven_mapping.translate(raw_content)
- self.assertEqual(
- result,
- {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "type": "SoftwareSourceCode",
- "name": "Maven Default Project",
- "identifier": "com.mycompany.app",
- "version": "1.2.3",
- "license": "https://www.apache.org/licenses/LICENSE-2.0.txt",
- "codeRepository": (
- "http://repo1.maven.org/maven2/com/mycompany/app/my-app"
- ),
- },
- )
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ "name": "Maven Default Project",
+ "identifier": "com.mycompany.app",
+ "version": "1.2.3",
+ "license": "https://www.apache.org/licenses/LICENSE-2.0.txt",
+ "codeRepository": (
+ "http://repo1.maven.org/maven2/com/mycompany/app/my-app"
+ ),
+ }
def test_compute_metadata_maven_empty(self):
raw_content = b"""
<project>
</project>"""
result = self.maven_mapping.translate(raw_content)
- self.assertEqual(
- result,
- {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "type": "SoftwareSourceCode",
- },
- )
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ }
def test_compute_metadata_maven_almost_empty(self):
raw_content = b"""
@@ -659,81 +631,85 @@
<foo/>
</project>"""
result = self.maven_mapping.translate(raw_content)
- self.assertEqual(
- result,
- {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "type": "SoftwareSourceCode",
- },
- )
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ }
- def test_compute_metadata_maven_invalid_xml(self):
+ def test_compute_metadata_maven_invalid_xml(self, caplog):
expected_warning = (
- "WARNING:swh.indexer.metadata_dictionary.maven.MavenMapping:"
- "Error parsing XML from foo"
+ "swh.indexer.metadata_dictionary.maven.MavenMapping",
+ logging.WARNING,
+ "Error parsing XML from foo",
)
+ caplog.at_level(logging.WARNING, logger="swh.indexer.metadata_dictionary")
raw_content = b"""
<project>"""
- with self.assertLogs("swh.indexer.metadata_dictionary", level="WARNING") as cm:
- result = MAPPINGS["MavenMapping"]("foo").translate(raw_content)
- self.assertEqual(cm.output, [expected_warning])
- self.assertEqual(result, None)
+ caplog.clear()
+ result = MAPPINGS["MavenMapping"]("foo").translate(raw_content)
+ assert caplog.record_tuples == [expected_warning]
+ assert result is None
raw_content = b"""
"""
- with self.assertLogs("swh.indexer.metadata_dictionary", level="WARNING") as cm:
- result = MAPPINGS["MavenMapping"]("foo").translate(raw_content)
- self.assertEqual(cm.output, [expected_warning])
- self.assertEqual(result, None)
+ caplog.clear()
+ result = MAPPINGS["MavenMapping"]("foo").translate(raw_content)
+ assert caplog.record_tuples == [expected_warning]
+ assert result is None
- def test_compute_metadata_maven_unknown_encoding(self):
+ def test_compute_metadata_maven_unknown_encoding(self, caplog):
expected_warning = (
- "WARNING:swh.indexer.metadata_dictionary.maven.MavenMapping:"
- "Error detecting XML encoding from foo"
+ "swh.indexer.metadata_dictionary.maven.MavenMapping",
+ logging.WARNING,
+ "Error detecting XML encoding from foo",
)
+ caplog.at_level(logging.WARNING, logger="swh.indexer.metadata_dictionary")
raw_content = b"""<?xml version="1.0" encoding="foo"?>
<project>
</project>"""
- with self.assertLogs("swh.indexer.metadata_dictionary", level="WARNING") as cm:
- result = MAPPINGS["MavenMapping"]("foo").translate(raw_content)
- self.assertEqual(cm.output, [expected_warning])
- self.assertEqual(result, None)
+ caplog.clear()
+ result = MAPPINGS["MavenMapping"]("foo").translate(raw_content)
+ assert caplog.record_tuples == [expected_warning]
+ assert result is None
raw_content = b"""<?xml version="1.0" encoding="UTF-7"?>
<project>
</project>"""
- with self.assertLogs("swh.indexer.metadata_dictionary", level="WARNING") as cm:
- result = MAPPINGS["MavenMapping"]("foo").translate(raw_content)
- self.assertEqual(cm.output, [expected_warning])
- self.assertEqual(result, None)
+ caplog.clear()
+ result = MAPPINGS["MavenMapping"]("foo").translate(raw_content)
+ assert caplog.record_tuples == [expected_warning]
+ assert result is None
- def test_compute_metadata_maven_invalid_encoding(self):
+ def test_compute_metadata_maven_invalid_encoding(self, caplog):
expected_warning = [
# libexpat1 <= 2.2.10-2+deb11u1
[
(
- "WARNING:swh.indexer.metadata_dictionary.maven.MavenMapping:"
- "Error unidecoding XML from foo"
+ "swh.indexer.metadata_dictionary.maven.MavenMapping",
+ logging.WARNING,
+ "Error unidecoding XML from foo",
)
],
# libexpat1 >= 2.2.10-2+deb11u2
[
(
- "WARNING:swh.indexer.metadata_dictionary.maven.MavenMapping:"
- "Error parsing XML from foo"
+ "swh.indexer.metadata_dictionary.maven.MavenMapping",
+ logging.WARNING,
+ "Error parsing XML from foo",
)
],
]
+ caplog.at_level(logging.WARNING, logger="swh.indexer.metadata_dictionary")
raw_content = b"""<?xml version="1.0" encoding="UTF-8"?>
<foo\xe5ct>
</foo>"""
- with self.assertLogs("swh.indexer.metadata_dictionary", level="WARNING") as cm:
- result = MAPPINGS["MavenMapping"]("foo").translate(raw_content)
- self.assertIn(cm.output, expected_warning)
- self.assertEqual(result, None)
+ caplog.clear()
+ result = MAPPINGS["MavenMapping"]("foo").translate(raw_content)
+ assert caplog.record_tuples in expected_warning
+ assert result is None
def test_compute_metadata_maven_minimal(self):
raw_content = b"""
@@ -745,19 +721,16 @@
<version>1.2.3</version>
</project>"""
result = self.maven_mapping.translate(raw_content)
- self.assertEqual(
- result,
- {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "type": "SoftwareSourceCode",
- "name": "Maven Default Project",
- "identifier": "com.mycompany.app",
- "version": "1.2.3",
- "codeRepository": (
- "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
- ),
- },
- )
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ "name": "Maven Default Project",
+ "identifier": "com.mycompany.app",
+ "version": "1.2.3",
+ "codeRepository": (
+ "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
+ ),
+ }
def test_compute_metadata_maven_empty_nodes(self):
raw_content = b"""
@@ -771,19 +744,16 @@
</repositories>
</project>"""
result = self.maven_mapping.translate(raw_content)
- self.assertEqual(
- result,
- {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "type": "SoftwareSourceCode",
- "name": "Maven Default Project",
- "identifier": "com.mycompany.app",
- "version": "1.2.3",
- "codeRepository": (
- "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
- ),
- },
- )
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ "name": "Maven Default Project",
+ "identifier": "com.mycompany.app",
+ "version": "1.2.3",
+ "codeRepository": (
+ "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
+ ),
+ }
raw_content = b"""
<project>
@@ -794,18 +764,15 @@
<version></version>
</project>"""
result = self.maven_mapping.translate(raw_content)
- self.assertEqual(
- result,
- {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "type": "SoftwareSourceCode",
- "name": "Maven Default Project",
- "identifier": "com.mycompany.app",
- "codeRepository": (
- "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
- ),
- },
- )
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ "name": "Maven Default Project",
+ "identifier": "com.mycompany.app",
+ "codeRepository": (
+ "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
+ ),
+ }
raw_content = b"""
<project>
@@ -816,18 +783,15 @@
<version>1.2.3</version>
</project>"""
result = self.maven_mapping.translate(raw_content)
- self.assertEqual(
- result,
- {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "type": "SoftwareSourceCode",
- "identifier": "com.mycompany.app",
- "version": "1.2.3",
- "codeRepository": (
- "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
- ),
- },
- )
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ "identifier": "com.mycompany.app",
+ "version": "1.2.3",
+ "codeRepository": (
+ "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
+ ),
+ }
raw_content = b"""
<project>
@@ -840,19 +804,16 @@
</licenses>
</project>"""
result = self.maven_mapping.translate(raw_content)
- self.assertEqual(
- result,
- {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "type": "SoftwareSourceCode",
- "name": "Maven Default Project",
- "identifier": "com.mycompany.app",
- "version": "1.2.3",
- "codeRepository": (
- "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
- ),
- },
- )
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ "name": "Maven Default Project",
+ "identifier": "com.mycompany.app",
+ "version": "1.2.3",
+ "codeRepository": (
+ "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
+ ),
+ }
raw_content = b"""
<project>
@@ -860,14 +821,11 @@
<version>1.2.3</version>
</project>"""
result = self.maven_mapping.translate(raw_content)
- self.assertEqual(
- result,
- {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "type": "SoftwareSourceCode",
- "version": "1.2.3",
- },
- )
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ "version": "1.2.3",
+ }
def test_compute_metadata_maven_invalid_licenses(self):
raw_content = b"""
@@ -882,19 +840,16 @@
</licenses>
</project>"""
result = self.maven_mapping.translate(raw_content)
- self.assertEqual(
- result,
- {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "type": "SoftwareSourceCode",
- "name": "Maven Default Project",
- "identifier": "com.mycompany.app",
- "version": "1.2.3",
- "codeRepository": (
- "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
- ),
- },
- )
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ "name": "Maven Default Project",
+ "identifier": "com.mycompany.app",
+ "version": "1.2.3",
+ "codeRepository": (
+ "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
+ ),
+ }
def test_compute_metadata_maven_multiple(self):
"""Tests when there are multiple code repos and licenses."""
@@ -936,24 +891,21 @@
</licenses>
</project>"""
result = self.maven_mapping.translate(raw_content)
- self.assertEqual(
- result,
- {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "type": "SoftwareSourceCode",
- "name": "Maven Default Project",
- "identifier": "com.mycompany.app",
- "version": "1.2.3",
- "license": [
- "https://www.apache.org/licenses/LICENSE-2.0.txt",
- "https://opensource.org/licenses/MIT",
- ],
- "codeRepository": [
- "http://repo1.maven.org/maven2/com/mycompany/app/my-app",
- "http://example.org/maven2/com/mycompany/app/my-app",
- ],
- },
- )
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ "name": "Maven Default Project",
+ "identifier": "com.mycompany.app",
+ "version": "1.2.3",
+ "license": [
+ "https://www.apache.org/licenses/LICENSE-2.0.txt",
+ "https://opensource.org/licenses/MIT",
+ ],
+ "codeRepository": [
+ "http://repo1.maven.org/maven2/com/mycompany/app/my-app",
+ "http://example.org/maven2/com/mycompany/app/my-app",
+ ],
+ }
def test_compute_metadata_pkginfo(self):
raw_content = b"""\
@@ -987,40 +939,33 @@
Provides-Extra: testing
""" # noqa
result = self.pkginfo_mapping.translate(raw_content)
- self.assertCountEqual(
- result["description"],
- [
- "Software Heritage core utilities", # note the comma here
- "swh-core\n"
- "========\n"
- "\n"
- "core library for swh's modules:\n"
- "- config parser\n"
- "- hash computations\n"
- "- serialization\n"
- "- logging mechanism\n"
- "",
- ],
- result,
- )
+ assert result["description"] == [
+ "Software Heritage core utilities", # note the comma here
+ "swh-core\n"
+ "========\n"
+ "\n"
+ "core library for swh's modules:\n"
+ "- config parser\n"
+ "- hash computations\n"
+ "- serialization\n"
+ "- logging mechanism\n"
+ "",
+ ], result
del result["description"]
- self.assertEqual(
- result,
- {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "type": "SoftwareSourceCode",
- "url": "https://forge.softwareheritage.org/diffusion/DCORE/",
- "name": "swh.core",
- "author": [
- {
- "type": "Person",
- "name": "Software Heritage developers",
- "email": "swh-devel@inria.fr",
- }
- ],
- "version": "0.0.49",
- },
- )
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ "url": "https://forge.softwareheritage.org/diffusion/DCORE/",
+ "name": "swh.core",
+ "author": [
+ {
+ "type": "Person",
+ "name": "Software Heritage developers",
+ "email": "swh-devel@inria.fr",
+ }
+ ],
+ "version": "0.0.49",
+ }
def test_compute_metadata_pkginfo_utf8(self):
raw_content = b"""\
@@ -1031,15 +976,12 @@
Hydrology N\xc2\xb083
""" # noqa
result = self.pkginfo_mapping.translate(raw_content)
- self.assertEqual(
- result,
- {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "type": "SoftwareSourceCode",
- "name": "snowpyt",
- "description": "foo\nHydrology N°83",
- },
- )
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ "name": "snowpyt",
+ "description": "foo\nHydrology N°83",
+ }
def test_compute_metadata_pkginfo_keywords(self):
raw_content = b"""\
@@ -1048,15 +990,12 @@
Keywords: foo bar baz
""" # noqa
result = self.pkginfo_mapping.translate(raw_content)
- self.assertEqual(
- result,
- {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "type": "SoftwareSourceCode",
- "name": "foo",
- "keywords": ["foo", "bar", "baz"],
- },
- )
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ "name": "foo",
+ "keywords": ["foo", "bar", "baz"],
+ }
def test_compute_metadata_pkginfo_license(self):
raw_content = b"""\
@@ -1065,15 +1004,12 @@
License: MIT
""" # noqa
result = self.pkginfo_mapping.translate(raw_content)
- self.assertEqual(
- result,
- {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "type": "SoftwareSourceCode",
- "name": "foo",
- "license": "MIT",
- },
- )
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ "name": "foo",
+ "license": "MIT",
+ }
def test_gemspec_base(self):
raw_content = b"""
@@ -1090,23 +1026,20 @@
s.metadata = { "source_code_uri" => "https://github.com/example/example" }
end"""
result = self.gemspec_mapping.translate(raw_content)
- self.assertCountEqual(
- result.pop("description"),
- ["This is an example!", "Much longer explanation of the example!"],
- )
- self.assertEqual(
- result,
- {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "type": "SoftwareSourceCode",
- "author": [{"type": "Person", "name": "Ruby Coder"}],
- "name": "example",
- "license": "https://spdx.org/licenses/MIT",
- "codeRepository": "https://rubygems.org/gems/example",
- "email": "rubycoder@example.com",
- "version": "0.1.0",
- },
- )
+ assert set(result.pop("description")) == {
+ "This is an example!",
+ "Much longer explanation of the example!",
+ }
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ "author": [{"type": "Person", "name": "Ruby Coder"}],
+ "name": "example",
+ "license": "https://spdx.org/licenses/MIT",
+ "codeRepository": "https://rubygems.org/gems/example",
+ "email": "rubycoder@example.com",
+ "version": "0.1.0",
+ }
def test_gemspec_two_author_fields(self):
raw_content = b"""
@@ -1115,20 +1048,20 @@
s.author = "Ruby Coder2"
end"""
result = self.gemspec_mapping.translate(raw_content)
- self.assertCountEqual(
- result.pop("author"),
+ assert result.pop("author") in (
[
{"type": "Person", "name": "Ruby Coder1"},
{"type": "Person", "name": "Ruby Coder2"},
],
+ [
+ {"type": "Person", "name": "Ruby Coder2"},
+ {"type": "Person", "name": "Ruby Coder1"},
+ ],
)
- self.assertEqual(
- result,
- {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "type": "SoftwareSourceCode",
- },
- )
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ }
def test_gemspec_invalid_author(self):
raw_content = b"""
@@ -1136,38 +1069,29 @@
s.author = ["Ruby Coder"]
end"""
result = self.gemspec_mapping.translate(raw_content)
- self.assertEqual(
- result,
- {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "type": "SoftwareSourceCode",
- },
- )
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ }
raw_content = b"""
Gem::Specification.new do |s|
s.author = "Ruby Coder1",
end"""
result = self.gemspec_mapping.translate(raw_content)
- self.assertEqual(
- result,
- {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "type": "SoftwareSourceCode",
- },
- )
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ }
raw_content = b"""
Gem::Specification.new do |s|
s.authors = ["Ruby Coder1", ["Ruby Coder2"]]
end"""
result = self.gemspec_mapping.translate(raw_content)
- self.assertEqual(
- result,
- {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "type": "SoftwareSourceCode",
- "author": [{"type": "Person", "name": "Ruby Coder1"}],
- },
- )
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ "author": [{"type": "Person", "name": "Ruby Coder1"}],
+ }
def test_gemspec_alternative_header(self):
raw_content = b"""
@@ -1179,15 +1103,12 @@
}
"""
result = self.gemspec_mapping.translate(raw_content)
- self.assertEqual(
- result,
- {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "type": "SoftwareSourceCode",
- "name": "rb-system-with-aliases",
- "description": "execute system commands with aliases",
- },
- )
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ "name": "rb-system-with-aliases",
+ "description": "execute system commands with aliases",
+ }
@settings(suppress_health_check=[HealthCheck.too_slow])
@given(json_document_strategy(keys=list(NpmMapping.mapping)))
@@ -1233,8 +1154,8 @@
parts.append(b"end\n")
self.gemspec_mapping.translate(b"".join(parts))
- def test_revision_metadata_indexer(self):
- metadata_indexer = RevisionMetadataIndexer(config=REVISION_METADATA_CONFIG)
+ def test_directory_metadata_indexer(self):
+ metadata_indexer = DirectoryMetadataIndexer(config=DIRECTORY_METADATA_CONFIG)
fill_obj_storage(metadata_indexer.objstorage)
fill_storage(metadata_indexer.storage)
@@ -1242,8 +1163,7 @@
{f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()}
)
assert tool is not None
- rev = REVISION
- assert rev.directory == DIRECTORY2.id
+ dir_ = DIRECTORY2
metadata_indexer.idx_storage.content_metadata_add(
[
@@ -1255,15 +1175,17 @@
]
)
- metadata_indexer.run([rev.id])
+ metadata_indexer.run([dir_.id])
results = list(
- metadata_indexer.idx_storage.revision_intrinsic_metadata_get([REVISION.id])
+ metadata_indexer.idx_storage.directory_intrinsic_metadata_get(
+ [DIRECTORY2.id]
+ )
)
expected_results = [
- RevisionIntrinsicMetadataRow(
- id=rev.id,
+ DirectoryIntrinsicMetadataRow(
+ id=dir_.id,
tool=TRANSLATOR_TOOL,
metadata=YARN_PARSER_METADATA,
mappings=["npm"],
@@ -1274,35 +1196,29 @@
del result.tool["id"]
# then
- self.assertEqual(results, expected_results)
+ assert results == expected_results
- def test_revision_metadata_indexer_single_root_dir(self):
- metadata_indexer = RevisionMetadataIndexer(config=REVISION_METADATA_CONFIG)
+ def test_directory_metadata_indexer_single_root_dir(self):
+ metadata_indexer = DirectoryMetadataIndexer(config=DIRECTORY_METADATA_CONFIG)
fill_obj_storage(metadata_indexer.objstorage)
fill_storage(metadata_indexer.storage)
# Add a parent directory, that is the only directory at the root
- # of the revision
- rev = REVISION
- assert rev.directory == DIRECTORY2.id
+ # of the directory
+ dir_ = DIRECTORY2
- directory = Directory(
+ new_dir = Directory(
entries=(
DirectoryEntry(
name=b"foobar-1.0.0",
type="dir",
- target=rev.directory,
+ target=dir_.id,
perms=16384,
),
),
)
- assert directory.id is not None
- metadata_indexer.storage.directory_add([directory])
-
- new_rev_dict = {**rev.to_dict(), "directory": directory.id}
- new_rev_dict.pop("id")
- new_rev = Revision.from_dict(new_rev_dict)
- metadata_indexer.storage.revision_add([new_rev])
+ assert new_dir.id is not None
+ metadata_indexer.storage.directory_add([new_dir])
tool = metadata_indexer.idx_storage.indexer_configuration_get(
{f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()}
@@ -1319,15 +1235,15 @@
]
)
- metadata_indexer.run([new_rev.id])
+ metadata_indexer.run([new_dir.id])
results = list(
- metadata_indexer.idx_storage.revision_intrinsic_metadata_get([new_rev.id])
+ metadata_indexer.idx_storage.directory_intrinsic_metadata_get([new_dir.id])
)
expected_results = [
- RevisionIntrinsicMetadataRow(
- id=new_rev.id,
+ DirectoryIntrinsicMetadataRow(
+ id=new_dir.id,
tool=TRANSLATOR_TOOL,
metadata=YARN_PARSER_METADATA,
mappings=["npm"],
@@ -1338,4 +1254,4 @@
del result.tool["id"]
# then
- self.assertEqual(results, expected_results)
+ assert results == expected_results
diff --git a/swh/indexer/tests/test_origin_head.py b/swh/indexer/tests/test_origin_head.py
--- a/swh/indexer/tests/test_origin_head.py
+++ b/swh/indexer/tests/test_origin_head.py
@@ -1,15 +1,13 @@
-# Copyright (C) 2017-2020 The Software Heritage developers
+# Copyright (C) 2017-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-import copy
from datetime import datetime, timezone
-import unittest
import pytest
-from swh.indexer.origin_head import OriginHeadIndexer
+from swh.indexer.origin_head import get_head_swhid
from swh.indexer.tests.utils import fill_storage
from swh.model.model import (
Origin,
@@ -19,37 +17,9 @@
SnapshotBranch,
TargetType,
)
+from swh.model.swhids import CoreSWHID
from swh.storage.utils import now
-
-@pytest.fixture
-def swh_indexer_config(swh_indexer_config):
- config = copy.deepcopy(swh_indexer_config)
- config.update(
- {
- "tools": {
- "name": "origin-metadata",
- "version": "0.0.1",
- "configuration": {},
- },
- "tasks": {
- "revision_intrinsic_metadata": None,
- "origin_intrinsic_metadata": None,
- },
- }
- )
- return config
-
-
-class OriginHeadTestIndexer(OriginHeadIndexer):
- """Specific indexer whose configuration is enough to satisfy the
- indexing tests.
- """
-
- def persist_index_computations(self, results):
- self.results = results
-
-
SAMPLE_SNAPSHOT = Snapshot(
branches={
b"foo": None,
@@ -61,156 +31,127 @@
)
-class OriginHead(unittest.TestCase):
- @pytest.fixture(autouse=True)
- def init(self, swh_config):
- super().setUp()
- self.indexer = OriginHeadTestIndexer()
- self.indexer.catch_exceptions = False
- fill_storage(self.indexer.storage)
-
- def test_git(self):
- origin_url = "https://github.com/SoftwareHeritage/swh-storage"
- self.indexer.run([origin_url])
- rev_id = b"8K\x12\x00d\x03\xcc\xe4]bS\xe3\x8f{\xd7}\xac\xefrm"
- self.assertEqual(
- self.indexer.results,
- [
- {
- "revision_id": rev_id,
- "origin_url": origin_url,
- }
- ],
- )
-
- def test_git_partial_snapshot(self):
- """Checks partial snapshots are ignored."""
- origin_url = "https://github.com/SoftwareHeritage/swh-core"
- self.indexer.storage.origin_add([Origin(url=origin_url)])
- visit = self.indexer.storage.origin_visit_add(
- [
- OriginVisit(
- origin=origin_url,
- date=datetime(2019, 2, 27, tzinfo=timezone.utc),
- type="git",
- )
- ]
- )[0]
- self.indexer.storage.snapshot_add([SAMPLE_SNAPSHOT])
- visit_status = OriginVisitStatus(
- origin=origin_url,
- visit=visit.visit,
- date=now(),
- status="partial",
- snapshot=SAMPLE_SNAPSHOT.id,
- )
- self.indexer.storage.origin_visit_status_add([visit_status])
- self.indexer.run([origin_url])
- self.assertEqual(self.indexer.results, [])
-
- def test_vcs_missing_snapshot(self):
- origin_url = "https://github.com/SoftwareHeritage/swh-indexer"
- self.indexer.storage.origin_add([Origin(url=origin_url)])
- self.indexer.run([origin_url])
- self.assertEqual(self.indexer.results, [])
-
- def test_pypi_missing_branch(self):
- origin_url = "https://pypi.org/project/abcdef/"
- self.indexer.storage.origin_add(
- [
- Origin(
- url=origin_url,
- )
- ]
- )
- visit = self.indexer.storage.origin_visit_add(
- [
- OriginVisit(
- origin=origin_url,
- date=datetime(2019, 2, 27, tzinfo=timezone.utc),
- type="pypi",
- )
- ]
- )[0]
- self.indexer.storage.snapshot_add([SAMPLE_SNAPSHOT])
- visit_status = OriginVisitStatus(
- origin=origin_url,
- visit=visit.visit,
- date=now(),
- status="full",
- snapshot=SAMPLE_SNAPSHOT.id,
- )
- self.indexer.storage.origin_visit_status_add([visit_status])
- self.indexer.run(["https://pypi.org/project/abcdef/"])
- self.assertEqual(self.indexer.results, [])
-
- def test_ftp(self):
- origin_url = "rsync://ftp.gnu.org/gnu/3dldf"
- self.indexer.run([origin_url])
- rev_id = b"\x8e\xa9\x8e/\xea}\x9feF\xf4\x9f\xfd\xee\xcc\x1a\xb4`\x8c\x8by"
- self.assertEqual(
- self.indexer.results,
- [
- {
- "revision_id": rev_id,
- "origin_url": origin_url,
- }
- ],
- )
-
- def test_ftp_missing_snapshot(self):
- origin_url = "rsync://ftp.gnu.org/gnu/foobar"
- self.indexer.storage.origin_add([Origin(url=origin_url)])
- self.indexer.run([origin_url])
- self.assertEqual(self.indexer.results, [])
-
- def test_deposit(self):
- origin_url = "https://forge.softwareheritage.org/source/jesuisgpl/"
- self.indexer.storage.origin_add([Origin(url=origin_url)])
- self.indexer.run([origin_url])
- rev_id = b"\xe7n\xa4\x9c\x9f\xfb\xb7\xf76\x11\x08{\xa6\xe9\x99\xb1\x9e]q\xeb"
- self.assertEqual(
- self.indexer.results,
- [
- {
- "revision_id": rev_id,
- "origin_url": origin_url,
- }
- ],
- )
-
- def test_deposit_missing_snapshot(self):
- origin_url = "https://forge.softwareheritage.org/source/foobar"
- self.indexer.storage.origin_add(
- [
- Origin(
- url=origin_url,
- )
- ]
- )
- self.indexer.run([origin_url])
- self.assertEqual(self.indexer.results, [])
-
- def test_pypi(self):
- origin_url = "https://pypi.org/project/limnoria/"
- self.indexer.run([origin_url])
-
- rev_id = b"\x83\xb9\xb6\xc7\x05\xb1%\xd0\xfem\xd8kA\x10\x9d\xc5\xfa2\xf8t"
- self.assertEqual(
- self.indexer.results,
- [{"revision_id": rev_id, "origin_url": origin_url}],
- )
-
- def test_svn(self):
- origin_url = "http://0-512-md.googlecode.com/svn/"
- self.indexer.run([origin_url])
- rev_id = b"\xe4?r\xe1,\x88\xab\xec\xe7\x9a\x87\xb8\xc9\xad#.\x1bw=\x18"
- self.assertEqual(
- self.indexer.results,
- [
- {
- "revision_id": rev_id,
- "origin_url": origin_url,
- }
- ],
- )
+@pytest.fixture
+def storage(swh_storage):
+ fill_storage(swh_storage)
+ return swh_storage
+
+
+def test_git(storage):
+ origin_url = "https://github.com/SoftwareHeritage/swh-storage"
+ assert get_head_swhid(storage, origin_url) == CoreSWHID.from_string(
+ "swh:1:rev:384b12006403cce45d6253e38f7bd77dacef726d"
+ )
+
+
+def test_git_partial_snapshot(storage):
+ """Checks partial snapshots are ignored."""
+ origin_url = "https://github.com/SoftwareHeritage/swh-core"
+ storage.origin_add([Origin(url=origin_url)])
+ visit = storage.origin_visit_add(
+ [
+ OriginVisit(
+ origin=origin_url,
+ date=datetime(2019, 2, 27, tzinfo=timezone.utc),
+ type="git",
+ )
+ ]
+ )[0]
+ storage.snapshot_add([SAMPLE_SNAPSHOT])
+ visit_status = OriginVisitStatus(
+ origin=origin_url,
+ visit=visit.visit,
+ date=now(),
+ status="partial",
+ snapshot=SAMPLE_SNAPSHOT.id,
+ )
+ storage.origin_visit_status_add([visit_status])
+ assert get_head_swhid(storage, origin_url) is None
+
+
+def test_vcs_missing_snapshot(storage):
+ origin_url = "https://github.com/SoftwareHeritage/swh-indexer"
+ storage.origin_add([Origin(url=origin_url)])
+ assert get_head_swhid(storage, origin_url) is None
+
+
+def test_pypi_missing_branch(storage):
+ origin_url = "https://pypi.org/project/abcdef/"
+ storage.origin_add(
+ [
+ Origin(
+ url=origin_url,
+ )
+ ]
+ )
+ visit = storage.origin_visit_add(
+ [
+ OriginVisit(
+ origin=origin_url,
+ date=datetime(2019, 2, 27, tzinfo=timezone.utc),
+ type="pypi",
+ )
+ ]
+ )[0]
+ storage.snapshot_add([SAMPLE_SNAPSHOT])
+ visit_status = OriginVisitStatus(
+ origin=origin_url,
+ visit=visit.visit,
+ date=now(),
+ status="full",
+ snapshot=SAMPLE_SNAPSHOT.id,
+ )
+ storage.origin_visit_status_add([visit_status])
+ assert get_head_swhid(storage, origin_url) is None
+
+
+def test_ftp(storage):
+ origin_url = "rsync://ftp.gnu.org/gnu/3dldf"
+ assert get_head_swhid(storage, origin_url) == CoreSWHID.from_string(
+ "swh:1:rev:8ea98e2fea7d9f6546f49ffdeecc1ab4608c8b79"
+ )
+
+
+def test_ftp_missing_snapshot(storage):
+ origin_url = "rsync://ftp.gnu.org/gnu/foobar"
+ storage.origin_add([Origin(url=origin_url)])
+ assert get_head_swhid(storage, origin_url) is None
+
+
+def test_deposit(storage):
+ origin_url = "https://forge.softwareheritage.org/source/jesuisgpl/"
+ storage.origin_add([Origin(url=origin_url)])
+ assert get_head_swhid(storage, origin_url) == CoreSWHID.from_string(
+ "swh:1:rev:e76ea49c9ffbb7f73611087ba6e999b19e5d71eb"
+ )
+
+
+def test_deposit_missing_snapshot(storage):
+ origin_url = "https://forge.softwareheritage.org/source/foobar"
+ storage.origin_add(
+ [
+ Origin(
+ url=origin_url,
+ )
+ ]
+ )
+ assert get_head_swhid(storage, origin_url) is None
+
+
+def test_pypi(storage):
+ origin_url = "https://old-pypi.example.org/project/limnoria/"
+ assert get_head_swhid(storage, origin_url) == CoreSWHID.from_string(
+ "swh:1:rev:83b9b6c705b125d0fe6dd86b41109dc5fa32f874"
+ )
+
+ origin_url = "https://pypi.org/project/limnoria/"
+ assert get_head_swhid(storage, origin_url) == CoreSWHID.from_string(
+ "swh:1:rel:83b9b6c705b125d0fe6dd86b41109dc5fa32f874"
+ )
+
+
+def test_svn(storage):
+ origin_url = "http://0-512-md.googlecode.com/svn/"
+ assert get_head_swhid(storage, origin_url) == CoreSWHID.from_string(
+ "swh:1:rev:e43f72e12c88abece79a87b8c9ad232e1b773d18"
+ )
diff --git a/swh/indexer/tests/test_origin_metadata.py b/swh/indexer/tests/test_origin_metadata.py
--- a/swh/indexer/tests/test_origin_metadata.py
+++ b/swh/indexer/tests/test_origin_metadata.py
@@ -11,14 +11,14 @@
from swh.indexer.metadata import OriginMetadataIndexer
from swh.indexer.storage.interface import IndexerStorageInterface
from swh.indexer.storage.model import (
+ DirectoryIntrinsicMetadataRow,
OriginIntrinsicMetadataRow,
- RevisionIntrinsicMetadataRow,
)
from swh.model.model import Origin
from swh.storage.interface import StorageInterface
from .test_metadata import TRANSLATOR_TOOL
-from .utils import REVISION, YARN_PARSER_METADATA
+from .utils import DIRECTORY2, YARN_PARSER_METADATA
@pytest.fixture
@@ -29,7 +29,47 @@
return cfg
-def test_origin_metadata_indexer(
+def test_origin_metadata_indexer_release(
+ swh_indexer_config,
+ idx_storage: IndexerStorageInterface,
+ storage: StorageInterface,
+ obj_storage,
+) -> None:
+ indexer = OriginMetadataIndexer(config=swh_indexer_config)
+ origin = "https://npm.example.org/yarn-parser"
+ indexer.run([origin])
+
+ tool = swh_indexer_config["tools"]
+
+ dir_id = DIRECTORY2.id
+ dir_metadata = DirectoryIntrinsicMetadataRow(
+ id=dir_id,
+ tool=tool,
+ metadata=YARN_PARSER_METADATA,
+ mappings=["npm"],
+ )
+ origin_metadata = OriginIntrinsicMetadataRow(
+ id=origin,
+ tool=tool,
+ from_directory=dir_id,
+ metadata=YARN_PARSER_METADATA,
+ mappings=["npm"],
+ )
+
+ dir_results = list(idx_storage.directory_intrinsic_metadata_get([dir_id]))
+ for dir_result in dir_results:
+ assert dir_result.tool
+ del dir_result.tool["id"]
+ assert dir_results == [dir_metadata]
+
+ orig_results = list(idx_storage.origin_intrinsic_metadata_get([origin]))
+ for orig_result in orig_results:
+ assert orig_result.tool
+ del orig_result.tool["id"]
+ assert orig_results == [origin_metadata]
+
+
+def test_origin_metadata_indexer_revision(
swh_indexer_config,
idx_storage: IndexerStorageInterface,
storage: StorageInterface,
@@ -41,9 +81,9 @@
tool = swh_indexer_config["tools"]
- rev_id = REVISION.id
- rev_metadata = RevisionIntrinsicMetadataRow(
- id=rev_id,
+ dir_id = DIRECTORY2.id
+ dir_metadata = DirectoryIntrinsicMetadataRow(
+ id=dir_id,
tool=tool,
metadata=YARN_PARSER_METADATA,
mappings=["npm"],
@@ -51,16 +91,16 @@
origin_metadata = OriginIntrinsicMetadataRow(
id=origin,
tool=tool,
- from_revision=rev_id,
+ from_directory=dir_id,
metadata=YARN_PARSER_METADATA,
mappings=["npm"],
)
- rev_results = list(idx_storage.revision_intrinsic_metadata_get([rev_id]))
- for rev_result in rev_results:
- assert rev_result.tool
- del rev_result.tool["id"]
- assert rev_results == [rev_metadata]
+ dir_results = list(idx_storage.directory_intrinsic_metadata_get([dir_id]))
+ for dir_result in dir_results:
+ assert dir_result.tool
+ del dir_result.tool["id"]
+ assert dir_results == [dir_metadata]
orig_results = list(idx_storage.origin_intrinsic_metadata_get([origin]))
for orig_result in orig_results:
@@ -82,10 +122,10 @@
indexer.run(["https://github.com/librariesio/yarn-parser"] * 2)
origin = "https://github.com/librariesio/yarn-parser"
- rev_id = REVISION.id
+ dir_id = DIRECTORY2.id
- rev_results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id]))
- assert len(rev_results) == 1
+ dir_results = list(indexer.idx_storage.directory_intrinsic_metadata_get([dir_id]))
+ assert len(dir_results) == 1
orig_results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin]))
assert len(orig_results) == 1
@@ -121,15 +161,15 @@
indexer = OriginMetadataIndexer(config=swh_indexer_config)
indexer.run([origin1, origin2])
- rev_id = REVISION.id
+ dir_id = DIRECTORY2.id
- rev_results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id]))
- assert rev_results == [
- RevisionIntrinsicMetadataRow(
- id=rev_id,
+ dir_results = list(indexer.idx_storage.directory_intrinsic_metadata_get([dir_id]))
+ assert dir_results == [
+ DirectoryIntrinsicMetadataRow(
+ id=dir_id,
metadata=YARN_PARSER_METADATA,
mappings=["npm"],
- tool=rev_results[0].tool,
+ tool=dir_results[0].tool,
)
]
@@ -140,7 +180,7 @@
assert orig_results == [
OriginIntrinsicMetadataRow(
id=origin2,
- from_revision=rev_id,
+ from_directory=dir_id,
metadata=YARN_PARSER_METADATA,
mappings=["npm"],
tool=orig_results[0].tool,
@@ -148,7 +188,7 @@
]
-def test_origin_metadata_indexer_duplicate_revision(
+def test_origin_metadata_indexer_duplicate_directory(
swh_indexer_config,
idx_storage: IndexerStorageInterface,
storage: StorageInterface,
@@ -162,10 +202,10 @@
origin2 = "https://github.com/librariesio/yarn-parser.git"
indexer.run([origin1, origin2])
- rev_id = REVISION.id
+ dir_id = DIRECTORY2.id
- rev_results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id]))
- assert len(rev_results) == 1
+ dir_results = list(indexer.idx_storage.directory_intrinsic_metadata_get([dir_id]))
+ assert len(dir_results) == 1
orig_results = list(
indexer.idx_storage.origin_intrinsic_metadata_get([origin1, origin2])
@@ -185,10 +225,10 @@
with patch("swh.indexer.metadata_dictionary.npm.NpmMapping.filename", b"foo.json"):
indexer.run([origin])
- rev_id = REVISION.id
+ dir_id = DIRECTORY2.id
- rev_results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id]))
- assert rev_results == []
+ dir_results = list(indexer.idx_storage.directory_intrinsic_metadata_get([dir_id]))
+ assert dir_results == []
orig_results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin]))
assert orig_results == []
@@ -204,16 +244,16 @@
indexer = OriginMetadataIndexer(config=swh_indexer_config)
origin = "https://github.com/librariesio/yarn-parser"
with patch(
- "swh.indexer.metadata.RevisionMetadataIndexer"
- ".translate_revision_intrinsic_metadata",
+ "swh.indexer.metadata.DirectoryMetadataIndexer"
+ ".translate_directory_intrinsic_metadata",
return_value=(["npm"], {"@context": "foo"}),
):
indexer.run([origin])
- rev_id = REVISION.id
+ dir_id = DIRECTORY2.id
- rev_results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id]))
- assert rev_results == []
+ dir_results = list(indexer.idx_storage.directory_intrinsic_metadata_get([dir_id]))
+ assert dir_results == []
orig_results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin]))
assert orig_results == []
@@ -229,16 +269,16 @@
indexer = OriginMetadataIndexer(config=swh_indexer_config)
origin = "https://github.com/librariesio/yarn-parser"
with patch(
- "swh.indexer.metadata.RevisionMetadataIndexer"
- ".translate_revision_intrinsic_metadata",
+ "swh.indexer.metadata.DirectoryMetadataIndexer"
+ ".translate_directory_intrinsic_metadata",
return_value=None,
):
indexer.run([origin])
- rev_id = REVISION.id
+ dir_id = DIRECTORY2.id
- rev_results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id]))
- assert rev_results == []
+ dir_results = list(indexer.idx_storage.directory_intrinsic_metadata_get([dir_id]))
+ assert dir_results == []
orig_results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin]))
assert orig_results == []
@@ -252,5 +292,5 @@
) -> None:
indexer = OriginMetadataIndexer(config=swh_indexer_config)
- result = indexer.index_list(["https://unknown.org/foo"])
+ result = indexer.index_list([Origin("https://unknown.org/foo")])
assert not result
diff --git a/swh/indexer/tests/utils.py b/swh/indexer/tests/utils.py
--- a/swh/indexer/tests/utils.py
+++ b/swh/indexer/tests/utils.py
@@ -19,10 +19,12 @@
Content,
Directory,
DirectoryEntry,
+ ObjectType,
Origin,
OriginVisit,
OriginVisitStatus,
Person,
+ Release,
Revision,
RevisionType,
Snapshot,
@@ -39,27 +41,26 @@
}
-ORIGINS = [
- Origin(url="https://github.com/SoftwareHeritage/swh-storage"),
- Origin(url="rsync://ftp.gnu.org/gnu/3dldf"),
- Origin(url="https://forge.softwareheritage.org/source/jesuisgpl/"),
- Origin(url="https://pypi.org/project/limnoria/"),
- Origin(url="http://0-512-md.googlecode.com/svn/"),
- Origin(url="https://github.com/librariesio/yarn-parser"),
- Origin(url="https://github.com/librariesio/yarn-parser.git"),
-]
-
-
ORIGIN_VISITS = [
- {"type": "git", "origin": ORIGINS[0].url},
- {"type": "ftp", "origin": ORIGINS[1].url},
- {"type": "deposit", "origin": ORIGINS[2].url},
- {"type": "pypi", "origin": ORIGINS[3].url},
- {"type": "svn", "origin": ORIGINS[4].url},
- {"type": "git", "origin": ORIGINS[5].url},
- {"type": "git", "origin": ORIGINS[6].url},
+ {"type": "git", "origin": "https://github.com/SoftwareHeritage/swh-storage"},
+ {"type": "ftp", "origin": "rsync://ftp.gnu.org/gnu/3dldf"},
+ {
+ "type": "deposit",
+ "origin": "https://forge.softwareheritage.org/source/jesuisgpl/",
+ },
+ {
+ "type": "pypi",
+ "origin": "https://old-pypi.example.org/project/limnoria/",
+ }, # with rev head
+ {"type": "pypi", "origin": "https://pypi.org/project/limnoria/"}, # with rel head
+ {"type": "svn", "origin": "http://0-512-md.googlecode.com/svn/"},
+ {"type": "git", "origin": "https://github.com/librariesio/yarn-parser"},
+ {"type": "git", "origin": "https://github.com/librariesio/yarn-parser.git"},
+ {"type": "git", "origin": "https://npm.example.org/yarn-parser"},
]
+ORIGINS = [Origin(url=visit["origin"]) for visit in ORIGIN_VISITS]
+
DIRECTORY = Directory(
id=hash_to_bytes("34f335a750111ca0a8b64d8034faec9eedc396be"),
@@ -97,6 +98,8 @@
),
)
+_utc_plus_2 = datetime.timezone(datetime.timedelta(minutes=120))
+
REVISION = Revision(
id=hash_to_bytes("c6201cb1b9b9df9a7542f9665c3b5dfab85e9775"),
message=b"Improve search functionality",
@@ -111,28 +114,12 @@
email=b"andrewnez@gmail.com",
),
committer_date=TimestampWithTimezone.from_datetime(
- datetime.datetime(
- 2013,
- 10,
- 4,
- 12,
- 50,
- 49,
- tzinfo=datetime.timezone(datetime.timedelta(minutes=120)),
- )
+ datetime.datetime(2013, 10, 4, 12, 50, 49, tzinfo=_utc_plus_2)
),
type=RevisionType.GIT,
synthetic=False,
date=TimestampWithTimezone.from_datetime(
- datetime.datetime(
- 2017,
- 2,
- 20,
- 16,
- 14,
- 16,
- tzinfo=datetime.timezone(datetime.timedelta(minutes=120)),
- )
+ datetime.datetime(2017, 2, 20, 16, 14, 16, tzinfo=_utc_plus_2)
),
directory=DIRECTORY2.id,
parents=(),
@@ -140,7 +127,26 @@
REVISIONS = [REVISION]
+RELEASE = Release(
+ name=b"v0.0.0",
+ message=None,
+ author=Person(
+ name=b"Andrew Nesbitt",
+ fullname=b"Andrew Nesbitt <andrewnez@gmail.com>",
+ email=b"andrewnez@gmail.com",
+ ),
+ synthetic=False,
+ date=TimestampWithTimezone.from_datetime(
+ datetime.datetime(2017, 2, 20, 16, 14, 16, tzinfo=_utc_plus_2)
+ ),
+ target_type=ObjectType.DIRECTORY,
+ target=DIRECTORY2.id,
+)
+
+RELEASES = [RELEASE]
+
SNAPSHOTS = [
+ # https://github.com/SoftwareHeritage/swh-storage
Snapshot(
id=hash_to_bytes("a50fde72265343b7d28cecf6db20d98a81d21965"),
branches={
@@ -161,6 +167,7 @@
),
},
),
+ # rsync://ftp.gnu.org/gnu/3dldf
Snapshot(
id=hash_to_bytes("2c67f69a416bca4e1f3fcd848c588fab88ad0642"),
branches={
@@ -186,6 +193,7 @@
),
},
),
+ # https://forge.softwareheritage.org/source/jesuisgpl/",
Snapshot(
id=hash_to_bytes("68c0d26104d47e278dd6be07ed61fafb561d0d20"),
branches={
@@ -195,6 +203,7 @@
)
},
),
+ # https://old-pypi.example.org/project/limnoria/
Snapshot(
id=hash_to_bytes("f255245269e15fc99d284affd79f766668de0b67"),
branches={
@@ -211,6 +220,23 @@
),
},
),
+ # https://pypi.org/project/limnoria/
+ Snapshot(
+ branches={
+ b"HEAD": SnapshotBranch(
+ target=b"releases/2018.09.09", target_type=TargetType.ALIAS
+ ),
+ b"releases/2018.09.01": SnapshotBranch(
+ target=b"<\xee1(\xe8\x8d_\xc1\xc9\xa6rT\xf1\x1d\xbb\xdfF\xfdw\xcf",
+ target_type=TargetType.RELEASE,
+ ),
+ b"releases/2018.09.09": SnapshotBranch(
+ target=b"\x83\xb9\xb6\xc7\x05\xb1%\xd0\xfem\xd8kA\x10\x9d\xc5\xfa2\xf8t", # noqa
+ target_type=TargetType.RELEASE,
+ ),
+ },
+ ),
+ # http://0-512-md.googlecode.com/svn/
Snapshot(
id=hash_to_bytes("a1a28c0ab387a8f9e0618cb705eab81fc448f473"),
branches={
@@ -220,6 +246,7 @@
)
},
),
+ # https://github.com/librariesio/yarn-parser
Snapshot(
id=hash_to_bytes("bb4fd3a836930ce629d912864319637040ff3040"),
branches={
@@ -229,6 +256,7 @@
)
},
),
+ # https://github.com/librariesio/yarn-parser.git
Snapshot(
id=hash_to_bytes("bb4fd3a836930ce629d912864319637040ff3040"),
branches={
@@ -238,8 +266,19 @@
)
},
),
+ # https://npm.example.org/yarn-parser
+ Snapshot(
+ branches={
+ b"HEAD": SnapshotBranch(
+ target=RELEASE.id,
+ target_type=TargetType.RELEASE,
+ )
+ },
+ ),
]
+assert len(SNAPSHOTS) == len(ORIGIN_VISITS)
+
SHA1_TO_LICENSES = {
"01c9379dfc33803963d07c1ccc748d3fe4c96bb5": ["GPL"],
@@ -582,6 +621,7 @@
storage.origin_add(ORIGINS)
storage.directory_add([DIRECTORY, DIRECTORY2])
storage.revision_add(REVISIONS)
+ storage.release_add(RELEASES)
storage.snapshot_add(SNAPSHOTS)
for visit, snapshot in zip(ORIGIN_VISITS, SNAPSHOTS):

File Metadata

Mime Type
text/plain
Expires
Jul 3 2025, 6:31 PM (5 w, 6 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3218965

Event Timeline