Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9348457
D8002.id28836.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
161 KB
Subscribers
None
D8002.id28836.diff
View Options
diff --git a/requirements-swh.txt b/requirements-swh.txt
--- a/requirements-swh.txt
+++ b/requirements-swh.txt
@@ -1,4 +1,4 @@
-swh.core[db,http] >= 0.14.0
+swh.core[db,http] >= 2.9
swh.model >= 0.0.15
swh.objstorage >= 0.2.2
swh.scheduler >= 0.5.2
diff --git a/swh/indexer/cli.py b/swh/indexer/cli.py
--- a/swh/indexer/cli.py
+++ b/swh/indexer/cli.py
@@ -1,9 +1,9 @@
-# Copyright (C) 2019-2020 The Software Heritage developers
+# Copyright (C) 2019-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-from typing import Iterator
+from typing import Callable, Dict, Iterator, List, Optional
# WARNING: do not import unnecessary things here to keep cli startup time under
# control
@@ -213,6 +213,12 @@
@indexer_cli_group.command("journal-client")
+@click.argument(
+ "indexer",
+ type=click.Choice(["origin-intrinsic-metadata", "*"]),
+ required=False
+ # TODO: remove required=False after we stop using it
+)
@click.option("--scheduler-url", "-s", default=None, help="URL of the scheduler API")
@click.option(
"--origin-metadata-task-type",
@@ -236,18 +242,27 @@
@click.pass_context
def journal_client(
ctx,
- scheduler_url,
- origin_metadata_task_type,
- brokers,
- prefix,
- group_id,
- stop_after_objects,
+ indexer: Optional[str],
+ scheduler_url: str,
+ origin_metadata_task_type: str,
+ brokers: List[str],
+ prefix: str,
+ group_id: str,
+ stop_after_objects: Optional[int],
):
- """Listens for new objects from the SWH Journal, and schedules tasks
- to run relevant indexers (currently, only origin-intrinsic-metadata)
- on these new objects."""
+ """
+ Listens for new objects from the SWH Journal, and either:
+
+ * runs the indexer with the name passed as argument, if any
+ * schedules tasks to run relevant indexers (currently, only
+ origin-intrinsic-metadata) on these new objects otherwise.
+
+ Passing '*' as indexer name runs all indexers.
+ """
import functools
+ import warnings
+ from swh.indexer.indexer import ObjectsDict
from swh.indexer.journal_client import process_journal_objects
from swh.journal.client import get_journal_client
from swh.scheduler import get_scheduler
@@ -268,22 +283,50 @@
)
stop_after_objects = stop_after_objects or journal_cfg.get("stop_after_objects")
+ object_types = set()
+ worker_fns: List[Callable[[ObjectsDict], Dict]] = []
+
+ if indexer is None:
+ warnings.warn(
+ "'swh indexer journal-client' with no argument creates scheduler tasks "
+ "to index, rather than index directly.",
+ DeprecationWarning,
+ )
+ object_types.add("origin_visit_status")
+ worker_fns.append(
+ functools.partial(
+ process_journal_objects,
+ scheduler=scheduler,
+ task_names={
+ "origin_metadata": origin_metadata_task_type,
+ },
+ )
+ )
+
+ if indexer in ("origin-intrinsic-metadata", "*"):
+ from swh.indexer.metadata import OriginMetadataIndexer
+
+ object_types.add("origin_visit_status")
+ idx = OriginMetadataIndexer()
+ idx.catch_exceptions = False # don't commit offsets if indexation failed
+ worker_fns.append(idx.process_journal_objects)
+
+ if not worker_fns:
+ raise click.ClickException(f"Unknown indexer: {indexer}")
+
client = get_journal_client(
cls="kafka",
brokers=brokers,
prefix=prefix,
group_id=group_id,
- object_types=["origin_visit_status"],
+ object_types=list(object_types),
stop_after_objects=stop_after_objects,
)
- worker_fn = functools.partial(
- process_journal_objects,
- scheduler=scheduler,
- task_names={
- "origin_metadata": origin_metadata_task_type,
- },
- )
+ def worker_fn(objects: ObjectsDict):
+ for fn in worker_fns:
+ fn(objects)
+
try:
client.process(worker_fn)
except KeyboardInterrupt:
diff --git a/swh/indexer/indexer.py b/swh/indexer/indexer.py
--- a/swh/indexer/indexer.py
+++ b/swh/indexer/indexer.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2016-2021 The Software Heritage developers
+# Copyright (C) 2016-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@@ -18,19 +18,21 @@
List,
Optional,
Set,
+ Tuple,
TypeVar,
Union,
)
import warnings
import sentry_sdk
+from typing_extensions import TypedDict
from swh.core import utils
from swh.core.config import load_from_envvar, merge_configs
from swh.indexer.storage import INDEXER_CFG_KEY, Sha1, get_indexer_storage
from swh.indexer.storage.interface import IndexerStorageInterface
from swh.model import hashutil
-from swh.model.model import Revision, Sha1Git
+from swh.model.model import Directory, Origin, Sha1Git
from swh.objstorage.exc import ObjNotFoundError
from swh.objstorage.factory import get_objstorage
from swh.scheduler import CONFIG as SWH_CONFIG
@@ -38,6 +40,12 @@
from swh.storage.interface import StorageInterface
+class ObjectsDict(TypedDict, total=False):
+ directory: List[Dict]
+ origin: List[Dict]
+ origin_visit_status: List[Dict]
+
+
@contextmanager
def write_to_temp(filename: str, data: bytes, working_directory: str) -> Iterator[str]:
"""Write the sha1's content in a temporary file.
@@ -102,7 +110,7 @@
content, sha1_git for revision, directory, release, and id for origin
To implement a new concrete indexer, inherit from the object level
- classes: :class:`ContentIndexer`, :class:`RevisionIndexer`,
+ classes: :class:`ContentIndexer`, :class:`DirectoryIndexer`,
:class:`OriginIndexer`.
Then you need to implement the following functions:
@@ -526,9 +534,29 @@
DeprecationWarning,
)
del kwargs["policy_update"]
+
+ origins = [{"url": url} for url in origin_urls]
+
+ return self.process_journal_objects({"origin": origins})
+
+ def process_journal_objects(self, objects: ObjectsDict) -> Dict:
+ """Worker function for ``JournalClient``. Expects ``objects`` to have a single
+ key, either ``origin`` or ``"origin_visit_status"``."""
+ origins = [
+ Origin(url=status["origin"])
+ for status in objects.get("origin_visit_status", [])
+ if status["status"] == "full"
+ ] + [Origin(url=origin["url"]) for origin in objects.get("origin", [])]
+
summary: Dict[str, Any] = {"status": "uneventful"}
try:
- results = self.index_list(origin_urls, **kwargs)
+ results = self.index_list(
+ origins,
+ check_origin_known=False,
+ # no need to check they exist, as we just received either an origin or
+ # visit status; which cannot be created by swh-storage unless the origin
+ # already exists
+ )
except Exception:
if not self.catch_exceptions:
raise
@@ -544,23 +572,23 @@
summary.update(summary_persist)
return summary
- def index_list(self, origin_urls: List[str], **kwargs) -> List[TResult]:
+ def index_list(self, origins: List[Origin], **kwargs) -> List[TResult]:
results = []
- for origin_url in origin_urls:
+ for origin in origins:
try:
- results.extend(self.index(origin_url, **kwargs))
+ results.extend(self.index(origin.url, **kwargs))
except Exception:
- self.log.exception("Problem when processing origin %s", origin_url)
+ self.log.exception("Problem when processing origin %s", origin.url)
sentry_sdk.capture_exception()
raise
return results
-class RevisionIndexer(BaseIndexer[Sha1Git, Revision, TResult], Generic[TResult]):
+class DirectoryIndexer(BaseIndexer[Sha1Git, Directory, TResult], Generic[TResult]):
"""An object type indexer, inherits from the :class:`BaseIndexer` and
- implements Revision indexing using the run method
+ implements Directory indexing using the run method
- Note: the :class:`RevisionIndexer` is not an instantiable object.
+ Note: the :class:`DirectoryIndexer` is not an instantiable object.
To use it in another context one should inherit from this class
and override the methods mentioned in the :class:`BaseIndexer`
class.
@@ -570,7 +598,7 @@
def run(self, ids: List[Sha1Git], **kwargs) -> Dict:
"""Given a list of sha1_gits:
- - retrieve revisions from storage
+ - retrieve directories from storage
- execute the indexing computations
- store the results
@@ -584,28 +612,40 @@
DeprecationWarning,
)
del kwargs["policy_update"]
- summary: Dict[str, Any] = {"status": "uneventful"}
- results = []
- revision_ids = [
+ directory_ids = [
hashutil.hash_to_bytes(id_) if isinstance(id_, str) else id_ for id_ in ids
]
- for (rev_id, rev) in zip(revision_ids, self.storage.revision_get(revision_ids)):
- if not rev:
- # TODO: call self.index() with rev=None?
- self.log.warning(
- "Revision %s not found in storage", hashutil.hash_to_hex(rev_id)
- )
- continue
+
+ return self._process_directories([(dir_id, None) for dir_id in directory_ids])
+
+ def process_journal_objects(self, objects: ObjectsDict) -> Dict:
+ """Worker function for ``JournalClient``. Expects ``objects`` to have a single
+ key, ``"directory"``."""
+ assert set(objects) == {"directory"}
+ return self._process_directories(
+ [(dir_["id"], Directory.from_dict(dir_)) for dir_ in objects["directory"]]
+ )
+
+ def _process_directories(
+ self,
+ directories: Union[List[Tuple[Sha1Git, Directory]], List[Tuple[Sha1Git, None]]],
+ ) -> Dict:
+
+ summary: Dict[str, Any] = {"status": "uneventful"}
+ results = []
+
+ # TODO: fetch raw_manifest when useful?
+
+ for (dir_id, dir_) in directories:
try:
- results.extend(self.index(rev_id, rev))
+ results.extend(self.index(dir_id, dir_))
except Exception:
if not self.catch_exceptions:
raise
- self.log.exception("Problem when processing revision")
+ self.log.exception("Problem when processing directory")
sentry_sdk.capture_exception()
summary["status"] = "failed"
- return summary
summary_persist = self.persist_index_computations(results)
if summary_persist:
diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py
--- a/swh/indexer/metadata.py
+++ b/swh/indexer/metadata.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2017-2021 The Software Heritage developers
+# Copyright (C) 2017-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@@ -21,20 +21,24 @@
from swh.core.config import merge_configs
from swh.core.utils import grouper
from swh.indexer.codemeta import merge_documents
-from swh.indexer.indexer import ContentIndexer, OriginIndexer, RevisionIndexer
+from swh.indexer.indexer import ContentIndexer, DirectoryIndexer, OriginIndexer
from swh.indexer.metadata_detector import detect_metadata
from swh.indexer.metadata_dictionary import MAPPINGS
-from swh.indexer.origin_head import OriginHeadIndexer
+from swh.indexer.origin_head import get_head_swhid
from swh.indexer.storage import INDEXER_CFG_KEY, Sha1
from swh.indexer.storage.model import (
ContentMetadataRow,
+ DirectoryIntrinsicMetadataRow,
OriginIntrinsicMetadataRow,
- RevisionIntrinsicMetadataRow,
)
from swh.model import hashutil
-from swh.model.model import Revision, Sha1Git
+from swh.model.model import Directory
+from swh.model.model import ObjectType as ModelObjectType
+from swh.model.model import Origin, Sha1Git
+from swh.model.swhids import CoreSWHID, ObjectType
REVISION_GET_BATCH_SIZE = 10
+RELEASE_GET_BATCH_SIZE = 10
ORIGIN_GET_BATCH_SIZE = 10
@@ -82,7 +86,7 @@
self,
id: Sha1,
data: Optional[bytes] = None,
- log_suffix="unknown revision",
+ log_suffix="unknown directory",
**kwargs,
) -> List[ContentMetadataRow]:
"""Index sha1s' content and store result.
@@ -144,18 +148,18 @@
}
-class RevisionMetadataIndexer(RevisionIndexer[RevisionIntrinsicMetadataRow]):
- """Revision-level indexer
+class DirectoryMetadataIndexer(DirectoryIndexer[DirectoryIntrinsicMetadataRow]):
+ """Directory-level indexer
This indexer is in charge of:
- - filtering revisions already indexed in revision_intrinsic_metadata table
+ - filtering directories already indexed in directory_intrinsic_metadata table
with defined computation tool
- - retrieve all entry_files in root directory
+ - retrieve all entry_files in directory
- use metadata_detector for file_names containing metadata
- compute metadata translation if necessary and possible (depends on tool)
- send sha1s to content indexing if possible
- - store the results for revision
+ - store the results for directory
"""
@@ -165,7 +169,7 @@
def filter(self, sha1_gits):
"""Filter out known sha1s and return only missing ones."""
- yield from self.idx_storage.revision_intrinsic_metadata_missing(
+ yield from self.idx_storage.directory_intrinsic_metadata_missing(
(
{
"id": sha1_git,
@@ -176,51 +180,52 @@
)
def index(
- self, id: Sha1Git, data: Optional[Revision], **kwargs
- ) -> List[RevisionIntrinsicMetadataRow]:
- """Index rev by processing it and organizing result.
+ self, id: Sha1Git, data: Optional[Directory] = None, **kwargs
+ ) -> List[DirectoryIntrinsicMetadataRow]:
+ """Index directory by processing it and organizing result.
use metadata_detector to iterate on filenames
- if one filename detected -> sends file to content indexer
- - if multiple file detected -> translation needed at revision level
+ - if multiple file detected -> translation needed at directory level
Args:
- id: sha1_git of the revision
- data: revision model object from storage
+ id: sha1_git of the directory
+ data: directory model object from storage
Returns:
- dict: dictionary representing a revision_intrinsic_metadata, with
+ dict: dictionary representing a directory_intrinsic_metadata, with
keys:
- - id (str): rev's identifier (sha1_git)
+ - id: directory's identifier (sha1_git)
- indexer_configuration_id (bytes): tool used
- metadata: dict of retrieved metadata
"""
- rev = data
- assert isinstance(rev, Revision)
+ if data is None:
+ dir_ = list(self.storage.directory_ls(id, recursive=False))
+ else:
+ assert isinstance(data, Directory)
+ dir_ = data.to_dict()
try:
- root_dir = rev.directory
- dir_ls = list(self.storage.directory_ls(root_dir, recursive=False))
- if [entry["type"] for entry in dir_ls] == ["dir"]:
+ if [entry["type"] for entry in dir_] == ["dir"]:
# If the root is just a single directory, recurse into it
# eg. PyPI packages, GNU tarballs
- subdir = dir_ls[0]["target"]
- dir_ls = list(self.storage.directory_ls(subdir, recursive=False))
- files = [entry for entry in dir_ls if entry["type"] == "file"]
+ subdir = dir_[0]["target"]
+ dir_ = list(self.storage.directory_ls(subdir, recursive=False))
+ files = [entry for entry in dir_ if entry["type"] == "file"]
detected_files = detect_metadata(files)
- (mappings, metadata) = self.translate_revision_intrinsic_metadata(
+ (mappings, metadata) = self.translate_directory_intrinsic_metadata(
detected_files,
- log_suffix="revision=%s" % hashutil.hash_to_hex(rev.id),
+ log_suffix="directory=%s" % hashutil.hash_to_hex(id),
)
except Exception as e:
- self.log.exception("Problem when indexing rev: %r", e)
+ self.log.exception("Problem when indexing dir: %r", e)
sentry_sdk.capture_exception()
return [
- RevisionIntrinsicMetadataRow(
- id=rev.id,
+ DirectoryIntrinsicMetadataRow(
+ id=id,
indexer_configuration_id=self.tool["id"],
mappings=mappings,
metadata=metadata,
@@ -228,7 +233,7 @@
]
def persist_index_computations(
- self, results: List[RevisionIntrinsicMetadataRow]
+ self, results: List[DirectoryIntrinsicMetadataRow]
) -> Dict[str, int]:
"""Persist the results in storage.
@@ -241,10 +246,10 @@
"""
# TODO: add functions in storage to keep data in
- # revision_intrinsic_metadata
- return self.idx_storage.revision_intrinsic_metadata_add(results)
+ # directory_intrinsic_metadata
+ return self.idx_storage.directory_intrinsic_metadata_add(results)
- def translate_revision_intrinsic_metadata(
+ def translate_directory_intrinsic_metadata(
self, detected_files: Dict[str, List[Any]], log_suffix: str
) -> Tuple[List[Any], Any]:
"""
@@ -315,81 +320,129 @@
class OriginMetadataIndexer(
- OriginIndexer[Tuple[OriginIntrinsicMetadataRow, RevisionIntrinsicMetadataRow]]
+ OriginIndexer[Tuple[OriginIntrinsicMetadataRow, DirectoryIntrinsicMetadataRow]]
):
USE_TOOLS = False
def __init__(self, config=None, **kwargs) -> None:
super().__init__(config=config, **kwargs)
- self.origin_head_indexer = OriginHeadIndexer(config=config)
- self.revision_metadata_indexer = RevisionMetadataIndexer(config=config)
+ self.directory_metadata_indexer = DirectoryMetadataIndexer(config=config)
def index_list(
- self, origin_urls: List[str], **kwargs
- ) -> List[Tuple[OriginIntrinsicMetadataRow, RevisionIntrinsicMetadataRow]]:
+ self, origins: List[Origin], check_origin_known: bool = True, **kwargs
+ ) -> List[Tuple[OriginIntrinsicMetadataRow, DirectoryIntrinsicMetadataRow]]:
head_rev_ids = []
- origins_with_head = []
- origins = list(
- call_with_batches(
- self.storage.origin_get,
- origin_urls,
- ORIGIN_GET_BATCH_SIZE,
+ head_rel_ids = []
+ origin_heads: Dict[Origin, CoreSWHID] = {}
+
+ # Filter out origins not in the storage
+ if check_origin_known:
+ known_origins = list(
+ call_with_batches(
+ self.storage.origin_get,
+ [origin.url for origin in origins],
+ ORIGIN_GET_BATCH_SIZE,
+ )
)
- )
- for origin in origins:
+ else:
+ known_origins = list(origins)
+
+ for origin in known_origins:
if origin is None:
continue
- head_results = self.origin_head_indexer.index(origin.url)
- if head_results:
- (head_result,) = head_results
- origins_with_head.append(origin)
- head_rev_ids.append(head_result["revision_id"])
-
- head_revs = list(
- call_with_batches(
- self.storage.revision_get, head_rev_ids, REVISION_GET_BATCH_SIZE
+ head_swhid = get_head_swhid(self.storage, origin.url)
+ if head_swhid:
+ origin_heads[origin] = head_swhid
+ if head_swhid.object_type == ObjectType.REVISION:
+ head_rev_ids.append(head_swhid.object_id)
+ elif head_swhid.object_type == ObjectType.RELEASE:
+ head_rel_ids.append(head_swhid.object_id)
+ else:
+ assert False, head_swhid
+
+ head_revs = dict(
+ zip(
+ head_rev_ids,
+ call_with_batches(
+ self.storage.revision_get, head_rev_ids, REVISION_GET_BATCH_SIZE
+ ),
+ )
+ )
+ head_rels = dict(
+ zip(
+ head_rel_ids,
+ call_with_batches(
+ self.storage.release_get, head_rel_ids, RELEASE_GET_BATCH_SIZE
+ ),
)
)
- assert len(head_revs) == len(head_rev_ids)
results = []
- for (origin, rev) in zip(origins_with_head, head_revs):
- if not rev:
- self.log.warning("Missing head revision of origin %r", origin.url)
- continue
-
- for rev_metadata in self.revision_metadata_indexer.index(rev.id, rev):
- # There is at most one rev_metadata
+ for (origin, head_swhid) in origin_heads.items():
+ if head_swhid.object_type == ObjectType.REVISION:
+ rev = head_revs[head_swhid.object_id]
+ if not rev:
+ self.log.warning(
+ "Missing head object %s of origin %r", head_swhid, origin.url
+ )
+ continue
+ directory_id = rev.directory
+ elif head_swhid.object_type == ObjectType.RELEASE:
+ rel = head_rels[head_swhid.object_id]
+ if not rel:
+ self.log.warning(
+ "Missing head object %s of origin %r", head_swhid, origin.url
+ )
+ continue
+ if rel.target_type != ModelObjectType.DIRECTORY:
+ # TODO
+ self.log.warning(
+ "Head release %s of %r has unexpected target type %s",
+ head_swhid,
+ origin.url,
+ rel.target_type,
+ )
+ continue
+ assert rel.target, rel
+ directory_id = rel.target
+ else:
+ assert False, head_swhid
+
+ for dir_metadata in self.directory_metadata_indexer.index(directory_id):
+ # There is at most one dir_metadata
orig_metadata = OriginIntrinsicMetadataRow(
- from_revision=rev_metadata.id,
+ from_directory=dir_metadata.id,
id=origin.url,
- metadata=rev_metadata.metadata,
- mappings=rev_metadata.mappings,
- indexer_configuration_id=rev_metadata.indexer_configuration_id,
+ metadata=dir_metadata.metadata,
+ mappings=dir_metadata.mappings,
+ indexer_configuration_id=dir_metadata.indexer_configuration_id,
)
- results.append((orig_metadata, rev_metadata))
+ results.append((orig_metadata, dir_metadata))
+
return results
def persist_index_computations(
self,
- results: List[Tuple[OriginIntrinsicMetadataRow, RevisionIntrinsicMetadataRow]],
+ results: List[Tuple[OriginIntrinsicMetadataRow, DirectoryIntrinsicMetadataRow]],
) -> Dict[str, int]:
- # Deduplicate revisions
- rev_metadata: List[RevisionIntrinsicMetadataRow] = []
+ # Deduplicate directories
+ dir_metadata: List[DirectoryIntrinsicMetadataRow] = []
orig_metadata: List[OriginIntrinsicMetadataRow] = []
summary: Dict = {}
- for (orig_item, rev_item) in results:
- assert rev_item.metadata == orig_item.metadata
- if rev_item.metadata and not (rev_item.metadata.keys() <= {"@context"}):
+ for (orig_item, dir_item) in results:
+ assert dir_item.metadata == orig_item.metadata
+ if dir_item.metadata and not (dir_item.metadata.keys() <= {"@context"}):
# Only store non-empty metadata sets
- if rev_item not in rev_metadata:
- rev_metadata.append(rev_item)
+ if dir_item not in dir_metadata:
+ dir_metadata.append(dir_item)
if orig_item not in orig_metadata:
orig_metadata.append(orig_item)
- if rev_metadata:
- summary_rev = self.idx_storage.revision_intrinsic_metadata_add(rev_metadata)
- summary.update(summary_rev)
+ if dir_metadata:
+ summary_dir = self.idx_storage.directory_intrinsic_metadata_add(
+ dir_metadata
+ )
+ summary.update(summary_dir)
if orig_metadata:
summary_ori = self.idx_storage.origin_intrinsic_metadata_add(orig_metadata)
summary.update(summary_ori)
diff --git a/swh/indexer/metadata_dictionary/cff.py b/swh/indexer/metadata_dictionary/cff.py
--- a/swh/indexer/metadata_dictionary/cff.py
+++ b/swh/indexer/metadata_dictionary/cff.py
@@ -6,10 +6,12 @@
from .base import DictMapping, SingleFileMapping
-yaml.SafeLoader.yaml_implicit_resolvers = {
- k: [r for r in v if r[0] != "tag:yaml.org,2002:timestamp"]
- for k, v in yaml.SafeLoader.yaml_implicit_resolvers.items()
-}
+
+class SafeLoader(yaml.SafeLoader):
+ yaml_implicit_resolvers = {
+ k: [r for r in v if r[0] != "tag:yaml.org,2002:timestamp"]
+ for k, v in yaml.SafeLoader.yaml_implicit_resolvers.items()
+ }
class CffMapping(DictMapping, SingleFileMapping):
diff --git a/swh/indexer/metadata_dictionary/npm.py b/swh/indexer/metadata_dictionary/npm.py
--- a/swh/indexer/metadata_dictionary/npm.py
+++ b/swh/indexer/metadata_dictionary/npm.py
@@ -133,6 +133,73 @@
author[SCHEMA_URI + "url"] = {"@id": url}
return {"@list": [author]}
+ def normalize_description(self, description):
+ r"""Try to re-decode ``description`` as UTF-16, as this is a somewhat common
+ mistake that causes issues in the database because of null bytes in JSON.
+
+ >>> NpmMapping().normalize_description("foo bar")
+ 'foo bar'
+ >>> NpmMapping().normalize_description(
+ ... "\ufffd\ufffd#\x00 \x00f\x00o\x00o\x00 \x00b\x00a\x00r\x00\r\x00 \x00"
+ ... )
+ 'foo bar'
+ >>> NpmMapping().normalize_description(
+ ... "\ufffd\ufffd\x00#\x00 \x00f\x00o\x00o\x00 \x00b\x00a\x00r\x00\r\x00 "
+ ... )
+ 'foo bar'
+ >>> NpmMapping().normalize_description(
+ ... # invalid UTF-16 and meaningless UTF-8:
+ ... "\ufffd\ufffd\x00#\x00\x00\x00 \x00\x00\x00\x00f\x00\x00\x00\x00"
+ ... ) is None
+ True
+ >>> NpmMapping().normalize_description(
+ ... # ditto (ut looks like little-endian at first)
+ ... "\ufffd\ufffd#\x00\x00\x00 \x00\x00\x00\x00f\x00\x00\x00\x00\x00"
+ ... ) is None
+ True
+ >>> NpmMapping().normalize_description(None) is None
+ True
+ """
+ if description is None:
+ return None
+ # XXX: if this function ever need to support more cases, consider
+ # switching to https://pypi.org/project/ftfy/ instead of adding more hacks
+ if description.startswith("\ufffd\ufffd") and "\x00" in description:
+ # 2 unicode replacement characters followed by '# ' encoded as UTF-16
+ # is a common mistake, which indicates a README.md was saved as UTF-16,
+ # and some NPM tool opened it as UTF-8 and used the first line as
+ # description.
+
+ description_bytes = description.encode()
+
+ # Strip the the two unicode replacement characters
+ assert description_bytes.startswith(b"\xef\xbf\xbd\xef\xbf\xbd")
+ description_bytes = description_bytes[6:]
+
+ # If the following attempts fail to recover the description, discard it
+ # entirely because the current indexer storage backend (postgresql) cannot
+ # store zero bytes in JSON columns.
+ description = None
+
+ if not description_bytes.startswith(b"\x00"):
+ # try UTF-16 little-endian (the most common) first
+ try:
+ description = description_bytes.decode("utf-16le")
+ except UnicodeDecodeError:
+ pass
+ if description is None:
+ # if it fails, try UTF-16 big-endian
+ try:
+ description = description_bytes.decode("utf-16be")
+ except UnicodeDecodeError:
+ pass
+
+ if description:
+ if description.startswith("# "):
+ description = description[2:]
+ return description.rstrip()
+ return description
+
def normalize_license(self, s):
"""https://docs.npmjs.com/files/package.json#license
diff --git a/swh/indexer/origin_head.py b/swh/indexer/origin_head.py
--- a/swh/indexer/origin_head.py
+++ b/swh/indexer/origin_head.py
@@ -1,159 +1,120 @@
-# Copyright (C) 2018-2020 The Software Heritage developers
+# Copyright (C) 2018-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-import logging
import re
-from typing import Any, Dict, List, Tuple, Union
+from typing import Dict, List, Optional, Tuple, Union
-import click
-
-from swh.indexer.indexer import OriginIndexer
from swh.model.model import SnapshotBranch, TargetType
+from swh.model.swhids import CoreSWHID, ObjectType
from swh.storage.algos.origin import origin_get_latest_visit_status
from swh.storage.algos.snapshot import snapshot_get_all_branches
-class OriginHeadIndexer(OriginIndexer[Dict]):
- """Origin-level indexer.
-
- This indexer is in charge of looking up the revision that acts as the
- "head" of an origin.
-
- In git, this is usually the commit pointed to by the 'master' branch."""
-
- USE_TOOLS = False
-
- def persist_index_computations(self, results: Any) -> Dict[str, int]:
- """Do nothing. The indexer's results are not persistent, they
- should only be piped to another indexer."""
- return {}
-
- # Dispatch
-
- def index(self, id: str, data: None = None, **kwargs) -> List[Dict]:
- origin_url = id
- visit_status = origin_get_latest_visit_status(
- self.storage, origin_url, allowed_statuses=["full"], require_snapshot=True
- )
- if not visit_status:
- return []
- assert visit_status.snapshot is not None
- snapshot = snapshot_get_all_branches(self.storage, visit_status.snapshot)
- if snapshot is None:
- return []
- method = getattr(
- self, "_try_get_%s_head" % visit_status.type, self._try_get_head_generic
- )
-
- rev_id = method(snapshot.branches) # type: ignore
- if rev_id is not None:
- return [
- {
- "origin_url": origin_url,
- "revision_id": rev_id,
- }
- ]
-
- # could not find a head revision
- return []
-
- # Tarballs
-
- _archive_filename_re = re.compile(
- rb"^"
- rb"(?P<pkgname>.*)[-_]"
- rb"(?P<version>[0-9]+(\.[0-9])*)"
- rb"(?P<preversion>[-+][a-zA-Z0-9.~]+?)?"
- rb"(?P<extension>(\.[a-zA-Z0-9]+)+)"
- rb"$"
+def get_head_swhid(storage, origin_url: str) -> Optional[CoreSWHID]:
+ """Returns the SWHID of the head revision or release of an origin"""
+ visit_status = origin_get_latest_visit_status(
+ storage, origin_url, allowed_statuses=["full"], require_snapshot=True
)
+ if not visit_status:
+ return None
+ assert visit_status.snapshot is not None
+ snapshot = snapshot_get_all_branches(storage, visit_status.snapshot)
+ if snapshot is None:
+ return None
+
+ if visit_status.type == "ftp":
+ return _try_get_ftp_head(dict(snapshot.branches))
+ else:
+ return _try_get_head_generic(dict(snapshot.branches))
+
+
+_archive_filename_re = re.compile(
+ rb"^"
+ rb"(?P<pkgname>.*)[-_]"
+ rb"(?P<version>[0-9]+(\.[0-9])*)"
+ rb"(?P<preversion>[-+][a-zA-Z0-9.~]+?)?"
+ rb"(?P<extension>(\.[a-zA-Z0-9]+)+)"
+ rb"$"
+)
- @classmethod
- def _parse_version(cls: Any, filename: bytes) -> Tuple[Union[float, int], ...]:
- """Extracts the release version from an archive filename,
- to get an ordering whose maximum is likely to be the last
- version of the software
-
- >>> OriginHeadIndexer._parse_version(b'foo')
- (-inf,)
- >>> OriginHeadIndexer._parse_version(b'foo.tar.gz')
- (-inf,)
- >>> OriginHeadIndexer._parse_version(b'gnu-hello-0.0.1.tar.gz')
- (0, 0, 1, 0)
- >>> OriginHeadIndexer._parse_version(b'gnu-hello-0.0.1-beta2.tar.gz')
- (0, 0, 1, -1, 'beta2')
- >>> OriginHeadIndexer._parse_version(b'gnu-hello-0.0.1+foobar.tar.gz')
- (0, 0, 1, 1, 'foobar')
- """
- res = cls._archive_filename_re.match(filename)
- if res is None:
- return (float("-infinity"),)
- version = [int(n) for n in res.group("version").decode().split(".")]
- if res.group("preversion") is None:
- version.append(0)
+
+def _parse_version(filename: bytes) -> Tuple[Union[float, int, str], ...]:
+ """Extracts the release version from an archive filename,
+ to get an ordering whose maximum is likely to be the last
+ version of the software
+
+ >>> _parse_version(b'foo')
+ (-inf,)
+ >>> _parse_version(b'foo.tar.gz')
+ (-inf,)
+ >>> _parse_version(b'gnu-hello-0.0.1.tar.gz')
+ (0, 0, 1, 0)
+ >>> _parse_version(b'gnu-hello-0.0.1-beta2.tar.gz')
+ (0, 0, 1, -1, 'beta2')
+ >>> _parse_version(b'gnu-hello-0.0.1+foobar.tar.gz')
+ (0, 0, 1, 1, 'foobar')
+ """
+ res = _archive_filename_re.match(filename)
+ if res is None:
+ return (float("-infinity"),)
+ version: List[Union[float, int, str]] = [
+ int(n) for n in res.group("version").decode().split(".")
+ ]
+ if res.group("preversion") is None:
+ version.append(0)
+ else:
+ preversion = res.group("preversion").decode()
+ if preversion.startswith("-"):
+ version.append(-1)
+ version.append(preversion[1:])
+ elif preversion.startswith("+"):
+ version.append(1)
+ version.append(preversion[1:])
else:
- preversion = res.group("preversion").decode()
- if preversion.startswith("-"):
- version.append(-1)
- version.append(preversion[1:])
- elif preversion.startswith("+"):
- version.append(1)
- version.append(preversion[1:])
- else:
- assert False, res.group("preversion")
- return tuple(version)
-
- def _try_get_ftp_head(self, branches: Dict[bytes, SnapshotBranch]) -> Any:
- archive_names = list(branches)
- max_archive_name = max(archive_names, key=self._parse_version)
- r = self._try_resolve_target(branches, max_archive_name)
- return r
-
- # Generic
-
- def _try_get_head_generic(self, branches: Dict[bytes, SnapshotBranch]) -> Any:
- # Works on 'deposit', 'pypi', and VCSs.
- return self._try_resolve_target(branches, b"HEAD") or self._try_resolve_target(
- branches, b"master"
- )
-
- def _try_resolve_target(
- self, branches: Dict[bytes, SnapshotBranch], branch_name: bytes
- ) -> Any:
- try:
- branch = branches[branch_name]
- if branch is None:
- return None
- while branch.target_type == TargetType.ALIAS:
- branch = branches[branch.target]
- if branch is None:
- return None
-
- if branch.target_type == TargetType.REVISION:
- return branch.target
- elif branch.target_type == TargetType.CONTENT:
- return None # TODO
- elif branch.target_type == TargetType.DIRECTORY:
- return None # TODO
- elif branch.target_type == TargetType.RELEASE:
- return None # TODO
- else:
- assert False, branch
- except KeyError:
- return None
+ assert False, res.group("preversion")
+ return tuple(version)
-@click.command()
-@click.option(
- "--origins", "-i", help='Origins to lookup, in the "type+url" format', multiple=True
-)
-def main(origins: List[str]) -> None:
- rev_metadata_indexer = OriginHeadIndexer()
- rev_metadata_indexer.run(origins)
+def _try_get_ftp_head(
+ branches: Dict[bytes, Optional[SnapshotBranch]]
+) -> Optional[CoreSWHID]:
+ archive_names = list(branches)
+ max_archive_name = max(archive_names, key=_parse_version)
+ return _try_resolve_target(branches, max_archive_name)
-if __name__ == "__main__":
- logging.basicConfig(level=logging.INFO)
- main()
+def _try_get_head_generic(
+ branches: Dict[bytes, Optional[SnapshotBranch]]
+) -> Optional[CoreSWHID]:
+ # Works on 'deposit', 'pypi', and VCSs.
+ return _try_resolve_target(branches, b"HEAD") or _try_resolve_target(
+ branches, b"master"
+ )
+
+
+def _try_resolve_target(
+ branches: Dict[bytes, Optional[SnapshotBranch]], branch_name: bytes
+) -> Optional[CoreSWHID]:
+ try:
+ branch = branches[branch_name]
+ if branch is None:
+ return None
+ while branch.target_type == TargetType.ALIAS:
+ branch = branches[branch.target]
+ if branch is None:
+ return None
+
+ if branch.target_type == TargetType.REVISION:
+ return CoreSWHID(object_type=ObjectType.REVISION, object_id=branch.target)
+ elif branch.target_type == TargetType.CONTENT:
+ return None # TODO
+ elif branch.target_type == TargetType.DIRECTORY:
+ return None # TODO
+ elif branch.target_type == TargetType.RELEASE:
+ return CoreSWHID(object_type=ObjectType.RELEASE, object_id=branch.target)
+ else:
+ assert False, branch
+ except KeyError:
+ return None
diff --git a/swh/indexer/sql/30-schema.sql b/swh/indexer/sql/30-schema.sql
--- a/swh/indexer/sql/30-schema.sql
+++ b/swh/indexer/sql/30-schema.sql
@@ -99,34 +99,34 @@
comment on column content_metadata.metadata is 'result of translation with defined format';
comment on column content_metadata.indexer_configuration_id is 'tool used for translation';
--- The table revision_intrinsic_metadata provides a minimal set of intrinsic
+-- The table directory_intrinsic_metadata provides a minimal set of intrinsic
-- metadata detected with the detection tool (indexer_configuration_id) and
-- aggregated from the content_metadata translation.
-create table revision_intrinsic_metadata(
+create table directory_intrinsic_metadata(
id sha1_git not null,
metadata jsonb not null,
indexer_configuration_id bigint not null,
mappings text array not null
);
-comment on table revision_intrinsic_metadata is 'metadata semantically detected and translated in a revision';
-comment on column revision_intrinsic_metadata.id is 'sha1_git of revision';
-comment on column revision_intrinsic_metadata.metadata is 'result of detection and translation with defined format';
-comment on column revision_intrinsic_metadata.indexer_configuration_id is 'tool used for detection';
-comment on column revision_intrinsic_metadata.mappings is 'type of metadata files used to obtain this metadata (eg. pkg-info, npm)';
+comment on table directory_intrinsic_metadata is 'metadata semantically detected and translated in a directory';
+comment on column directory_intrinsic_metadata.id is 'sha1_git of directory';
+comment on column directory_intrinsic_metadata.metadata is 'result of detection and translation with defined format';
+comment on column directory_intrinsic_metadata.indexer_configuration_id is 'tool used for detection';
+comment on column directory_intrinsic_metadata.mappings is 'type of metadata files used to obtain this metadata (eg. pkg-info, npm)';
create table origin_intrinsic_metadata(
id text not null, -- origin url
metadata jsonb,
indexer_configuration_id bigint not null,
- from_revision sha1_git not null,
+ from_directory sha1_git not null,
metadata_tsvector tsvector,
mappings text array not null
);
comment on table origin_intrinsic_metadata is 'keeps intrinsic metadata for an origin';
comment on column origin_intrinsic_metadata.id is 'url of the origin';
-comment on column origin_intrinsic_metadata.metadata is 'metadata extracted from a revision';
+comment on column origin_intrinsic_metadata.metadata is 'metadata extracted from a directory';
comment on column origin_intrinsic_metadata.indexer_configuration_id is 'tool used to generate this metadata';
-comment on column origin_intrinsic_metadata.from_revision is 'sha1 of the revision this metadata was copied from.';
+comment on column origin_intrinsic_metadata.from_directory is 'sha1 of the directory this metadata was copied from.';
comment on column origin_intrinsic_metadata.mappings is 'type of metadata files used to obtain this metadata (eg. pkg-info, npm)';
diff --git a/swh/indexer/sql/50-func.sql b/swh/indexer/sql/50-func.sql
--- a/swh/indexer/sql/50-func.sql
+++ b/swh/indexer/sql/50-func.sql
@@ -273,25 +273,25 @@
-- end content_metadata functions
--- add tmp_revision_intrinsic_metadata entries to revision_intrinsic_metadata,
+-- add tmp_directory_intrinsic_metadata entries to directory_intrinsic_metadata,
-- overwriting duplicates.
--
-- If filtering duplicates is in order, the call to
--- swh_revision_intrinsic_metadata_missing must take place before calling this
+-- swh_directory_intrinsic_metadata_missing must take place before calling this
-- function.
--
-- operates in bulk: 0. swh_mktemp(content_language), 1. COPY to
--- tmp_revision_intrinsic_metadata, 2. call this function
-create or replace function swh_revision_intrinsic_metadata_add()
+-- tmp_directory_intrinsic_metadata, 2. call this function
+create or replace function swh_directory_intrinsic_metadata_add()
returns bigint
language plpgsql
as $$
declare
res bigint;
begin
- insert into revision_intrinsic_metadata (id, metadata, mappings, indexer_configuration_id)
+ insert into directory_intrinsic_metadata (id, metadata, mappings, indexer_configuration_id)
select id, metadata, mappings, indexer_configuration_id
- from tmp_revision_intrinsic_metadata tcm
+ from tmp_directory_intrinsic_metadata tcm
on conflict(id, indexer_configuration_id)
do update set
metadata = excluded.metadata,
@@ -302,19 +302,19 @@
end
$$;
-comment on function swh_revision_intrinsic_metadata_add() IS 'Add new revision intrinsic metadata';
+comment on function swh_directory_intrinsic_metadata_add() IS 'Add new directory intrinsic metadata';
--- create a temporary table for retrieving revision_intrinsic_metadata
-create or replace function swh_mktemp_revision_intrinsic_metadata()
+-- create a temporary table for retrieving directory_intrinsic_metadata
+create or replace function swh_mktemp_directory_intrinsic_metadata()
returns void
language sql
as $$
- create temporary table if not exists tmp_revision_intrinsic_metadata (
- like revision_intrinsic_metadata including defaults
+ create temporary table if not exists tmp_directory_intrinsic_metadata (
+ like directory_intrinsic_metadata including defaults
) on commit delete rows;
$$;
-comment on function swh_mktemp_revision_intrinsic_metadata() is 'Helper table to add revision intrinsic metadata';
+comment on function swh_mktemp_directory_intrinsic_metadata() is 'Helper table to add directory intrinsic metadata';
-- create a temporary table for retrieving origin_intrinsic_metadata
create or replace function swh_mktemp_origin_intrinsic_metadata()
@@ -380,8 +380,8 @@
begin
perform swh_origin_intrinsic_metadata_compute_tsvector();
- insert into origin_intrinsic_metadata (id, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings)
- select id, metadata, indexer_configuration_id, from_revision,
+ insert into origin_intrinsic_metadata (id, metadata, indexer_configuration_id, from_directory, metadata_tsvector, mappings)
+ select id, metadata, indexer_configuration_id, from_directory,
metadata_tsvector, mappings
from tmp_origin_intrinsic_metadata
on conflict(id, indexer_configuration_id)
@@ -389,7 +389,7 @@
metadata = excluded.metadata,
metadata_tsvector = excluded.metadata_tsvector,
mappings = excluded.mappings,
- from_revision = excluded.from_revision;
+ from_directory = excluded.from_directory;
get diagnostics res = ROW_COUNT;
return res;
diff --git a/swh/indexer/sql/60-indexes.sql b/swh/indexer/sql/60-indexes.sql
--- a/swh/indexer/sql/60-indexes.sql
+++ b/swh/indexer/sql/60-indexes.sql
@@ -25,12 +25,12 @@
alter table content_metadata add constraint content_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid;
alter table content_metadata validate constraint content_metadata_indexer_configuration_id_fkey;
--- revision_intrinsic_metadata
-create unique index revision_intrinsic_metadata_pkey on revision_intrinsic_metadata(id, indexer_configuration_id);
-alter table revision_intrinsic_metadata add primary key using index revision_intrinsic_metadata_pkey;
+-- directory_intrinsic_metadata
+create unique index directory_intrinsic_metadata_pkey on directory_intrinsic_metadata(id, indexer_configuration_id);
+alter table directory_intrinsic_metadata add primary key using index directory_intrinsic_metadata_pkey;
-alter table revision_intrinsic_metadata add constraint revision_intrinsic_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid;
-alter table revision_intrinsic_metadata validate constraint revision_intrinsic_metadata_indexer_configuration_id_fkey;
+alter table directory_intrinsic_metadata add constraint directory_intrinsic_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid;
+alter table directory_intrinsic_metadata validate constraint directory_intrinsic_metadata_indexer_configuration_id_fkey;
-- content_mimetype
create unique index content_mimetype_pkey on content_mimetype(id, indexer_configuration_id);
diff --git a/swh/indexer/sql/upgrades/134.sql b/swh/indexer/sql/upgrades/134.sql
new file mode 100644
--- /dev/null
+++ b/swh/indexer/sql/upgrades/134.sql
@@ -0,0 +1,154 @@
+-- SWH Indexer DB schema upgrade
+-- from_version: 133
+-- to_version: 134
+-- description: replace revision_intrinsic_metadata with directory_intrinsic_metadata
+-- and origin_intrinsic_metadata.from_revision with origin_intrinsic_metadata.from_directory
+-- This migration works by dropping both tables and reindexing from scratch.
+
+insert into dbversion(version, release, description)
+ values(134, now(), 'Work In Progress');
+
+drop table origin_intrinsic_metadata;
+drop table revision_intrinsic_metadata;
+drop function swh_revision_intrinsic_metadata_add;
+drop function swh_mktemp_revision_intrinsic_metadata;
+
+
+create table directory_intrinsic_metadata(
+ id sha1_git not null,
+ metadata jsonb not null,
+ indexer_configuration_id bigint not null,
+ mappings text array not null
+);
+
+comment on table directory_intrinsic_metadata is 'metadata semantically detected and translated in a directory';
+comment on column directory_intrinsic_metadata.id is 'sha1_git of directory';
+comment on column directory_intrinsic_metadata.metadata is 'result of detection and translation with defined format';
+comment on column directory_intrinsic_metadata.indexer_configuration_id is 'tool used for detection';
+comment on column directory_intrinsic_metadata.mappings is 'type of metadata files used to obtain this metadata (eg. pkg-info, npm)';
+
+create table origin_intrinsic_metadata(
+ id text not null, -- origin url
+ metadata jsonb,
+ indexer_configuration_id bigint not null,
+ from_directory sha1_git not null,
+ metadata_tsvector tsvector,
+ mappings text array not null
+);
+
+comment on table origin_intrinsic_metadata is 'keeps intrinsic metadata for an origin';
+comment on column origin_intrinsic_metadata.id is 'url of the origin';
+comment on column origin_intrinsic_metadata.metadata is 'metadata extracted from a directory';
+comment on column origin_intrinsic_metadata.indexer_configuration_id is 'tool used to generate this metadata';
+comment on column origin_intrinsic_metadata.from_directory is 'sha1 of the directory this metadata was copied from.';
+comment on column origin_intrinsic_metadata.mappings is 'type of metadata files used to obtain this metadata (eg. pkg-info, npm)';
+
+-- add tmp_directory_intrinsic_metadata entries to directory_intrinsic_metadata,
+-- overwriting duplicates.
+--
+-- If filtering duplicates is in order, the call to
+-- swh_directory_intrinsic_metadata_missing must take place before calling this
+-- function.
+--
+-- operates in bulk: 0. swh_mktemp(content_language), 1. COPY to
+-- tmp_directory_intrinsic_metadata, 2. call this function
+create or replace function swh_directory_intrinsic_metadata_add()
+ returns bigint
+ language plpgsql
+as $$
+declare
+ res bigint;
+begin
+ insert into directory_intrinsic_metadata (id, metadata, mappings, indexer_configuration_id)
+ select id, metadata, mappings, indexer_configuration_id
+ from tmp_directory_intrinsic_metadata tcm
+ on conflict(id, indexer_configuration_id)
+ do update set
+ metadata = excluded.metadata,
+ mappings = excluded.mappings;
+
+ get diagnostics res = ROW_COUNT;
+ return res;
+end
+$$;
+
+comment on function swh_directory_intrinsic_metadata_add() IS 'Add new directory intrinsic metadata';
+
+-- create a temporary table for retrieving directory_intrinsic_metadata
+create or replace function swh_mktemp_directory_intrinsic_metadata()
+ returns void
+ language sql
+as $$
+ create temporary table if not exists tmp_directory_intrinsic_metadata (
+ like directory_intrinsic_metadata including defaults
+ ) on commit delete rows;
+$$;
+
+comment on function swh_mktemp_directory_intrinsic_metadata() is 'Helper table to add directory intrinsic metadata';
+
+-- create a temporary table for retrieving origin_intrinsic_metadata
+create or replace function swh_mktemp_origin_intrinsic_metadata()
+ returns void
+ language sql
+as $$
+ create temporary table if not exists tmp_origin_intrinsic_metadata (
+ like origin_intrinsic_metadata including defaults
+ ) on commit delete rows;
+$$;
+
+comment on function swh_mktemp_origin_intrinsic_metadata() is 'Helper table to add origin intrinsic metadata';
+
+-- add tmp_origin_intrinsic_metadata entries to origin_intrinsic_metadata,
+-- overwriting duplicates.
+--
+-- If filtering duplicates is in order, the call to
+-- swh_origin_intrinsic_metadata_missing must take place before calling this
+-- function.
+--
+-- operates in bulk: 0. swh_mktemp(content_language), 1. COPY to
+-- tmp_origin_intrinsic_metadata, 2. call this function
+create or replace function swh_origin_intrinsic_metadata_add()
+ returns bigint
+ language plpgsql
+as $$
+declare
+ res bigint;
+begin
+ perform swh_origin_intrinsic_metadata_compute_tsvector();
+
+ insert into origin_intrinsic_metadata (id, metadata, indexer_configuration_id, from_directory, metadata_tsvector, mappings)
+ select id, metadata, indexer_configuration_id, from_directory,
+ metadata_tsvector, mappings
+ from tmp_origin_intrinsic_metadata
+ on conflict(id, indexer_configuration_id)
+ do update set
+ metadata = excluded.metadata,
+ metadata_tsvector = excluded.metadata_tsvector,
+ mappings = excluded.mappings,
+ from_directory = excluded.from_directory;
+
+ get diagnostics res = ROW_COUNT;
+ return res;
+end
+$$;
+
+comment on function swh_origin_intrinsic_metadata_add() IS 'Add new origin intrinsic metadata';
+
+
+
+-- directory_intrinsic_metadata
+create unique index directory_intrinsic_metadata_pkey on directory_intrinsic_metadata(id, indexer_configuration_id);
+alter table directory_intrinsic_metadata add primary key using index directory_intrinsic_metadata_pkey;
+
+alter table directory_intrinsic_metadata add constraint directory_intrinsic_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid;
+alter table directory_intrinsic_metadata validate constraint directory_intrinsic_metadata_indexer_configuration_id_fkey;
+
+-- origin_intrinsic_metadata
+create unique index origin_intrinsic_metadata_pkey on origin_intrinsic_metadata(id, indexer_configuration_id);
+alter table origin_intrinsic_metadata add primary key using index origin_intrinsic_metadata_pkey;
+
+alter table origin_intrinsic_metadata add constraint origin_intrinsic_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid;
+alter table origin_intrinsic_metadata validate constraint origin_intrinsic_metadata_indexer_configuration_id_fkey;
+
+create index origin_intrinsic_metadata_fulltext_idx on origin_intrinsic_metadata using gin (metadata_tsvector);
+create index origin_intrinsic_metadata_mappings_idx on origin_intrinsic_metadata using gin (mappings);
diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py
--- a/swh/indexer/storage/__init__.py
+++ b/swh/indexer/storage/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2015-2020 The Software Heritage developers
+# Copyright (C) 2015-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@@ -30,8 +30,8 @@
ContentLicenseRow,
ContentMetadataRow,
ContentMimetypeRow,
+ DirectoryIntrinsicMetadataRow,
OriginIntrinsicMetadataRow,
- RevisionIntrinsicMetadataRow,
)
from .writer import JournalWriter
@@ -120,7 +120,9 @@
class IndexerStorage:
- """SWH Indexer Storage"""
+ """SWH Indexer Storage Datastore"""
+
+ current_version = 134
def __init__(self, db, min_pool_conns=1, max_pool_conns=10, journal_writer=None):
"""
@@ -152,10 +154,6 @@
if db is not self._db:
db.put_conn()
- @db_transaction()
- def get_current_version(self, *, db=None, cur=None):
- return db.current_version
-
@timed
@db_transaction()
def check_config(self, *, check_write, db=None, cur=None):
@@ -522,52 +520,52 @@
@timed
@db_transaction()
- def revision_intrinsic_metadata_missing(
+ def directory_intrinsic_metadata_missing(
self, metadata: Iterable[Dict], db=None, cur=None
) -> List[Tuple[Sha1, int]]:
return [
obj[0]
- for obj in db.revision_intrinsic_metadata_missing_from_list(metadata, cur)
+ for obj in db.directory_intrinsic_metadata_missing_from_list(metadata, cur)
]
@timed
@db_transaction()
- def revision_intrinsic_metadata_get(
+ def directory_intrinsic_metadata_get(
self, ids: Iterable[Sha1], db=None, cur=None
- ) -> List[RevisionIntrinsicMetadataRow]:
+ ) -> List[DirectoryIntrinsicMetadataRow]:
return [
- RevisionIntrinsicMetadataRow.from_dict(
+ DirectoryIntrinsicMetadataRow.from_dict(
converters.db_to_metadata(
- dict(zip(db.revision_intrinsic_metadata_cols, c))
+ dict(zip(db.directory_intrinsic_metadata_cols, c))
)
)
- for c in db.revision_intrinsic_metadata_get_from_list(ids, cur)
+ for c in db.directory_intrinsic_metadata_get_from_list(ids, cur)
]
@timed
@process_metrics
@db_transaction()
- def revision_intrinsic_metadata_add(
+ def directory_intrinsic_metadata_add(
self,
- metadata: List[RevisionIntrinsicMetadataRow],
+ metadata: List[DirectoryIntrinsicMetadataRow],
db=None,
cur=None,
) -> Dict[str, int]:
check_id_duplicates(metadata)
metadata.sort(key=lambda m: m.id)
- self.journal_writer.write_additions("revision_intrinsic_metadata", metadata)
+ self.journal_writer.write_additions("directory_intrinsic_metadata", metadata)
- db.mktemp_revision_intrinsic_metadata(cur)
+ db.mktemp_directory_intrinsic_metadata(cur)
db.copy_to(
[m.to_dict() for m in metadata],
- "tmp_revision_intrinsic_metadata",
+ "tmp_directory_intrinsic_metadata",
["id", "metadata", "mappings", "indexer_configuration_id"],
cur,
)
- count = db.revision_intrinsic_metadata_add_from_temp(cur)
+ count = db.directory_intrinsic_metadata_add_from_temp(cur)
return {
- "revision_intrinsic_metadata:add": count,
+ "directory_intrinsic_metadata:add": count,
}
@timed
@@ -602,7 +600,13 @@
db.copy_to(
[m.to_dict() for m in metadata],
"tmp_origin_intrinsic_metadata",
- ["id", "metadata", "indexer_configuration_id", "from_revision", "mappings"],
+ [
+ "id",
+ "metadata",
+ "indexer_configuration_id",
+ "from_directory",
+ "mappings",
+ ],
cur,
)
count = db.origin_intrinsic_metadata_add_from_temp(cur)
diff --git a/swh/indexer/storage/api/server.py b/swh/indexer/storage/api/server.py
--- a/swh/indexer/storage/api/server.py
+++ b/swh/indexer/storage/api/server.py
@@ -86,10 +86,10 @@
if type == "local":
vcfg = cfg["indexer_storage"]
cls = vcfg.get("cls")
- if cls != "local":
+ if cls not in ("local", "postgresql"):
raise ValueError(
"The indexer_storage backend can only be started with a "
- "'local' configuration"
+ "'postgresql' configuration"
)
if not vcfg.get("db"):
diff --git a/swh/indexer/storage/db.py b/swh/indexer/storage/db.py
--- a/swh/indexer/storage/db.py
+++ b/swh/indexer/storage/db.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2015-2018 The Software Heritage developers
+# Copyright (C) 2015-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@@ -16,7 +16,6 @@
"""Proxy to the SWH Indexer DB, with wrappers around stored procedures"""
content_mimetype_hash_keys = ["id", "indexer_configuration_id"]
- current_version = 133
def _missing_from_list(
self, table: str, data: Iterable[Dict], hash_keys: List[str], cur=None
@@ -350,18 +349,18 @@
"content_metadata", ids, self.content_metadata_cols, cur=cur
)
- revision_intrinsic_metadata_hash_keys = ["id", "indexer_configuration_id"]
+ directory_intrinsic_metadata_hash_keys = ["id", "indexer_configuration_id"]
- def revision_intrinsic_metadata_missing_from_list(self, metadata, cur=None):
+ def directory_intrinsic_metadata_missing_from_list(self, metadata, cur=None):
"""List missing metadata."""
yield from self._missing_from_list(
- "revision_intrinsic_metadata",
+ "directory_intrinsic_metadata",
metadata,
- self.revision_intrinsic_metadata_hash_keys,
+ self.directory_intrinsic_metadata_hash_keys,
cur=cur,
)
- revision_intrinsic_metadata_cols = [
+ directory_intrinsic_metadata_cols = [
"id",
"metadata",
"mappings",
@@ -371,27 +370,27 @@
"tool_configuration",
]
- @stored_procedure("swh_mktemp_revision_intrinsic_metadata")
- def mktemp_revision_intrinsic_metadata(self, cur=None):
+ @stored_procedure("swh_mktemp_directory_intrinsic_metadata")
+ def mktemp_directory_intrinsic_metadata(self, cur=None):
pass
- def revision_intrinsic_metadata_add_from_temp(self, cur=None):
+ def directory_intrinsic_metadata_add_from_temp(self, cur=None):
cur = self._cursor(cur)
- cur.execute("select * from swh_revision_intrinsic_metadata_add()")
+ cur.execute("select * from swh_directory_intrinsic_metadata_add()")
return cur.fetchone()[0]
- def revision_intrinsic_metadata_get_from_list(self, ids, cur=None):
+ def directory_intrinsic_metadata_get_from_list(self, ids, cur=None):
yield from self._get_from_list(
- "revision_intrinsic_metadata",
+ "directory_intrinsic_metadata",
ids,
- self.revision_intrinsic_metadata_cols,
+ self.directory_intrinsic_metadata_cols,
cur=cur,
)
origin_intrinsic_metadata_cols = [
"id",
"metadata",
- "from_revision",
+ "from_directory",
"mappings",
"tool_id",
"tool_name",
diff --git a/swh/indexer/storage/in_memory.py b/swh/indexer/storage/in_memory.py
--- a/swh/indexer/storage/in_memory.py
+++ b/swh/indexer/storage/in_memory.py
@@ -38,8 +38,8 @@
ContentLicenseRow,
ContentMetadataRow,
ContentMimetypeRow,
+ DirectoryIntrinsicMetadataRow,
OriginIntrinsicMetadataRow,
- RevisionIntrinsicMetadataRow,
)
from .writer import JournalWriter
@@ -250,8 +250,8 @@
self._content_ctags = SubStorage(ContentCtagsRow, *args)
self._licenses = SubStorage(ContentLicenseRow, *args)
self._content_metadata = SubStorage(ContentMetadataRow, *args)
- self._revision_intrinsic_metadata = SubStorage(
- RevisionIntrinsicMetadataRow, *args
+ self._directory_intrinsic_metadata = SubStorage(
+ DirectoryIntrinsicMetadataRow, *args
)
self._origin_intrinsic_metadata = SubStorage(OriginIntrinsicMetadataRow, *args)
@@ -369,21 +369,21 @@
added = self._content_metadata.add(metadata)
return {"content_metadata:add": added}
- def revision_intrinsic_metadata_missing(
+ def directory_intrinsic_metadata_missing(
self, metadata: Iterable[Dict]
) -> List[Tuple[Sha1, int]]:
- return self._revision_intrinsic_metadata.missing(metadata)
+ return self._directory_intrinsic_metadata.missing(metadata)
- def revision_intrinsic_metadata_get(
+ def directory_intrinsic_metadata_get(
self, ids: Iterable[Sha1]
- ) -> List[RevisionIntrinsicMetadataRow]:
- return self._revision_intrinsic_metadata.get(ids)
+ ) -> List[DirectoryIntrinsicMetadataRow]:
+ return self._directory_intrinsic_metadata.get(ids)
- def revision_intrinsic_metadata_add(
- self, metadata: List[RevisionIntrinsicMetadataRow]
+ def directory_intrinsic_metadata_add(
+ self, metadata: List[DirectoryIntrinsicMetadataRow]
) -> Dict[str, int]:
- added = self._revision_intrinsic_metadata.add(metadata)
- return {"revision_intrinsic_metadata:add": added}
+ added = self._directory_intrinsic_metadata.add(metadata)
+ return {"directory_intrinsic_metadata:add": added}
def origin_intrinsic_metadata_get(
self, urls: Iterable[str]
diff --git a/swh/indexer/storage/interface.py b/swh/indexer/storage/interface.py
--- a/swh/indexer/storage/interface.py
+++ b/swh/indexer/storage/interface.py
@@ -15,8 +15,8 @@
ContentLicenseRow,
ContentMetadataRow,
ContentMimetypeRow,
+ DirectoryIntrinsicMetadataRow,
OriginIntrinsicMetadataRow,
- RevisionIntrinsicMetadataRow,
)
TResult = TypeVar("TResult")
@@ -341,8 +341,8 @@
"""
...
- @remote_api_endpoint("revision_intrinsic_metadata/missing")
- def revision_intrinsic_metadata_missing(
+ @remote_api_endpoint("directory_intrinsic_metadata/missing")
+ def directory_intrinsic_metadata_missing(
self, metadata: Iterable[Dict]
) -> List[Tuple[Sha1, int]]:
"""List metadata missing from storage.
@@ -350,7 +350,7 @@
Args:
metadata (iterable): dictionaries with keys:
- - **id** (bytes): sha1_git revision identifier
+ - **id** (bytes): sha1_git directory identifier
- **indexer_configuration_id** (int): tool used to compute
the results
@@ -360,11 +360,11 @@
"""
...
- @remote_api_endpoint("revision_intrinsic_metadata")
- def revision_intrinsic_metadata_get(
+ @remote_api_endpoint("directory_intrinsic_metadata")
+ def directory_intrinsic_metadata_get(
self, ids: Iterable[Sha1]
- ) -> List[RevisionIntrinsicMetadataRow]:
- """Retrieve revision metadata per id.
+ ) -> List[DirectoryIntrinsicMetadataRow]:
+ """Retrieve directory metadata per id.
Args:
ids (iterable): sha1 checksums
@@ -375,10 +375,10 @@
"""
...
- @remote_api_endpoint("revision_intrinsic_metadata/add")
- def revision_intrinsic_metadata_add(
+ @remote_api_endpoint("directory_intrinsic_metadata/add")
+ def directory_intrinsic_metadata_add(
self,
- metadata: List[RevisionIntrinsicMetadataRow],
+ metadata: List[DirectoryIntrinsicMetadataRow],
) -> Dict[str, int]:
"""Add metadata not present in storage.
diff --git a/swh/indexer/storage/model.py b/swh/indexer/storage/model.py
--- a/swh/indexer/storage/model.py
+++ b/swh/indexer/storage/model.py
@@ -120,8 +120,8 @@
@attr.s
-class RevisionIntrinsicMetadataRow(BaseRow):
- object_type: Final = "revision_intrinsic_metadata"
+class DirectoryIntrinsicMetadataRow(BaseRow):
+ object_type: Final = "directory_intrinsic_metadata"
id = attr.ib(type=Sha1Git)
metadata = attr.ib(type=Dict[str, Any])
@@ -134,5 +134,5 @@
id = attr.ib(type=str)
metadata = attr.ib(type=Dict[str, Any])
- from_revision = attr.ib(type=Sha1Git)
+ from_directory = attr.ib(type=Sha1Git)
mappings = attr.ib(type=List[str])
diff --git a/swh/indexer/tests/conftest.py b/swh/indexer/tests/conftest.py
--- a/swh/indexer/tests/conftest.py
+++ b/swh/indexer/tests/conftest.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2019-2020 The Software Heritage developers
+# Copyright (C) 2019-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@@ -13,9 +13,8 @@
from pytest_postgresql import factories
import yaml
-from swh.core.db.pytest_plugin import initialize_database_for_module, postgresql_fact
-from swh.indexer.storage import get_indexer_storage
-from swh.indexer.storage.db import Db as IndexerDb
+from swh.core.db.pytest_plugin import initialize_database_for_module
+from swh.indexer.storage import IndexerStorage, get_indexer_storage
from swh.objstorage.factory import get_objstorage
from swh.storage import get_storage
@@ -23,23 +22,22 @@
TASK_NAMES: List[Tuple[str, str]] = [
# (scheduler-task-type, task-class-test-name)
- ("index-revision-metadata", "revision_intrinsic_metadata"),
+ ("index-directory-metadata", "directory_intrinsic_metadata"),
("index-origin-metadata", "origin_intrinsic_metadata"),
]
idx_postgresql_proc = factories.postgresql_proc(
- dbname="indexer_storage",
load=[
partial(
initialize_database_for_module,
modname="indexer",
- version=IndexerDb.current_version,
+ version=IndexerStorage.current_version,
)
],
)
-idx_storage_postgresql = postgresql_fact("idx_postgresql_proc")
+idx_storage_postgresql = factories.postgresql("idx_postgresql_proc")
@pytest.fixture
diff --git a/swh/indexer/tests/storage/conftest.py b/swh/indexer/tests/storage/conftest.py
--- a/swh/indexer/tests/storage/conftest.py
+++ b/swh/indexer/tests/storage/conftest.py
@@ -41,9 +41,9 @@
data.tools = tools
data.sha1_1 = hash_to_bytes("34973274ccef6ab4dfaaf86599792fa9c3fe4689")
data.sha1_2 = hash_to_bytes("61c2b3a30496d329e21af70dd2d7e097046d07b7")
- data.revision_id_1 = hash_to_bytes("7026b7c1a2af56521e951c01ed20f255fa054238")
- data.revision_id_2 = hash_to_bytes("7026b7c1a2af56521e9587659012345678904321")
- data.revision_id_3 = hash_to_bytes("7026b7c1a2af56521e9587659012345678904320")
+ data.directory_id_1 = hash_to_bytes("7026b7c1a2af56521e951c01ed20f255fa054238")
+ data.directory_id_2 = hash_to_bytes("7026b7c1a2af56521e9587659012345678904321")
+ data.directory_id_3 = hash_to_bytes("7026b7c1a2af56521e9587659012345678904320")
data.origin_url_1 = "file:///dev/0/zero" # 44434341
data.origin_url_2 = "file:///dev/1/one" # 44434342
data.origin_url_3 = "file:///dev/2/two" # 54974445
diff --git a/swh/indexer/tests/storage/test_server.py b/swh/indexer/tests/storage/test_server.py
--- a/swh/indexer/tests/storage/test_server.py
+++ b/swh/indexer/tests/storage/test_server.py
@@ -57,13 +57,13 @@
def test_load_and_check_config_remote_config_local_type_raise(
class_storage, tmpdir
) -> None:
- """Any other configuration than 'local' (the default) is rejected"""
+ """Any other configuration than 'postgresql' (the default) is rejected"""
assert class_storage != "local"
incompatible_config = {"indexer_storage": {"cls": class_storage}}
config_path = prepare_config_file(tmpdir, incompatible_config)
expected_error = (
- "The indexer_storage backend can only be started with a 'local' "
+ "The indexer_storage backend can only be started with a 'postgresql' "
"configuration"
)
with pytest.raises(ValueError, match=expected_error):
@@ -82,8 +82,8 @@
def test_load_and_check_config_local_incomplete_configuration(tmpdir) -> None:
- """Incomplete 'local' configuration should raise"""
- config = {"indexer_storage": {"cls": "local"}}
+ """Incomplete 'postgresql' configuration should raise"""
+ config = {"indexer_storage": {"cls": "postgresql"}}
expected_error = "Invalid configuration; missing 'db' config entry"
config_path = prepare_config_file(tmpdir, config)
@@ -95,10 +95,10 @@
"""'Complete 'local' configuration is fine"""
config = {
"indexer_storage": {
- "cls": "local",
+ "cls": "postgresql",
"db": "db",
}
}
config_path = prepare_config_file(tmpdir, config)
- cfg = load_and_check_config(config_path, type="local")
+ cfg = load_and_check_config(config_path, type="postgresql")
assert cfg == config
diff --git a/swh/indexer/tests/storage/test_storage.py b/swh/indexer/tests/storage/test_storage.py
--- a/swh/indexer/tests/storage/test_storage.py
+++ b/swh/indexer/tests/storage/test_storage.py
@@ -19,8 +19,8 @@
ContentLicenseRow,
ContentMetadataRow,
ContentMimetypeRow,
+ DirectoryIntrinsicMetadataRow,
OriginIntrinsicMetadataRow,
- RevisionIntrinsicMetadataRow,
)
from swh.model.hashutil import hash_to_bytes
@@ -289,37 +289,37 @@
etype = self.endpoint_type
tool = data.tools[self.tool_name]
- data_rev1 = self.row_class.from_dict(
+ data_dir1 = self.row_class.from_dict(
{
- "id": data.revision_id_2,
+ "id": data.directory_id_2,
**self.example_data[0],
"indexer_configuration_id": tool["id"],
}
)
- data_rev2 = self.row_class.from_dict(
+ data_dir2 = self.row_class.from_dict(
{
- "id": data.revision_id_2,
+ "id": data.directory_id_2,
**self.example_data[1],
"indexer_configuration_id": tool["id"],
}
)
# when
- summary = endpoint(storage, etype, "add")([data_rev1])
+ summary = endpoint(storage, etype, "add")([data_dir1])
assert summary == expected_summary(1, etype)
with pytest.raises(DuplicateId):
- endpoint(storage, etype, "add")([data_rev2, data_rev2])
+ endpoint(storage, etype, "add")([data_dir2, data_dir2])
# then
actual_data = list(
- endpoint(storage, etype, "get")([data.revision_id_2, data.revision_id_1])
+ endpoint(storage, etype, "get")([data.directory_id_2, data.directory_id_1])
)
expected_data = [
self.row_class.from_dict(
- {"id": data.revision_id_2, **self.example_data[0], "tool": tool}
+ {"id": data.directory_id_2, **self.example_data[0], "tool": tool}
)
]
assert actual_data == expected_data
@@ -806,11 +806,11 @@
row_class = ContentMetadataRow
-class TestIndexerStorageRevisionIntrinsicMetadata(StorageETypeTester):
- """Test Indexer Storage revision_intrinsic_metadata related methods"""
+class TestIndexerStorageDirectoryIntrinsicMetadata(StorageETypeTester):
+ """Test Indexer Storage directory_intrinsic_metadata related methods"""
tool_name = "swh-metadata-detector"
- endpoint_type = "revision_intrinsic_metadata"
+ endpoint_type = "directory_intrinsic_metadata"
example_data = [
{
"metadata": {
@@ -830,7 +830,7 @@
"mappings": ["mapping2"],
},
]
- row_class = RevisionIntrinsicMetadataRow
+ row_class = DirectoryIntrinsicMetadataRow
class TestIndexerStorageContentFossologyLicense(StorageETypeTester):
@@ -1102,8 +1102,8 @@
"version": None,
"name": None,
}
- metadata_rev = RevisionIntrinsicMetadataRow(
- id=data.revision_id_2,
+ metadata_dir = DirectoryIntrinsicMetadataRow(
+ id=data.directory_id_2,
metadata=metadata,
mappings=["mapping1"],
indexer_configuration_id=tool_id,
@@ -1113,11 +1113,11 @@
metadata=metadata,
indexer_configuration_id=tool_id,
mappings=["mapping1"],
- from_revision=data.revision_id_2,
+ from_directory=data.directory_id_2,
)
# when
- storage.revision_intrinsic_metadata_add([metadata_rev])
+ storage.directory_intrinsic_metadata_add([metadata_dir])
storage.origin_intrinsic_metadata_add([metadata_origin])
# then
@@ -1130,7 +1130,7 @@
id=data.origin_url_1,
metadata=metadata,
tool=data.tools["swh-metadata-detector"],
- from_revision=data.revision_id_2,
+ from_directory=data.directory_id_2,
mappings=["mapping1"],
)
]
@@ -1156,8 +1156,8 @@
"version": None,
"name": None,
}
- metadata_rev_v1 = RevisionIntrinsicMetadataRow(
- id=data.revision_id_2,
+ metadata_dir_v1 = DirectoryIntrinsicMetadataRow(
+ id=data.directory_id_2,
metadata=metadata_v1,
mappings=[],
indexer_configuration_id=tool_id,
@@ -1167,11 +1167,11 @@
metadata=metadata_v1.copy(),
indexer_configuration_id=tool_id,
mappings=[],
- from_revision=data.revision_id_2,
+ from_directory=data.directory_id_2,
)
# given
- storage.revision_intrinsic_metadata_add([metadata_rev_v1])
+ storage.directory_intrinsic_metadata_add([metadata_dir_v1])
storage.origin_intrinsic_metadata_add([metadata_origin_v1])
# when
@@ -1185,7 +1185,7 @@
id=data.origin_url_1,
metadata=metadata_v1,
tool=data.tools["swh-metadata-detector"],
- from_revision=data.revision_id_2,
+ from_directory=data.directory_id_2,
mappings=[],
)
]
@@ -1199,16 +1199,16 @@
"author": "MG",
}
)
- metadata_rev_v2 = attr.evolve(metadata_rev_v1, metadata=metadata_v2)
+ metadata_dir_v2 = attr.evolve(metadata_dir_v1, metadata=metadata_v2)
metadata_origin_v2 = OriginIntrinsicMetadataRow(
id=data.origin_url_1,
metadata=metadata_v2.copy(),
indexer_configuration_id=tool_id,
mappings=["npm"],
- from_revision=data.revision_id_1,
+ from_directory=data.directory_id_1,
)
- storage.revision_intrinsic_metadata_add([metadata_rev_v2])
+ storage.directory_intrinsic_metadata_add([metadata_dir_v2])
storage.origin_intrinsic_metadata_add([metadata_origin_v2])
actual_metadata = list(
@@ -1220,7 +1220,7 @@
id=data.origin_url_1,
metadata=metadata_v2,
tool=data.tools["swh-metadata-detector"],
- from_revision=data.revision_id_1,
+ from_directory=data.directory_id_1,
mappings=["npm"],
)
]
@@ -1252,8 +1252,8 @@
"mappings": [],
}
- metadata_rev_v1 = RevisionIntrinsicMetadataRow(
- id=data.revision_id_2,
+ metadata_dir_v1 = DirectoryIntrinsicMetadataRow(
+ id=data.directory_id_2,
metadata={
"version": None,
"name": None,
@@ -1265,7 +1265,7 @@
data_v1 = [
OriginIntrinsicMetadataRow(
id=origin,
- from_revision=data.revision_id_2,
+ from_directory=data.directory_id_2,
indexer_configuration_id=tool_id,
**example_data1,
)
@@ -1274,7 +1274,7 @@
data_v2 = [
OriginIntrinsicMetadataRow(
id=origin,
- from_revision=data.revision_id_2,
+ from_directory=data.directory_id_2,
indexer_configuration_id=tool_id,
**example_data2,
)
@@ -1287,7 +1287,7 @@
data_v2b = list(reversed(data_v2[0:-1]))
# given
- storage.revision_intrinsic_metadata_add([metadata_rev_v1])
+ storage.directory_intrinsic_metadata_add([metadata_dir_v1])
storage.origin_intrinsic_metadata_add(data_v1)
# when
@@ -1296,7 +1296,7 @@
expected_data_v1 = [
OriginIntrinsicMetadataRow(
id=origin,
- from_revision=data.revision_id_2,
+ from_directory=data.directory_id_2,
tool=data.tools["swh-metadata-detector"],
**example_data1,
)
@@ -1326,7 +1326,7 @@
expected_data_v2 = [
OriginIntrinsicMetadataRow(
id=origin,
- from_revision=data.revision_id_2,
+ from_directory=data.directory_id_2,
tool=data.tools["swh-metadata-detector"],
**example_data2,
)
@@ -1351,8 +1351,8 @@
"developmentStatus": None,
"name": None,
}
- metadata_rev = RevisionIntrinsicMetadataRow(
- id=data.revision_id_2,
+ metadata_dir = DirectoryIntrinsicMetadataRow(
+ id=data.directory_id_2,
metadata=metadata,
mappings=["mapping1"],
indexer_configuration_id=tool_id,
@@ -1362,11 +1362,11 @@
metadata=metadata,
indexer_configuration_id=tool_id,
mappings=["mapping1"],
- from_revision=data.revision_id_2,
+ from_directory=data.directory_id_2,
)
# when
- storage.revision_intrinsic_metadata_add([metadata_rev])
+ storage.directory_intrinsic_metadata_add([metadata_dir])
with pytest.raises(DuplicateId):
storage.origin_intrinsic_metadata_add([metadata_origin, metadata_origin])
@@ -1381,8 +1381,8 @@
metadata1 = {
"author": "John Doe",
}
- metadata1_rev = RevisionIntrinsicMetadataRow(
- id=data.revision_id_1,
+ metadata1_dir = DirectoryIntrinsicMetadataRow(
+ id=data.directory_id_1,
metadata=metadata1,
mappings=[],
indexer_configuration_id=tool_id,
@@ -1392,13 +1392,13 @@
metadata=metadata1,
mappings=[],
indexer_configuration_id=tool_id,
- from_revision=data.revision_id_1,
+ from_directory=data.directory_id_1,
)
metadata2 = {
"author": "Jane Doe",
}
- metadata2_rev = RevisionIntrinsicMetadataRow(
- id=data.revision_id_2,
+ metadata2_dir = DirectoryIntrinsicMetadataRow(
+ id=data.directory_id_2,
metadata=metadata2,
mappings=[],
indexer_configuration_id=tool_id,
@@ -1408,13 +1408,13 @@
metadata=metadata2,
mappings=[],
indexer_configuration_id=tool_id,
- from_revision=data.revision_id_2,
+ from_directory=data.directory_id_2,
)
# when
- storage.revision_intrinsic_metadata_add([metadata1_rev])
+ storage.directory_intrinsic_metadata_add([metadata1_dir])
storage.origin_intrinsic_metadata_add([metadata1_origin])
- storage.revision_intrinsic_metadata_add([metadata2_rev])
+ storage.directory_intrinsic_metadata_add([metadata2_dir])
storage.origin_intrinsic_metadata_add([metadata2_origin])
# then
@@ -1444,8 +1444,8 @@
"Jane Doe",
]
}
- metadata1_rev = RevisionIntrinsicMetadataRow(
- id=data.revision_id_1,
+ metadata1_dir = DirectoryIntrinsicMetadataRow(
+ id=data.directory_id_1,
metadata=metadata1,
mappings=[],
indexer_configuration_id=tool_id,
@@ -1455,7 +1455,7 @@
metadata=metadata1,
mappings=[],
indexer_configuration_id=tool_id,
- from_revision=data.revision_id_1,
+ from_directory=data.directory_id_1,
)
metadata2 = {
"author": [
@@ -1463,8 +1463,8 @@
"Jane Doe",
]
}
- metadata2_rev = RevisionIntrinsicMetadataRow(
- id=data.revision_id_2,
+ metadata2_dir = DirectoryIntrinsicMetadataRow(
+ id=data.directory_id_2,
metadata=metadata2,
mappings=[],
indexer_configuration_id=tool_id,
@@ -1474,13 +1474,13 @@
metadata=metadata2,
mappings=[],
indexer_configuration_id=tool_id,
- from_revision=data.revision_id_2,
+ from_directory=data.directory_id_2,
)
# when
- storage.revision_intrinsic_metadata_add([metadata1_rev])
+ storage.directory_intrinsic_metadata_add([metadata1_dir])
storage.origin_intrinsic_metadata_add([metadata1_origin])
- storage.revision_intrinsic_metadata_add([metadata2_rev])
+ storage.directory_intrinsic_metadata_add([metadata2_dir])
storage.origin_intrinsic_metadata_add([metadata2_origin])
# then
@@ -1508,8 +1508,8 @@
"@context": "foo",
"author": "John Doe",
}
- metadata1_rev = RevisionIntrinsicMetadataRow(
- id=data.revision_id_1,
+ metadata1_dir = DirectoryIntrinsicMetadataRow(
+ id=data.directory_id_1,
metadata=metadata1,
mappings=["npm"],
indexer_configuration_id=tool1_id,
@@ -1519,14 +1519,14 @@
metadata=metadata1,
mappings=["npm"],
indexer_configuration_id=tool1_id,
- from_revision=data.revision_id_1,
+ from_directory=data.directory_id_1,
)
metadata2 = {
"@context": "foo",
"author": "Jane Doe",
}
- metadata2_rev = RevisionIntrinsicMetadataRow(
- id=data.revision_id_2,
+ metadata2_dir = DirectoryIntrinsicMetadataRow(
+ id=data.directory_id_2,
metadata=metadata2,
mappings=["npm", "gemspec"],
indexer_configuration_id=tool2_id,
@@ -1536,13 +1536,13 @@
metadata=metadata2,
mappings=["npm", "gemspec"],
indexer_configuration_id=tool2_id,
- from_revision=data.revision_id_2,
+ from_directory=data.directory_id_2,
)
metadata3 = {
"@context": "foo",
}
- metadata3_rev = RevisionIntrinsicMetadataRow(
- id=data.revision_id_3,
+ metadata3_dir = DirectoryIntrinsicMetadataRow(
+ id=data.directory_id_3,
metadata=metadata3,
mappings=["npm", "gemspec"],
indexer_configuration_id=tool2_id,
@@ -1552,14 +1552,14 @@
metadata=metadata3,
mappings=["pkg-info"],
indexer_configuration_id=tool2_id,
- from_revision=data.revision_id_3,
+ from_directory=data.directory_id_3,
)
- storage.revision_intrinsic_metadata_add([metadata1_rev])
+ storage.directory_intrinsic_metadata_add([metadata1_dir])
storage.origin_intrinsic_metadata_add([metadata1_origin])
- storage.revision_intrinsic_metadata_add([metadata2_rev])
+ storage.directory_intrinsic_metadata_add([metadata2_dir])
storage.origin_intrinsic_metadata_add([metadata2_origin])
- storage.revision_intrinsic_metadata_add([metadata3_rev])
+ storage.directory_intrinsic_metadata_add([metadata3_dir])
storage.origin_intrinsic_metadata_add([metadata3_origin])
def test_origin_intrinsic_metadata_search_by_producer(
@@ -1685,7 +1685,7 @@
},
mappings=["npm", "gemspec"],
tool=tool2,
- from_revision=data.revision_id_2,
+ from_directory=data.directory_id_2,
)
],
next_page_token=None,
diff --git a/swh/indexer/tests/tasks.py b/swh/indexer/tests/tasks.py
--- a/swh/indexer/tests/tasks.py
+++ b/swh/indexer/tests/tasks.py
@@ -1,13 +1,12 @@
from celery import current_app as app
-from swh.indexer.metadata import OriginMetadataIndexer, RevisionMetadataIndexer
+from swh.indexer.metadata import DirectoryMetadataIndexer, OriginMetadataIndexer
from .test_metadata import ContentMetadataTestIndexer
-from .test_origin_head import OriginHeadTestIndexer
from .utils import BASE_TEST_CONFIG
-class RevisionMetadataTestIndexer(RevisionMetadataIndexer):
+class DirectoryMetadataTestIndexer(DirectoryMetadataIndexer):
"""Specific indexer whose configuration is enough to satisfy the
indexing tests.
"""
@@ -30,13 +29,12 @@
return {**BASE_TEST_CONFIG, "tools": []}
def _prepare_sub_indexers(self):
- self.origin_head_indexer = OriginHeadTestIndexer()
- self.revision_metadata_indexer = RevisionMetadataTestIndexer()
+ self.directory_metadata_indexer = DirectoryMetadataTestIndexer()
@app.task
-def revision_intrinsic_metadata(*args, **kwargs):
- indexer = RevisionMetadataTestIndexer()
+def directory_intrinsic_metadata(*args, **kwargs):
+ indexer = DirectoryMetadataTestIndexer()
indexer.run(*args, **kwargs)
print("REV RESULT=", indexer.results)
diff --git a/swh/indexer/tests/test_cli.py b/swh/indexer/tests/test_cli.py
--- a/swh/indexer/tests/test_cli.py
+++ b/swh/indexer/tests/test_cli.py
@@ -16,13 +16,15 @@
from swh.indexer.cli import indexer_cli_group
from swh.indexer.storage.interface import IndexerStorageInterface
from swh.indexer.storage.model import (
+ DirectoryIntrinsicMetadataRow,
OriginIntrinsicMetadataRow,
- RevisionIntrinsicMetadataRow,
)
from swh.journal.writer import get_journal_writer
from swh.model.hashutil import hash_to_bytes
from swh.model.model import OriginVisitStatus
+from .utils import DIRECTORY2, REVISION
+
def fill_idx_storage(idx_storage: IndexerStorageInterface, nb_rows: int) -> List[int]:
tools: List[Dict[str, Any]] = [
@@ -38,15 +40,15 @@
origin_metadata = [
OriginIntrinsicMetadataRow(
id="file://dev/%04d" % origin_id,
- from_revision=hash_to_bytes("abcd{:0>36}".format(origin_id)),
+ from_directory=hash_to_bytes("abcd{:0>36}".format(origin_id)),
indexer_configuration_id=tools[origin_id % 2]["id"],
metadata={"name": "origin %d" % origin_id},
mappings=["mapping%d" % (origin_id % 10)],
)
for origin_id in range(nb_rows)
]
- revision_metadata = [
- RevisionIntrinsicMetadataRow(
+ directory_metadata = [
+ DirectoryIntrinsicMetadataRow(
id=hash_to_bytes("abcd{:0>36}".format(origin_id)),
indexer_configuration_id=tools[origin_id % 2]["id"],
metadata={"name": "origin %d" % origin_id},
@@ -55,7 +57,7 @@
for origin_id in range(nb_rows)
]
- idx_storage.revision_intrinsic_metadata_add(revision_metadata)
+ idx_storage.directory_intrinsic_metadata_add(directory_metadata)
idx_storage.origin_intrinsic_metadata_add(origin_metadata)
return [tool["id"] for tool in tools]
@@ -400,7 +402,7 @@
return datetime.datetime.now(tz=datetime.timezone.utc)
-def test_cli_journal_client(
+def test_cli_journal_client_schedule(
cli_runner,
swh_config,
indexer_scheduler,
@@ -523,3 +525,131 @@
],
catch_exceptions=False,
)
+
+
+@pytest.mark.parametrize("indexer_name", ["origin-intrinsic-metadata", "*"])
+def test_cli_journal_client_index(
+ cli_runner,
+ swh_config,
+ kafka_prefix: str,
+ kafka_server,
+ consumer: Consumer,
+ idx_storage,
+ storage,
+ mocker,
+ swh_indexer_config,
+ indexer_name: str,
+):
+ """Test the 'swh indexer journal-client' cli tool."""
+ journal_writer = get_journal_writer(
+ "kafka",
+ brokers=[kafka_server],
+ prefix=kafka_prefix,
+ client_id="test producer",
+ value_sanitizer=lambda object_type, value: value,
+ flush_timeout=3, # fail early if something is going wrong
+ )
+
+ visit_statuses = [
+ OriginVisitStatus(
+ origin="file:///dev/zero",
+ visit=1,
+ date=now(),
+ status="full",
+ snapshot=None,
+ ),
+ OriginVisitStatus(
+ origin="file:///dev/foobar",
+ visit=2,
+ date=now(),
+ status="full",
+ snapshot=None,
+ ),
+ OriginVisitStatus(
+ origin="file:///tmp/spamegg",
+ visit=3,
+ date=now(),
+ status="full",
+ snapshot=None,
+ ),
+ OriginVisitStatus(
+ origin="file:///dev/0002",
+ visit=6,
+ date=now(),
+ status="full",
+ snapshot=None,
+ ),
+ OriginVisitStatus( # will be filtered out due to its 'partial' status
+ origin="file:///dev/0000",
+ visit=4,
+ date=now(),
+ status="partial",
+ snapshot=None,
+ ),
+ OriginVisitStatus( # will be filtered out due to its 'ongoing' status
+ origin="file:///dev/0001",
+ visit=5,
+ date=now(),
+ status="ongoing",
+ snapshot=None,
+ ),
+ ]
+
+ journal_writer.write_additions("origin_visit_status", visit_statuses)
+ visit_statuses_full = [vs for vs in visit_statuses if vs.status == "full"]
+ storage.revision_add([REVISION])
+
+ mocker.patch(
+ "swh.indexer.metadata.get_head_swhid",
+ return_value=REVISION.swhid(),
+ )
+
+ mocker.patch(
+ "swh.indexer.metadata.DirectoryMetadataIndexer.index",
+ return_value=[
+ DirectoryIntrinsicMetadataRow(
+ id=DIRECTORY2.id,
+ indexer_configuration_id=1,
+ mappings=["cff"],
+ metadata={"foo": "bar"},
+ )
+ ],
+ )
+ result = cli_runner.invoke(
+ indexer_cli_group,
+ [
+ "-C",
+ swh_config,
+ "journal-client",
+ indexer_name,
+ "--broker",
+ kafka_server,
+ "--prefix",
+ kafka_prefix,
+ "--group-id",
+ "test-consumer",
+ "--stop-after-objects",
+ len(visit_statuses),
+ ],
+ catch_exceptions=False,
+ )
+
+ # Check the output
+ expected_output = "Done.\n"
+ assert result.exit_code == 0, result.output
+ assert result.output == expected_output
+
+ results = idx_storage.origin_intrinsic_metadata_get(
+ [status.origin for status in visit_statuses]
+ )
+ expected_results = [
+ OriginIntrinsicMetadataRow(
+ id=status.origin,
+ from_directory=DIRECTORY2.id,
+ tool={"id": 1, **swh_indexer_config["tools"]},
+ mappings=["cff"],
+ metadata={"foo": "bar"},
+ )
+ for status in sorted(visit_statuses_full, key=lambda r: r.origin)
+ ]
+ assert sorted(results, key=lambda r: r.id) == expected_results
diff --git a/swh/indexer/tests/test_indexer.py b/swh/indexer/tests/test_indexer.py
--- a/swh/indexer/tests/test_indexer.py
+++ b/swh/indexer/tests/test_indexer.py
@@ -11,13 +11,13 @@
from swh.indexer.indexer import (
ContentIndexer,
ContentPartitionIndexer,
+ DirectoryIndexer,
OriginIndexer,
- RevisionIndexer,
)
from swh.indexer.storage import PagedResult, Sha1
from swh.model.model import Content
-from .utils import BASE_TEST_CONFIG
+from .utils import BASE_TEST_CONFIG, DIRECTORY2
class _TestException(Exception):
@@ -49,7 +49,7 @@
pass
-class CrashingRevisionIndexer(CrashingIndexerMixin, RevisionIndexer):
+class CrashingDirectoryIndexer(CrashingIndexerMixin, DirectoryIndexer):
pass
@@ -86,29 +86,43 @@
indexer.run([b"foo"])
-def test_revision_indexer_catch_exceptions():
- indexer = CrashingRevisionIndexer(config=BASE_TEST_CONFIG)
+def test_directory_indexer_catch_exceptions():
+ indexer = CrashingDirectoryIndexer(config=BASE_TEST_CONFIG)
indexer.storage = Mock()
- indexer.storage.revision_get.return_value = ["rev"]
+ indexer.storage.directory_get.return_value = [DIRECTORY2]
assert indexer.run([b"foo"]) == {"status": "failed"}
+ assert indexer.process_journal_objects({"directory": [DIRECTORY2.to_dict()]}) == {
+ "status": "failed"
+ }
+
indexer.catch_exceptions = False
with pytest.raises(_TestException):
indexer.run([b"foo"])
+ with pytest.raises(_TestException):
+ indexer.process_journal_objects({"directory": [DIRECTORY2.to_dict()]})
+
def test_origin_indexer_catch_exceptions():
indexer = CrashingOriginIndexer(config=BASE_TEST_CONFIG)
assert indexer.run(["http://example.org"]) == {"status": "failed"}
+ assert indexer.process_journal_objects(
+ {"origin": [{"url": "http://example.org"}]}
+ ) == {"status": "failed"}
+
indexer.catch_exceptions = False
with pytest.raises(_TestException):
indexer.run(["http://example.org"])
+ with pytest.raises(_TestException):
+ indexer.process_journal_objects({"origin": [{"url": "http://example.org"}]})
+
def test_content_partition_indexer_catch_exceptions():
indexer = CrashingContentPartitionIndexer(
diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py
--- a/swh/indexer/tests/test_metadata.py
+++ b/swh/indexer/tests/test_metadata.py
@@ -1,24 +1,25 @@
-# Copyright (C) 2017-2020 The Software Heritage developers
+# Copyright (C) 2017-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import json
-import unittest
+import logging
from hypothesis import HealthCheck, given, settings, strategies
+import pytest
from swh.indexer.codemeta import CODEMETA_TERMS
-from swh.indexer.metadata import ContentMetadataIndexer, RevisionMetadataIndexer
+from swh.indexer.metadata import ContentMetadataIndexer, DirectoryMetadataIndexer
from swh.indexer.metadata_detector import detect_metadata
from swh.indexer.metadata_dictionary import MAPPINGS
from swh.indexer.metadata_dictionary.maven import MavenMapping
from swh.indexer.metadata_dictionary.npm import NpmMapping
from swh.indexer.metadata_dictionary.ruby import GemspecMapping
-from swh.indexer.storage.model import ContentMetadataRow, RevisionIntrinsicMetadataRow
-from swh.indexer.tests.utils import DIRECTORY2, REVISION
+from swh.indexer.storage.model import ContentMetadataRow, DirectoryIntrinsicMetadataRow
+from swh.indexer.tests.utils import DIRECTORY2
from swh.model.hashutil import hash_to_bytes
-from swh.model.model import Directory, DirectoryEntry, Revision
+from swh.model.model import Directory, DirectoryEntry
from .utils import (
BASE_TEST_CONFIG,
@@ -42,25 +43,21 @@
"""
def parse_config_file(self, *args, **kwargs):
- assert False, "should not be called; the rev indexer configures it."
+ assert False, "should not be called; the dir indexer configures it."
-REVISION_METADATA_CONFIG = {
+DIRECTORY_METADATA_CONFIG = {
**BASE_TEST_CONFIG,
"tools": TRANSLATOR_TOOL,
}
-class Metadata(unittest.TestCase):
+class TestMetadata:
"""
Tests metadata_mock_tool tool for Metadata detection
"""
- def setUp(self):
- """
- shows the entire diff in the results
- """
- self.maxDiff = None
+ def setup_method(self):
self.npm_mapping = MAPPINGS["NpmMapping"]()
self.codemeta_mapping = MAPPINGS["CodemetaMapping"]()
self.maven_mapping = MAPPINGS["MavenMapping"]()
@@ -81,7 +78,7 @@
# when
result = self.npm_mapping.translate(content)
# then
- self.assertEqual(declared_metadata, result)
+ assert declared_metadata == result
def test_compute_metadata_cff(self):
"""
@@ -160,7 +157,7 @@
# when
result = self.cff_mapping.translate(content)
# then
- self.assertEqual(expected, result)
+ assert expected == result
def test_compute_metadata_npm(self):
"""
@@ -201,7 +198,7 @@
# when
result = self.npm_mapping.translate(content)
# then
- self.assertEqual(declared_metadata, result)
+ assert declared_metadata == result
def test_index_content_metadata_npm(self):
"""
@@ -275,7 +272,7 @@
del result.tool["id"]
# The assertion below returns False sometimes because of nested lists
- self.assertEqual(expected_results, results)
+ assert expected_results == results
def test_npm_bugs_normalization(self):
# valid dictionary
@@ -287,15 +284,12 @@
}
}"""
result = self.npm_mapping.translate(package_json)
- self.assertEqual(
- result,
- {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "name": "foo",
- "issueTracker": "https://github.com/owner/project/issues",
- "type": "SoftwareSourceCode",
- },
- )
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "name": "foo",
+ "issueTracker": "https://github.com/owner/project/issues",
+ "type": "SoftwareSourceCode",
+ }
# "invalid" dictionary
package_json = b"""{
@@ -305,14 +299,11 @@
}
}"""
result = self.npm_mapping.translate(package_json)
- self.assertEqual(
- result,
- {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "name": "foo",
- "type": "SoftwareSourceCode",
- },
- )
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "name": "foo",
+ "type": "SoftwareSourceCode",
+ }
# string
package_json = b"""{
@@ -320,15 +311,12 @@
"bugs": "https://github.com/owner/project/issues"
}"""
result = self.npm_mapping.translate(package_json)
- self.assertEqual(
- result,
- {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "name": "foo",
- "issueTracker": "https://github.com/owner/project/issues",
- "type": "SoftwareSourceCode",
- },
- )
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "name": "foo",
+ "issueTracker": "https://github.com/owner/project/issues",
+ "type": "SoftwareSourceCode",
+ }
def test_npm_repository_normalization(self):
# normal
@@ -340,15 +328,12 @@
}
}"""
result = self.npm_mapping.translate(package_json)
- self.assertEqual(
- result,
- {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "name": "foo",
- "codeRepository": "git+https://github.com/npm/cli.git",
- "type": "SoftwareSourceCode",
- },
- )
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "name": "foo",
+ "codeRepository": "git+https://github.com/npm/cli.git",
+ "type": "SoftwareSourceCode",
+ }
# missing url
package_json = b"""{
@@ -358,14 +343,11 @@
}
}"""
result = self.npm_mapping.translate(package_json)
- self.assertEqual(
- result,
- {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "name": "foo",
- "type": "SoftwareSourceCode",
- },
- )
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "name": "foo",
+ "type": "SoftwareSourceCode",
+ }
# github shortcut
package_json = b"""{
@@ -379,7 +361,7 @@
"codeRepository": "git+https://github.com/npm/cli.git",
"type": "SoftwareSourceCode",
}
- self.assertEqual(result, expected_result)
+ assert result == expected_result
# github shortshortcut
package_json = b"""{
@@ -387,7 +369,7 @@
"repository": "npm/cli"
}"""
result = self.npm_mapping.translate(package_json)
- self.assertEqual(result, expected_result)
+ assert result == expected_result
# gitlab shortcut
package_json = b"""{
@@ -395,52 +377,48 @@
"repository": "gitlab:user/repo"
}"""
result = self.npm_mapping.translate(package_json)
- self.assertEqual(
- result,
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "name": "foo",
+ "codeRepository": "git+https://gitlab.com/user/repo.git",
+ "type": "SoftwareSourceCode",
+ }
+
+ @pytest.mark.parametrize(
+ "filename", [b"package.json", b"Package.json", b"PACKAGE.json", b"PACKAGE.JSON"]
+ )
+ def test_detect_metadata_package_json(self, filename):
+ # given
+ df = [
{
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "name": "foo",
- "codeRepository": "git+https://gitlab.com/user/repo.git",
- "type": "SoftwareSourceCode",
+ "sha1_git": b"abc",
+ "name": b"index.js",
+ "target": b"abc",
+ "length": 897,
+ "status": "visible",
+ "type": "file",
+ "perms": 33188,
+ "dir_id": b"dir_a",
+ "sha1": b"bcd",
},
- )
-
- def test_detect_metadata_package_json(self):
- filenames = [b"package.json", b"Package.json", b"PACKAGE.json", b"PACKAGE.JSON"]
-
- for filename in filenames:
- with self.subTest(filename=filename):
- # given
- df = [
- {
- "sha1_git": b"abc",
- "name": b"index.js",
- "target": b"abc",
- "length": 897,
- "status": "visible",
- "type": "file",
- "perms": 33188,
- "dir_id": b"dir_a",
- "sha1": b"bcd",
- },
- {
- "sha1_git": b"aab",
- "name": filename,
- "target": b"aab",
- "length": 712,
- "status": "visible",
- "type": "file",
- "perms": 33188,
- "dir_id": b"dir_a",
- "sha1": b"cde",
- },
- ]
- # when
- results = detect_metadata(df)
+ {
+ "sha1_git": b"aab",
+ "name": filename,
+ "target": b"aab",
+ "length": 712,
+ "status": "visible",
+ "type": "file",
+ "perms": 33188,
+ "dir_id": b"dir_a",
+ "sha1": b"cde",
+ },
+ ]
+ # when
+ results = detect_metadata(df)
- expected_results = {"NpmMapping": [b"cde"]}
- # then
- self.assertEqual(expected_results, results)
+ expected_results = {"NpmMapping": [b"cde"]}
+ # then
+ assert expected_results == results
def test_detect_metadata_codemeta_json_uppercase(self):
# given
@@ -473,7 +451,7 @@
expected_results = {"CodemetaMapping": [b"bcd"]}
# then
- self.assertEqual(expected_results, results)
+ assert expected_results == results
def test_compute_metadata_valid_codemeta(self):
raw_content = b"""{
@@ -580,7 +558,7 @@
"programmingLanguage": "JSON-LD",
}
result = self.codemeta_mapping.translate(raw_content)
- self.assertEqual(result, expected_result)
+ assert result == expected_result
def test_compute_metadata_codemeta_alternate_context(self):
raw_content = b"""{
@@ -594,7 +572,7 @@
"identifier": "CodeMeta",
}
result = self.codemeta_mapping.translate(raw_content)
- self.assertEqual(result, expected_result)
+ assert result == expected_result
def test_compute_metadata_maven(self):
raw_content = b"""
@@ -625,33 +603,27 @@
</licenses>
</project>"""
result = self.maven_mapping.translate(raw_content)
- self.assertEqual(
- result,
- {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "type": "SoftwareSourceCode",
- "name": "Maven Default Project",
- "identifier": "com.mycompany.app",
- "version": "1.2.3",
- "license": "https://www.apache.org/licenses/LICENSE-2.0.txt",
- "codeRepository": (
- "http://repo1.maven.org/maven2/com/mycompany/app/my-app"
- ),
- },
- )
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ "name": "Maven Default Project",
+ "identifier": "com.mycompany.app",
+ "version": "1.2.3",
+ "license": "https://www.apache.org/licenses/LICENSE-2.0.txt",
+ "codeRepository": (
+ "http://repo1.maven.org/maven2/com/mycompany/app/my-app"
+ ),
+ }
def test_compute_metadata_maven_empty(self):
raw_content = b"""
<project>
</project>"""
result = self.maven_mapping.translate(raw_content)
- self.assertEqual(
- result,
- {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "type": "SoftwareSourceCode",
- },
- )
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ }
def test_compute_metadata_maven_almost_empty(self):
raw_content = b"""
@@ -659,81 +631,85 @@
<foo/>
</project>"""
result = self.maven_mapping.translate(raw_content)
- self.assertEqual(
- result,
- {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "type": "SoftwareSourceCode",
- },
- )
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ }
- def test_compute_metadata_maven_invalid_xml(self):
+ def test_compute_metadata_maven_invalid_xml(self, caplog):
expected_warning = (
- "WARNING:swh.indexer.metadata_dictionary.maven.MavenMapping:"
- "Error parsing XML from foo"
+ "swh.indexer.metadata_dictionary.maven.MavenMapping",
+ logging.WARNING,
+ "Error parsing XML from foo",
)
+ caplog.at_level(logging.WARNING, logger="swh.indexer.metadata_dictionary")
raw_content = b"""
<project>"""
- with self.assertLogs("swh.indexer.metadata_dictionary", level="WARNING") as cm:
- result = MAPPINGS["MavenMapping"]("foo").translate(raw_content)
- self.assertEqual(cm.output, [expected_warning])
- self.assertEqual(result, None)
+ caplog.clear()
+ result = MAPPINGS["MavenMapping"]("foo").translate(raw_content)
+ assert caplog.record_tuples == [expected_warning]
+ assert result is None
raw_content = b"""
"""
- with self.assertLogs("swh.indexer.metadata_dictionary", level="WARNING") as cm:
- result = MAPPINGS["MavenMapping"]("foo").translate(raw_content)
- self.assertEqual(cm.output, [expected_warning])
- self.assertEqual(result, None)
+ caplog.clear()
+ result = MAPPINGS["MavenMapping"]("foo").translate(raw_content)
+ assert caplog.record_tuples == [expected_warning]
+ assert result is None
- def test_compute_metadata_maven_unknown_encoding(self):
+ def test_compute_metadata_maven_unknown_encoding(self, caplog):
expected_warning = (
- "WARNING:swh.indexer.metadata_dictionary.maven.MavenMapping:"
- "Error detecting XML encoding from foo"
+ "swh.indexer.metadata_dictionary.maven.MavenMapping",
+ logging.WARNING,
+ "Error detecting XML encoding from foo",
)
+ caplog.at_level(logging.WARNING, logger="swh.indexer.metadata_dictionary")
raw_content = b"""<?xml version="1.0" encoding="foo"?>
<project>
</project>"""
- with self.assertLogs("swh.indexer.metadata_dictionary", level="WARNING") as cm:
- result = MAPPINGS["MavenMapping"]("foo").translate(raw_content)
- self.assertEqual(cm.output, [expected_warning])
- self.assertEqual(result, None)
+ caplog.clear()
+ result = MAPPINGS["MavenMapping"]("foo").translate(raw_content)
+ assert caplog.record_tuples == [expected_warning]
+ assert result is None
raw_content = b"""<?xml version="1.0" encoding="UTF-7"?>
<project>
</project>"""
- with self.assertLogs("swh.indexer.metadata_dictionary", level="WARNING") as cm:
- result = MAPPINGS["MavenMapping"]("foo").translate(raw_content)
- self.assertEqual(cm.output, [expected_warning])
- self.assertEqual(result, None)
+ caplog.clear()
+ result = MAPPINGS["MavenMapping"]("foo").translate(raw_content)
+ assert caplog.record_tuples == [expected_warning]
+ assert result is None
- def test_compute_metadata_maven_invalid_encoding(self):
+ def test_compute_metadata_maven_invalid_encoding(self, caplog):
expected_warning = [
# libexpat1 <= 2.2.10-2+deb11u1
[
(
- "WARNING:swh.indexer.metadata_dictionary.maven.MavenMapping:"
- "Error unidecoding XML from foo"
+ "swh.indexer.metadata_dictionary.maven.MavenMapping",
+ logging.WARNING,
+ "Error unidecoding XML from foo",
)
],
# libexpat1 >= 2.2.10-2+deb11u2
[
(
- "WARNING:swh.indexer.metadata_dictionary.maven.MavenMapping:"
- "Error parsing XML from foo"
+ "swh.indexer.metadata_dictionary.maven.MavenMapping",
+ logging.WARNING,
+ "Error parsing XML from foo",
)
],
]
+ caplog.at_level(logging.WARNING, logger="swh.indexer.metadata_dictionary")
raw_content = b"""<?xml version="1.0" encoding="UTF-8"?>
<foo\xe5ct>
</foo>"""
- with self.assertLogs("swh.indexer.metadata_dictionary", level="WARNING") as cm:
- result = MAPPINGS["MavenMapping"]("foo").translate(raw_content)
- self.assertIn(cm.output, expected_warning)
- self.assertEqual(result, None)
+ caplog.clear()
+ result = MAPPINGS["MavenMapping"]("foo").translate(raw_content)
+ assert caplog.record_tuples in expected_warning
+ assert result is None
def test_compute_metadata_maven_minimal(self):
raw_content = b"""
@@ -745,19 +721,16 @@
<version>1.2.3</version>
</project>"""
result = self.maven_mapping.translate(raw_content)
- self.assertEqual(
- result,
- {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "type": "SoftwareSourceCode",
- "name": "Maven Default Project",
- "identifier": "com.mycompany.app",
- "version": "1.2.3",
- "codeRepository": (
- "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
- ),
- },
- )
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ "name": "Maven Default Project",
+ "identifier": "com.mycompany.app",
+ "version": "1.2.3",
+ "codeRepository": (
+ "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
+ ),
+ }
def test_compute_metadata_maven_empty_nodes(self):
raw_content = b"""
@@ -771,19 +744,16 @@
</repositories>
</project>"""
result = self.maven_mapping.translate(raw_content)
- self.assertEqual(
- result,
- {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "type": "SoftwareSourceCode",
- "name": "Maven Default Project",
- "identifier": "com.mycompany.app",
- "version": "1.2.3",
- "codeRepository": (
- "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
- ),
- },
- )
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ "name": "Maven Default Project",
+ "identifier": "com.mycompany.app",
+ "version": "1.2.3",
+ "codeRepository": (
+ "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
+ ),
+ }
raw_content = b"""
<project>
@@ -794,18 +764,15 @@
<version></version>
</project>"""
result = self.maven_mapping.translate(raw_content)
- self.assertEqual(
- result,
- {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "type": "SoftwareSourceCode",
- "name": "Maven Default Project",
- "identifier": "com.mycompany.app",
- "codeRepository": (
- "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
- ),
- },
- )
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ "name": "Maven Default Project",
+ "identifier": "com.mycompany.app",
+ "codeRepository": (
+ "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
+ ),
+ }
raw_content = b"""
<project>
@@ -816,18 +783,15 @@
<version>1.2.3</version>
</project>"""
result = self.maven_mapping.translate(raw_content)
- self.assertEqual(
- result,
- {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "type": "SoftwareSourceCode",
- "identifier": "com.mycompany.app",
- "version": "1.2.3",
- "codeRepository": (
- "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
- ),
- },
- )
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ "identifier": "com.mycompany.app",
+ "version": "1.2.3",
+ "codeRepository": (
+ "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
+ ),
+ }
raw_content = b"""
<project>
@@ -840,19 +804,16 @@
</licenses>
</project>"""
result = self.maven_mapping.translate(raw_content)
- self.assertEqual(
- result,
- {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "type": "SoftwareSourceCode",
- "name": "Maven Default Project",
- "identifier": "com.mycompany.app",
- "version": "1.2.3",
- "codeRepository": (
- "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
- ),
- },
- )
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ "name": "Maven Default Project",
+ "identifier": "com.mycompany.app",
+ "version": "1.2.3",
+ "codeRepository": (
+ "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
+ ),
+ }
raw_content = b"""
<project>
@@ -860,14 +821,11 @@
<version>1.2.3</version>
</project>"""
result = self.maven_mapping.translate(raw_content)
- self.assertEqual(
- result,
- {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "type": "SoftwareSourceCode",
- "version": "1.2.3",
- },
- )
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ "version": "1.2.3",
+ }
def test_compute_metadata_maven_invalid_licenses(self):
raw_content = b"""
@@ -882,19 +840,16 @@
</licenses>
</project>"""
result = self.maven_mapping.translate(raw_content)
- self.assertEqual(
- result,
- {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "type": "SoftwareSourceCode",
- "name": "Maven Default Project",
- "identifier": "com.mycompany.app",
- "version": "1.2.3",
- "codeRepository": (
- "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
- ),
- },
- )
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ "name": "Maven Default Project",
+ "identifier": "com.mycompany.app",
+ "version": "1.2.3",
+ "codeRepository": (
+ "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
+ ),
+ }
def test_compute_metadata_maven_multiple(self):
"""Tests when there are multiple code repos and licenses."""
@@ -936,24 +891,21 @@
</licenses>
</project>"""
result = self.maven_mapping.translate(raw_content)
- self.assertEqual(
- result,
- {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "type": "SoftwareSourceCode",
- "name": "Maven Default Project",
- "identifier": "com.mycompany.app",
- "version": "1.2.3",
- "license": [
- "https://www.apache.org/licenses/LICENSE-2.0.txt",
- "https://opensource.org/licenses/MIT",
- ],
- "codeRepository": [
- "http://repo1.maven.org/maven2/com/mycompany/app/my-app",
- "http://example.org/maven2/com/mycompany/app/my-app",
- ],
- },
- )
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ "name": "Maven Default Project",
+ "identifier": "com.mycompany.app",
+ "version": "1.2.3",
+ "license": [
+ "https://www.apache.org/licenses/LICENSE-2.0.txt",
+ "https://opensource.org/licenses/MIT",
+ ],
+ "codeRepository": [
+ "http://repo1.maven.org/maven2/com/mycompany/app/my-app",
+ "http://example.org/maven2/com/mycompany/app/my-app",
+ ],
+ }
def test_compute_metadata_pkginfo(self):
raw_content = b"""\
@@ -987,40 +939,33 @@
Provides-Extra: testing
""" # noqa
result = self.pkginfo_mapping.translate(raw_content)
- self.assertCountEqual(
- result["description"],
- [
- "Software Heritage core utilities", # note the comma here
- "swh-core\n"
- "========\n"
- "\n"
- "core library for swh's modules:\n"
- "- config parser\n"
- "- hash computations\n"
- "- serialization\n"
- "- logging mechanism\n"
- "",
- ],
- result,
- )
+ assert result["description"] == [
+ "Software Heritage core utilities", # note the comma here
+ "swh-core\n"
+ "========\n"
+ "\n"
+ "core library for swh's modules:\n"
+ "- config parser\n"
+ "- hash computations\n"
+ "- serialization\n"
+ "- logging mechanism\n"
+ "",
+ ], result
del result["description"]
- self.assertEqual(
- result,
- {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "type": "SoftwareSourceCode",
- "url": "https://forge.softwareheritage.org/diffusion/DCORE/",
- "name": "swh.core",
- "author": [
- {
- "type": "Person",
- "name": "Software Heritage developers",
- "email": "swh-devel@inria.fr",
- }
- ],
- "version": "0.0.49",
- },
- )
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ "url": "https://forge.softwareheritage.org/diffusion/DCORE/",
+ "name": "swh.core",
+ "author": [
+ {
+ "type": "Person",
+ "name": "Software Heritage developers",
+ "email": "swh-devel@inria.fr",
+ }
+ ],
+ "version": "0.0.49",
+ }
def test_compute_metadata_pkginfo_utf8(self):
raw_content = b"""\
@@ -1031,15 +976,12 @@
Hydrology N\xc2\xb083
""" # noqa
result = self.pkginfo_mapping.translate(raw_content)
- self.assertEqual(
- result,
- {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "type": "SoftwareSourceCode",
- "name": "snowpyt",
- "description": "foo\nHydrology N°83",
- },
- )
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ "name": "snowpyt",
+ "description": "foo\nHydrology N°83",
+ }
def test_compute_metadata_pkginfo_keywords(self):
raw_content = b"""\
@@ -1048,15 +990,12 @@
Keywords: foo bar baz
""" # noqa
result = self.pkginfo_mapping.translate(raw_content)
- self.assertEqual(
- result,
- {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "type": "SoftwareSourceCode",
- "name": "foo",
- "keywords": ["foo", "bar", "baz"],
- },
- )
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ "name": "foo",
+ "keywords": ["foo", "bar", "baz"],
+ }
def test_compute_metadata_pkginfo_license(self):
raw_content = b"""\
@@ -1065,15 +1004,12 @@
License: MIT
""" # noqa
result = self.pkginfo_mapping.translate(raw_content)
- self.assertEqual(
- result,
- {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "type": "SoftwareSourceCode",
- "name": "foo",
- "license": "MIT",
- },
- )
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ "name": "foo",
+ "license": "MIT",
+ }
def test_gemspec_base(self):
raw_content = b"""
@@ -1090,23 +1026,20 @@
s.metadata = { "source_code_uri" => "https://github.com/example/example" }
end"""
result = self.gemspec_mapping.translate(raw_content)
- self.assertCountEqual(
- result.pop("description"),
- ["This is an example!", "Much longer explanation of the example!"],
- )
- self.assertEqual(
- result,
- {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "type": "SoftwareSourceCode",
- "author": [{"type": "Person", "name": "Ruby Coder"}],
- "name": "example",
- "license": "https://spdx.org/licenses/MIT",
- "codeRepository": "https://rubygems.org/gems/example",
- "email": "rubycoder@example.com",
- "version": "0.1.0",
- },
- )
+ assert set(result.pop("description")) == {
+ "This is an example!",
+ "Much longer explanation of the example!",
+ }
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ "author": [{"type": "Person", "name": "Ruby Coder"}],
+ "name": "example",
+ "license": "https://spdx.org/licenses/MIT",
+ "codeRepository": "https://rubygems.org/gems/example",
+ "email": "rubycoder@example.com",
+ "version": "0.1.0",
+ }
def test_gemspec_two_author_fields(self):
raw_content = b"""
@@ -1115,20 +1048,20 @@
s.author = "Ruby Coder2"
end"""
result = self.gemspec_mapping.translate(raw_content)
- self.assertCountEqual(
- result.pop("author"),
+ assert result.pop("author") in (
[
{"type": "Person", "name": "Ruby Coder1"},
{"type": "Person", "name": "Ruby Coder2"},
],
+ [
+ {"type": "Person", "name": "Ruby Coder2"},
+ {"type": "Person", "name": "Ruby Coder1"},
+ ],
)
- self.assertEqual(
- result,
- {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "type": "SoftwareSourceCode",
- },
- )
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ }
def test_gemspec_invalid_author(self):
raw_content = b"""
@@ -1136,38 +1069,29 @@
s.author = ["Ruby Coder"]
end"""
result = self.gemspec_mapping.translate(raw_content)
- self.assertEqual(
- result,
- {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "type": "SoftwareSourceCode",
- },
- )
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ }
raw_content = b"""
Gem::Specification.new do |s|
s.author = "Ruby Coder1",
end"""
result = self.gemspec_mapping.translate(raw_content)
- self.assertEqual(
- result,
- {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "type": "SoftwareSourceCode",
- },
- )
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ }
raw_content = b"""
Gem::Specification.new do |s|
s.authors = ["Ruby Coder1", ["Ruby Coder2"]]
end"""
result = self.gemspec_mapping.translate(raw_content)
- self.assertEqual(
- result,
- {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "type": "SoftwareSourceCode",
- "author": [{"type": "Person", "name": "Ruby Coder1"}],
- },
- )
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ "author": [{"type": "Person", "name": "Ruby Coder1"}],
+ }
def test_gemspec_alternative_header(self):
raw_content = b"""
@@ -1179,15 +1103,12 @@
}
"""
result = self.gemspec_mapping.translate(raw_content)
- self.assertEqual(
- result,
- {
- "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
- "type": "SoftwareSourceCode",
- "name": "rb-system-with-aliases",
- "description": "execute system commands with aliases",
- },
- )
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ "name": "rb-system-with-aliases",
+ "description": "execute system commands with aliases",
+ }
@settings(suppress_health_check=[HealthCheck.too_slow])
@given(json_document_strategy(keys=list(NpmMapping.mapping)))
@@ -1233,8 +1154,8 @@
parts.append(b"end\n")
self.gemspec_mapping.translate(b"".join(parts))
- def test_revision_metadata_indexer(self):
- metadata_indexer = RevisionMetadataIndexer(config=REVISION_METADATA_CONFIG)
+ def test_directory_metadata_indexer(self):
+ metadata_indexer = DirectoryMetadataIndexer(config=DIRECTORY_METADATA_CONFIG)
fill_obj_storage(metadata_indexer.objstorage)
fill_storage(metadata_indexer.storage)
@@ -1242,8 +1163,7 @@
{f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()}
)
assert tool is not None
- rev = REVISION
- assert rev.directory == DIRECTORY2.id
+ dir_ = DIRECTORY2
metadata_indexer.idx_storage.content_metadata_add(
[
@@ -1255,15 +1175,17 @@
]
)
- metadata_indexer.run([rev.id])
+ metadata_indexer.run([dir_.id])
results = list(
- metadata_indexer.idx_storage.revision_intrinsic_metadata_get([REVISION.id])
+ metadata_indexer.idx_storage.directory_intrinsic_metadata_get(
+ [DIRECTORY2.id]
+ )
)
expected_results = [
- RevisionIntrinsicMetadataRow(
- id=rev.id,
+ DirectoryIntrinsicMetadataRow(
+ id=dir_.id,
tool=TRANSLATOR_TOOL,
metadata=YARN_PARSER_METADATA,
mappings=["npm"],
@@ -1274,35 +1196,29 @@
del result.tool["id"]
# then
- self.assertEqual(results, expected_results)
+ assert results == expected_results
- def test_revision_metadata_indexer_single_root_dir(self):
- metadata_indexer = RevisionMetadataIndexer(config=REVISION_METADATA_CONFIG)
+ def test_directory_metadata_indexer_single_root_dir(self):
+ metadata_indexer = DirectoryMetadataIndexer(config=DIRECTORY_METADATA_CONFIG)
fill_obj_storage(metadata_indexer.objstorage)
fill_storage(metadata_indexer.storage)
# Add a parent directory, that is the only directory at the root
- # of the revision
- rev = REVISION
- assert rev.directory == DIRECTORY2.id
+ # of the directory
+ dir_ = DIRECTORY2
- directory = Directory(
+ new_dir = Directory(
entries=(
DirectoryEntry(
name=b"foobar-1.0.0",
type="dir",
- target=rev.directory,
+ target=dir_.id,
perms=16384,
),
),
)
- assert directory.id is not None
- metadata_indexer.storage.directory_add([directory])
-
- new_rev_dict = {**rev.to_dict(), "directory": directory.id}
- new_rev_dict.pop("id")
- new_rev = Revision.from_dict(new_rev_dict)
- metadata_indexer.storage.revision_add([new_rev])
+ assert new_dir.id is not None
+ metadata_indexer.storage.directory_add([new_dir])
tool = metadata_indexer.idx_storage.indexer_configuration_get(
{f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()}
@@ -1319,15 +1235,15 @@
]
)
- metadata_indexer.run([new_rev.id])
+ metadata_indexer.run([new_dir.id])
results = list(
- metadata_indexer.idx_storage.revision_intrinsic_metadata_get([new_rev.id])
+ metadata_indexer.idx_storage.directory_intrinsic_metadata_get([new_dir.id])
)
expected_results = [
- RevisionIntrinsicMetadataRow(
- id=new_rev.id,
+ DirectoryIntrinsicMetadataRow(
+ id=new_dir.id,
tool=TRANSLATOR_TOOL,
metadata=YARN_PARSER_METADATA,
mappings=["npm"],
@@ -1338,4 +1254,4 @@
del result.tool["id"]
# then
- self.assertEqual(results, expected_results)
+ assert results == expected_results
diff --git a/swh/indexer/tests/test_origin_head.py b/swh/indexer/tests/test_origin_head.py
--- a/swh/indexer/tests/test_origin_head.py
+++ b/swh/indexer/tests/test_origin_head.py
@@ -1,15 +1,13 @@
-# Copyright (C) 2017-2020 The Software Heritage developers
+# Copyright (C) 2017-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-import copy
from datetime import datetime, timezone
-import unittest
import pytest
-from swh.indexer.origin_head import OriginHeadIndexer
+from swh.indexer.origin_head import get_head_swhid
from swh.indexer.tests.utils import fill_storage
from swh.model.model import (
Origin,
@@ -19,37 +17,9 @@
SnapshotBranch,
TargetType,
)
+from swh.model.swhids import CoreSWHID
from swh.storage.utils import now
-
-@pytest.fixture
-def swh_indexer_config(swh_indexer_config):
- config = copy.deepcopy(swh_indexer_config)
- config.update(
- {
- "tools": {
- "name": "origin-metadata",
- "version": "0.0.1",
- "configuration": {},
- },
- "tasks": {
- "revision_intrinsic_metadata": None,
- "origin_intrinsic_metadata": None,
- },
- }
- )
- return config
-
-
-class OriginHeadTestIndexer(OriginHeadIndexer):
- """Specific indexer whose configuration is enough to satisfy the
- indexing tests.
- """
-
- def persist_index_computations(self, results):
- self.results = results
-
-
SAMPLE_SNAPSHOT = Snapshot(
branches={
b"foo": None,
@@ -61,156 +31,127 @@
)
-class OriginHead(unittest.TestCase):
- @pytest.fixture(autouse=True)
- def init(self, swh_config):
- super().setUp()
- self.indexer = OriginHeadTestIndexer()
- self.indexer.catch_exceptions = False
- fill_storage(self.indexer.storage)
-
- def test_git(self):
- origin_url = "https://github.com/SoftwareHeritage/swh-storage"
- self.indexer.run([origin_url])
- rev_id = b"8K\x12\x00d\x03\xcc\xe4]bS\xe3\x8f{\xd7}\xac\xefrm"
- self.assertEqual(
- self.indexer.results,
- [
- {
- "revision_id": rev_id,
- "origin_url": origin_url,
- }
- ],
- )
-
- def test_git_partial_snapshot(self):
- """Checks partial snapshots are ignored."""
- origin_url = "https://github.com/SoftwareHeritage/swh-core"
- self.indexer.storage.origin_add([Origin(url=origin_url)])
- visit = self.indexer.storage.origin_visit_add(
- [
- OriginVisit(
- origin=origin_url,
- date=datetime(2019, 2, 27, tzinfo=timezone.utc),
- type="git",
- )
- ]
- )[0]
- self.indexer.storage.snapshot_add([SAMPLE_SNAPSHOT])
- visit_status = OriginVisitStatus(
- origin=origin_url,
- visit=visit.visit,
- date=now(),
- status="partial",
- snapshot=SAMPLE_SNAPSHOT.id,
- )
- self.indexer.storage.origin_visit_status_add([visit_status])
- self.indexer.run([origin_url])
- self.assertEqual(self.indexer.results, [])
-
- def test_vcs_missing_snapshot(self):
- origin_url = "https://github.com/SoftwareHeritage/swh-indexer"
- self.indexer.storage.origin_add([Origin(url=origin_url)])
- self.indexer.run([origin_url])
- self.assertEqual(self.indexer.results, [])
-
- def test_pypi_missing_branch(self):
- origin_url = "https://pypi.org/project/abcdef/"
- self.indexer.storage.origin_add(
- [
- Origin(
- url=origin_url,
- )
- ]
- )
- visit = self.indexer.storage.origin_visit_add(
- [
- OriginVisit(
- origin=origin_url,
- date=datetime(2019, 2, 27, tzinfo=timezone.utc),
- type="pypi",
- )
- ]
- )[0]
- self.indexer.storage.snapshot_add([SAMPLE_SNAPSHOT])
- visit_status = OriginVisitStatus(
- origin=origin_url,
- visit=visit.visit,
- date=now(),
- status="full",
- snapshot=SAMPLE_SNAPSHOT.id,
- )
- self.indexer.storage.origin_visit_status_add([visit_status])
- self.indexer.run(["https://pypi.org/project/abcdef/"])
- self.assertEqual(self.indexer.results, [])
-
- def test_ftp(self):
- origin_url = "rsync://ftp.gnu.org/gnu/3dldf"
- self.indexer.run([origin_url])
- rev_id = b"\x8e\xa9\x8e/\xea}\x9feF\xf4\x9f\xfd\xee\xcc\x1a\xb4`\x8c\x8by"
- self.assertEqual(
- self.indexer.results,
- [
- {
- "revision_id": rev_id,
- "origin_url": origin_url,
- }
- ],
- )
-
- def test_ftp_missing_snapshot(self):
- origin_url = "rsync://ftp.gnu.org/gnu/foobar"
- self.indexer.storage.origin_add([Origin(url=origin_url)])
- self.indexer.run([origin_url])
- self.assertEqual(self.indexer.results, [])
-
- def test_deposit(self):
- origin_url = "https://forge.softwareheritage.org/source/jesuisgpl/"
- self.indexer.storage.origin_add([Origin(url=origin_url)])
- self.indexer.run([origin_url])
- rev_id = b"\xe7n\xa4\x9c\x9f\xfb\xb7\xf76\x11\x08{\xa6\xe9\x99\xb1\x9e]q\xeb"
- self.assertEqual(
- self.indexer.results,
- [
- {
- "revision_id": rev_id,
- "origin_url": origin_url,
- }
- ],
- )
-
- def test_deposit_missing_snapshot(self):
- origin_url = "https://forge.softwareheritage.org/source/foobar"
- self.indexer.storage.origin_add(
- [
- Origin(
- url=origin_url,
- )
- ]
- )
- self.indexer.run([origin_url])
- self.assertEqual(self.indexer.results, [])
-
- def test_pypi(self):
- origin_url = "https://pypi.org/project/limnoria/"
- self.indexer.run([origin_url])
-
- rev_id = b"\x83\xb9\xb6\xc7\x05\xb1%\xd0\xfem\xd8kA\x10\x9d\xc5\xfa2\xf8t"
- self.assertEqual(
- self.indexer.results,
- [{"revision_id": rev_id, "origin_url": origin_url}],
- )
-
- def test_svn(self):
- origin_url = "http://0-512-md.googlecode.com/svn/"
- self.indexer.run([origin_url])
- rev_id = b"\xe4?r\xe1,\x88\xab\xec\xe7\x9a\x87\xb8\xc9\xad#.\x1bw=\x18"
- self.assertEqual(
- self.indexer.results,
- [
- {
- "revision_id": rev_id,
- "origin_url": origin_url,
- }
- ],
- )
+@pytest.fixture
+def storage(swh_storage):
+ fill_storage(swh_storage)
+ return swh_storage
+
+
+def test_git(storage):
+ origin_url = "https://github.com/SoftwareHeritage/swh-storage"
+ assert get_head_swhid(storage, origin_url) == CoreSWHID.from_string(
+ "swh:1:rev:384b12006403cce45d6253e38f7bd77dacef726d"
+ )
+
+
+def test_git_partial_snapshot(storage):
+ """Checks partial snapshots are ignored."""
+ origin_url = "https://github.com/SoftwareHeritage/swh-core"
+ storage.origin_add([Origin(url=origin_url)])
+ visit = storage.origin_visit_add(
+ [
+ OriginVisit(
+ origin=origin_url,
+ date=datetime(2019, 2, 27, tzinfo=timezone.utc),
+ type="git",
+ )
+ ]
+ )[0]
+ storage.snapshot_add([SAMPLE_SNAPSHOT])
+ visit_status = OriginVisitStatus(
+ origin=origin_url,
+ visit=visit.visit,
+ date=now(),
+ status="partial",
+ snapshot=SAMPLE_SNAPSHOT.id,
+ )
+ storage.origin_visit_status_add([visit_status])
+ assert get_head_swhid(storage, origin_url) is None
+
+
+def test_vcs_missing_snapshot(storage):
+ origin_url = "https://github.com/SoftwareHeritage/swh-indexer"
+ storage.origin_add([Origin(url=origin_url)])
+ assert get_head_swhid(storage, origin_url) is None
+
+
+def test_pypi_missing_branch(storage):
+ origin_url = "https://pypi.org/project/abcdef/"
+ storage.origin_add(
+ [
+ Origin(
+ url=origin_url,
+ )
+ ]
+ )
+ visit = storage.origin_visit_add(
+ [
+ OriginVisit(
+ origin=origin_url,
+ date=datetime(2019, 2, 27, tzinfo=timezone.utc),
+ type="pypi",
+ )
+ ]
+ )[0]
+ storage.snapshot_add([SAMPLE_SNAPSHOT])
+ visit_status = OriginVisitStatus(
+ origin=origin_url,
+ visit=visit.visit,
+ date=now(),
+ status="full",
+ snapshot=SAMPLE_SNAPSHOT.id,
+ )
+ storage.origin_visit_status_add([visit_status])
+ assert get_head_swhid(storage, origin_url) is None
+
+
+def test_ftp(storage):
+ origin_url = "rsync://ftp.gnu.org/gnu/3dldf"
+ assert get_head_swhid(storage, origin_url) == CoreSWHID.from_string(
+ "swh:1:rev:8ea98e2fea7d9f6546f49ffdeecc1ab4608c8b79"
+ )
+
+
+def test_ftp_missing_snapshot(storage):
+ origin_url = "rsync://ftp.gnu.org/gnu/foobar"
+ storage.origin_add([Origin(url=origin_url)])
+ assert get_head_swhid(storage, origin_url) is None
+
+
+def test_deposit(storage):
+ origin_url = "https://forge.softwareheritage.org/source/jesuisgpl/"
+ storage.origin_add([Origin(url=origin_url)])
+ assert get_head_swhid(storage, origin_url) == CoreSWHID.from_string(
+ "swh:1:rev:e76ea49c9ffbb7f73611087ba6e999b19e5d71eb"
+ )
+
+
+def test_deposit_missing_snapshot(storage):
+ origin_url = "https://forge.softwareheritage.org/source/foobar"
+ storage.origin_add(
+ [
+ Origin(
+ url=origin_url,
+ )
+ ]
+ )
+ assert get_head_swhid(storage, origin_url) is None
+
+
+def test_pypi(storage):
+ origin_url = "https://old-pypi.example.org/project/limnoria/"
+ assert get_head_swhid(storage, origin_url) == CoreSWHID.from_string(
+ "swh:1:rev:83b9b6c705b125d0fe6dd86b41109dc5fa32f874"
+ )
+
+ origin_url = "https://pypi.org/project/limnoria/"
+ assert get_head_swhid(storage, origin_url) == CoreSWHID.from_string(
+ "swh:1:rel:83b9b6c705b125d0fe6dd86b41109dc5fa32f874"
+ )
+
+
+def test_svn(storage):
+ origin_url = "http://0-512-md.googlecode.com/svn/"
+ assert get_head_swhid(storage, origin_url) == CoreSWHID.from_string(
+ "swh:1:rev:e43f72e12c88abece79a87b8c9ad232e1b773d18"
+ )
diff --git a/swh/indexer/tests/test_origin_metadata.py b/swh/indexer/tests/test_origin_metadata.py
--- a/swh/indexer/tests/test_origin_metadata.py
+++ b/swh/indexer/tests/test_origin_metadata.py
@@ -11,14 +11,14 @@
from swh.indexer.metadata import OriginMetadataIndexer
from swh.indexer.storage.interface import IndexerStorageInterface
from swh.indexer.storage.model import (
+ DirectoryIntrinsicMetadataRow,
OriginIntrinsicMetadataRow,
- RevisionIntrinsicMetadataRow,
)
from swh.model.model import Origin
from swh.storage.interface import StorageInterface
from .test_metadata import TRANSLATOR_TOOL
-from .utils import REVISION, YARN_PARSER_METADATA
+from .utils import DIRECTORY2, YARN_PARSER_METADATA
@pytest.fixture
@@ -29,7 +29,47 @@
return cfg
-def test_origin_metadata_indexer(
+def test_origin_metadata_indexer_release(
+ swh_indexer_config,
+ idx_storage: IndexerStorageInterface,
+ storage: StorageInterface,
+ obj_storage,
+) -> None:
+ indexer = OriginMetadataIndexer(config=swh_indexer_config)
+ origin = "https://npm.example.org/yarn-parser"
+ indexer.run([origin])
+
+ tool = swh_indexer_config["tools"]
+
+ dir_id = DIRECTORY2.id
+ dir_metadata = DirectoryIntrinsicMetadataRow(
+ id=dir_id,
+ tool=tool,
+ metadata=YARN_PARSER_METADATA,
+ mappings=["npm"],
+ )
+ origin_metadata = OriginIntrinsicMetadataRow(
+ id=origin,
+ tool=tool,
+ from_directory=dir_id,
+ metadata=YARN_PARSER_METADATA,
+ mappings=["npm"],
+ )
+
+ dir_results = list(idx_storage.directory_intrinsic_metadata_get([dir_id]))
+ for dir_result in dir_results:
+ assert dir_result.tool
+ del dir_result.tool["id"]
+ assert dir_results == [dir_metadata]
+
+ orig_results = list(idx_storage.origin_intrinsic_metadata_get([origin]))
+ for orig_result in orig_results:
+ assert orig_result.tool
+ del orig_result.tool["id"]
+ assert orig_results == [origin_metadata]
+
+
+def test_origin_metadata_indexer_revision(
swh_indexer_config,
idx_storage: IndexerStorageInterface,
storage: StorageInterface,
@@ -41,9 +81,9 @@
tool = swh_indexer_config["tools"]
- rev_id = REVISION.id
- rev_metadata = RevisionIntrinsicMetadataRow(
- id=rev_id,
+ dir_id = DIRECTORY2.id
+ dir_metadata = DirectoryIntrinsicMetadataRow(
+ id=dir_id,
tool=tool,
metadata=YARN_PARSER_METADATA,
mappings=["npm"],
@@ -51,16 +91,16 @@
origin_metadata = OriginIntrinsicMetadataRow(
id=origin,
tool=tool,
- from_revision=rev_id,
+ from_directory=dir_id,
metadata=YARN_PARSER_METADATA,
mappings=["npm"],
)
- rev_results = list(idx_storage.revision_intrinsic_metadata_get([rev_id]))
- for rev_result in rev_results:
- assert rev_result.tool
- del rev_result.tool["id"]
- assert rev_results == [rev_metadata]
+ dir_results = list(idx_storage.directory_intrinsic_metadata_get([dir_id]))
+ for dir_result in dir_results:
+ assert dir_result.tool
+ del dir_result.tool["id"]
+ assert dir_results == [dir_metadata]
orig_results = list(idx_storage.origin_intrinsic_metadata_get([origin]))
for orig_result in orig_results:
@@ -82,10 +122,10 @@
indexer.run(["https://github.com/librariesio/yarn-parser"] * 2)
origin = "https://github.com/librariesio/yarn-parser"
- rev_id = REVISION.id
+ dir_id = DIRECTORY2.id
- rev_results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id]))
- assert len(rev_results) == 1
+ dir_results = list(indexer.idx_storage.directory_intrinsic_metadata_get([dir_id]))
+ assert len(dir_results) == 1
orig_results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin]))
assert len(orig_results) == 1
@@ -121,15 +161,15 @@
indexer = OriginMetadataIndexer(config=swh_indexer_config)
indexer.run([origin1, origin2])
- rev_id = REVISION.id
+ dir_id = DIRECTORY2.id
- rev_results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id]))
- assert rev_results == [
- RevisionIntrinsicMetadataRow(
- id=rev_id,
+ dir_results = list(indexer.idx_storage.directory_intrinsic_metadata_get([dir_id]))
+ assert dir_results == [
+ DirectoryIntrinsicMetadataRow(
+ id=dir_id,
metadata=YARN_PARSER_METADATA,
mappings=["npm"],
- tool=rev_results[0].tool,
+ tool=dir_results[0].tool,
)
]
@@ -140,7 +180,7 @@
assert orig_results == [
OriginIntrinsicMetadataRow(
id=origin2,
- from_revision=rev_id,
+ from_directory=dir_id,
metadata=YARN_PARSER_METADATA,
mappings=["npm"],
tool=orig_results[0].tool,
@@ -148,7 +188,7 @@
]
-def test_origin_metadata_indexer_duplicate_revision(
+def test_origin_metadata_indexer_duplicate_directory(
swh_indexer_config,
idx_storage: IndexerStorageInterface,
storage: StorageInterface,
@@ -162,10 +202,10 @@
origin2 = "https://github.com/librariesio/yarn-parser.git"
indexer.run([origin1, origin2])
- rev_id = REVISION.id
+ dir_id = DIRECTORY2.id
- rev_results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id]))
- assert len(rev_results) == 1
+ dir_results = list(indexer.idx_storage.directory_intrinsic_metadata_get([dir_id]))
+ assert len(dir_results) == 1
orig_results = list(
indexer.idx_storage.origin_intrinsic_metadata_get([origin1, origin2])
@@ -185,10 +225,10 @@
with patch("swh.indexer.metadata_dictionary.npm.NpmMapping.filename", b"foo.json"):
indexer.run([origin])
- rev_id = REVISION.id
+ dir_id = DIRECTORY2.id
- rev_results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id]))
- assert rev_results == []
+ dir_results = list(indexer.idx_storage.directory_intrinsic_metadata_get([dir_id]))
+ assert dir_results == []
orig_results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin]))
assert orig_results == []
@@ -204,16 +244,16 @@
indexer = OriginMetadataIndexer(config=swh_indexer_config)
origin = "https://github.com/librariesio/yarn-parser"
with patch(
- "swh.indexer.metadata.RevisionMetadataIndexer"
- ".translate_revision_intrinsic_metadata",
+ "swh.indexer.metadata.DirectoryMetadataIndexer"
+ ".translate_directory_intrinsic_metadata",
return_value=(["npm"], {"@context": "foo"}),
):
indexer.run([origin])
- rev_id = REVISION.id
+ dir_id = DIRECTORY2.id
- rev_results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id]))
- assert rev_results == []
+ dir_results = list(indexer.idx_storage.directory_intrinsic_metadata_get([dir_id]))
+ assert dir_results == []
orig_results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin]))
assert orig_results == []
@@ -229,16 +269,16 @@
indexer = OriginMetadataIndexer(config=swh_indexer_config)
origin = "https://github.com/librariesio/yarn-parser"
with patch(
- "swh.indexer.metadata.RevisionMetadataIndexer"
- ".translate_revision_intrinsic_metadata",
+ "swh.indexer.metadata.DirectoryMetadataIndexer"
+ ".translate_directory_intrinsic_metadata",
return_value=None,
):
indexer.run([origin])
- rev_id = REVISION.id
+ dir_id = DIRECTORY2.id
- rev_results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id]))
- assert rev_results == []
+ dir_results = list(indexer.idx_storage.directory_intrinsic_metadata_get([dir_id]))
+ assert dir_results == []
orig_results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin]))
assert orig_results == []
@@ -252,5 +292,5 @@
) -> None:
indexer = OriginMetadataIndexer(config=swh_indexer_config)
- result = indexer.index_list(["https://unknown.org/foo"])
+ result = indexer.index_list([Origin("https://unknown.org/foo")])
assert not result
diff --git a/swh/indexer/tests/utils.py b/swh/indexer/tests/utils.py
--- a/swh/indexer/tests/utils.py
+++ b/swh/indexer/tests/utils.py
@@ -19,10 +19,12 @@
Content,
Directory,
DirectoryEntry,
+ ObjectType,
Origin,
OriginVisit,
OriginVisitStatus,
Person,
+ Release,
Revision,
RevisionType,
Snapshot,
@@ -39,27 +41,26 @@
}
-ORIGINS = [
- Origin(url="https://github.com/SoftwareHeritage/swh-storage"),
- Origin(url="rsync://ftp.gnu.org/gnu/3dldf"),
- Origin(url="https://forge.softwareheritage.org/source/jesuisgpl/"),
- Origin(url="https://pypi.org/project/limnoria/"),
- Origin(url="http://0-512-md.googlecode.com/svn/"),
- Origin(url="https://github.com/librariesio/yarn-parser"),
- Origin(url="https://github.com/librariesio/yarn-parser.git"),
-]
-
-
ORIGIN_VISITS = [
- {"type": "git", "origin": ORIGINS[0].url},
- {"type": "ftp", "origin": ORIGINS[1].url},
- {"type": "deposit", "origin": ORIGINS[2].url},
- {"type": "pypi", "origin": ORIGINS[3].url},
- {"type": "svn", "origin": ORIGINS[4].url},
- {"type": "git", "origin": ORIGINS[5].url},
- {"type": "git", "origin": ORIGINS[6].url},
+ {"type": "git", "origin": "https://github.com/SoftwareHeritage/swh-storage"},
+ {"type": "ftp", "origin": "rsync://ftp.gnu.org/gnu/3dldf"},
+ {
+ "type": "deposit",
+ "origin": "https://forge.softwareheritage.org/source/jesuisgpl/",
+ },
+ {
+ "type": "pypi",
+ "origin": "https://old-pypi.example.org/project/limnoria/",
+ }, # with rev head
+ {"type": "pypi", "origin": "https://pypi.org/project/limnoria/"}, # with rel head
+ {"type": "svn", "origin": "http://0-512-md.googlecode.com/svn/"},
+ {"type": "git", "origin": "https://github.com/librariesio/yarn-parser"},
+ {"type": "git", "origin": "https://github.com/librariesio/yarn-parser.git"},
+ {"type": "git", "origin": "https://npm.example.org/yarn-parser"},
]
+ORIGINS = [Origin(url=visit["origin"]) for visit in ORIGIN_VISITS]
+
DIRECTORY = Directory(
id=hash_to_bytes("34f335a750111ca0a8b64d8034faec9eedc396be"),
@@ -97,6 +98,8 @@
),
)
+_utc_plus_2 = datetime.timezone(datetime.timedelta(minutes=120))
+
REVISION = Revision(
id=hash_to_bytes("c6201cb1b9b9df9a7542f9665c3b5dfab85e9775"),
message=b"Improve search functionality",
@@ -111,28 +114,12 @@
email=b"andrewnez@gmail.com",
),
committer_date=TimestampWithTimezone.from_datetime(
- datetime.datetime(
- 2013,
- 10,
- 4,
- 12,
- 50,
- 49,
- tzinfo=datetime.timezone(datetime.timedelta(minutes=120)),
- )
+ datetime.datetime(2013, 10, 4, 12, 50, 49, tzinfo=_utc_plus_2)
),
type=RevisionType.GIT,
synthetic=False,
date=TimestampWithTimezone.from_datetime(
- datetime.datetime(
- 2017,
- 2,
- 20,
- 16,
- 14,
- 16,
- tzinfo=datetime.timezone(datetime.timedelta(minutes=120)),
- )
+ datetime.datetime(2017, 2, 20, 16, 14, 16, tzinfo=_utc_plus_2)
),
directory=DIRECTORY2.id,
parents=(),
@@ -140,7 +127,26 @@
REVISIONS = [REVISION]
+RELEASE = Release(
+ name=b"v0.0.0",
+ message=None,
+ author=Person(
+ name=b"Andrew Nesbitt",
+ fullname=b"Andrew Nesbitt <andrewnez@gmail.com>",
+ email=b"andrewnez@gmail.com",
+ ),
+ synthetic=False,
+ date=TimestampWithTimezone.from_datetime(
+ datetime.datetime(2017, 2, 20, 16, 14, 16, tzinfo=_utc_plus_2)
+ ),
+ target_type=ObjectType.DIRECTORY,
+ target=DIRECTORY2.id,
+)
+
+RELEASES = [RELEASE]
+
SNAPSHOTS = [
+ # https://github.com/SoftwareHeritage/swh-storage
Snapshot(
id=hash_to_bytes("a50fde72265343b7d28cecf6db20d98a81d21965"),
branches={
@@ -161,6 +167,7 @@
),
},
),
+ # rsync://ftp.gnu.org/gnu/3dldf
Snapshot(
id=hash_to_bytes("2c67f69a416bca4e1f3fcd848c588fab88ad0642"),
branches={
@@ -186,6 +193,7 @@
),
},
),
+ # https://forge.softwareheritage.org/source/jesuisgpl/",
Snapshot(
id=hash_to_bytes("68c0d26104d47e278dd6be07ed61fafb561d0d20"),
branches={
@@ -195,6 +203,7 @@
)
},
),
+ # https://old-pypi.example.org/project/limnoria/
Snapshot(
id=hash_to_bytes("f255245269e15fc99d284affd79f766668de0b67"),
branches={
@@ -211,6 +220,23 @@
),
},
),
+ # https://pypi.org/project/limnoria/
+ Snapshot(
+ branches={
+ b"HEAD": SnapshotBranch(
+ target=b"releases/2018.09.09", target_type=TargetType.ALIAS
+ ),
+ b"releases/2018.09.01": SnapshotBranch(
+ target=b"<\xee1(\xe8\x8d_\xc1\xc9\xa6rT\xf1\x1d\xbb\xdfF\xfdw\xcf",
+ target_type=TargetType.RELEASE,
+ ),
+ b"releases/2018.09.09": SnapshotBranch(
+ target=b"\x83\xb9\xb6\xc7\x05\xb1%\xd0\xfem\xd8kA\x10\x9d\xc5\xfa2\xf8t", # noqa
+ target_type=TargetType.RELEASE,
+ ),
+ },
+ ),
+ # http://0-512-md.googlecode.com/svn/
Snapshot(
id=hash_to_bytes("a1a28c0ab387a8f9e0618cb705eab81fc448f473"),
branches={
@@ -220,6 +246,7 @@
)
},
),
+ # https://github.com/librariesio/yarn-parser
Snapshot(
id=hash_to_bytes("bb4fd3a836930ce629d912864319637040ff3040"),
branches={
@@ -229,6 +256,7 @@
)
},
),
+ # https://github.com/librariesio/yarn-parser.git
Snapshot(
id=hash_to_bytes("bb4fd3a836930ce629d912864319637040ff3040"),
branches={
@@ -238,8 +266,19 @@
)
},
),
+ # https://npm.example.org/yarn-parser
+ Snapshot(
+ branches={
+ b"HEAD": SnapshotBranch(
+ target=RELEASE.id,
+ target_type=TargetType.RELEASE,
+ )
+ },
+ ),
]
+assert len(SNAPSHOTS) == len(ORIGIN_VISITS)
+
SHA1_TO_LICENSES = {
"01c9379dfc33803963d07c1ccc748d3fe4c96bb5": ["GPL"],
@@ -582,6 +621,7 @@
storage.origin_add(ORIGINS)
storage.directory_add([DIRECTORY, DIRECTORY2])
storage.revision_add(REVISIONS)
+ storage.release_add(RELEASES)
storage.snapshot_add(SNAPSHOTS)
for visit, snapshot in zip(ORIGIN_VISITS, SNAPSHOTS):
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Jul 3 2025, 6:31 PM (5 w, 6 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3218965
Attached To
D8002: CffMapping: Ignores invalid yaml files
Event Timeline
Log In to Comment