diff --git a/PKG-INFO b/PKG-INFO
index ee4435e..2c3b3bd 100644
--- a/PKG-INFO
+++ b/PKG-INFO
@@ -1,71 +1,71 @@
Metadata-Version: 2.1
Name: swh.indexer
-Version: 2.5.0
+Version: 2.6.0
Summary: Software Heritage Content Indexer
Home-page: https://forge.softwareheritage.org/diffusion/78/
Author: Software Heritage developers
Author-email: swh-devel@inria.fr
Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest
Project-URL: Funding, https://www.softwareheritage.org/donate
Project-URL: Source, https://forge.softwareheritage.org/source/swh-indexer
Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-indexer/
Classifier: Programming Language :: Python :: 3
Classifier: Intended Audience :: Developers
Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
Classifier: Operating System :: OS Independent
Classifier: Development Status :: 5 - Production/Stable
Requires-Python: >=3.7
Description-Content-Type: text/markdown
Provides-Extra: testing
License-File: LICENSE
License-File: AUTHORS
swh-indexer
============
Tools to compute multiple indexes on SWH's raw contents:
- content:
- mimetype
- ctags
- language
- fossology-license
- metadata
- revision:
- metadata
An indexer is in charge of:
- looking up objects
- extracting information from those objects
- store those information in the swh-indexer db
There are multiple indexers working on different object types:
- content indexer: works with content sha1 hashes
- revision indexer: works with revision sha1 hashes
- origin indexer: works with origin identifiers
Indexation procedure:
- receive batch of ids
- retrieve the associated data depending on object type
- compute for that object some index
- store the result to swh's storage
Current content indexers:
- mimetype (queue swh_indexer_content_mimetype): detect the encoding
and mimetype
- language (queue swh_indexer_content_language): detect the
programming language
- ctags (queue swh_indexer_content_ctags): compute tags information
- fossology-license (queue swh_indexer_fossology_license): compute the
license
- metadata: translate file into translated_metadata dict
Current revision indexers:
- metadata: detects files containing metadata and retrieves translated_metadata
in content_metadata table in storage or run content indexer to translate
files.
diff --git a/swh.indexer.egg-info/PKG-INFO b/swh.indexer.egg-info/PKG-INFO
index ee4435e..2c3b3bd 100644
--- a/swh.indexer.egg-info/PKG-INFO
+++ b/swh.indexer.egg-info/PKG-INFO
@@ -1,71 +1,71 @@
Metadata-Version: 2.1
Name: swh.indexer
-Version: 2.5.0
+Version: 2.6.0
Summary: Software Heritage Content Indexer
Home-page: https://forge.softwareheritage.org/diffusion/78/
Author: Software Heritage developers
Author-email: swh-devel@inria.fr
Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest
Project-URL: Funding, https://www.softwareheritage.org/donate
Project-URL: Source, https://forge.softwareheritage.org/source/swh-indexer
Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-indexer/
Classifier: Programming Language :: Python :: 3
Classifier: Intended Audience :: Developers
Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
Classifier: Operating System :: OS Independent
Classifier: Development Status :: 5 - Production/Stable
Requires-Python: >=3.7
Description-Content-Type: text/markdown
Provides-Extra: testing
License-File: LICENSE
License-File: AUTHORS
swh-indexer
============
Tools to compute multiple indexes on SWH's raw contents:
- content:
- mimetype
- ctags
- language
- fossology-license
- metadata
- revision:
- metadata
An indexer is in charge of:
- looking up objects
- extracting information from those objects
- store those information in the swh-indexer db
There are multiple indexers working on different object types:
- content indexer: works with content sha1 hashes
- revision indexer: works with revision sha1 hashes
- origin indexer: works with origin identifiers
Indexation procedure:
- receive batch of ids
- retrieve the associated data depending on object type
- compute for that object some index
- store the result to swh's storage
Current content indexers:
- mimetype (queue swh_indexer_content_mimetype): detect the encoding
and mimetype
- language (queue swh_indexer_content_language): detect the
programming language
- ctags (queue swh_indexer_content_ctags): compute tags information
- fossology-license (queue swh_indexer_fossology_license): compute the
license
- metadata: translate file into translated_metadata dict
Current revision indexers:
- metadata: detects files containing metadata and retrieves translated_metadata
in content_metadata table in storage or run content indexer to translate
files.
diff --git a/swh/indexer/cli.py b/swh/indexer/cli.py
index 2c7318f..939b4b1 100644
--- a/swh/indexer/cli.py
+++ b/swh/indexer/cli.py
@@ -1,407 +1,408 @@
# Copyright (C) 2019-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from typing import Callable, Dict, Iterator, List, Optional
# WARNING: do not import unnecessary things here to keep cli startup time under
# control
import click
from swh.core.cli import CONTEXT_SETTINGS, AliasedGroup
from swh.core.cli import swh as swh_cli_group
@swh_cli_group.group(
name="indexer", context_settings=CONTEXT_SETTINGS, cls=AliasedGroup
)
@click.option(
"--config-file",
"-C",
default=None,
type=click.Path(
exists=True,
dir_okay=False,
),
help="Configuration file.",
)
@click.pass_context
def indexer_cli_group(ctx, config_file):
"""Software Heritage Indexer tools.
The Indexer is used to mine the content of the archive and extract derived
information from archive source code artifacts.
"""
from swh.core import config
ctx.ensure_object(dict)
conf = config.read(config_file)
ctx.obj["config"] = conf
def _get_api(getter, config, config_key, url):
if url:
config[config_key] = {"cls": "remote", "url": url}
elif config_key not in config:
raise click.ClickException("Missing configuration for {}".format(config_key))
return getter(**config[config_key])
@indexer_cli_group.group("mapping")
def mapping():
"""Manage Software Heritage Indexer mappings."""
pass
@mapping.command("list")
def mapping_list():
"""Prints the list of known mappings."""
from swh.indexer import metadata_dictionary
mapping_names = [mapping.name for mapping in metadata_dictionary.MAPPINGS.values()]
mapping_names.sort()
for mapping_name in mapping_names:
click.echo(mapping_name)
@mapping.command("list-terms")
@click.option(
"--exclude-mapping", multiple=True, help="Exclude the given mapping from the output"
)
@click.option(
"--concise",
is_flag=True,
default=False,
help="Don't print the list of mappings supporting each term.",
)
def mapping_list_terms(concise, exclude_mapping):
"""Prints the list of known CodeMeta terms, and which mappings
support them."""
from swh.indexer import metadata_dictionary
properties = metadata_dictionary.list_terms()
for (property_name, supported_mappings) in sorted(properties.items()):
supported_mappings = {m.name for m in supported_mappings}
supported_mappings -= set(exclude_mapping)
if supported_mappings:
if concise:
click.echo(property_name)
else:
click.echo("{}:".format(property_name))
click.echo("\t" + ", ".join(sorted(supported_mappings)))
@mapping.command("translate")
@click.argument("mapping-name")
@click.argument("file", type=click.File("rb"))
def mapping_translate(mapping_name, file):
"""Translates file from mapping-name to codemeta format."""
import json
from swh.indexer import metadata_dictionary
mapping_cls = [
cls for cls in metadata_dictionary.MAPPINGS.values() if cls.name == mapping_name
]
if not mapping_cls:
raise click.ClickException("Unknown mapping {}".format(mapping_name))
assert len(mapping_cls) == 1
mapping_cls = mapping_cls[0]
mapping = mapping_cls()
codemeta_doc = mapping.translate(file.read())
click.echo(json.dumps(codemeta_doc, indent=4))
@indexer_cli_group.group("schedule")
@click.option("--scheduler-url", "-s", default=None, help="URL of the scheduler API")
@click.option(
"--indexer-storage-url", "-i", default=None, help="URL of the indexer storage API"
)
@click.option(
"--storage-url", "-g", default=None, help="URL of the (graph) storage API"
)
@click.option(
"--dry-run/--no-dry-run",
is_flag=True,
default=False,
help="List only what would be scheduled.",
)
@click.pass_context
def schedule(ctx, scheduler_url, storage_url, indexer_storage_url, dry_run):
"""Manipulate Software Heritage Indexer tasks.
Via SWH Scheduler's API."""
from swh.indexer.storage import get_indexer_storage
from swh.scheduler import get_scheduler
from swh.storage import get_storage
ctx.obj["indexer_storage"] = _get_api(
get_indexer_storage, ctx.obj["config"], "indexer_storage", indexer_storage_url
)
ctx.obj["storage"] = _get_api(
get_storage, ctx.obj["config"], "storage", storage_url
)
ctx.obj["scheduler"] = _get_api(
get_scheduler, ctx.obj["config"], "scheduler", scheduler_url
)
if dry_run:
ctx.obj["scheduler"] = None
def list_origins_by_producer(idx_storage, mappings, tool_ids) -> Iterator[str]:
next_page_token = ""
limit = 10000
while next_page_token is not None:
result = idx_storage.origin_intrinsic_metadata_search_by_producer(
page_token=next_page_token,
limit=limit,
ids_only=True,
mappings=mappings or None,
tool_ids=tool_ids or None,
)
next_page_token = result.next_page_token
yield from result.results
@schedule.command("reindex_origin_metadata")
@click.option(
"--batch-size",
"-b",
"origin_batch_size",
default=10,
show_default=True,
type=int,
help="Number of origins per task",
)
@click.option(
"--tool-id",
"-t",
"tool_ids",
type=int,
multiple=True,
help="Restrict search of old metadata to this/these tool ids.",
)
@click.option(
"--mapping",
"-m",
"mappings",
multiple=True,
help="Mapping(s) that should be re-scheduled (eg. 'npm', 'gemspec', 'maven')",
)
@click.option(
"--task-type",
default="index-origin-metadata",
show_default=True,
help="Name of the task type to schedule.",
)
@click.pass_context
def schedule_origin_metadata_reindex(
ctx, origin_batch_size, tool_ids, mappings, task_type
):
"""Schedules indexing tasks for origins that were already indexed."""
from swh.scheduler.cli_utils import schedule_origin_batches
idx_storage = ctx.obj["indexer_storage"]
scheduler = ctx.obj["scheduler"]
origins = list_origins_by_producer(idx_storage, mappings, tool_ids)
kwargs = {"retries_left": 1}
schedule_origin_batches(scheduler, task_type, origins, origin_batch_size, kwargs)
@indexer_cli_group.command("journal-client")
@click.argument(
"indexer",
type=click.Choice(
[
"origin_intrinsic_metadata",
"extrinsic_metadata",
"content_mimetype",
"content_fossology_license",
"*",
]
),
required=False
# TODO: remove required=False after we stop using it
)
@click.option("--scheduler-url", "-s", default=None, help="URL of the scheduler API")
@click.option(
"--origin-metadata-task-type",
default="index-origin-metadata",
help="Name of the task running the origin metadata indexer.",
)
@click.option(
"--broker", "brokers", type=str, multiple=True, help="Kafka broker to connect to."
)
@click.option(
"--prefix", type=str, default=None, help="Prefix of Kafka topic names to read from."
)
@click.option("--group-id", type=str, help="Consumer/group id for reading from Kafka.")
@click.option(
"--stop-after-objects",
"-m",
default=None,
type=int,
help="Maximum number of objects to replay. Default is to run forever.",
)
@click.option(
"--batch-size",
"-b",
default=None,
type=int,
help="Batch size. Default is 200.",
)
@click.pass_context
def journal_client(
ctx,
indexer: Optional[str],
scheduler_url: str,
origin_metadata_task_type: str,
brokers: List[str],
prefix: str,
group_id: str,
stop_after_objects: Optional[int],
batch_size: Optional[int],
):
"""
Listens for new objects from the SWH Journal, and either:
* runs the indexer with the name passed as argument, if any
* schedules tasks to run relevant indexers (currently, only
origin_intrinsic_metadata) on these new objects otherwise.
Passing '*' as indexer name runs all indexers.
"""
import functools
import warnings
from swh.indexer.indexer import BaseIndexer, ObjectsDict
from swh.indexer.journal_client import process_journal_objects
from swh.journal.client import get_journal_client
from swh.scheduler import get_scheduler
cfg = ctx.obj["config"]
journal_cfg = cfg.get("journal", {})
scheduler = _get_api(get_scheduler, cfg, "scheduler", scheduler_url)
- brokers = brokers or journal_cfg.get("brokers")
- if not brokers:
+ if brokers:
+ journal_cfg["brokers"] = brokers
+ if not journal_cfg.get("brokers"):
raise ValueError("The brokers configuration is mandatory.")
- prefix = prefix or journal_cfg.get("prefix")
- group_id = group_id or journal_cfg.get("group_id")
+ if prefix:
+ journal_cfg["prefix"] = prefix
+ if group_id:
+ journal_cfg["group_id"] = group_id
origin_metadata_task_type = origin_metadata_task_type or journal_cfg.get(
"origin_metadata_task_type"
)
- stop_after_objects = stop_after_objects or journal_cfg.get("stop_after_objects")
- batch_size = batch_size or journal_cfg.get("batch_size", 200)
+ if stop_after_objects:
+ journal_cfg["stop_after_objects"] = stop_after_objects
+ if batch_size:
+ journal_cfg["batch_size"] = batch_size
object_types = set()
worker_fns: List[Callable[[ObjectsDict], Dict]] = []
if indexer is None:
warnings.warn(
"'swh indexer journal-client' with no argument creates scheduler tasks "
"to index, rather than index directly.",
DeprecationWarning,
)
object_types.add("origin_visit_status")
worker_fns.append(
functools.partial(
process_journal_objects,
scheduler=scheduler,
task_names={
"origin_metadata": origin_metadata_task_type,
},
)
)
idx: Optional[BaseIndexer] = None
if indexer in ("origin_intrinsic_metadata", "*"):
from swh.indexer.metadata import OriginMetadataIndexer
object_types.add("origin_visit_status")
idx = OriginMetadataIndexer()
idx.catch_exceptions = False # don't commit offsets if indexation failed
worker_fns.append(idx.process_journal_objects)
if indexer in ("extrinsic_metadata", "*"):
from swh.indexer.metadata import ExtrinsicMetadataIndexer
object_types.add("raw_extrinsic_metadata")
idx = ExtrinsicMetadataIndexer()
idx.catch_exceptions = False # don't commit offsets if indexation failed
worker_fns.append(idx.process_journal_objects)
if indexer in ("content_mimetype", "*"):
from swh.indexer.mimetype import MimetypeIndexer
object_types.add("content")
idx = MimetypeIndexer()
idx.catch_exceptions = False # don't commit offsets if indexation failed
worker_fns.append(idx.process_journal_objects)
if indexer in ("content_fossology_license", "*"):
from swh.indexer.fossology_license import FossologyLicenseIndexer
object_types.add("content")
idx = FossologyLicenseIndexer()
idx.catch_exceptions = False # don't commit offsets if indexation failed
worker_fns.append(idx.process_journal_objects)
if not worker_fns:
raise click.ClickException(f"Unknown indexer: {indexer}")
client = get_journal_client(
cls="kafka",
- brokers=brokers,
- prefix=prefix,
- group_id=group_id,
object_types=list(object_types),
- stop_after_objects=stop_after_objects,
- batch_size=batch_size,
+ **journal_cfg,
)
def worker_fn(objects: ObjectsDict):
for fn in worker_fns:
fn(objects)
try:
client.process(worker_fn)
except KeyboardInterrupt:
ctx.exit(0)
else:
print("Done.")
finally:
client.close()
@indexer_cli_group.command("rpc-serve")
@click.argument("config-path", required=True)
@click.option("--host", default="0.0.0.0", help="Host to run the server")
@click.option("--port", default=5007, type=click.INT, help="Binding port of the server")
@click.option(
"--debug/--nodebug",
default=True,
help="Indicates if the server should run in debug mode",
)
def rpc_server(config_path, host, port, debug):
"""Starts a Software Heritage Indexer RPC HTTP server."""
from swh.indexer.storage.api.server import app, load_and_check_config
api_cfg = load_and_check_config(config_path, type="any")
app.config.update(api_cfg)
app.run(host, port=int(port), debug=bool(debug))
def main():
return indexer_cli_group(auto_envvar_prefix="SWH_INDEXER")
if __name__ == "__main__":
main()
diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py
index 566ab98..d9b3eb3 100644
--- a/swh/indexer/metadata.py
+++ b/swh/indexer/metadata.py
@@ -1,545 +1,561 @@
# Copyright (C) 2017-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from copy import deepcopy
+import itertools
+import logging
+import time
from typing import (
Any,
Callable,
Dict,
Iterable,
Iterator,
List,
Optional,
Tuple,
TypeVar,
cast,
)
from urllib.parse import urlparse
import sentry_sdk
from swh.core.config import merge_configs
from swh.core.utils import grouper
from swh.indexer.codemeta import merge_documents
from swh.indexer.indexer import (
BaseIndexer,
ContentIndexer,
DirectoryIndexer,
ObjectsDict,
OriginIndexer,
)
from swh.indexer.metadata_detector import detect_metadata
from swh.indexer.metadata_dictionary import EXTRINSIC_MAPPINGS, INTRINSIC_MAPPINGS
from swh.indexer.metadata_dictionary.base import DirectoryLsEntry
from swh.indexer.origin_head import get_head_swhid
from swh.indexer.storage import INDEXER_CFG_KEY, Sha1
from swh.indexer.storage.model import (
ContentMetadataRow,
DirectoryIntrinsicMetadataRow,
OriginExtrinsicMetadataRow,
OriginIntrinsicMetadataRow,
)
from swh.model import hashutil
from swh.model.model import Directory, MetadataAuthorityType
from swh.model.model import ObjectType as ModelObjectType
from swh.model.model import Origin, RawExtrinsicMetadata, Sha1Git
from swh.model.swhids import CoreSWHID, ExtendedObjectType, ObjectType
REVISION_GET_BATCH_SIZE = 10
RELEASE_GET_BATCH_SIZE = 10
ORIGIN_GET_BATCH_SIZE = 10
T1 = TypeVar("T1")
T2 = TypeVar("T2")
+logger = logging.getLogger(__name__)
+
def call_with_batches(
f: Callable[[List[T1]], Iterable[T2]],
args: List[T1],
batch_size: int,
) -> Iterator[T2]:
"""Calls a function with batches of args, and concatenates the results."""
groups = grouper(args, batch_size)
for group in groups:
yield from f(list(group))
class ExtrinsicMetadataIndexer(
BaseIndexer[Sha1Git, RawExtrinsicMetadata, OriginExtrinsicMetadataRow]
):
def process_journal_objects(self, objects: ObjectsDict) -> Dict:
summary: Dict[str, Any] = {"status": "uneventful"}
try:
- results = []
+ results = {}
for item in objects.get("raw_extrinsic_metadata", []):
remd = RawExtrinsicMetadata.from_dict(item)
- sentry_sdk.set_tag("swh-indexer-remd-swhid", remd.swhid())
- results.extend(self.index(remd.id, data=remd))
+ sentry_sdk.set_tag("swh-indexer-remd-swhid", str(remd.swhid()))
+ results[remd.target] = self.index(remd.id, data=remd)
except Exception:
if not self.catch_exceptions:
raise
summary["status"] = "failed"
return summary
- summary_persist = self.persist_index_computations(results)
- self.results = results
+ self.results = list(itertools.chain.from_iterable(results.values()))
+ summary_persist = self.persist_index_computations(self.results)
if summary_persist:
for value in summary_persist.values():
if value > 0:
summary["status"] = "eventful"
summary.update(summary_persist)
return summary
def index(
self,
id: Sha1Git,
data: Optional[RawExtrinsicMetadata],
**kwargs,
) -> List[OriginExtrinsicMetadataRow]:
if data is None:
raise NotImplementedError(
"ExtrinsicMetadataIndexer.index() without RawExtrinsicMetadata data"
)
if data.target.object_type != ExtendedObjectType.ORIGIN:
# other types are not supported yet
return []
if data.authority.type != MetadataAuthorityType.FORGE:
# metadata provided by a third-party; don't trust it
# (technically this could be handled below, but we check it here
# to return early; sparing a translation and origin lookup)
# TODO: add ways to define trusted authorities
return []
metadata_items = []
mappings: List[str] = []
for mapping_cls in EXTRINSIC_MAPPINGS.values():
if data.format in mapping_cls.extrinsic_metadata_formats():
mapping = mapping_cls()
metadata_item = mapping.translate(data.metadata)
if metadata_item is not None:
metadata_items.append(metadata_item)
mappings.append(mapping.name)
if not metadata_items:
# Don't have any mapping to parse it, ignore
return []
# TODO: batch requests to origin_get_by_sha1()
- origins = self.storage.origin_get_by_sha1([data.target.object_id])
- try:
- (origin,) = origins
- if origin is None:
- raise ValueError()
- except ValueError:
+ for _ in range(6):
+ origins = self.storage.origin_get_by_sha1([data.target.object_id])
+ try:
+ (origin,) = origins
+ if origin is not None:
+ break
+ except ValueError:
+ pass
+ # The origin does not exist. This may be due to some replication lag
+ # between the loader's DB/journal and the DB we are consuming from.
+ # Wait a bit and try again
+ logger.debug("Origin %s not found, sleeping for 10s.", data.target)
+ time.sleep(10)
+ else:
+ # Does not exist, or replication lag > 60s.
raise ValueError(f"Unknown origin {data.target}") from None
if urlparse(data.authority.url).netloc != urlparse(origin["url"]).netloc:
# metadata provided by a third-party; don't trust it
# TODO: add ways to define trusted authorities
return []
metadata = merge_documents(metadata_items)
return [
OriginExtrinsicMetadataRow(
id=origin["url"],
indexer_configuration_id=self.tool["id"],
from_remd_id=data.id,
mappings=mappings,
metadata=metadata,
)
]
def persist_index_computations(
self, results: List[OriginExtrinsicMetadataRow]
) -> Dict[str, int]:
"""Persist the results in storage."""
return self.idx_storage.origin_extrinsic_metadata_add(results)
class ContentMetadataIndexer(ContentIndexer[ContentMetadataRow]):
"""Content-level indexer
This indexer is in charge of:
- filtering out content already indexed in content_metadata
- reading content from objstorage with the content's id sha1
- computing metadata by given context
- using the metadata_dictionary as the 'swh-metadata-translator' tool
- store result in content_metadata table
"""
def filter(self, ids):
"""Filter out known sha1s and return only missing ones."""
yield from self.idx_storage.content_metadata_missing(
(
{
"id": sha1,
"indexer_configuration_id": self.tool["id"],
}
for sha1 in ids
)
)
def index(
self,
id: Sha1,
data: Optional[bytes] = None,
log_suffix="unknown directory",
**kwargs,
) -> List[ContentMetadataRow]:
"""Index sha1s' content and store result.
Args:
id: content's identifier
data: raw content in bytes
Returns:
dict: dictionary representing a content_metadata. If the
translation wasn't successful the metadata keys will
be returned as None
"""
assert isinstance(id, bytes)
assert data is not None
metadata = None
try:
mapping_name = self.tool["tool_configuration"]["context"]
log_suffix += ", content_id=%s" % hashutil.hash_to_hex(id)
metadata = INTRINSIC_MAPPINGS[mapping_name](log_suffix).translate(data)
except Exception:
self.log.exception(
"Problem during metadata translation "
"for content %s" % hashutil.hash_to_hex(id)
)
sentry_sdk.capture_exception()
if metadata is None:
return []
return [
ContentMetadataRow(
id=id,
indexer_configuration_id=self.tool["id"],
metadata=metadata,
)
]
def persist_index_computations(
self, results: List[ContentMetadataRow]
) -> Dict[str, int]:
"""Persist the results in storage."""
return self.idx_storage.content_metadata_add(results)
DEFAULT_CONFIG: Dict[str, Any] = {
"tools": {
"name": "swh-metadata-detector",
"version": "0.0.2",
"configuration": {},
},
}
class DirectoryMetadataIndexer(DirectoryIndexer[DirectoryIntrinsicMetadataRow]):
"""Directory-level indexer
This indexer is in charge of:
- filtering directories already indexed in directory_intrinsic_metadata table
with defined computation tool
- retrieve all entry_files in directory
- use metadata_detector for file_names containing metadata
- compute metadata translation if necessary and possible (depends on tool)
- send sha1s to content indexing if possible
- store the results for directory
"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.config = merge_configs(DEFAULT_CONFIG, self.config)
def filter(self, sha1_gits):
"""Filter out known sha1s and return only missing ones."""
yield from self.idx_storage.directory_intrinsic_metadata_missing(
(
{
"id": sha1_git,
"indexer_configuration_id": self.tool["id"],
}
for sha1_git in sha1_gits
)
)
def index(
self, id: Sha1Git, data: Optional[Directory] = None, **kwargs
) -> List[DirectoryIntrinsicMetadataRow]:
"""Index directory by processing it and organizing result.
use metadata_detector to iterate on filenames, passes them to the content
indexers, then merges (if more than one)
Args:
id: sha1_git of the directory
data: should always be None
Returns:
dict: dictionary representing a directory_intrinsic_metadata, with
keys:
- id: directory's identifier (sha1_git)
- indexer_configuration_id (bytes): tool used
- metadata: dict of retrieved metadata
"""
dir_: List[DirectoryLsEntry]
assert data is None, "Unexpected directory object"
dir_ = cast(
List[DirectoryLsEntry],
list(self.storage.directory_ls(id, recursive=False)),
)
try:
if [entry["type"] for entry in dir_] == ["dir"]:
# If the root is just a single directory, recurse into it
# eg. PyPI packages, GNU tarballs
subdir = dir_[0]["target"]
dir_ = cast(
List[DirectoryLsEntry],
list(self.storage.directory_ls(subdir, recursive=False)),
)
files = [entry for entry in dir_ if entry["type"] == "file"]
(mappings, metadata) = self.translate_directory_intrinsic_metadata(
files,
log_suffix="directory=%s" % hashutil.hash_to_hex(id),
)
except Exception as e:
self.log.exception("Problem when indexing dir: %r", e)
sentry_sdk.capture_exception()
return []
return [
DirectoryIntrinsicMetadataRow(
id=id,
indexer_configuration_id=self.tool["id"],
mappings=mappings,
metadata=metadata,
)
]
def persist_index_computations(
self, results: List[DirectoryIntrinsicMetadataRow]
) -> Dict[str, int]:
"""Persist the results in storage."""
# TODO: add functions in storage to keep data in
# directory_intrinsic_metadata
return self.idx_storage.directory_intrinsic_metadata_add(results)
def translate_directory_intrinsic_metadata(
self, files: List[DirectoryLsEntry], log_suffix: str
) -> Tuple[List[Any], Any]:
"""
Determine plan of action to translate metadata in the given root directory
Args:
files: list of file entries, as returned by
:meth:`swh.storage.interface.StorageInterface.directory_ls`
Returns:
(List[str], dict): list of mappings used and dict with
translated metadata according to the CodeMeta vocabulary
"""
metadata = []
tool = {
"name": "swh-metadata-translator",
"version": "0.0.2",
"configuration": {},
}
# TODO: iterate on each context, on each file
# -> get raw_contents
# -> translate each content
config = {k: self.config[k] for k in [INDEXER_CFG_KEY, "objstorage", "storage"]}
config["tools"] = [tool]
all_detected_files = detect_metadata(files)
used_mappings = [
INTRINSIC_MAPPINGS[context].name for context in all_detected_files
]
for (mapping_name, detected_files) in all_detected_files.items():
cfg = deepcopy(config)
cfg["tools"][0]["configuration"]["context"] = mapping_name
c_metadata_indexer = ContentMetadataIndexer(config=cfg)
# sha1s that are in content_metadata table
sha1s_in_storage = []
metadata_generator = self.idx_storage.content_metadata_get(detected_files)
for c in metadata_generator:
# extracting metadata
sha1 = c.id
sha1s_in_storage.append(sha1)
local_metadata = c.metadata
# local metadata is aggregated
if local_metadata:
metadata.append(local_metadata)
sha1s_filtered = [
item for item in detected_files if item not in sha1s_in_storage
]
if sha1s_filtered:
# content indexing
try:
c_metadata_indexer.run(
sha1s_filtered,
log_suffix=log_suffix,
)
# on the fly possibility:
for result in c_metadata_indexer.results:
local_metadata = result.metadata
metadata.append(local_metadata)
except Exception:
self.log.exception("Exception while indexing metadata on contents")
sentry_sdk.capture_exception()
metadata = merge_documents(metadata)
return (used_mappings, metadata)
class OriginMetadataIndexer(
OriginIndexer[Tuple[OriginIntrinsicMetadataRow, DirectoryIntrinsicMetadataRow]]
):
USE_TOOLS = False
def __init__(self, config=None, **kwargs) -> None:
super().__init__(config=config, **kwargs)
self.directory_metadata_indexer = DirectoryMetadataIndexer(config=config)
def index_list(
self,
origins: List[Origin],
*,
check_origin_known: bool = True,
**kwargs,
) -> List[Tuple[OriginIntrinsicMetadataRow, DirectoryIntrinsicMetadataRow]]:
head_rev_ids = []
head_rel_ids = []
origin_heads: Dict[Origin, CoreSWHID] = {}
# Filter out origins not in the storage
if check_origin_known:
known_origins = list(
call_with_batches(
self.storage.origin_get,
[origin.url for origin in origins],
ORIGIN_GET_BATCH_SIZE,
)
)
else:
known_origins = list(origins)
for origin in known_origins:
if origin is None:
continue
head_swhid = get_head_swhid(self.storage, origin.url)
if head_swhid:
origin_heads[origin] = head_swhid
if head_swhid.object_type == ObjectType.REVISION:
head_rev_ids.append(head_swhid.object_id)
elif head_swhid.object_type == ObjectType.RELEASE:
head_rel_ids.append(head_swhid.object_id)
else:
assert False, head_swhid
head_revs = dict(
zip(
head_rev_ids,
call_with_batches(
self.storage.revision_get, head_rev_ids, REVISION_GET_BATCH_SIZE
),
)
)
head_rels = dict(
zip(
head_rel_ids,
call_with_batches(
self.storage.release_get, head_rel_ids, RELEASE_GET_BATCH_SIZE
),
)
)
results = []
for (origin, head_swhid) in origin_heads.items():
sentry_sdk.set_tag("swh-indexer-origin-url", origin.url)
sentry_sdk.set_tag("swh-indexer-origin-head-swhid", str(head_swhid))
if head_swhid.object_type == ObjectType.REVISION:
rev = head_revs[head_swhid.object_id]
if not rev:
self.log.warning(
"Missing head object %s of origin %r", head_swhid, origin.url
)
continue
directory_id = rev.directory
elif head_swhid.object_type == ObjectType.RELEASE:
rel = head_rels[head_swhid.object_id]
if not rel:
self.log.warning(
"Missing head object %s of origin %r", head_swhid, origin.url
)
continue
if rel.target_type != ModelObjectType.DIRECTORY:
# TODO
self.log.warning(
"Head release %s of %r has unexpected target type %s",
head_swhid,
origin.url,
rel.target_type,
)
continue
assert rel.target, rel
directory_id = rel.target
else:
assert False, head_swhid
for dir_metadata in self.directory_metadata_indexer.index(directory_id):
# There is at most one dir_metadata
orig_metadata = OriginIntrinsicMetadataRow(
from_directory=dir_metadata.id,
id=origin.url,
metadata=dir_metadata.metadata,
mappings=dir_metadata.mappings,
indexer_configuration_id=dir_metadata.indexer_configuration_id,
)
results.append((orig_metadata, dir_metadata))
return results
def persist_index_computations(
self,
results: List[Tuple[OriginIntrinsicMetadataRow, DirectoryIntrinsicMetadataRow]],
) -> Dict[str, int]:
# Deduplicate directories
- dir_metadata: List[DirectoryIntrinsicMetadataRow] = []
- orig_metadata: List[OriginIntrinsicMetadataRow] = []
+ dir_metadata: Dict[bytes, DirectoryIntrinsicMetadataRow] = {}
+ orig_metadata: Dict[str, OriginIntrinsicMetadataRow] = {}
summary: Dict = {}
for (orig_item, dir_item) in results:
assert dir_item.metadata == orig_item.metadata
if dir_item.metadata and not (dir_item.metadata.keys() <= {"@context"}):
# Only store non-empty metadata sets
- if dir_item not in dir_metadata:
- dir_metadata.append(dir_item)
- if orig_item not in orig_metadata:
- orig_metadata.append(orig_item)
+ if dir_item.id not in dir_metadata:
+ dir_metadata[dir_item.id] = dir_item
+ if orig_item.id not in orig_metadata:
+ orig_metadata[orig_item.id] = orig_item
if dir_metadata:
summary_dir = self.idx_storage.directory_intrinsic_metadata_add(
- dir_metadata
+ list(dir_metadata.values())
)
summary.update(summary_dir)
if orig_metadata:
- summary_ori = self.idx_storage.origin_intrinsic_metadata_add(orig_metadata)
+ summary_ori = self.idx_storage.origin_intrinsic_metadata_add(
+ list(orig_metadata.values())
+ )
summary.update(summary_ori)
return summary
diff --git a/swh/indexer/metadata_dictionary/base.py b/swh/indexer/metadata_dictionary/base.py
index 418c2ec..f6253d7 100644
--- a/swh/indexer/metadata_dictionary/base.py
+++ b/swh/indexer/metadata_dictionary/base.py
@@ -1,348 +1,371 @@
# Copyright (C) 2017-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import json
import logging
from typing import Any, Callable, Dict, List, Optional, Tuple, TypeVar
+import urllib.parse
import uuid
import xml.parsers.expat
from pyld import jsonld
import rdflib
from typing_extensions import TypedDict
import xmltodict
import yaml
from swh.indexer.codemeta import _document_loader, compact
from swh.indexer.namespaces import RDF, SCHEMA
from swh.indexer.storage.interface import Sha1
class DirectoryLsEntry(TypedDict):
target: Sha1
sha1: Sha1
name: bytes
type: str
TTranslateCallable = TypeVar(
"TTranslateCallable",
bound=Callable[[Any, rdflib.Graph, rdflib.term.BNode, Any], None],
)
def produce_terms(*uris: str) -> Callable[[TTranslateCallable], TTranslateCallable]:
"""Returns a decorator that marks the decorated function as adding
the given terms to the ``translated_metadata`` dict"""
def decorator(f: TTranslateCallable) -> TTranslateCallable:
if not hasattr(f, "produced_terms"):
f.produced_terms = [] # type: ignore
f.produced_terms.extend(uris) # type: ignore
return f
return decorator
class BaseMapping:
"""Base class for :class:`BaseExtrinsicMapping` and :class:`BaseIntrinsicMapping`,
not to be inherited directly."""
def __init__(self, log_suffix=""):
self.log_suffix = log_suffix
self.log = logging.getLogger(
"%s.%s" % (self.__class__.__module__, self.__class__.__name__)
)
@property
def name(self):
"""A name of this mapping, used as an identifier in the
indexer storage."""
raise NotImplementedError(f"{self.__class__.__name__}.name")
def translate(self, raw_content: bytes) -> Optional[Dict]:
"""
Translates content by parsing content from a bytestring containing
mapping-specific data and translating with the appropriate mapping
to JSON-LD using the Codemeta and ForgeFed vocabularies.
Args:
raw_content: raw content to translate
Returns:
translated metadata in JSON friendly form needed for the content
if parseable, :const:`None` otherwise.
"""
raise NotImplementedError(f"{self.__class__.__name__}.translate")
def normalize_translation(self, metadata: Dict[str, Any]) -> Dict[str, Any]:
raise NotImplementedError(f"{self.__class__.__name__}.normalize_translation")
class BaseExtrinsicMapping(BaseMapping):
"""Base class for extrinsic_metadata mappings to inherit from
To implement a new mapping:
- inherit this class
- override translate function
"""
@classmethod
def extrinsic_metadata_formats(cls) -> Tuple[str, ...]:
"""
Returns the list of extrinsic metadata formats which can be translated
by this mapping
"""
raise NotImplementedError(f"{cls.__name__}.extrinsic_metadata_formats")
def normalize_translation(self, metadata: Dict[str, Any]) -> Dict[str, Any]:
return compact(metadata, forgefed=True)
class BaseIntrinsicMapping(BaseMapping):
"""Base class for intrinsic-metadata mappings to inherit from
To implement a new mapping:
- inherit this class
- override translate function
"""
@classmethod
def detect_metadata_files(cls, file_entries: List[DirectoryLsEntry]) -> List[Sha1]:
"""
Returns the sha1 hashes of files which can be translated by this mapping
"""
raise NotImplementedError(f"{cls.__name__}.detect_metadata_files")
def normalize_translation(self, metadata: Dict[str, Any]) -> Dict[str, Any]:
return compact(metadata, forgefed=False)
class SingleFileIntrinsicMapping(BaseIntrinsicMapping):
"""Base class for all intrinsic metadata mappings that use a single file as input."""
@property
def filename(self):
"""The .json file to extract metadata from."""
raise NotImplementedError(f"{self.__class__.__name__}.filename")
@classmethod
def detect_metadata_files(cls, file_entries: List[DirectoryLsEntry]) -> List[Sha1]:
for entry in file_entries:
if entry["name"].lower() == cls.filename:
return [entry["sha1"]]
return []
class DictMapping(BaseMapping):
"""Base class for mappings that take as input a file that is mostly
a key-value store (eg. a shallow JSON dict)."""
string_fields: List[str] = []
"""List of fields that are simple strings, and don't need any
normalization."""
uri_fields: List[str] = []
"""List of fields that are simple URIs, and don't need any
normalization."""
@property
def mapping(self):
"""A translation dict to map dict keys into a canonical name."""
raise NotImplementedError(f"{self.__class__.__name__}.mapping")
@staticmethod
def _normalize_method_name(name: str) -> str:
return name.replace("-", "_")
@classmethod
def supported_terms(cls):
# one-to-one mapping from the original key to a CodeMeta term
simple_terms = {
str(term)
for (key, term) in cls.mapping.items()
if key in cls.string_fields + cls.uri_fields
or hasattr(cls, "normalize_" + cls._normalize_method_name(key))
}
# more complex mapping from the original key to JSON-LD
complex_terms = {
str(term)
for meth_name in dir(cls)
if meth_name.startswith("translate_")
for term in getattr(getattr(cls, meth_name), "produced_terms", [])
}
return simple_terms | complex_terms
def _translate_dict(self, content_dict: Dict) -> Dict[str, Any]:
"""
Translates content by parsing content from a dict object
and translating with the appropriate mapping
Args:
content_dict (dict): content dict to translate
Returns:
dict: translated metadata in json-friendly form needed for
the indexer
"""
graph = rdflib.Graph()
# The main object being described (the SoftwareSourceCode) does not necessarily
# may or may not have an id.
# Either way, we temporarily use this URI to identify it. Unfortunately,
# we cannot use a blank node as we need to use it for JSON-LD framing later,
# and blank nodes cannot be used for framing in JSON-LD >= 1.1
root_id = (
"https://www.softwareheritage.org/schema/2022/indexer/tmp-node/"
+ str(uuid.uuid4())
)
root = rdflib.URIRef(root_id)
graph.add((root, RDF.type, SCHEMA.SoftwareSourceCode))
for k, v in content_dict.items():
# First, check if there is a specific translation
# method for this key
translation_method = getattr(
self, "translate_" + self._normalize_method_name(k), None
)
if translation_method:
translation_method(graph, root, v)
elif k in self.mapping:
# if there is no method, but the key is known from the
# crosswalk table
codemeta_key = self.mapping[k]
# if there is a normalization method, use it on the value,
# and add its results to the triples
normalization_method = getattr(
self, "normalize_" + self._normalize_method_name(k), None
)
if normalization_method:
v = normalization_method(v)
if v is None:
pass
elif isinstance(v, list):
for item in reversed(v):
graph.add((root, codemeta_key, item))
else:
graph.add((root, codemeta_key, v))
elif k in self.string_fields and isinstance(v, str):
graph.add((root, codemeta_key, rdflib.Literal(v)))
elif k in self.string_fields and isinstance(v, list):
for item in v:
graph.add((root, codemeta_key, rdflib.Literal(item)))
elif k in self.uri_fields and isinstance(v, str):
- graph.add((root, codemeta_key, rdflib.URIRef(v)))
+ # Workaround for https://github.com/digitalbazaar/pyld/issues/91 : drop
+ # URLs that are blatantly invalid early, so PyLD does not crash.
+ parsed_url = urllib.parse.urlparse(v)
+ if parsed_url.netloc:
+ graph.add((root, codemeta_key, rdflib.URIRef(v)))
elif k in self.uri_fields and isinstance(v, list):
for item in v:
if isinstance(item, str):
- graph.add((root, codemeta_key, rdflib.URIRef(item)))
+ # ditto
+ parsed_url = urllib.parse.urlparse(item)
+ if parsed_url.netloc:
+ graph.add((root, codemeta_key, rdflib.URIRef(item)))
else:
continue
self.extra_translation(graph, root, content_dict)
+ self.sanitize(graph)
+
# Convert from rdflib's internal graph representation to JSON
s = graph.serialize(format="application/ld+json")
# Load from JSON to a list of Python objects
jsonld_graph = json.loads(s)
# Use JSON-LD framing to turn the graph into a rooted tree
# frame = {"@type": str(SCHEMA.SoftwareSourceCode)}
translated_metadata = jsonld.frame(
jsonld_graph,
{"@id": root_id},
options={
"documentLoader": _document_loader,
"processingMode": "json-ld-1.1",
},
)
# Remove the temporary id we added at the beginning
if isinstance(translated_metadata["@id"], list):
translated_metadata["@id"].remove(root_id)
else:
del translated_metadata["@id"]
return self.normalize_translation(translated_metadata)
+ def sanitize(self, graph: rdflib.Graph) -> None:
+ # Remove triples that make PyLD crash
+ for (subject, predicate, _) in graph.triples((None, None, rdflib.URIRef(""))):
+ graph.remove((subject, predicate, rdflib.URIRef("")))
+
+ # Should not happen, but we's better check as this may lead to incorrect data
+ invalid = False
+ for triple in graph.triples((rdflib.URIRef(""), None, None)):
+ invalid = True
+ logging.error("Empty triple subject URI: %r", triple)
+ if invalid:
+ raise ValueError("Empty triple subject(s)")
+
def extra_translation(
self, graph: rdflib.Graph, root: rdflib.term.Node, d: Dict[str, Any]
- ):
+ ) -> None:
"""Called at the end of the translation process, and may add arbitrary triples
to ``graph`` based on the input dictionary (passed as ``d``).
"""
pass
class JsonMapping(DictMapping):
"""Base class for all mappings that use JSON data as input."""
def translate(self, raw_content: bytes) -> Optional[Dict]:
try:
raw_content_string: str = raw_content.decode()
except UnicodeDecodeError:
self.log.warning("Error unidecoding from %s", self.log_suffix)
return None
try:
content_dict = json.loads(raw_content_string)
except json.JSONDecodeError:
self.log.warning("Error unjsoning from %s", self.log_suffix)
return None
if isinstance(content_dict, dict):
return self._translate_dict(content_dict)
return None
class XmlMapping(DictMapping):
"""Base class for all mappings that use XML data as input."""
def translate(self, raw_content: bytes) -> Optional[Dict]:
try:
d = xmltodict.parse(raw_content)
except xml.parsers.expat.ExpatError:
self.log.warning("Error parsing XML from %s", self.log_suffix)
return None
except UnicodeDecodeError:
self.log.warning("Error unidecoding XML from %s", self.log_suffix)
return None
except (LookupError, ValueError):
# unknown encoding or multi-byte encoding
self.log.warning("Error detecting XML encoding from %s", self.log_suffix)
return None
if not isinstance(d, dict):
self.log.warning("Skipping ill-formed XML content: %s", raw_content)
return None
return self._translate_dict(d)
class SafeLoader(yaml.SafeLoader):
yaml_implicit_resolvers = {
k: [r for r in v if r[0] != "tag:yaml.org,2002:timestamp"]
for k, v in yaml.SafeLoader.yaml_implicit_resolvers.items()
}
class YamlMapping(DictMapping, SingleFileIntrinsicMapping):
"""Base class for all mappings that use Yaml data as input."""
def translate(self, raw_content: bytes) -> Optional[Dict[str, str]]:
raw_content_string: str = raw_content.decode()
try:
content_dict = yaml.load(raw_content_string, Loader=SafeLoader)
except yaml.scanner.ScannerError:
return None
if isinstance(content_dict, dict):
return self._translate_dict(content_dict)
return None
diff --git a/swh/indexer/metadata_dictionary/github.py b/swh/indexer/metadata_dictionary/github.py
index fe3b87e..d8d8702 100644
--- a/swh/indexer/metadata_dictionary/github.py
+++ b/swh/indexer/metadata_dictionary/github.py
@@ -1,113 +1,117 @@
# Copyright (C) 2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from typing import Any, Tuple
from rdflib import RDF, BNode, Graph, Literal, URIRef
from swh.indexer.codemeta import CROSSWALK_TABLE
from swh.indexer.namespaces import ACTIVITYSTREAMS, FORGEFED, SCHEMA
from .base import BaseExtrinsicMapping, JsonMapping, produce_terms
from .utils import prettyprint_graph # noqa
SPDX = URIRef("https://spdx.org/licenses/")
class GitHubMapping(BaseExtrinsicMapping, JsonMapping):
name = "github"
- mapping = CROSSWALK_TABLE["GitHub"]
+ mapping = {
+ **CROSSWALK_TABLE["GitHub"],
+ "topics": SCHEMA.keywords, # TODO: submit this to the official crosswalk
+ }
string_fields = [
"archive_url",
"created_at",
"updated_at",
"description",
"full_name",
"html_url",
"issues_url",
+ "topics",
]
@classmethod
def extrinsic_metadata_formats(cls) -> Tuple[str, ...]:
return ("application/vnd.github.v3+json",)
def extra_translation(self, graph, root, content_dict):
graph.remove((root, RDF.type, SCHEMA.SoftwareSourceCode))
graph.add((root, RDF.type, FORGEFED.Repository))
@produce_terms(FORGEFED.forks, ACTIVITYSTREAMS.totalItems)
def translate_forks_count(self, graph: Graph, root: BNode, v: Any) -> None:
"""
>>> graph = Graph()
>>> root = URIRef("http://example.org/test-software")
>>> GitHubMapping().translate_forks_count(graph, root, 42)
>>> prettyprint_graph(graph, root)
{
"@id": ...,
"https://forgefed.org/ns#forks": {
"@type": "https://www.w3.org/ns/activitystreams#OrderedCollection",
"https://www.w3.org/ns/activitystreams#totalItems": 42
}
}
"""
if isinstance(v, int):
collection = BNode()
graph.add((root, FORGEFED.forks, collection))
graph.add((collection, RDF.type, ACTIVITYSTREAMS.OrderedCollection))
graph.add((collection, ACTIVITYSTREAMS.totalItems, Literal(v)))
@produce_terms(ACTIVITYSTREAMS.likes, ACTIVITYSTREAMS.totalItems)
def translate_stargazers_count(self, graph: Graph, root: BNode, v: Any) -> None:
"""
>>> graph = Graph()
>>> root = URIRef("http://example.org/test-software")
>>> GitHubMapping().translate_stargazers_count(graph, root, 42)
>>> prettyprint_graph(graph, root)
{
"@id": ...,
"https://www.w3.org/ns/activitystreams#likes": {
"@type": "https://www.w3.org/ns/activitystreams#Collection",
"https://www.w3.org/ns/activitystreams#totalItems": 42
}
}
"""
if isinstance(v, int):
collection = BNode()
graph.add((root, ACTIVITYSTREAMS.likes, collection))
graph.add((collection, RDF.type, ACTIVITYSTREAMS.Collection))
graph.add((collection, ACTIVITYSTREAMS.totalItems, Literal(v)))
@produce_terms(ACTIVITYSTREAMS.followers, ACTIVITYSTREAMS.totalItems)
def translate_watchers_count(self, graph: Graph, root: BNode, v: Any) -> None:
"""
>>> graph = Graph()
>>> root = URIRef("http://example.org/test-software")
>>> GitHubMapping().translate_watchers_count(graph, root, 42)
>>> prettyprint_graph(graph, root)
{
"@id": ...,
"https://www.w3.org/ns/activitystreams#followers": {
"@type": "https://www.w3.org/ns/activitystreams#Collection",
"https://www.w3.org/ns/activitystreams#totalItems": 42
}
}
"""
if isinstance(v, int):
collection = BNode()
graph.add((root, ACTIVITYSTREAMS.followers, collection))
graph.add((collection, RDF.type, ACTIVITYSTREAMS.Collection))
graph.add((collection, ACTIVITYSTREAMS.totalItems, Literal(v)))
def normalize_license(self, d):
"""
>>> GitHubMapping().normalize_license({'spdx_id': 'MIT'})
rdflib.term.URIRef('https://spdx.org/licenses/MIT')
"""
if isinstance(d, dict) and isinstance(d.get("spdx_id"), str):
return SPDX + d["spdx_id"]
diff --git a/swh/indexer/metadata_dictionary/maven.py b/swh/indexer/metadata_dictionary/maven.py
index a374a5e..8b3e48d 100644
--- a/swh/indexer/metadata_dictionary/maven.py
+++ b/swh/indexer/metadata_dictionary/maven.py
@@ -1,159 +1,162 @@
# Copyright (C) 2018-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import os
from typing import Any, Dict
from rdflib import Graph, Literal, URIRef
from swh.indexer.codemeta import CROSSWALK_TABLE
from swh.indexer.namespaces import SCHEMA
from .base import SingleFileIntrinsicMapping, XmlMapping
from .utils import prettyprint_graph # noqa
class MavenMapping(XmlMapping, SingleFileIntrinsicMapping):
"""
dedicated class for Maven (pom.xml) mapping and translation
"""
name = "maven"
filename = b"pom.xml"
mapping = CROSSWALK_TABLE["Java (Maven)"]
string_fields = ["name", "version", "description", "email"]
_default_repository = {"url": "https://repo.maven.apache.org/maven2/"}
def _translate_dict(self, d: Dict[str, Any]) -> Dict[str, Any]:
return super()._translate_dict(d.get("project") or {})
def extra_translation(self, graph: Graph, root, d):
self.parse_repositories(graph, root, d)
def parse_repositories(self, graph: Graph, root, d):
"""https://maven.apache.org/pom.html#Repositories
>>> import rdflib
>>> import xmltodict
>>> from pprint import pprint
>>> d = xmltodict.parse('''
...
...
... codehausSnapshots
... Codehaus Snapshots
... http://snapshots.maven.codehaus.org/maven2
... default
...
...
... ''')
>>> MavenMapping().parse_repositories(rdflib.Graph(), rdflib.BNode(), d)
"""
repositories = d.get("repositories")
if not repositories:
self.parse_repository(graph, root, d, self._default_repository)
elif isinstance(repositories, dict):
repositories = repositories.get("repository") or []
if not isinstance(repositories, list):
repositories = [repositories]
for repo in repositories:
self.parse_repository(graph, root, d, repo)
def parse_repository(self, graph: Graph, root, d, repo):
if not isinstance(repo, dict):
return
if repo.get("layout", "default") != "default":
return # TODO ?
url = repo.get("url")
group_id = d.get("groupId")
artifact_id = d.get("artifactId")
if (
isinstance(url, str)
and isinstance(group_id, str)
and isinstance(artifact_id, str)
):
repo = os.path.join(url, *group_id.split("."), artifact_id)
+ if "${" in repo:
+ # Often use as templating in pom.xml files collected from VCSs
+ return
graph.add((root, SCHEMA.codeRepository, URIRef(repo)))
def normalize_groupId(self, id_):
"""https://maven.apache.org/pom.html#Maven_Coordinates
>>> MavenMapping().normalize_groupId('org.example')
rdflib.term.Literal('org.example')
"""
if isinstance(id_, str):
return Literal(id_)
def translate_licenses(self, graph, root, licenses):
"""https://maven.apache.org/pom.html#Licenses
>>> import xmltodict
>>> import json
>>> d = xmltodict.parse('''
...
...
... Apache License, Version 2.0
... https://www.apache.org/licenses/LICENSE-2.0.txt
...
...
... ''')
>>> print(json.dumps(d, indent=4))
{
"licenses": {
"license": {
"name": "Apache License, Version 2.0",
"url": "https://www.apache.org/licenses/LICENSE-2.0.txt"
}
}
}
>>> graph = Graph()
>>> root = URIRef("http://example.org/test-software")
>>> MavenMapping().translate_licenses(graph, root, d["licenses"])
>>> prettyprint_graph(graph, root)
{
"@id": ...,
"http://schema.org/license": {
"@id": "https://www.apache.org/licenses/LICENSE-2.0.txt"
}
}
or, if there are more than one license:
>>> import xmltodict
>>> from pprint import pprint
>>> d = xmltodict.parse('''
...
...
... Apache License, Version 2.0
... https://www.apache.org/licenses/LICENSE-2.0.txt
...
...
... MIT License
... https://opensource.org/licenses/MIT
...
...
... ''')
>>> graph = Graph()
>>> root = URIRef("http://example.org/test-software")
>>> MavenMapping().translate_licenses(graph, root, d["licenses"])
>>> pprint(set(graph.triples((root, URIRef("http://schema.org/license"), None))))
{(rdflib.term.URIRef('http://example.org/test-software'),
rdflib.term.URIRef('http://schema.org/license'),
rdflib.term.URIRef('https://opensource.org/licenses/MIT')),
(rdflib.term.URIRef('http://example.org/test-software'),
rdflib.term.URIRef('http://schema.org/license'),
rdflib.term.URIRef('https://www.apache.org/licenses/LICENSE-2.0.txt'))}
"""
if not isinstance(licenses, dict):
return
licenses = licenses.get("license")
if isinstance(licenses, dict):
licenses = [licenses]
elif not isinstance(licenses, list):
return
for license in licenses:
if isinstance(license, dict) and isinstance(license.get("url"), str):
graph.add((root, SCHEMA.license, URIRef(license["url"])))
diff --git a/swh/indexer/metadata_dictionary/npm.py b/swh/indexer/metadata_dictionary/npm.py
index 1540ef6..f2eaa64 100644
--- a/swh/indexer/metadata_dictionary/npm.py
+++ b/swh/indexer/metadata_dictionary/npm.py
@@ -1,282 +1,292 @@
# Copyright (C) 2018-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import re
import urllib.parse
from rdflib import RDF, BNode, Graph, Literal, URIRef
from swh.indexer.codemeta import CROSSWALK_TABLE
from swh.indexer.namespaces import SCHEMA
from .base import JsonMapping, SingleFileIntrinsicMapping
from .utils import add_list, prettyprint_graph # noqa
SPDX = URIRef("https://spdx.org/licenses/")
class NpmMapping(JsonMapping, SingleFileIntrinsicMapping):
"""
dedicated class for NPM (package.json) mapping and translation
"""
name = "npm"
mapping = CROSSWALK_TABLE["NodeJS"]
filename = b"package.json"
string_fields = ["name", "version", "description", "email"]
uri_fields = ["homepage"]
_schema_shortcuts = {
"github": "git+https://github.com/%s.git",
"gist": "git+https://gist.github.com/%s.git",
"gitlab": "git+https://gitlab.com/%s.git",
# Bitbucket supports both hg and git, and the shortcut does not
# tell which one to use.
# 'bitbucket': 'https://bitbucket.org/',
}
def normalize_repository(self, d):
"""https://docs.npmjs.com/files/package.json#repository
>>> NpmMapping().normalize_repository({
... 'type': 'git',
... 'url': 'https://example.org/foo.git'
... })
rdflib.term.URIRef('git+https://example.org/foo.git')
>>> NpmMapping().normalize_repository(
... 'gitlab:foo/bar')
rdflib.term.URIRef('git+https://gitlab.com/foo/bar.git')
>>> NpmMapping().normalize_repository(
... 'foo/bar')
rdflib.term.URIRef('git+https://github.com/foo/bar.git')
"""
if (
isinstance(d, dict)
and isinstance(d.get("type"), str)
and isinstance(d.get("url"), str)
):
url = "{type}+{url}".format(**d)
elif isinstance(d, str):
if "://" in d:
url = d
elif ":" in d:
(schema, rest) = d.split(":", 1)
if schema in self._schema_shortcuts:
url = self._schema_shortcuts[schema] % rest
else:
return None
else:
url = self._schema_shortcuts["github"] % d
else:
return None
return URIRef(url)
def normalize_bugs(self, d):
"""https://docs.npmjs.com/files/package.json#bugs
>>> NpmMapping().normalize_bugs({
... 'url': 'https://example.org/bugs/',
... 'email': 'bugs@example.org'
... })
rdflib.term.URIRef('https://example.org/bugs/')
>>> NpmMapping().normalize_bugs(
... 'https://example.org/bugs/')
rdflib.term.URIRef('https://example.org/bugs/')
"""
if isinstance(d, dict) and isinstance(d.get("url"), str):
return URIRef(d["url"])
elif isinstance(d, str):
return URIRef(d)
else:
return None
_parse_author = re.compile(
r"^ *" r"(?P.*?)" r"( +<(?P.*)>)?" r"( +\((?P.*)\))?" r" *$"
)
def translate_author(self, graph: Graph, root, d):
r"""https://docs.npmjs.com/files/package.json#people-fields-author-contributors'
>>> from pprint import pprint
>>> root = URIRef("http://example.org/test-software")
>>> graph = Graph()
>>> NpmMapping().translate_author(graph, root, {
... 'name': 'John Doe',
... 'email': 'john.doe@example.org',
... 'url': 'https://example.org/~john.doe',
... })
>>> prettyprint_graph(graph, root)
{
"@id": ...,
"http://schema.org/author": {
"@list": [
{
"@type": "http://schema.org/Person",
"http://schema.org/email": "john.doe@example.org",
"http://schema.org/name": "John Doe",
"http://schema.org/url": {
"@id": "https://example.org/~john.doe"
}
}
]
}
}
>>> graph = Graph()
>>> NpmMapping().translate_author(graph, root,
... 'John Doe (https://example.org/~john.doe)'
... )
>>> prettyprint_graph(graph, root)
{
"@id": ...,
"http://schema.org/author": {
"@list": [
{
"@type": "http://schema.org/Person",
"http://schema.org/email": "john.doe@example.org",
"http://schema.org/name": "John Doe",
"http://schema.org/url": {
"@id": "https://example.org/~john.doe"
}
}
]
}
}
>>> graph = Graph()
>>> NpmMapping().translate_author(graph, root, {
... 'name': 'John Doe',
... 'email': 'john.doe@example.org',
... 'url': 'https:\\\\example.invalid/~john.doe',
... })
>>> prettyprint_graph(graph, root)
{
"@id": ...,
"http://schema.org/author": {
"@list": [
{
"@type": "http://schema.org/Person",
"http://schema.org/email": "john.doe@example.org",
"http://schema.org/name": "John Doe"
}
]
}
}
""" # noqa
author = BNode()
graph.add((author, RDF.type, SCHEMA.Person))
if isinstance(d, dict):
name = d.get("name", None)
email = d.get("email", None)
url = d.get("url", None)
elif isinstance(d, str):
match = self._parse_author.match(d)
if not match:
return None
name = match.group("name")
email = match.group("email")
url = match.group("url")
else:
return None
if name and isinstance(name, str):
graph.add((author, SCHEMA.name, Literal(name)))
if email and isinstance(email, str):
graph.add((author, SCHEMA.email, Literal(email)))
if url and isinstance(url, str):
# Workaround for https://github.com/digitalbazaar/pyld/issues/91 : drop
# URLs that are blatantly invalid early, so PyLD does not crash.
parsed_url = urllib.parse.urlparse(url)
if parsed_url.netloc:
graph.add((author, SCHEMA.url, URIRef(url)))
add_list(graph, root, SCHEMA.author, [author])
def normalize_description(self, description):
r"""Try to re-decode ``description`` as UTF-16, as this is a somewhat common
mistake that causes issues in the database because of null bytes in JSON.
>>> NpmMapping().normalize_description("foo bar")
rdflib.term.Literal('foo bar')
>>> NpmMapping().normalize_description(
... "\ufffd\ufffd#\x00 \x00f\x00o\x00o\x00 \x00b\x00a\x00r\x00\r\x00 \x00"
... )
rdflib.term.Literal('foo bar')
>>> NpmMapping().normalize_description(
... "\ufffd\ufffd\x00#\x00 \x00f\x00o\x00o\x00 \x00b\x00a\x00r\x00\r\x00 "
... )
rdflib.term.Literal('foo bar')
>>> NpmMapping().normalize_description(
... # invalid UTF-16 and meaningless UTF-8:
... "\ufffd\ufffd\x00#\x00\x00\x00 \x00\x00\x00\x00f\x00\x00\x00\x00"
... ) is None
True
>>> NpmMapping().normalize_description(
... # ditto (ut looks like little-endian at first)
... "\ufffd\ufffd#\x00\x00\x00 \x00\x00\x00\x00f\x00\x00\x00\x00\x00"
... ) is None
True
>>> NpmMapping().normalize_description(None) is None
True
"""
if not isinstance(description, str):
return None
# XXX: if this function ever need to support more cases, consider
# switching to https://pypi.org/project/ftfy/ instead of adding more hacks
if description.startswith("\ufffd\ufffd") and "\x00" in description:
# 2 unicode replacement characters followed by '# ' encoded as UTF-16
# is a common mistake, which indicates a README.md was saved as UTF-16,
# and some NPM tool opened it as UTF-8 and used the first line as
# description.
description_bytes = description.encode()
# Strip the the two unicode replacement characters
assert description_bytes.startswith(b"\xef\xbf\xbd\xef\xbf\xbd")
description_bytes = description_bytes[6:]
# If the following attempts fail to recover the description, discard it
# entirely because the current indexer storage backend (postgresql) cannot
# store zero bytes in JSON columns.
description = None
if not description_bytes.startswith(b"\x00"):
# try UTF-16 little-endian (the most common) first
try:
description = description_bytes.decode("utf-16le")
except UnicodeDecodeError:
pass
if description is None:
# if it fails, try UTF-16 big-endian
try:
description = description_bytes.decode("utf-16be")
except UnicodeDecodeError:
pass
if description:
if description.startswith("# "):
description = description[2:]
return Literal(description.rstrip())
else:
return None
return Literal(description)
def normalize_license(self, s):
"""https://docs.npmjs.com/files/package.json#license
>>> NpmMapping().normalize_license('MIT')
rdflib.term.URIRef('https://spdx.org/licenses/MIT')
"""
if isinstance(s, str):
+ if s.startswith("SEE LICENSE IN "):
+ # Very common pattern, because it is an example in the specification.
+ # It is followed by the filename; and the indexer architecture currently
+ # does not allow accessing that from metadata mappings.
+ # (Plus, an hypothetical license mapping would eventually pick it up)
+ return
+ if " " in s:
+ # Either an SPDX expression, or unusable data
+ # TODO: handle it
+ return
return SPDX + s
def normalize_keywords(self, lst):
"""https://docs.npmjs.com/files/package.json#homepage
>>> NpmMapping().normalize_keywords(['foo', 'bar'])
[rdflib.term.Literal('foo'), rdflib.term.Literal('bar')]
"""
if isinstance(lst, list):
return [Literal(x) for x in lst if isinstance(x, str)]
diff --git a/swh/indexer/tests/metadata_dictionary/test_github.py b/swh/indexer/tests/metadata_dictionary/test_github.py
index c0592dc..3085bcc 100644
--- a/swh/indexer/tests/metadata_dictionary/test_github.py
+++ b/swh/indexer/tests/metadata_dictionary/test_github.py
@@ -1,142 +1,156 @@
# Copyright (C) 2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from swh.indexer.metadata_dictionary import MAPPINGS
CONTEXT = [
"https://doi.org/10.5063/schema/codemeta-2.0",
{
"as": "https://www.w3.org/ns/activitystreams#",
"forge": "https://forgefed.org/ns#",
},
]
def test_compute_metadata_none():
"""
testing content empty content is empty
should return None
"""
content = b""
# None if no metadata was found or an error occurred
declared_metadata = None
result = MAPPINGS["GitHubMapping"]().translate(content)
assert declared_metadata == result
def test_supported_terms():
terms = MAPPINGS["GitHubMapping"].supported_terms()
assert {
"http://schema.org/name",
"http://schema.org/license",
"https://forgefed.org/ns#forks",
"https://www.w3.org/ns/activitystreams#totalItems",
} <= terms
def test_compute_metadata_github():
- """
- testing only computation of metadata with hard_mapping_npm
- """
content = b"""
{
"id": 80521091,
"node_id": "MDEwOlJlcG9zaXRvcnk4MDUyMTA5MQ==",
"name": "swh-indexer",
"full_name": "SoftwareHeritage/swh-indexer",
"private": false,
"owner": {
"login": "SoftwareHeritage",
"id": 18555939,
"node_id": "MDEyOk9yZ2FuaXphdGlvbjE4NTU1OTM5",
"avatar_url": "https://avatars.githubusercontent.com/u/18555939?v=4",
"gravatar_id": "",
"url": "https://api.github.com/users/SoftwareHeritage",
"type": "Organization",
"site_admin": false
},
"html_url": "https://github.com/SoftwareHeritage/swh-indexer",
"description": "GitHub mirror of Metadata indexer",
"fork": false,
"url": "https://api.github.com/repos/SoftwareHeritage/swh-indexer",
"created_at": "2017-01-31T13:05:39Z",
"updated_at": "2022-06-22T08:02:20Z",
"pushed_at": "2022-06-29T09:01:08Z",
"git_url": "git://github.com/SoftwareHeritage/swh-indexer.git",
"ssh_url": "git@github.com:SoftwareHeritage/swh-indexer.git",
"clone_url": "https://github.com/SoftwareHeritage/swh-indexer.git",
"svn_url": "https://github.com/SoftwareHeritage/swh-indexer",
"homepage": "https://forge.softwareheritage.org/source/swh-indexer/",
"size": 2713,
"stargazers_count": 13,
"watchers_count": 12,
"language": "Python",
"has_issues": false,
"has_projects": false,
"has_downloads": true,
"has_wiki": false,
"has_pages": false,
"forks_count": 1,
"mirror_url": null,
"archived": false,
"disabled": false,
"open_issues_count": 0,
"license": {
"key": "gpl-3.0",
"name": "GNU General Public License v3.0",
"spdx_id": "GPL-3.0",
"url": "https://api.github.com/licenses/gpl-3.0",
"node_id": "MDc6TGljZW5zZTk="
},
"allow_forking": true,
"is_template": false,
"web_commit_signoff_required": false,
"topics": [
],
"visibility": "public",
"forks": 1,
"open_issues": 0,
"watchers": 13,
"default_branch": "master",
"temp_clone_token": null,
"organization": {
"login": "SoftwareHeritage",
"id": 18555939,
"node_id": "MDEyOk9yZ2FuaXphdGlvbjE4NTU1OTM5",
"avatar_url": "https://avatars.githubusercontent.com/u/18555939?v=4",
"gravatar_id": "",
"type": "Organization",
"site_admin": false
},
"network_count": 1,
"subscribers_count": 6
}
"""
result = MAPPINGS["GitHubMapping"]().translate(content)
assert result == {
"@context": CONTEXT,
"type": "forge:Repository",
"forge:forks": {
"as:totalItems": 1,
"type": "as:OrderedCollection",
},
"as:likes": {
"as:totalItems": 13,
"type": "as:Collection",
},
"as:followers": {
"as:totalItems": 12,
"type": "as:Collection",
},
"license": "https://spdx.org/licenses/GPL-3.0",
"name": "SoftwareHeritage/swh-indexer",
"description": "GitHub mirror of Metadata indexer",
"schema:codeRepository": "https://github.com/SoftwareHeritage/swh-indexer",
"schema:dateCreated": "2017-01-31T13:05:39Z",
"schema:dateModified": "2022-06-22T08:02:20Z",
}
+
+
+def test_github_topics():
+ content = b"""
+{
+ "topics": [
+ "foo",
+ "bar"
+ ]
+}
+ """
+ result = MAPPINGS["GitHubMapping"]().translate(content)
+ assert set(result.pop("keywords", [])) == {"foo", "bar"}, result
+ assert result == {
+ "@context": CONTEXT,
+ "type": "forge:Repository",
+ }
diff --git a/swh/indexer/tests/metadata_dictionary/test_maven.py b/swh/indexer/tests/metadata_dictionary/test_maven.py
index 0267e95..afde286 100644
--- a/swh/indexer/tests/metadata_dictionary/test_maven.py
+++ b/swh/indexer/tests/metadata_dictionary/test_maven.py
@@ -1,365 +1,406 @@
# Copyright (C) 2017-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import logging
from hypothesis import HealthCheck, given, settings
from swh.indexer.metadata_dictionary import MAPPINGS
from ..utils import xml_document_strategy
def test_compute_metadata_maven():
raw_content = b"""
Maven Default Project
4.0.0
com.mycompany.app
my-app
1.2.3
central
Maven Repository Switchboard
default
http://repo1.maven.org/maven2
false
Apache License, Version 2.0
https://www.apache.org/licenses/LICENSE-2.0.txt
repo
A business-friendly OSS license
"""
result = MAPPINGS["MavenMapping"]().translate(raw_content)
assert result == {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"name": "Maven Default Project",
"schema:identifier": "com.mycompany.app",
"version": "1.2.3",
"license": "https://www.apache.org/licenses/LICENSE-2.0.txt",
"codeRepository": ("http://repo1.maven.org/maven2/com/mycompany/app/my-app"),
}
def test_compute_metadata_maven_empty():
raw_content = b"""
"""
result = MAPPINGS["MavenMapping"]().translate(raw_content)
assert result == {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
}
def test_compute_metadata_maven_almost_empty():
raw_content = b"""
"""
result = MAPPINGS["MavenMapping"]().translate(raw_content)
assert result == {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
}
def test_compute_metadata_maven_invalid_xml(caplog):
expected_warning = (
"swh.indexer.metadata_dictionary.maven.MavenMapping",
logging.WARNING,
"Error parsing XML from foo",
)
caplog.at_level(logging.WARNING, logger="swh.indexer.metadata_dictionary")
raw_content = b"""
"""
caplog.clear()
result = MAPPINGS["MavenMapping"]("foo").translate(raw_content)
assert caplog.record_tuples == [expected_warning], result
assert result is None
raw_content = b"""
"""
caplog.clear()
result = MAPPINGS["MavenMapping"]("foo").translate(raw_content)
assert caplog.record_tuples == [expected_warning], result
assert result is None
def test_compute_metadata_maven_unknown_encoding(caplog):
expected_warning = (
"swh.indexer.metadata_dictionary.maven.MavenMapping",
logging.WARNING,
"Error detecting XML encoding from foo",
)
caplog.at_level(logging.WARNING, logger="swh.indexer.metadata_dictionary")
raw_content = b"""
"""
caplog.clear()
result = MAPPINGS["MavenMapping"]("foo").translate(raw_content)
assert caplog.record_tuples == [expected_warning], result
assert result is None
raw_content = b"""
"""
caplog.clear()
result = MAPPINGS["MavenMapping"]("foo").translate(raw_content)
assert caplog.record_tuples == [expected_warning], result
assert result is None
def test_compute_metadata_maven_invalid_encoding(caplog):
expected_warning = [
# libexpat1 <= 2.2.10-2+deb11u1
[
(
"swh.indexer.metadata_dictionary.maven.MavenMapping",
logging.WARNING,
"Error unidecoding XML from foo",
)
],
# libexpat1 >= 2.2.10-2+deb11u2
[
(
"swh.indexer.metadata_dictionary.maven.MavenMapping",
logging.WARNING,
"Error parsing XML from foo",
)
],
]
caplog.at_level(logging.WARNING, logger="swh.indexer.metadata_dictionary")
raw_content = b"""
"""
caplog.clear()
result = MAPPINGS["MavenMapping"]("foo").translate(raw_content)
assert caplog.record_tuples in expected_warning, result
assert result is None
def test_compute_metadata_maven_minimal():
raw_content = b"""
Maven Default Project
4.0.0
com.mycompany.app
my-app
1.2.3
"""
result = MAPPINGS["MavenMapping"]().translate(raw_content)
assert result == {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"name": "Maven Default Project",
"schema:identifier": "com.mycompany.app",
"version": "1.2.3",
"codeRepository": (
"https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
),
}
def test_compute_metadata_maven_empty_nodes():
raw_content = b"""
Maven Default Project
4.0.0
com.mycompany.app
my-app
1.2.3
"""
result = MAPPINGS["MavenMapping"]().translate(raw_content)
assert result == {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"name": "Maven Default Project",
"schema:identifier": "com.mycompany.app",
"version": "1.2.3",
"codeRepository": (
"https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
),
}
raw_content = b"""
Maven Default Project
4.0.0
com.mycompany.app
my-app
"""
result = MAPPINGS["MavenMapping"]().translate(raw_content)
assert result == {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"name": "Maven Default Project",
"schema:identifier": "com.mycompany.app",
"codeRepository": (
"https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
),
}
raw_content = b"""
4.0.0
com.mycompany.app
my-app
1.2.3
"""
result = MAPPINGS["MavenMapping"]().translate(raw_content)
assert result == {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"schema:identifier": "com.mycompany.app",
"version": "1.2.3",
"codeRepository": (
"https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
),
}
raw_content = b"""
Maven Default Project
4.0.0
com.mycompany.app
my-app
1.2.3
"""
result = MAPPINGS["MavenMapping"]().translate(raw_content)
assert result == {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"name": "Maven Default Project",
"schema:identifier": "com.mycompany.app",
"version": "1.2.3",
"codeRepository": (
"https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
),
}
raw_content = b"""
1.2.3
"""
result = MAPPINGS["MavenMapping"]().translate(raw_content)
assert result == {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"version": "1.2.3",
}
def test_compute_metadata_maven_invalid_licenses():
raw_content = b"""
Maven Default Project
4.0.0
com.mycompany.app
my-app
1.2.3
foo
"""
result = MAPPINGS["MavenMapping"]().translate(raw_content)
assert result == {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"name": "Maven Default Project",
"schema:identifier": "com.mycompany.app",
"version": "1.2.3",
"codeRepository": (
"https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
),
}
def test_compute_metadata_maven_multiple():
"""Tests when there are multiple code repos and licenses."""
raw_content = b"""
Maven Default Project
4.0.0
com.mycompany.app
my-app
1.2.3
central
Maven Repository Switchboard
default
http://repo1.maven.org/maven2
false
example
Example Maven Repo
default
http://example.org/maven2
Apache License, Version 2.0
https://www.apache.org/licenses/LICENSE-2.0.txt
repo
A business-friendly OSS license
MIT license
https://opensource.org/licenses/MIT
"""
result = MAPPINGS["MavenMapping"]().translate(raw_content)
assert set(result.pop("license")) == {
"https://www.apache.org/licenses/LICENSE-2.0.txt",
"https://opensource.org/licenses/MIT",
}, result
assert set(result.pop("codeRepository")) == {
"http://repo1.maven.org/maven2/com/mycompany/app/my-app",
"http://example.org/maven2/com/mycompany/app/my-app",
}, result
assert result == {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"name": "Maven Default Project",
"schema:identifier": "com.mycompany.app",
"version": "1.2.3",
}
+def test_compute_metadata_maven_invalid_repository():
+ raw_content = b"""
+
+ Maven Default Project
+ 4.0.0
+ com.mycompany.app
+ my-app
+ 1.2.3
+
+
+ tcc-transaction-internal-releases
+ internal repository for released artifacts
+ ${repo.internal.releases.url}
+
+ false
+
+
+ true
+
+
+
+
+
+ Apache License, Version 2.0
+ https://www.apache.org/licenses/LICENSE-2.0.txt
+ repo
+ A business-friendly OSS license
+
+
+ """
+ result = MAPPINGS["MavenMapping"]().translate(raw_content)
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ "name": "Maven Default Project",
+ "schema:identifier": "com.mycompany.app",
+ "version": "1.2.3",
+ "license": "https://www.apache.org/licenses/LICENSE-2.0.txt",
+ }
+
+
@settings(suppress_health_check=[HealthCheck.too_slow])
@given(
xml_document_strategy(
keys=list(MAPPINGS["MavenMapping"].mapping), # type: ignore
root="project",
xmlns="http://maven.apache.org/POM/4.0.0",
)
)
def test_maven_adversarial(doc):
MAPPINGS["MavenMapping"]().translate(doc)
diff --git a/swh/indexer/tests/metadata_dictionary/test_npm.py b/swh/indexer/tests/metadata_dictionary/test_npm.py
index b0ead25..cdaf6b7 100644
--- a/swh/indexer/tests/metadata_dictionary/test_npm.py
+++ b/swh/indexer/tests/metadata_dictionary/test_npm.py
@@ -1,335 +1,420 @@
# Copyright (C) 2017-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import json
from hypothesis import HealthCheck, given, settings
import pytest
from swh.indexer.metadata_detector import detect_metadata
from swh.indexer.metadata_dictionary import MAPPINGS
from swh.indexer.storage.model import ContentMetadataRow
from ..test_metadata import TRANSLATOR_TOOL, ContentMetadataTestIndexer
from ..utils import (
BASE_TEST_CONFIG,
MAPPING_DESCRIPTION_CONTENT_SHA1,
json_document_strategy,
)
def test_compute_metadata_none():
"""
testing content empty content is empty
should return None
"""
content = b""
# None if no metadata was found or an error occurred
declared_metadata = None
result = MAPPINGS["NpmMapping"]().translate(content)
assert declared_metadata == result
def test_compute_metadata_npm():
"""
testing only computation of metadata with hard_mapping_npm
"""
content = b"""
{
"name": "test_metadata",
"version": "0.0.2",
"description": "Simple package.json test for indexer",
"repository": {
"type": "git",
"url": "https://github.com/moranegg/metadata_test"
},
"author": {
"email": "moranegg@example.com",
"name": "Morane G"
}
}
"""
declared_metadata = {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"name": "test_metadata",
"version": "0.0.2",
"description": "Simple package.json test for indexer",
"codeRepository": "git+https://github.com/moranegg/metadata_test",
"author": [
{
"type": "Person",
"name": "Morane G",
"email": "moranegg@example.com",
}
],
}
result = MAPPINGS["NpmMapping"]().translate(content)
assert declared_metadata == result
def test_compute_metadata_invalid_description_npm():
"""
testing only computation of metadata with hard_mapping_npm
"""
content = b"""
{
"name": "test_metadata",
"version": "0.0.2",
"description": 1234
}
"""
declared_metadata = {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"name": "test_metadata",
"version": "0.0.2",
}
result = MAPPINGS["NpmMapping"]().translate(content)
assert declared_metadata == result
def test_index_content_metadata_npm(storage, obj_storage):
"""
testing NPM with package.json
- one sha1 uses a file that can't be translated to metadata and
should return None in the translated metadata
"""
sha1s = [
MAPPING_DESCRIPTION_CONTENT_SHA1["json:test-metadata-package.json"],
MAPPING_DESCRIPTION_CONTENT_SHA1["json:npm-package.json"],
MAPPING_DESCRIPTION_CONTENT_SHA1["python:code"],
]
# this metadata indexer computes only metadata for package.json
# in npm context with a hard mapping
config = BASE_TEST_CONFIG.copy()
config["tools"] = [TRANSLATOR_TOOL]
metadata_indexer = ContentMetadataTestIndexer(config=config)
metadata_indexer.run(sha1s, log_suffix="unknown content")
results = list(metadata_indexer.idx_storage.content_metadata_get(sha1s))
expected_results = [
ContentMetadataRow(
id=sha1s[0],
tool=TRANSLATOR_TOOL,
metadata={
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"codeRepository": "git+https://github.com/moranegg/metadata_test",
"description": "Simple package.json test for indexer",
"name": "test_metadata",
"version": "0.0.1",
},
),
ContentMetadataRow(
id=sha1s[1],
tool=TRANSLATOR_TOOL,
metadata={
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"issueTracker": "https://github.com/npm/npm/issues",
"author": [
{
"type": "Person",
"name": "Isaac Z. Schlueter",
"email": "i@izs.me",
"url": "http://blog.izs.me",
}
],
"codeRepository": "git+https://github.com/npm/npm",
"description": "a package manager for JavaScript",
"license": "https://spdx.org/licenses/Artistic-2.0",
"version": "5.0.3",
"name": "npm",
"url": "https://docs.npmjs.com/",
},
),
]
for result in results:
del result.tool["id"]
result.metadata.pop("keywords", None)
# The assertion below returns False sometimes because of nested lists
assert expected_results == results
def test_npm_null_list_item_normalization():
package_json = b"""{
"name": "foo",
"keywords": [
"foo",
null
],
"homepage": [
"http://example.org/",
null
]
}"""
result = MAPPINGS["NpmMapping"]().translate(package_json)
assert result == {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"name": "foo",
"type": "SoftwareSourceCode",
"url": "http://example.org/",
"keywords": "foo",
}
def test_npm_bugs_normalization():
# valid dictionary
package_json = b"""{
"name": "foo",
"bugs": {
"url": "https://github.com/owner/project/issues",
"email": "foo@example.com"
}
}"""
result = MAPPINGS["NpmMapping"]().translate(package_json)
assert result == {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"name": "foo",
"issueTracker": "https://github.com/owner/project/issues",
"type": "SoftwareSourceCode",
}
# "invalid" dictionary
package_json = b"""{
"name": "foo",
"bugs": {
"email": "foo@example.com"
}
}"""
result = MAPPINGS["NpmMapping"]().translate(package_json)
assert result == {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"name": "foo",
"type": "SoftwareSourceCode",
}
# string
package_json = b"""{
"name": "foo",
"bugs": "https://github.com/owner/project/issues"
}"""
result = MAPPINGS["NpmMapping"]().translate(package_json)
assert result == {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"name": "foo",
"issueTracker": "https://github.com/owner/project/issues",
"type": "SoftwareSourceCode",
}
def test_npm_repository_normalization():
# normal
package_json = b"""{
"name": "foo",
"repository": {
"type" : "git",
"url" : "https://github.com/npm/cli.git"
}
}"""
result = MAPPINGS["NpmMapping"]().translate(package_json)
assert result == {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"name": "foo",
"codeRepository": "git+https://github.com/npm/cli.git",
"type": "SoftwareSourceCode",
}
# missing url
package_json = b"""{
"name": "foo",
"repository": {
"type" : "git"
}
}"""
result = MAPPINGS["NpmMapping"]().translate(package_json)
assert result == {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"name": "foo",
"type": "SoftwareSourceCode",
}
# github shortcut
package_json = b"""{
"name": "foo",
"repository": "github:npm/cli"
}"""
result = MAPPINGS["NpmMapping"]().translate(package_json)
expected_result = {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"name": "foo",
"codeRepository": "git+https://github.com/npm/cli.git",
"type": "SoftwareSourceCode",
}
assert result == expected_result
# github shortshortcut
package_json = b"""{
"name": "foo",
"repository": "npm/cli"
}"""
result = MAPPINGS["NpmMapping"]().translate(package_json)
assert result == expected_result
# gitlab shortcut
package_json = b"""{
"name": "foo",
"repository": "gitlab:user/repo"
}"""
result = MAPPINGS["NpmMapping"]().translate(package_json)
assert result == {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"name": "foo",
"codeRepository": "git+https://gitlab.com/user/repo.git",
"type": "SoftwareSourceCode",
}
+def test_npm_invalid_uris():
+ package_json = rb"""{
+ "version": "1.0.0",
+ "homepage": "",
+ "author": {
+ "name": "foo",
+ "url": "http://example.org"
+ }
+}"""
+ result = MAPPINGS["NpmMapping"]().translate(package_json)
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ "author": [{"name": "foo", "type": "Person", "url": "http://example.org"}],
+ "version": "1.0.0",
+ }
+
+ package_json = rb"""{
+ "version": "1.0.0",
+ "homepage": "http://example.org",
+ "author": {
+ "name": "foo",
+ "url": ""
+ }
+}"""
+ result = MAPPINGS["NpmMapping"]().translate(package_json)
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ "author": [{"name": "foo", "type": "Person"}],
+ "url": "http://example.org",
+ "version": "1.0.0",
+ }
+
+ package_json = rb"""{
+ "version": "1.0.0",
+ "homepage": "",
+ "author": {
+ "name": "foo",
+ "url": ""
+ }
+}"""
+ result = MAPPINGS["NpmMapping"]().translate(package_json)
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ "author": [{"name": "foo", "type": "Person"}],
+ "version": "1.0.0",
+ }
+
+ package_json = rb"""{
+ "version": "1.0.0",
+ "homepage": "http:example.org",
+ "author": {
+ "name": "foo",
+ "url": "http:example.com"
+ }
+}"""
+ result = MAPPINGS["NpmMapping"]().translate(package_json)
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ "author": [{"name": "foo", "type": "Person"}],
+ "version": "1.0.0",
+ }
+
+
+def test_npm_invalid_licenses():
+ package_json = rb"""{
+ "version": "1.0.0",
+ "license": "SEE LICENSE IN LICENSE.md",
+ "author": {
+ "name": "foo",
+ "url": "http://example.org"
+ }
+}"""
+ result = MAPPINGS["NpmMapping"]().translate(package_json)
+ assert result == {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ "author": [{"name": "foo", "type": "Person", "url": "http://example.org"}],
+ "version": "1.0.0",
+ }
+
+
@settings(suppress_health_check=[HealthCheck.too_slow])
@given(json_document_strategy(keys=list(MAPPINGS["NpmMapping"].mapping))) # type: ignore
def test_npm_adversarial(doc):
raw = json.dumps(doc).encode()
MAPPINGS["NpmMapping"]().translate(raw)
@pytest.mark.parametrize(
"filename", [b"package.json", b"Package.json", b"PACKAGE.json", b"PACKAGE.JSON"]
)
def test_detect_metadata_package_json(filename):
df = [
{
"sha1_git": b"abc",
"name": b"index.js",
"target": b"abc",
"length": 897,
"status": "visible",
"type": "file",
"perms": 33188,
"dir_id": b"dir_a",
"sha1": b"bcd",
},
{
"sha1_git": b"aab",
"name": filename,
"target": b"aab",
"length": 712,
"status": "visible",
"type": "file",
"perms": 33188,
"dir_id": b"dir_a",
"sha1": b"cde",
},
]
results = detect_metadata(df)
expected_results = {"NpmMapping": [b"cde"]}
assert expected_results == results
diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py
index 20c49c0..3ba7ad8 100644
--- a/swh/indexer/tests/test_metadata.py
+++ b/swh/indexer/tests/test_metadata.py
@@ -1,283 +1,312 @@
# Copyright (C) 2017-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import datetime
from unittest.mock import call
import attr
from swh.indexer.metadata import (
ContentMetadataIndexer,
DirectoryMetadataIndexer,
ExtrinsicMetadataIndexer,
)
from swh.indexer.storage.model import (
ContentMetadataRow,
DirectoryIntrinsicMetadataRow,
OriginExtrinsicMetadataRow,
)
from swh.indexer.tests.utils import DIRECTORY2
from swh.model.model import (
Directory,
DirectoryEntry,
MetadataAuthority,
MetadataAuthorityType,
MetadataFetcher,
RawExtrinsicMetadata,
)
from swh.model.swhids import ExtendedObjectType, ExtendedSWHID
from .utils import (
BASE_TEST_CONFIG,
MAPPING_DESCRIPTION_CONTENT_SHA1,
MAPPING_DESCRIPTION_CONTENT_SHA1GIT,
YARN_PARSER_METADATA,
fill_obj_storage,
fill_storage,
)
TRANSLATOR_TOOL = {
"name": "swh-metadata-translator",
"version": "0.0.2",
"configuration": {"type": "local", "context": "NpmMapping"},
}
class ContentMetadataTestIndexer(ContentMetadataIndexer):
"""Specific Metadata whose configuration is enough to satisfy the
indexing tests.
"""
def parse_config_file(self, *args, **kwargs):
assert False, "should not be called; the dir indexer configures it."
DIRECTORY_METADATA_CONFIG = {
**BASE_TEST_CONFIG,
"tools": TRANSLATOR_TOOL,
}
REMD = RawExtrinsicMetadata(
target=ExtendedSWHID(
object_type=ExtendedObjectType.ORIGIN,
object_id=b"\x01" * 20,
),
discovery_date=datetime.datetime.now(tz=datetime.timezone.utc),
authority=MetadataAuthority(
type=MetadataAuthorityType.FORGE,
url="https://example.org/",
),
fetcher=MetadataFetcher(
name="example-fetcher",
version="1.0.0",
),
format="application/vnd.github.v3+json",
metadata=b'{"full_name": "test software"}',
)
class TestMetadata:
"""
Tests metadata_mock_tool tool for Metadata detection
"""
def test_directory_metadata_indexer(self):
metadata_indexer = DirectoryMetadataIndexer(config=DIRECTORY_METADATA_CONFIG)
fill_obj_storage(metadata_indexer.objstorage)
fill_storage(metadata_indexer.storage)
tool = metadata_indexer.idx_storage.indexer_configuration_get(
{f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()}
)
assert tool is not None
dir_ = DIRECTORY2
assert (
dir_.entries[0].target
== MAPPING_DESCRIPTION_CONTENT_SHA1GIT["json:yarn-parser-package.json"]
)
metadata_indexer.idx_storage.content_metadata_add(
[
ContentMetadataRow(
id=MAPPING_DESCRIPTION_CONTENT_SHA1[
"json:yarn-parser-package.json"
],
indexer_configuration_id=tool["id"],
metadata=YARN_PARSER_METADATA,
)
]
)
metadata_indexer.run([dir_.id])
results = list(
metadata_indexer.idx_storage.directory_intrinsic_metadata_get([dir_.id])
)
expected_results = [
DirectoryIntrinsicMetadataRow(
id=dir_.id,
tool=TRANSLATOR_TOOL,
metadata=YARN_PARSER_METADATA,
mappings=["npm"],
)
]
for result in results:
del result.tool["id"]
assert results == expected_results
def test_directory_metadata_indexer_single_root_dir(self):
metadata_indexer = DirectoryMetadataIndexer(config=DIRECTORY_METADATA_CONFIG)
fill_obj_storage(metadata_indexer.objstorage)
fill_storage(metadata_indexer.storage)
# Add a parent directory, that is the only directory at the root
# of the directory
dir_ = DIRECTORY2
assert (
dir_.entries[0].target
== MAPPING_DESCRIPTION_CONTENT_SHA1GIT["json:yarn-parser-package.json"]
)
new_dir = Directory(
entries=(
DirectoryEntry(
name=b"foobar-1.0.0",
type="dir",
target=dir_.id,
perms=16384,
),
),
)
assert new_dir.id is not None
metadata_indexer.storage.directory_add([new_dir])
tool = metadata_indexer.idx_storage.indexer_configuration_get(
{f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()}
)
assert tool is not None
metadata_indexer.idx_storage.content_metadata_add(
[
ContentMetadataRow(
id=MAPPING_DESCRIPTION_CONTENT_SHA1[
"json:yarn-parser-package.json"
],
indexer_configuration_id=tool["id"],
metadata=YARN_PARSER_METADATA,
)
]
)
metadata_indexer.run([new_dir.id])
results = list(
metadata_indexer.idx_storage.directory_intrinsic_metadata_get([new_dir.id])
)
expected_results = [
DirectoryIntrinsicMetadataRow(
id=new_dir.id,
tool=TRANSLATOR_TOOL,
metadata=YARN_PARSER_METADATA,
mappings=["npm"],
)
]
for result in results:
del result.tool["id"]
assert results == expected_results
def test_extrinsic_metadata_indexer_unknown_format(self, mocker):
"""Should be ignored when unknown format"""
metadata_indexer = ExtrinsicMetadataIndexer(config=DIRECTORY_METADATA_CONFIG)
metadata_indexer.storage = mocker.patch.object(metadata_indexer, "storage")
remd = attr.evolve(REMD, format="unknown format")
results = metadata_indexer.index(remd.id, data=remd)
assert metadata_indexer.storage.method_calls == []
assert results == []
def test_extrinsic_metadata_indexer_github(self, mocker):
"""Nominal case, calling the mapping and storing the result"""
origin = "https://example.org/jdoe/myrepo"
metadata_indexer = ExtrinsicMetadataIndexer(config=DIRECTORY_METADATA_CONFIG)
metadata_indexer.catch_exceptions = False
metadata_indexer.storage = mocker.patch.object(metadata_indexer, "storage")
metadata_indexer.storage.origin_get_by_sha1.return_value = [{"url": origin}]
tool = metadata_indexer.idx_storage.indexer_configuration_get(
{f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()}
)
assert tool is not None
assert metadata_indexer.process_journal_objects(
{"raw_extrinsic_metadata": [REMD.to_dict()]}
) == {"status": "eventful", "origin_extrinsic_metadata:add": 1}
assert metadata_indexer.storage.method_calls == [
call.origin_get_by_sha1([b"\x01" * 20])
]
results = list(
metadata_indexer.idx_storage.origin_extrinsic_metadata_get([origin])
)
assert results == [
OriginExtrinsicMetadataRow(
id="https://example.org/jdoe/myrepo",
tool={"id": tool["id"], **TRANSLATOR_TOOL},
metadata={
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "https://forgefed.org/ns#Repository",
"name": "test software",
},
from_remd_id=REMD.id,
mappings=["github"],
)
]
def test_extrinsic_metadata_indexer_nonforge_authority(self, mocker):
"""Early abort on non-forge authorities"""
metadata_indexer = ExtrinsicMetadataIndexer(config=DIRECTORY_METADATA_CONFIG)
metadata_indexer.storage = mocker.patch.object(metadata_indexer, "storage")
remd = attr.evolve(
REMD,
authority=attr.evolve(REMD.authority, type=MetadataAuthorityType.REGISTRY),
)
results = metadata_indexer.index(remd.id, data=remd)
assert metadata_indexer.storage.method_calls == []
assert results == []
def test_extrinsic_metadata_indexer_thirdparty_authority(self, mocker):
"""Should be ignored when authority URL does not match the origin"""
origin = "https://different-domain.example.org/jdoe/myrepo"
metadata_indexer = ExtrinsicMetadataIndexer(config=DIRECTORY_METADATA_CONFIG)
metadata_indexer.catch_exceptions = False
metadata_indexer.storage = mocker.patch.object(metadata_indexer, "storage")
metadata_indexer.storage.origin_get_by_sha1.return_value = [{"url": origin}]
tool = metadata_indexer.idx_storage.indexer_configuration_get(
{f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()}
)
assert tool is not None
results = metadata_indexer.index(REMD.id, data=REMD)
assert metadata_indexer.storage.method_calls == [
call.origin_get_by_sha1([b"\x01" * 20])
]
assert results == []
+
+ def test_extrinsic_metadata_indexer_duplicate_origin(self, mocker):
+ """Nominal case, calling the mapping and storing the result"""
+ origin = "https://example.org/jdoe/myrepo"
+
+ metadata_indexer = ExtrinsicMetadataIndexer(config=DIRECTORY_METADATA_CONFIG)
+ metadata_indexer.catch_exceptions = False
+ metadata_indexer.storage = mocker.patch.object(metadata_indexer, "storage")
+ metadata_indexer.storage.origin_get_by_sha1.return_value = [{"url": origin}]
+
+ tool = metadata_indexer.idx_storage.indexer_configuration_get(
+ {f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()}
+ )
+ assert tool is not None
+
+ assert metadata_indexer.process_journal_objects(
+ {
+ "raw_extrinsic_metadata": [
+ REMD.to_dict(),
+ {**REMD.to_dict(), "id": b"\x00" * 20},
+ ]
+ }
+ ) == {"status": "eventful", "origin_extrinsic_metadata:add": 1}
+
+ results = list(
+ metadata_indexer.idx_storage.origin_extrinsic_metadata_get([origin])
+ )
+ assert len(results) == 1, results
+ assert results[0].from_remd_id == b"\x00" * 20
diff --git a/swh/indexer/tests/test_origin_metadata.py b/swh/indexer/tests/test_origin_metadata.py
index 567f479..4b7057e 100644
--- a/swh/indexer/tests/test_origin_metadata.py
+++ b/swh/indexer/tests/test_origin_metadata.py
@@ -1,356 +1,409 @@
-# Copyright (C) 2018-2020 The Software Heritage developers
+# Copyright (C) 2018-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import copy
from unittest.mock import patch
+import attr
import pytest
from swh.indexer.metadata import OriginMetadataIndexer
from swh.indexer.storage.interface import IndexerStorageInterface
from swh.indexer.storage.model import (
DirectoryIntrinsicMetadataRow,
OriginIntrinsicMetadataRow,
)
from swh.model.model import Origin
from swh.storage.interface import StorageInterface
from .test_metadata import TRANSLATOR_TOOL
from .utils import DIRECTORY2, YARN_PARSER_METADATA
@pytest.fixture
def swh_indexer_config(swh_indexer_config):
"""Override the default configuration to override the tools entry"""
cfg = copy.deepcopy(swh_indexer_config)
cfg["tools"] = TRANSLATOR_TOOL
return cfg
def test_origin_metadata_indexer_release(
swh_indexer_config,
idx_storage: IndexerStorageInterface,
storage: StorageInterface,
obj_storage,
) -> None:
indexer = OriginMetadataIndexer(config=swh_indexer_config)
origin = "https://npm.example.org/yarn-parser"
indexer.run([origin])
tool = swh_indexer_config["tools"]
dir_id = DIRECTORY2.id
dir_metadata = DirectoryIntrinsicMetadataRow(
id=dir_id,
tool=tool,
metadata=YARN_PARSER_METADATA,
mappings=["npm"],
)
origin_metadata = OriginIntrinsicMetadataRow(
id=origin,
tool=tool,
from_directory=dir_id,
metadata=YARN_PARSER_METADATA,
mappings=["npm"],
)
dir_results = list(idx_storage.directory_intrinsic_metadata_get([dir_id]))
for dir_result in dir_results:
assert dir_result.tool
del dir_result.tool["id"]
assert dir_results == [dir_metadata]
orig_results = list(idx_storage.origin_intrinsic_metadata_get([origin]))
for orig_result in orig_results:
assert orig_result.tool
del orig_result.tool["id"]
assert orig_results == [origin_metadata]
def test_origin_metadata_indexer_revision(
swh_indexer_config,
idx_storage: IndexerStorageInterface,
storage: StorageInterface,
obj_storage,
) -> None:
indexer = OriginMetadataIndexer(config=swh_indexer_config)
origin = "https://github.com/librariesio/yarn-parser"
indexer.run([origin])
tool = swh_indexer_config["tools"]
dir_id = DIRECTORY2.id
dir_metadata = DirectoryIntrinsicMetadataRow(
id=dir_id,
tool=tool,
metadata=YARN_PARSER_METADATA,
mappings=["npm"],
)
origin_metadata = OriginIntrinsicMetadataRow(
id=origin,
tool=tool,
from_directory=dir_id,
metadata=YARN_PARSER_METADATA,
mappings=["npm"],
)
dir_results = list(idx_storage.directory_intrinsic_metadata_get([dir_id]))
for dir_result in dir_results:
assert dir_result.tool
del dir_result.tool["id"]
assert dir_results == [dir_metadata]
orig_results = list(idx_storage.origin_intrinsic_metadata_get([origin]))
for orig_result in orig_results:
assert orig_result.tool
del orig_result.tool["id"]
assert orig_results == [origin_metadata]
def test_origin_metadata_indexer_duplicate_origin(
swh_indexer_config,
idx_storage: IndexerStorageInterface,
storage: StorageInterface,
obj_storage,
) -> None:
indexer = OriginMetadataIndexer(config=swh_indexer_config)
indexer.storage = storage
indexer.idx_storage = idx_storage
indexer.run(["https://github.com/librariesio/yarn-parser"])
indexer.run(["https://github.com/librariesio/yarn-parser"] * 2)
origin = "https://github.com/librariesio/yarn-parser"
dir_id = DIRECTORY2.id
dir_results = list(indexer.idx_storage.directory_intrinsic_metadata_get([dir_id]))
assert len(dir_results) == 1
orig_results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin]))
assert len(orig_results) == 1
def test_origin_metadata_indexer_missing_head(
swh_indexer_config,
idx_storage: IndexerStorageInterface,
storage: StorageInterface,
obj_storage,
) -> None:
storage.origin_add([Origin(url="https://example.com")])
indexer = OriginMetadataIndexer(config=swh_indexer_config)
indexer.run(["https://example.com"])
origin = "https://example.com"
results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin]))
assert results == []
def test_origin_metadata_indexer_partial_missing_head(
swh_indexer_config,
idx_storage: IndexerStorageInterface,
storage: StorageInterface,
obj_storage,
) -> None:
origin1 = "https://example.com"
origin2 = "https://github.com/librariesio/yarn-parser"
storage.origin_add([Origin(url=origin1)])
indexer = OriginMetadataIndexer(config=swh_indexer_config)
indexer.run([origin1, origin2])
dir_id = DIRECTORY2.id
dir_results = list(indexer.idx_storage.directory_intrinsic_metadata_get([dir_id]))
assert dir_results == [
DirectoryIntrinsicMetadataRow(
id=dir_id,
metadata=YARN_PARSER_METADATA,
mappings=["npm"],
tool=dir_results[0].tool,
)
]
orig_results = list(
indexer.idx_storage.origin_intrinsic_metadata_get([origin1, origin2])
)
for orig_result in orig_results:
assert orig_results == [
OriginIntrinsicMetadataRow(
id=origin2,
from_directory=dir_id,
metadata=YARN_PARSER_METADATA,
mappings=["npm"],
tool=orig_results[0].tool,
)
]
def test_origin_metadata_indexer_duplicate_directory(
swh_indexer_config,
idx_storage: IndexerStorageInterface,
storage: StorageInterface,
obj_storage,
) -> None:
indexer = OriginMetadataIndexer(config=swh_indexer_config)
indexer.storage = storage
indexer.idx_storage = idx_storage
indexer.catch_exceptions = False
origin1 = "https://github.com/librariesio/yarn-parser"
origin2 = "https://github.com/librariesio/yarn-parser.git"
indexer.run([origin1, origin2])
dir_id = DIRECTORY2.id
dir_results = list(indexer.idx_storage.directory_intrinsic_metadata_get([dir_id]))
assert len(dir_results) == 1
orig_results = list(
indexer.idx_storage.origin_intrinsic_metadata_get([origin1, origin2])
)
assert len(orig_results) == 2
+def test_origin_metadata_indexer_duplicate_directory_different_result(
+ swh_indexer_config,
+ idx_storage: IndexerStorageInterface,
+ storage: StorageInterface,
+ obj_storage,
+ mocker,
+) -> None:
+ """Same as above, but indexing the same directory twice resulted in different
+ data (because list order differs).
+ """
+ indexer = OriginMetadataIndexer(config=swh_indexer_config)
+ indexer.storage = storage
+ indexer.idx_storage = idx_storage
+ indexer.catch_exceptions = False
+ origin1 = "https://github.com/librariesio/yarn-parser"
+ origin2 = "https://github.com/librariesio/yarn-parser.git"
+
+ directory_index = indexer.directory_metadata_indexer.index
+
+ nb_calls = 0
+
+ def side_effect(dir_id):
+ nonlocal nb_calls
+ if nb_calls == 0:
+ keywords = ["foo", "bar"]
+ elif nb_calls == 1:
+ keywords = ["bar", "foo"]
+ else:
+ assert False, nb_calls
+ nb_calls += 1
+ return [
+ attr.evolve(row, metadata={**row.metadata, "keywords": keywords})
+ for row in directory_index(dir_id)
+ ]
+
+ mocker.patch.object(
+ indexer.directory_metadata_indexer, "index", side_effect=side_effect
+ )
+
+ indexer.run([origin1, origin2])
+
+ dir_id = DIRECTORY2.id
+
+ dir_results = list(indexer.idx_storage.directory_intrinsic_metadata_get([dir_id]))
+ assert len(dir_results) == 1
+
+ orig_results = list(
+ indexer.idx_storage.origin_intrinsic_metadata_get([origin1, origin2])
+ )
+ assert len(orig_results) == 2
+
+
def test_origin_metadata_indexer_no_metadata_file(
swh_indexer_config,
idx_storage: IndexerStorageInterface,
storage: StorageInterface,
obj_storage,
) -> None:
indexer = OriginMetadataIndexer(config=swh_indexer_config)
origin = "https://github.com/librariesio/yarn-parser"
with patch("swh.indexer.metadata_dictionary.npm.NpmMapping.filename", b"foo.json"):
indexer.run([origin])
dir_id = DIRECTORY2.id
dir_results = list(indexer.idx_storage.directory_intrinsic_metadata_get([dir_id]))
assert dir_results == []
orig_results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin]))
assert orig_results == []
def test_origin_metadata_indexer_no_metadata(
swh_indexer_config,
idx_storage: IndexerStorageInterface,
storage: StorageInterface,
obj_storage,
) -> None:
indexer = OriginMetadataIndexer(config=swh_indexer_config)
origin = "https://github.com/librariesio/yarn-parser"
with patch(
"swh.indexer.metadata.DirectoryMetadataIndexer"
".translate_directory_intrinsic_metadata",
return_value=(["npm"], {"@context": "foo"}),
):
indexer.run([origin])
dir_id = DIRECTORY2.id
dir_results = list(indexer.idx_storage.directory_intrinsic_metadata_get([dir_id]))
assert dir_results == []
orig_results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin]))
assert orig_results == []
@pytest.mark.parametrize("catch_exceptions", [True, False])
def test_origin_metadata_indexer_directory_error(
swh_indexer_config,
idx_storage: IndexerStorageInterface,
storage: StorageInterface,
obj_storage,
sentry_events,
catch_exceptions,
) -> None:
indexer = OriginMetadataIndexer(config=swh_indexer_config)
origin = "https://github.com/librariesio/yarn-parser"
indexer.catch_exceptions = catch_exceptions
with patch(
"swh.indexer.metadata.DirectoryMetadataIndexer"
".translate_directory_intrinsic_metadata",
return_value=None,
):
indexer.run([origin])
assert len(sentry_events) == 1
sentry_event = sentry_events.pop()
assert sentry_event.get("tags") == {
"swh-indexer-origin-head-swhid": (
"swh:1:rev:a78410ce2f78f5078fd4ee7edb8c82c02a4a712c"
),
"swh-indexer-origin-url": origin,
}
assert "'TypeError'" in str(sentry_event)
dir_id = DIRECTORY2.id
dir_results = list(indexer.idx_storage.directory_intrinsic_metadata_get([dir_id]))
assert dir_results == []
orig_results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin]))
assert orig_results == []
@pytest.mark.parametrize("catch_exceptions", [True, False])
def test_origin_metadata_indexer_content_exception(
swh_indexer_config,
idx_storage: IndexerStorageInterface,
storage: StorageInterface,
obj_storage,
sentry_events,
catch_exceptions,
) -> None:
indexer = OriginMetadataIndexer(config=swh_indexer_config)
origin = "https://github.com/librariesio/yarn-parser"
indexer.catch_exceptions = catch_exceptions
class TestException(Exception):
pass
with patch(
"swh.indexer.metadata.ContentMetadataRow",
side_effect=TestException(),
):
indexer.run([origin])
assert len(sentry_events) == 1
sentry_event = sentry_events.pop()
assert sentry_event.get("tags") == {
"swh-indexer-content-sha1": "df9d3bcc0158faa446bd1af225f8e2e4afa576d7",
"swh-indexer-origin-head-swhid": (
"swh:1:rev:a78410ce2f78f5078fd4ee7edb8c82c02a4a712c"
),
"swh-indexer-origin-url": origin,
}
assert ".TestException'" in str(sentry_event), sentry_event
dir_id = DIRECTORY2.id
dir_results = list(indexer.idx_storage.directory_intrinsic_metadata_get([dir_id]))
assert dir_results == []
orig_results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin]))
assert orig_results == []
def test_origin_metadata_indexer_unknown_origin(
swh_indexer_config,
idx_storage: IndexerStorageInterface,
storage: StorageInterface,
obj_storage,
) -> None:
indexer = OriginMetadataIndexer(config=swh_indexer_config)
result = indexer.index_list([Origin("https://unknown.org/foo")])
assert not result