Page MenuHomeSoftware Heritage

No OneTemporary

diff --git a/swh/indexer/data/package-json/CITATION b/swh/indexer/data/package-json/CITATION
new file mode 100644
index 0000000..52a13c0
--- /dev/null
+++ b/swh/indexer/data/package-json/CITATION
@@ -0,0 +1 @@
+swh:1:dir:49dd6f75450a37243dfcc4b418ca5bf5e0010748;origin=https://github.com/Bartvds/package.json-schema
diff --git a/swh/indexer/data/package-json/LICENSE b/swh/indexer/data/package-json/LICENSE
new file mode 100644
index 0000000..3651abe
--- /dev/null
+++ b/swh/indexer/data/package-json/LICENSE
@@ -0,0 +1,22 @@
+Copyright (c) 2014 Bart van der Schoor
+
+Permission is hereby granted, free of charge, to any person
+obtaining a copy of this software and associated documentation
+files (the "Software"), to deal in the Software without
+restriction, including without limitation the rights to use,
+copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the
+Software is furnished to do so, subject to the following
+conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.
diff --git a/swh/indexer/data/package-json/schema.json b/swh/indexer/data/package-json/schema.json
new file mode 100644
index 0000000..e5f799f
--- /dev/null
+++ b/swh/indexer/data/package-json/schema.json
@@ -0,0 +1,377 @@
+{
+ "$schema": "http://json-schema.org/draft-04/schema",
+ "id": "lib://package.json",
+ "title": "package.json-schema",
+ "description": "JSON Schema for node/npm package.json",
+ "$ref": "lib://package.json#/definitions/standard",
+ "definitions": {
+ "minimal": {
+ "allOf": [
+ {
+ "$ref": "lib://package.json#/definitions/structure"
+ },
+ {
+ "required": [
+ "name",
+ "version"
+ ]
+ }
+ ]
+ },
+ "standard": {
+ "allOf": [
+ {
+ "$ref": "lib://package.json#/definitions/structure"
+ },
+ {
+ "required": [
+ "name",
+ "version",
+ "description",
+ "keywords",
+ "author",
+ "homepage",
+ "repository",
+ "bugs",
+ "licenses",
+ "engines",
+ "main",
+ "scripts",
+ "dependencies",
+ "devDependencies"
+ ],
+ "properties": {
+ "scripts": {
+ "type": "object",
+ "properties": {
+ "test": {
+ "type" : "string",
+ "pattern": "[a-zA-Z]"
+ }
+ }
+ },
+ "author": {
+ "$ref": "lib://package.json#/definitions/person-object"
+ },
+ "contributors": {
+ "type": "array",
+ "items": {
+ "$ref": "lib://package.json#/definitions/person-object"
+ }
+ },
+ "maintainers": {
+ "type": "array",
+ "items": {
+ "$ref": "lib://package.json#/definitions/person-object"
+ }
+ }
+ }
+ }
+ ]
+ },
+ "structure": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "$ref": "lib://package.json#/definitions/name"
+ },
+ "version": {
+ "$ref": "lib://package.json#/definitions/semver"
+ },
+ "description": {
+ "type": "string",
+ "minLength": 1
+ },
+ "keywords": {
+ "type": "array",
+ "uniqueItems": true,
+ "items": {
+ "$ref": "lib://package.json#/definitions/name"
+ }
+ },
+ "author": {
+ "$ref": "lib://package.json#/definitions/person"
+ },
+ "contributors": {
+ "type": "array",
+ "uniqueItems": true,
+ "items": {
+ "$ref": "lib://package.json#/definitions/person"
+ }
+ },
+ "maintainers": {
+ "type": "array",
+ "uniqueItems": true,
+ "items": {
+ "$ref": "lib://package.json#/definitions/person"
+ }
+ },
+ "homepage": {
+ "$ref": "lib://package.json#/definitions/uri-http"
+ },
+ "repository": {
+ "$ref": "lib://package.json#/definitions/repository"
+ },
+ "man": {
+ "oneOf": [
+ {
+ "$ref": "lib://package.json#/definitions/path"
+ },
+ {
+ "type": "array",
+ "uniqueItems": true,
+ "items": {
+ "$ref": "lib://package.json#/definitions/path"
+ }
+ }
+ ]
+ },
+ "bugs": {
+ "oneOf": [
+ {
+ "$ref": "lib://package.json#/definitions/uri-http"
+ },
+ {
+ "type": "object",
+ "required": [
+ "url"
+ ],
+ "properties": {
+ "url": {
+ "$ref": "lib://package.json#/definitions/uri-http"
+ },
+ "email": {
+ "$ref": "lib://package.json#/definitions/email"
+ }
+ }
+ }
+ ]
+ },
+ "license": {
+ "$ref": "lib://package.json#/definitions/licence"
+ },
+ "licenses": {
+ "type": "array",
+ "uniqueItems": true,
+ "items": {
+ "$ref": "lib://package.json#/definitions/licence"
+ }
+ },
+ "private": {
+ "type": "boolean"
+ },
+ "preferGlobal": {
+ "type": "boolean"
+ },
+ "engines": {
+ "$ref": "lib://package.json#/definitions/string-map"
+ },
+ "engineStrict": {
+ "type": "boolean"
+ },
+ "main": {
+ "$ref": "lib://package.json#/definitions/path"
+ },
+ "bin": {
+ "oneOf": [
+ {
+ "$ref": "lib://package.json#/definitions/path"
+ },
+ {
+
+ "$ref": "lib://package.json#/definitions/path-map"
+ }
+ ]
+ },
+ "files": {
+ "type": "array",
+ "uniqueItems": true,
+ "items": {
+ "$ref": "lib://package.json#/definitions/path"
+ }
+ },
+ "os": {
+ "type": "array",
+ "uniqueItems": true,
+ "items": {
+ "$ref": "lib://package.json#/definitions/identifier"
+ }
+ },
+ "cpu": {
+ "type": "array",
+ "uniqueItems": true,
+ "items": {
+ "$ref": "lib://package.json#/definitions/identifier"
+ }
+ },
+ "config": {
+ "type": "object"
+ },
+ "publishConfig": {
+ "type": "object"
+ },
+ "directories": {
+ "type": "object",
+ "properties": {
+ "lib": {
+ "$ref": "lib://package.json#/definitions/path"
+ },
+ "bin": {
+ "$ref": "lib://package.json#/definitions/path"
+ },
+ "man": {
+ "$ref": "lib://package.json#/definitions/path"
+ },
+ "doc": {
+ "$ref": "lib://package.json#/definitions/path"
+ },
+ "example": {
+ "$ref": "lib://package.json#/definitions/path"
+ }
+ }
+ },
+ "scripts": {
+ "$ref": "lib://package.json#/definitions/string-map"
+ },
+ "dependencies": {
+ "$ref": "lib://package.json#/definitions/dependency-map"
+ },
+ "devDependencies": {
+ "$ref": "lib://package.json#/definitions/dependency-map"
+ },
+ "bundledDependencies": {
+ "$ref": "lib://package.json#/definitions/dependency-map"
+ },
+ "bundleDependencies": {
+ "$ref": "lib://package.json#/definitions/dependency-map"
+ },
+ "optionalDependencies": {
+ "$ref": "lib://package.json#/definitions/dependency-map"
+ },
+ "peerDependencies": {
+ "$ref": "lib://package.json#/definitions/dependency-map"
+ }
+ }
+ },
+ "uri-http": {
+ "type": "string",
+ "pattern": "^https?:\/\/"
+ },
+ "email": {
+ "type": "string",
+ "pattern": "^([0-9a-zA-Z]([-\\.\\w]*[0-9a-zA-Z])*@([0-9a-zA-Z][-\\w]*[0-9a-zA-Z]\\.)+[a-zA-Z]{2,9})$"
+ },
+ "path": {
+ "type": "string",
+ "minLength": 1
+ },
+ "name": {
+ "type": "string",
+ "pattern": "^[A-Za-z](?:[_\\.-]?[A-Za-z0-9]+)*$"
+ },
+ "identifier": {
+ "type": "string",
+ "pattern": "^[A-Za-z](?:[_-]?[A-Za-z0-9]+)*$"
+ },
+ "semver": {
+ "type": "string",
+ "pattern": "^\\d+\\.\\d+\\.\\d+(?:-[a-z]+(?:[_\\.-]*[a-z0-9]+)*)*$"
+ },
+ "type-url": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": [
+ "type",
+ "url"
+ ],
+ "properties": {
+ "type": {
+ "type": "string",
+ "pattern": "[a-zA-Z]"
+ },
+ "url": {
+ "$ref": "lib://package.json#/definitions/uri-http"
+ }
+ }
+
+ },
+ "repository": {
+ "$ref": "lib://package.json#/definitions/type-url"
+ },
+ "licence": {
+ "oneOf": [
+ {
+ "type": "string",
+ "pattern": "[a-zA-Z]"
+ },
+ {
+ "$ref": "lib://package.json#/definitions/licence-object"
+ }
+ ]
+ },
+ "licence-object": {
+ "type": "object",
+ "additionalProperties": false,
+ "properties": {
+ "type": {
+ "type": "string",
+ "pattern": "[a-zA-Z]"
+ },
+ "url": {
+ "$ref": "lib://package.json#/definitions/uri-http"
+ }
+ }
+ },
+ "person": {
+ "oneOf": [
+ {
+ "type": "string",
+ "pattern": "[a-zA-Z]"
+ },
+ {
+ "$ref": "lib://package.json#/definitions/person-object"
+ }
+ ]
+ },
+ "person-object": {
+ "type": "object",
+ "required": [
+ "name"
+ ],
+ "properties": {
+ "name": {
+ "type": "string",
+ "pattern": "[a-zA-Z]"
+ },
+ "email": {
+ "$ref": "lib://package.json#/definitions/email"
+ },
+ "url": {
+ "$ref": "lib://package.json#/definitions/uri-http"
+ }
+ }
+ },
+ "string-map": {
+ "type": "object",
+ "additionalProperties": false,
+ "patternProperties": {
+ ".+": {
+ "type": "string"
+ }
+ }
+ },
+ "path-map": {
+ "type": "object",
+ "additionalProperties": false,
+ "patternProperties": {
+ ".+": {
+ "$ref": "lib://package.json#/definitions/path",
+ "pattern": "[a-zA-Z]"
+ }
+ }
+ },
+ "dependency-map": {
+ "$ref": "lib://package.json#/definitions/string-map"
+ }
+ }
+}
diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py
index 836d77d..8b2df2d 100644
--- a/swh/indexer/metadata.py
+++ b/swh/indexer/metadata.py
@@ -1,335 +1,335 @@
# Copyright (C) 2017-2018 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import click
import logging
from copy import deepcopy
from swh.indexer.indexer import ContentIndexer, RevisionIndexer, OriginIndexer
from swh.indexer.origin_head import OriginHeadIndexer
from swh.indexer.metadata_dictionary import MAPPINGS
from swh.indexer.metadata_detector import detect_metadata
from swh.indexer.metadata_detector import extract_minimal_metadata_dict
from swh.indexer.storage import INDEXER_CFG_KEY
from swh.model import hashutil
class ContentMetadataIndexer(ContentIndexer):
"""Content-level indexer
This indexer is in charge of:
- filtering out content already indexed in content_metadata
- reading content from objstorage with the content's id sha1
- computing translated_metadata by given context
- using the metadata_dictionary as the 'swh-metadata-translator' tool
- store result in content_metadata table
"""
# Note: This used when the content metadata indexer is used alone
# (not the case for example in the case of the RevisionMetadataIndexer)
CONFIG_BASE_FILENAME = 'indexer/content_metadata'
def filter(self, ids):
"""Filter out known sha1s and return only missing ones.
"""
yield from self.idx_storage.content_metadata_missing((
{
'id': sha1,
'indexer_configuration_id': self.tool['id'],
} for sha1 in ids
))
def index(self, id, data, log_suffix='unknown revision'):
"""Index sha1s' content and store result.
Args:
id (bytes): content's identifier
data (bytes): raw content in bytes
Returns:
dict: dictionary representing a content_metadata. If the
translation wasn't successful the translated_metadata keys will
be returned as None
"""
result = {
'id': id,
'indexer_configuration_id': self.tool['id'],
'translated_metadata': None
}
try:
mapping_name = self.tool['tool_configuration']['context']
log_suffix += ', content_id=%s' % hashutil.hash_to_hex(id)
result['translated_metadata'] = \
MAPPINGS[mapping_name](log_suffix).translate(data)
except Exception:
self.log.exception(
"Problem during metadata translation "
"for content %s" % hashutil.hash_to_hex(id))
if result['translated_metadata'] is None:
return None
return result
def persist_index_computations(self, results, policy_update):
"""Persist the results in storage.
Args:
results ([dict]): list of content_metadata, dict with the
following keys:
- id (bytes): content's identifier (sha1)
- translated_metadata (jsonb): detected metadata
policy_update ([str]): either 'update-dups' or 'ignore-dups' to
respectively update duplicates or ignore them
"""
self.idx_storage.content_metadata_add(
results, conflict_update=(policy_update == 'update-dups'))
class RevisionMetadataIndexer(RevisionIndexer):
"""Revision-level indexer
This indexer is in charge of:
- filtering revisions already indexed in revision_metadata table with
defined computation tool
- retrieve all entry_files in root directory
- use metadata_detector for file_names containing metadata
- compute metadata translation if necessary and possible (depends on tool)
- send sha1s to content indexing if possible
- store the results for revision
"""
CONFIG_BASE_FILENAME = 'indexer/revision_metadata'
ADDITIONAL_CONFIG = {
'tools': ('dict', {
'name': 'swh-metadata-detector',
'version': '0.0.2',
'configuration': {
'type': 'local',
'context': list(MAPPINGS),
},
}),
}
def filter(self, sha1_gits):
"""Filter out known sha1s and return only missing ones.
"""
yield from self.idx_storage.revision_metadata_missing((
{
'id': sha1_git,
'indexer_configuration_id': self.tool['id'],
} for sha1_git in sha1_gits
))
def index(self, rev):
"""Index rev by processing it and organizing result.
use metadata_detector to iterate on filenames
- if one filename detected -> sends file to content indexer
- if multiple file detected -> translation needed at revision level
Args:
rev (dict): revision artifact from storage
Returns:
dict: dictionary representing a revision_metadata, with keys:
- id (str): rev's identifier (sha1_git)
- indexer_configuration_id (bytes): tool used
- translated_metadata: dict of retrieved metadata
"""
result = {
'id': rev['id'],
'indexer_configuration_id': self.tool['id'],
'mappings': None,
'translated_metadata': None
}
try:
root_dir = rev['directory']
dir_ls = self.storage.directory_ls(root_dir, recursive=False)
files = [entry for entry in dir_ls if entry['type'] == 'file']
detected_files = detect_metadata(files)
(mappings, metadata) = self.translate_revision_metadata(
detected_files,
log_suffix='revision=%s' % hashutil.hash_to_hex(rev['id']))
result['mappings'] = mappings
result['translated_metadata'] = metadata
except Exception as e:
self.log.exception(
'Problem when indexing rev: %r', e)
return result
def persist_index_computations(self, results, policy_update):
"""Persist the results in storage.
Args:
results ([dict]): list of content_mimetype, dict with the
following keys:
- id (bytes): content's identifier (sha1)
- mimetype (bytes): mimetype in bytes
- encoding (bytes): encoding in bytes
policy_update ([str]): either 'update-dups' or 'ignore-dups' to
respectively update duplicates or ignore them
"""
# TODO: add functions in storage to keep data in revision_metadata
self.idx_storage.revision_metadata_add(
results, conflict_update=(policy_update == 'update-dups'))
def translate_revision_metadata(self, detected_files, log_suffix):
"""
Determine plan of action to translate metadata when containing
one or multiple detected files:
Args:
detected_files (dict): dictionary mapping context names (e.g.,
"npm", "authors") to list of sha1
Returns:
(List[str], dict): list of mappings used and dict with
translated metadata according to the CodeMeta vocabulary
"""
used_mappings = [MAPPINGS[context].name for context in detected_files]
translated_metadata = []
tool = {
'name': 'swh-metadata-translator',
'version': '0.0.2',
'configuration': {
'type': 'local',
'context': None
},
}
# TODO: iterate on each context, on each file
# -> get raw_contents
# -> translate each content
config = {
k: self.config[k]
for k in [INDEXER_CFG_KEY, 'objstorage', 'storage']
}
config['tools'] = [tool]
for context in detected_files.keys():
cfg = deepcopy(config)
cfg['tools'][0]['configuration']['context'] = context
c_metadata_indexer = ContentMetadataIndexer(config=cfg)
# sha1s that are in content_metadata table
sha1s_in_storage = []
metadata_generator = self.idx_storage.content_metadata_get(
detected_files[context])
for c in metadata_generator:
# extracting translated_metadata
sha1 = c['id']
sha1s_in_storage.append(sha1)
local_metadata = c['translated_metadata']
# local metadata is aggregated
if local_metadata:
translated_metadata.append(local_metadata)
sha1s_filtered = [item for item in detected_files[context]
if item not in sha1s_in_storage]
if sha1s_filtered:
# content indexing
try:
c_metadata_indexer.run(sha1s_filtered,
policy_update='ignore-dups',
log_suffix=log_suffix)
# on the fly possibility:
for result in c_metadata_indexer.results:
local_metadata = result['translated_metadata']
translated_metadata.append(local_metadata)
except Exception:
self.log.exception(
"Exception while indexing metadata on contents")
# transform translated_metadata into min set with swh-metadata-detector
min_metadata = extract_minimal_metadata_dict(translated_metadata)
return (used_mappings, min_metadata)
class OriginMetadataIndexer(OriginIndexer):
CONFIG_BASE_FILENAME = 'indexer/origin_intrinsic_metadata'
ADDITIONAL_CONFIG = {
'tools': ('list', [])
}
USE_TOOLS = False
def __init__(self):
super().__init__()
self.origin_head_indexer = OriginHeadIndexer()
self.revision_metadata_indexer = RevisionMetadataIndexer()
def index_list(self, origins):
head_rev_ids = []
for origin in origins:
head_result = self.origin_head_indexer.index(origin)
if not head_result:
continue
head_rev_ids.append(head_result['revision_id'])
head_revs = list(self.storage.revision_get(head_rev_ids))
assert len(head_revs) == len(head_rev_ids)
results = []
- for (orig, rev) in zip(origins, head_revs):
+ for (origin, rev) in zip(origins, head_revs):
if not rev:
self.warning('Missing head revision %s of origin %r',
(hashutil.hash_to_bytes(rev['id']), origin))
continue
rev_metadata = self.revision_metadata_indexer.index(rev)
orig_metadata = {
'from_revision': rev_metadata['id'],
'origin_id': origin['id'],
'metadata': rev_metadata['translated_metadata'],
'mappings': rev_metadata['mappings'],
'indexer_configuration_id':
rev_metadata['indexer_configuration_id'],
}
results.append((orig_metadata, rev_metadata))
return results
def persist_index_computations(self, results, policy_update):
conflict_update = (policy_update == 'update-dups')
# Deduplicate revisions
rev_metadata = []
orig_metadata = []
for (orig_item, rev_item) in results:
if rev_item not in rev_metadata:
rev_metadata.append(rev_item)
if rev_item not in orig_metadata:
orig_metadata.append(orig_item)
self.idx_storage.revision_metadata_add(
rev_metadata, conflict_update=conflict_update)
self.idx_storage.origin_intrinsic_metadata_add(
orig_metadata, conflict_update=conflict_update)
@click.command()
@click.option('--revs', '-i',
help='Default sha1_git to lookup', multiple=True)
def main(revs):
_git_sha1s = list(map(hashutil.hash_to_bytes, revs))
rev_metadata_indexer = RevisionMetadataIndexer()
rev_metadata_indexer.run(_git_sha1s, 'update-dups')
if __name__ == '__main__':
logging.basicConfig(level=logging.INFO)
main()
diff --git a/swh/indexer/tests/test_origin_metadata.py b/swh/indexer/tests/test_origin_metadata.py
index 2d5e653..fb72a5a 100644
--- a/swh/indexer/tests/test_origin_metadata.py
+++ b/swh/indexer/tests/test_origin_metadata.py
@@ -1,163 +1,188 @@
# Copyright (C) 2018-2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import pytest
from unittest.mock import patch
from swh.model.hashutil import hash_to_bytes
from swh.indexer.metadata import OriginMetadataIndexer
from .utils import BASE_TEST_CONFIG, YARN_PARSER_METADATA
from .test_metadata import REVISION_METADATA_CONFIG
ORIGIN_HEAD_CONFIG = {
**BASE_TEST_CONFIG,
'tools': {
'name': 'origin-metadata',
'version': '0.0.1',
'configuration': {},
},
'tasks': {
'revision_metadata': 'revision_metadata',
'origin_intrinsic_metadata': 'origin_intrinsic_metadata',
}
}
@pytest.fixture
def origin_metadata_indexer():
prefix = 'swh.indexer.'
suffix = '.parse_config_file'
with patch(prefix + 'metadata.OriginMetadataIndexer' + suffix) as omi, \
patch(prefix + 'origin_head.OriginHeadIndexer' + suffix) as ohi, \
patch(prefix + 'metadata.RevisionMetadataIndexer' + suffix) as rmi:
omi.return_value = BASE_TEST_CONFIG
ohi.return_value = ORIGIN_HEAD_CONFIG
rmi.return_value = REVISION_METADATA_CONFIG
yield OriginMetadataIndexer()
def test_origin_metadata_indexer(
idx_storage, storage, obj_storage, origin_metadata_indexer):
indexer = OriginMetadataIndexer()
indexer.run(["git+https://github.com/librariesio/yarn-parser"])
origin = storage.origin_get({
'type': 'git',
'url': 'https://github.com/librariesio/yarn-parser'})
rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f')
rev_metadata = {
'id': rev_id,
'translated_metadata': YARN_PARSER_METADATA,
'mappings': ['npm'],
}
origin_metadata = {
'origin_id': origin['id'],
'from_revision': rev_id,
'metadata': YARN_PARSER_METADATA,
'mappings': ['npm'],
}
results = list(indexer.idx_storage.revision_metadata_get([rev_id]))
for result in results:
del result['tool']
assert results == [rev_metadata]
results = list(indexer.idx_storage.origin_intrinsic_metadata_get([
origin['id']]))
for result in results:
del result['tool']
assert results == [origin_metadata]
-def test_origin_metadata_indexer_duplicates(
+def test_origin_metadata_indexer_duplicate_origin(
idx_storage, storage, obj_storage, origin_metadata_indexer):
indexer = OriginMetadataIndexer()
indexer.storage = storage
indexer.idx_storage = idx_storage
indexer.run(["git+https://github.com/librariesio/yarn-parser"])
indexer.run(["git+https://github.com/librariesio/yarn-parser"]*2)
origin = storage.origin_get({
'type': 'git',
'url': 'https://github.com/librariesio/yarn-parser'})
rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f')
results = list(indexer.idx_storage.revision_metadata_get([rev_id]))
assert len(results) == 1
results = list(indexer.idx_storage.origin_intrinsic_metadata_get([
origin['id']]))
assert len(results) == 1
def test_origin_metadata_indexer_missing_head(
idx_storage, storage, obj_storage, origin_metadata_indexer):
storage.origin_add([{
'type': 'git',
'url': 'https://example.com'
}])
indexer = OriginMetadataIndexer()
indexer.run(["git+https://example.com"])
origin = storage.origin_get({
'type': 'git',
'url': 'https://example.com'})
results = list(indexer.idx_storage.origin_intrinsic_metadata_get([
origin['id']]))
assert results == []
def test_origin_metadata_indexer_partial_missing_head(
idx_storage, storage, obj_storage, origin_metadata_indexer):
storage.origin_add([{
'type': 'git',
'url': 'https://example.com'
}])
indexer = OriginMetadataIndexer()
indexer.run(["git+https://example.com",
"git+https://github.com/librariesio/yarn-parser"])
origin1 = storage.origin_get({
'type': 'git',
'url': 'https://example.com'})
origin2 = storage.origin_get({
'type': 'git',
'url': 'https://github.com/librariesio/yarn-parser'})
rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f')
rev_metadata = {
'id': rev_id,
'translated_metadata': YARN_PARSER_METADATA,
'mappings': ['npm'],
}
origin_metadata = {
'origin_id': origin2['id'],
'from_revision': rev_id,
'metadata': YARN_PARSER_METADATA,
'mappings': ['npm'],
}
results = list(indexer.idx_storage.revision_metadata_get([rev_id]))
for result in results:
del result['tool']
assert results == [rev_metadata]
results = list(indexer.idx_storage.origin_intrinsic_metadata_get([
origin1['id'], origin2['id']]))
for result in results:
del result['tool']
assert results == [origin_metadata]
+
+
+def test_origin_metadata_indexer_duplicate_revision(
+ idx_storage, storage, obj_storage, origin_metadata_indexer):
+ indexer = OriginMetadataIndexer()
+ indexer.storage = storage
+ indexer.idx_storage = idx_storage
+ indexer.run(["git+https://github.com/librariesio/yarn-parser",
+ "git+https://github.com/librariesio/yarn-parser.git"])
+
+ origin1 = storage.origin_get({
+ 'type': 'git',
+ 'url': 'https://github.com/librariesio/yarn-parser'})
+ origin2 = storage.origin_get({
+ 'type': 'git',
+ 'url': 'https://github.com/librariesio/yarn-parser.git'})
+ assert origin1['id'] != origin2['id']
+ rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f')
+
+ results = list(indexer.idx_storage.revision_metadata_get([rev_id]))
+ assert len(results) == 1
+
+ results = list(indexer.idx_storage.origin_intrinsic_metadata_get([
+ origin1['id'], origin2['id']]))
+ assert len(results) == 2
diff --git a/swh/indexer/tests/utils.py b/swh/indexer/tests/utils.py
index c19bb7e..2cfc437 100644
--- a/swh/indexer/tests/utils.py
+++ b/swh/indexer/tests/utils.py
@@ -1,666 +1,678 @@
# Copyright (C) 2017-2018 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import abc
import datetime
import hashlib
import random
from hypothesis import strategies
from swh.model import hashutil
from swh.model.hashutil import hash_to_bytes, hash_to_hex
from swh.indexer.storage import INDEXER_CFG_KEY
BASE_TEST_CONFIG = {
'storage': {
'cls': 'memory',
'args': {
},
},
'objstorage': {
'cls': 'memory',
'args': {
},
},
INDEXER_CFG_KEY: {
'cls': 'memory',
'args': {
},
},
}
ORIGINS = [
{
'id': 52189575,
'lister': None,
'project': None,
'type': 'git',
'url': 'https://github.com/SoftwareHeritage/swh-storage'},
{
'id': 4423668,
'lister': None,
'project': None,
'type': 'ftp',
'url': 'rsync://ftp.gnu.org/gnu/3dldf'},
{
'id': 77775770,
'lister': None,
'project': None,
'type': 'deposit',
'url': 'https://forge.softwareheritage.org/source/jesuisgpl/'},
{
'id': 85072327,
'lister': None,
'project': None,
'type': 'pypi',
'url': 'https://pypi.org/project/limnoria/'},
{
'id': 49908349,
'lister': None,
'project': None,
'type': 'svn',
'url': 'http://0-512-md.googlecode.com/svn/'},
{
'id': 54974445,
'lister': None,
'project': None,
'type': 'git',
'url': 'https://github.com/librariesio/yarn-parser'},
+ {
+ 'id': 54974446,
+ 'lister': None,
+ 'project': None,
+ 'type': 'git',
+ 'url': 'https://github.com/librariesio/yarn-parser.git'},
]
SNAPSHOTS = {
52189575: {
'branches': {
b'refs/heads/add-revision-origin-cache': {
'target': b'L[\xce\x1c\x88\x8eF\t\xf1"\x19\x1e\xfb\xc0'
b's\xe7/\xe9l\x1e',
'target_type': 'revision'},
b'HEAD': {
'target': b'8K\x12\x00d\x03\xcc\xe4]bS\xe3\x8f{\xd7}'
b'\xac\xefrm',
'target_type': 'revision'},
b'refs/tags/v0.0.103': {
'target': b'\xb6"Im{\xfdLb\xb0\x94N\xea\x96m\x13x\x88+'
b'\x0f\xdd',
'target_type': 'release'},
}},
4423668: {
'branches': {
b'3DLDF-1.1.4.tar.gz': {
'target': b'dJ\xfb\x1c\x91\xf4\x82B%]6\xa2\x90|\xd3\xfc'
b'"G\x99\x11',
'target_type': 'revision'},
b'3DLDF-2.0.2.tar.gz': {
'target': b'\xb6\x0e\xe7\x9e9\xac\xaa\x19\x9e='
b'\xd1\xc5\x00\\\xc6\xfc\xe0\xa6\xb4V',
'target_type': 'revision'},
b'3DLDF-2.0.3-examples.tar.gz': {
'target': b'!H\x19\xc0\xee\x82-\x12F1\xbd\x97'
b'\xfe\xadZ\x80\x80\xc1\x83\xff',
'target_type': 'revision'},
b'3DLDF-2.0.3.tar.gz': {
'target': b'\x8e\xa9\x8e/\xea}\x9feF\xf4\x9f\xfd\xee'
b'\xcc\x1a\xb4`\x8c\x8by',
'target_type': 'revision'},
b'3DLDF-2.0.tar.gz': {
'target': b'F6*\xff(?\x19a\xef\xb6\xc2\x1fv$S\xe3G'
b'\xd3\xd1m',
b'target_type': 'revision'}
}},
77775770: {
'branches': {
b'master': {
'target': b'\xe7n\xa4\x9c\x9f\xfb\xb7\xf76\x11\x08{'
b'\xa6\xe9\x99\xb1\x9e]q\xeb',
'target_type': 'revision'}
},
'id': b"h\xc0\xd2a\x04\xd4~'\x8d\xd6\xbe\x07\xeda\xfa\xfbV"
b"\x1d\r "},
85072327: {
'branches': {
b'HEAD': {
'target': b'releases/2018.09.09',
'target_type': 'alias'},
b'releases/2018.09.01': {
'target': b'<\xee1(\xe8\x8d_\xc1\xc9\xa6rT\xf1\x1d'
b'\xbb\xdfF\xfdw\xcf',
'target_type': 'revision'},
b'releases/2018.09.09': {
'target': b'\x83\xb9\xb6\xc7\x05\xb1%\xd0\xfem\xd8k'
b'A\x10\x9d\xc5\xfa2\xf8t',
'target_type': 'revision'}},
'id': b'{\xda\x8e\x84\x7fX\xff\x92\x80^\x93V\x18\xa3\xfay'
b'\x12\x9e\xd6\xb3'},
49908349: {
'branches': {
b'master': {
'target': b'\xe4?r\xe1,\x88\xab\xec\xe7\x9a\x87\xb8'
b'\xc9\xad#.\x1bw=\x18',
'target_type': 'revision'}},
'id': b'\xa1\xa2\x8c\n\xb3\x87\xa8\xf9\xe0a\x8c\xb7'
b'\x05\xea\xb8\x1f\xc4H\xf4s'},
54974445: {
'branches': {
b'HEAD': {
'target': hash_to_bytes(
'8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'),
- 'target_type': 'revision'}}}
+ 'target_type': 'revision'}}},
+ 54974446: {
+ 'branches': {
+ b'HEAD': {
+ 'target': hash_to_bytes(
+ '8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'),
+ 'target_type': 'revision'}}},
}
REVISIONS = [{
'id': hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'),
'author': {
'id': 26,
'name': b'Andrew Nesbitt',
'fullname': b'Andrew Nesbitt <andrewnez@gmail.com>',
'email': b'andrewnez@gmail.com'
},
'committer': {
'id': 26,
'name': b'Andrew Nesbitt',
'fullname': b'Andrew Nesbitt <andrewnez@gmail.com>',
'email': b'andrewnez@gmail.com'
},
'synthetic': False,
'date': {
'negative_utc': False,
'timestamp': {
'seconds': 1487596456,
'microseconds': 0
},
'offset': 0
},
'directory': b'10'
}]
DIRECTORY_ID = b'10'
DIRECTORY = [{
'sha1_git': b'abc',
'name': b'index.js',
'target': b'abc',
'length': 897,
'status': 'visible',
'type': 'file',
'perms': 33188,
'sha1': b'bcd'
},
{
'sha1_git': b'aab',
'name': b'package.json',
'target': b'aab',
'length': 712,
'status': 'visible',
'type': 'file',
'perms': 33188,
'sha1': b'cde'
},
{
'target': b'11',
'type': 'dir',
'length': None,
'name': b'.github',
'sha1': None,
'perms': 16384,
'sha1_git': None,
'status': None,
'sha256': None
}
]
SHA1_TO_LICENSES = {
'01c9379dfc33803963d07c1ccc748d3fe4c96bb5': ['GPL'],
'02fb2c89e14f7fab46701478c83779c7beb7b069': ['Apache2.0'],
'103bc087db1d26afc3a0283f38663d081e9b01e6': ['MIT'],
'688a5ef812c53907562fe379d4b3851e69c7cb15': ['AGPL'],
'da39a3ee5e6b4b0d3255bfef95601890afd80709': [],
}
SHA1_TO_CTAGS = {
'01c9379dfc33803963d07c1ccc748d3fe4c96bb5': [{
'name': 'foo',
'kind': 'str',
'line': 10,
'lang': 'bar',
}],
'd4c647f0fc257591cc9ba1722484229780d1c607': [{
'name': 'let',
'kind': 'int',
'line': 100,
'lang': 'haskell',
}],
'688a5ef812c53907562fe379d4b3851e69c7cb15': [{
'name': 'symbol',
'kind': 'float',
'line': 99,
'lang': 'python',
}],
}
OBJ_STORAGE_DATA = {
'01c9379dfc33803963d07c1ccc748d3fe4c96bb5': b'this is some text',
'688a5ef812c53907562fe379d4b3851e69c7cb15': b'another text',
'8986af901dd2043044ce8f0d8fc039153641cf17': b'yet another text',
'02fb2c89e14f7fab46701478c83779c7beb7b069': b"""
import unittest
import logging
from swh.indexer.mimetype import MimetypeIndexer
from swh.indexer.tests.test_utils import MockObjStorage
class MockStorage():
def content_mimetype_add(self, mimetypes):
self.state = mimetypes
self.conflict_update = conflict_update
def indexer_configuration_add(self, tools):
return [{
'id': 10,
}]
""",
'103bc087db1d26afc3a0283f38663d081e9b01e6': b"""
#ifndef __AVL__
#define __AVL__
typedef struct _avl_tree avl_tree;
typedef struct _data_t {
int content;
} data_t;
""",
'93666f74f1cf635c8c8ac118879da6ec5623c410': b"""
(should 'pygments (recognize 'lisp 'easily))
""",
'26a9f72a7c87cc9205725cfd879f514ff4f3d8d5': b"""
{
"name": "test_metadata",
"version": "0.0.1",
"description": "Simple package.json test for indexer",
"repository": {
"type": "git",
"url": "https://github.com/moranegg/metadata_test"
}
}
""",
'd4c647f0fc257591cc9ba1722484229780d1c607': b"""
{
"version": "5.0.3",
"name": "npm",
"description": "a package manager for JavaScript",
"keywords": [
"install",
"modules",
"package manager",
"package.json"
],
"preferGlobal": true,
"config": {
"publishtest": false
},
"homepage": "https://docs.npmjs.com/",
"author": "Isaac Z. Schlueter <i@izs.me> (http://blog.izs.me)",
"repository": {
"type": "git",
"url": "https://github.com/npm/npm"
},
"bugs": {
"url": "https://github.com/npm/npm/issues"
},
"dependencies": {
"JSONStream": "~1.3.1",
"abbrev": "~1.1.0",
"ansi-regex": "~2.1.1",
"ansicolors": "~0.3.2",
"ansistyles": "~0.1.3"
},
"devDependencies": {
"tacks": "~1.2.6",
"tap": "~10.3.2"
},
"license": "Artistic-2.0"
}
""",
'a7ab314d8a11d2c93e3dcf528ca294e7b431c449': b"""
""",
'da39a3ee5e6b4b0d3255bfef95601890afd80709': b'',
'636465': b"""
{
"name": "yarn-parser",
"version": "1.0.0",
"description": "Tiny web service for parsing yarn.lock files",
"main": "index.js",
"scripts": {
"start": "node index.js",
"test": "mocha"
},
"engines": {
"node": "9.8.0"
},
"repository": {
"type": "git",
"url": "git+https://github.com/librariesio/yarn-parser.git"
},
"keywords": [
"yarn",
"parse",
"lock",
"dependencies"
],
"author": "Andrew Nesbitt",
"license": "AGPL-3.0",
"bugs": {
"url": "https://github.com/librariesio/yarn-parser/issues"
},
"homepage": "https://github.com/librariesio/yarn-parser#readme",
"dependencies": {
"@yarnpkg/lockfile": "^1.0.0",
"body-parser": "^1.15.2",
"express": "^4.14.0"
},
"devDependencies": {
"chai": "^4.1.2",
"mocha": "^5.2.0",
"request": "^2.87.0",
"test": "^0.6.0"
}
}
"""
}
YARN_PARSER_METADATA = {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'url':
'https://github.com/librariesio/yarn-parser#readme',
'codeRepository':
'git+git+https://github.com/librariesio/yarn-parser.git',
'author': [{
'type': 'Person',
'name': 'Andrew Nesbitt'
}],
'license': 'https://spdx.org/licenses/AGPL-3.0',
'version': '1.0.0',
'description':
'Tiny web service for parsing yarn.lock files',
'issueTracker':
'https://github.com/librariesio/yarn-parser/issues',
'name': 'yarn-parser',
'keywords': ['yarn', 'parse', 'lock', 'dependencies'],
}
json_dict_keys = strategies.one_of(
strategies.characters(),
*map(strategies.just, ['type', 'url', 'name', 'email', '@id',
'@context', 'repository', 'license',
]),
)
"""Hypothesis strategy that generates strings, with an emphasis on those
that are often used as dictionary keys in metadata files."""
generic_json_document = strategies.recursive(
strategies.none() | strategies.booleans() | strategies.floats() |
strategies.characters(),
lambda children: (
strategies.lists(children, 1) |
strategies.dictionaries(json_dict_keys, children, min_size=1)
)
)
"""Hypothesis strategy that generates possible values for values of JSON
metadata files."""
def json_document_strategy(keys=None):
"""Generates an hypothesis strategy that generates metadata files
for a format that uses the given keys."""
if keys is None:
keys = strategies.characters()
else:
keys = strategies.one_of(map(strategies.just, keys))
return strategies.dictionaries(keys, generic_json_document, min_size=1)
def filter_dict(d, keys):
'return a copy of the dict with keys deleted'
if not isinstance(keys, (list, tuple)):
keys = (keys, )
return dict((k, v) for (k, v) in d.items() if k not in keys)
def fill_obj_storage(obj_storage):
"""Add some content in an object storage."""
for (obj_id, content) in OBJ_STORAGE_DATA.items():
obj_storage.add(content, obj_id=hash_to_bytes(obj_id))
def fill_storage(storage):
for origin in ORIGINS:
origin = origin.copy()
del origin['id']
storage.origin_add_one(origin)
for (orig_pseudo_id, snap) in SNAPSHOTS.items():
for orig in ORIGINS:
if orig_pseudo_id == orig['id']:
origin_id = storage.origin_get(
{'type': orig['type'], 'url': orig['url']})['id']
break
else:
assert False
visit = storage.origin_visit_add(origin_id, datetime.datetime.now())
snap_id = snap.get('id') or \
bytes([random.randint(0, 255) for _ in range(32)])
storage.snapshot_add(origin_id, visit['visit'], {
'id': snap_id,
'branches': snap['branches']
})
storage.revision_add(REVISIONS)
storage.directory_add([{
'id': DIRECTORY_ID,
'entries': DIRECTORY,
}])
for (obj_id, content) in OBJ_STORAGE_DATA.items():
# TODO: use MultiHash
if hasattr(hashlib, 'blake2s'):
blake2s256 = hashlib.blake2s(content, digest_size=32).digest()
else:
# fallback for Python <3.6
blake2s256 = bytes([random.randint(0, 255) for _ in range(32)])
storage.content_add([{
'data': content,
'length': len(content),
'status': 'visible',
'sha1': hash_to_bytes(obj_id),
'sha1_git': hash_to_bytes(obj_id),
'sha256': hashlib.sha256(content).digest(),
'blake2s256': blake2s256
}])
class CommonContentIndexerTest(metaclass=abc.ABCMeta):
legacy_get_format = False
"""True if and only if the tested indexer uses the legacy format.
see: https://forge.softwareheritage.org/T1433
"""
def get_indexer_results(self, ids):
"""Override this for indexers that don't have a mock storage."""
return self.indexer.idx_storage.state
def assert_legacy_results_ok(self, sha1s, expected_results=None):
# XXX old format, remove this when all endpoints are
# updated to the new one
# see: https://forge.softwareheritage.org/T1433
sha1s = [sha1 if isinstance(sha1, bytes) else hash_to_bytes(sha1)
for sha1 in sha1s]
actual_results = list(self.get_indexer_results(sha1s))
if expected_results is None:
expected_results = self.expected_results
self.assertEqual(len(expected_results), len(actual_results),
(expected_results, actual_results))
for indexed_data in actual_results:
_id = indexed_data['id']
expected_data = expected_results[hashutil.hash_to_hex(_id)].copy()
expected_data['id'] = _id
self.assertEqual(indexed_data, expected_data)
def assert_results_ok(self, sha1s, expected_results=None):
if self.legacy_get_format:
self.assert_legacy_results_ok(sha1s, expected_results)
return
sha1s = [sha1 if isinstance(sha1, bytes) else hash_to_bytes(sha1)
for sha1 in sha1s]
actual_results = list(self.get_indexer_results(sha1s))
if expected_results is None:
expected_results = self.expected_results
self.assertEqual(len(expected_results), len(actual_results),
(expected_results, actual_results))
for indexed_data in actual_results:
(_id, indexed_data) = list(indexed_data.items())[0]
expected_data = expected_results[hashutil.hash_to_hex(_id)].copy()
expected_data = [expected_data]
self.assertEqual(indexed_data, expected_data)
def test_index(self):
"""Known sha1 have their data indexed
"""
sha1s = [self.id0, self.id1, self.id2]
# when
self.indexer.run(sha1s, policy_update='update-dups')
self.assert_results_ok(sha1s)
# 2nd pass
self.indexer.run(sha1s, policy_update='ignore-dups')
self.assert_results_ok(sha1s)
def test_index_one_unknown_sha1(self):
"""Unknown sha1 are not indexed"""
sha1s = [self.id1,
'799a5ef812c53907562fe379d4b3851e69c7cb15', # unknown
'800a5ef812c53907562fe379d4b3851e69c7cb15'] # unknown
# when
self.indexer.run(sha1s, policy_update='update-dups')
# then
expected_results = {
k: v for k, v in self.expected_results.items() if k in sha1s
}
self.assert_results_ok(sha1s, expected_results)
class CommonContentIndexerRangeTest:
"""Allows to factorize tests on range indexer.
"""
def setUp(self):
self.contents = sorted(OBJ_STORAGE_DATA)
def assert_results_ok(self, start, end, actual_results,
expected_results=None):
if expected_results is None:
expected_results = self.expected_results
actual_results = list(actual_results)
for indexed_data in actual_results:
_id = indexed_data['id']
assert isinstance(_id, bytes)
indexed_data = indexed_data.copy()
indexed_data['id'] = hash_to_hex(indexed_data['id'])
self.assertEqual(indexed_data, expected_results[hash_to_hex(_id)])
self.assertTrue(start <= _id <= end)
_tool_id = indexed_data['indexer_configuration_id']
self.assertEqual(_tool_id, self.indexer.tool['id'])
def test__index_contents(self):
"""Indexing contents without existing data results in indexed data
"""
_start, _end = [self.contents[0], self.contents[2]] # output hex ids
start, end = map(hashutil.hash_to_bytes, (_start, _end))
# given
actual_results = list(self.indexer._index_contents(
start, end, indexed={}))
self.assert_results_ok(start, end, actual_results)
def test__index_contents_with_indexed_data(self):
"""Indexing contents with existing data results in less indexed data
"""
_start, _end = [self.contents[0], self.contents[2]] # output hex ids
start, end = map(hashutil.hash_to_bytes, (_start, _end))
data_indexed = [self.id0, self.id2]
# given
actual_results = self.indexer._index_contents(
start, end, indexed=set(map(hash_to_bytes, data_indexed)))
# craft the expected results
expected_results = self.expected_results.copy()
for already_indexed_key in data_indexed:
expected_results.pop(already_indexed_key)
self.assert_results_ok(
start, end, actual_results, expected_results)
def test_generate_content_get(self):
"""Optimal indexing should result in indexed data
"""
_start, _end = [self.contents[0], self.contents[2]] # output hex ids
start, end = map(hashutil.hash_to_bytes, (_start, _end))
# given
actual_results = self.indexer.run(start, end)
# then
self.assertTrue(actual_results)
def test_generate_content_get_input_as_bytes(self):
"""Optimal indexing should result in indexed data
Input are in bytes here.
"""
_start, _end = [self.contents[0], self.contents[2]] # output hex ids
start, end = map(hashutil.hash_to_bytes, (_start, _end))
# given
actual_results = self.indexer.run( # checks the bytes input this time
start, end, skip_existing=False)
# no already indexed data so same result as prior test
# then
self.assertTrue(actual_results)
def test_generate_content_get_no_result(self):
"""No result indexed returns False"""
_start, _end = ['0000000000000000000000000000000000000000',
'0000000000000000000000000000000000000001']
start, end = map(hashutil.hash_to_bytes, (_start, _end))
# given
actual_results = self.indexer.run(
start, end, incremental=False)
# then
self.assertFalse(actual_results)

File Metadata

Mime Type
text/x-diff
Expires
Thu, Sep 18, 4:32 AM (50 m, 52 s ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3297959

Event Timeline