Page MenuHomeSoftware Heritage

No OneTemporary

diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py
index 0a5f0c7..1ee09bb 100644
--- a/swh/indexer/metadata.py
+++ b/swh/indexer/metadata.py
@@ -1,340 +1,340 @@
# Copyright (C) 2017 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import click
import logging
from swh.indexer.indexer import ContentIndexer, RevisionIndexer, OriginIndexer
from swh.indexer.metadata_dictionary import MAPPINGS
from swh.indexer.metadata_detector import detect_metadata
from swh.indexer.metadata_detector import extract_minimal_metadata_dict
from swh.indexer.storage import INDEXER_CFG_KEY
from swh.model import hashutil
class ContentMetadataIndexer(ContentIndexer):
"""Content-level indexer
This indexer is in charge of:
- filtering out content already indexed in content_metadata
- reading content from objstorage with the content's id sha1
- computing translated_metadata by given context
- using the metadata_dictionary as the 'swh-metadata-translator' tool
- store result in content_metadata table
"""
CONFIG_BASE_FILENAME = 'indexer/content_metadata'
def __init__(self, tool, config):
# twisted way to use the exact same config of RevisionMetadataIndexer
# object that uses internally ContentMetadataIndexer
self.config = config
self.config['tools'] = tool
super().__init__()
def filter(self, ids):
"""Filter out known sha1s and return only missing ones.
"""
yield from self.idx_storage.content_metadata_missing((
{
'id': sha1,
'indexer_configuration_id': self.tool['id'],
} for sha1 in ids
))
def index(self, id, data):
"""Index sha1s' content and store result.
Args:
id (bytes): content's identifier
data (bytes): raw content in bytes
Returns:
dict: dictionary representing a content_metadata. If the
translation wasn't successful the translated_metadata keys will
be returned as None
"""
result = {
'id': id,
'indexer_configuration_id': self.tool['id'],
'translated_metadata': None
}
try:
mapping_name = self.tool['tool_configuration']['context']
result['translated_metadata'] = MAPPINGS[mapping_name] \
.translate(data)
# a twisted way to keep result with indexer object for get_results
self.results.append(result)
except Exception:
self.log.exception(
"Problem during tool retrieval of metadata translation")
return result
def persist_index_computations(self, results, policy_update):
"""Persist the results in storage.
Args:
results ([dict]): list of content_metadata, dict with the
following keys:
- id (bytes): content's identifier (sha1)
- translated_metadata (jsonb): detected metadata
policy_update ([str]): either 'update-dups' or 'ignore-dups' to
respectively update duplicates or ignore them
"""
self.idx_storage.content_metadata_add(
results, conflict_update=(policy_update == 'update-dups'))
def get_results(self):
"""can be called only if run method was called before
Returns:
list: list of content_metadata entries calculated by
current indexer
"""
return self.results
class RevisionMetadataIndexer(RevisionIndexer):
"""Revision-level indexer
This indexer is in charge of:
- filtering revisions already indexed in revision_metadata table with
defined computation tool
- retrieve all entry_files in root directory
- use metadata_detector for file_names containing metadata
- compute metadata translation if necessary and possible (depends on tool)
- send sha1s to content indexing if possible
- store the results for revision
"""
CONFIG_BASE_FILENAME = 'indexer/revision_metadata'
ADDITIONAL_CONFIG = {
'tools': ('dict', {
'name': 'swh-metadata-detector',
'version': '0.0.2',
'configuration': {
'type': 'local',
'context': ['NpmMapping', 'CodemetaMapping']
},
}),
}
ContentMetadataIndexer = ContentMetadataIndexer
def prepare(self):
super().prepare()
self.tool = self.tools[0]
def filter(self, sha1_gits):
"""Filter out known sha1s and return only missing ones.
"""
yield from self.idx_storage.revision_metadata_missing((
{
'id': sha1_git,
'indexer_configuration_id': self.tool['id'],
} for sha1_git in sha1_gits
))
def index(self, rev):
"""Index rev by processing it and organizing result.
use metadata_detector to iterate on filenames
- if one filename detected -> sends file to content indexer
- if multiple file detected -> translation needed at revision level
Args:
rev (bytes): revision artifact from storage
Returns:
dict: dictionary representing a revision_metadata, with keys:
- id (str): rev's identifier (sha1_git)
- indexer_configuration_id (bytes): tool used
- translated_metadata: dict of retrieved metadata
"""
result = {
- 'id': rev['id'].decode(),
+ 'id': rev['id'],
'indexer_configuration_id': self.tool['id'],
'translated_metadata': None
}
try:
root_dir = rev['directory']
dir_ls = self.storage.directory_ls(root_dir, recursive=False)
files = [entry for entry in dir_ls if entry['type'] == 'file']
detected_files = detect_metadata(files)
result['translated_metadata'] = self.translate_revision_metadata(
detected_files)
except Exception as e:
self.log.exception(
'Problem when indexing rev: %r', e)
return result
def persist_index_computations(self, results, policy_update):
"""Persist the results in storage.
Args:
results ([dict]): list of content_mimetype, dict with the
following keys:
- id (bytes): content's identifier (sha1)
- mimetype (bytes): mimetype in bytes
- encoding (bytes): encoding in bytes
policy_update ([str]): either 'update-dups' or 'ignore-dups' to
respectively update duplicates or ignore them
"""
# TODO: add functions in storage to keep data in revision_metadata
self.idx_storage.revision_metadata_add(
results, conflict_update=(policy_update == 'update-dups'))
def translate_revision_metadata(self, detected_files):
"""
Determine plan of action to translate metadata when containing
one or multiple detected files:
Args:
detected_files (dict): dictionary mapping context names (e.g.,
"npm", "authors") to list of sha1
Returns:
dict: dict with translated metadata according to the CodeMeta
vocabulary
"""
translated_metadata = []
tool = {
'name': 'swh-metadata-translator',
'version': '0.0.2',
'configuration': {
'type': 'local',
'context': None
},
}
# TODO: iterate on each context, on each file
# -> get raw_contents
# -> translate each content
config = {
INDEXER_CFG_KEY: self.idx_storage,
'objstorage': self.objstorage
}
for context in detected_files.keys():
tool['configuration']['context'] = context
c_metadata_indexer = self.ContentMetadataIndexer(tool, config)
# sha1s that are in content_metadata table
sha1s_in_storage = []
metadata_generator = self.idx_storage.content_metadata_get(
detected_files[context])
for c in metadata_generator:
# extracting translated_metadata
sha1 = c['id']
sha1s_in_storage.append(sha1)
local_metadata = c['translated_metadata']
# local metadata is aggregated
if local_metadata:
translated_metadata.append(local_metadata)
sha1s_filtered = [item for item in detected_files[context]
if item not in sha1s_in_storage]
if sha1s_filtered:
# schedule indexation of content
try:
c_metadata_indexer.run(sha1s_filtered,
policy_update='ignore-dups')
# on the fly possibility:
results = c_metadata_indexer.get_results()
for result in results:
local_metadata = result['translated_metadata']
translated_metadata.append(local_metadata)
except Exception as e:
self.log.warning("""Exception while indexing content""", e)
# transform translated_metadata into min set with swh-metadata-detector
min_metadata = extract_minimal_metadata_dict(translated_metadata)
return min_metadata
class OriginMetadataIndexer(OriginIndexer):
CONFIG_BASE_FILENAME = 'indexer/origin_intrinsic_metadata'
ADDITIONAL_CONFIG = {
'tools': ('list', [])
}
def check(self, **kwargs):
kwargs['check_tools'] = False
super().check(**kwargs)
def filter(self, ids):
return ids
def run(self, origin_head, policy_update):
"""Expected to be called with the result of RevisionMetadataIndexer
as first argument; ie. not a list of ids as other indexers would.
Args:
- * `origin_head` (dict): {str(origin_id): rev_id.encode()}
+ * `origin_head` (dict): {str(origin_id): rev_id}
keys `origin_id` and `revision_id`, which is the result
of OriginHeadIndexer.
* `policy_update`: `'ignore-dups'` or `'update-dups'`
"""
- origin_head_map = {int(origin_id): rev_id
+ origin_head_map = {int(origin_id): hashutil.hash_to_bytes(rev_id)
for (origin_id, rev_id) in origin_head.items()}
# Fix up the argument order. revisions_metadata has to be the
# first argument because of celery.chain; the next line calls
# run() with the usual order, ie. origin ids first.
return super().run(ids=list(origin_head_map),
policy_update=policy_update,
parse_ids=False,
origin_head_map=origin_head_map)
def index(self, origin, *, origin_head_map):
# Get the last revision of the origin.
revision_id = origin_head_map[origin['id']]
revision_metadata = self.idx_storage \
.revision_metadata_get([revision_id])
for item in revision_metadata:
assert item['id'] == revision_id
# Get the metadata of that revision, and return it
return {
'origin_id': origin['id'],
'metadata': item['translated_metadata'],
'from_revision': revision_id,
'indexer_configuration_id':
item['indexer_configuration_id'],
}
def persist_index_computations(self, results, policy_update):
self.idx_storage.origin_intrinsic_metadata_add(
results, conflict_update=(policy_update == 'update-dups'))
@click.command()
@click.option('--revs', '-i',
help='Default sha1_git to lookup', multiple=True)
def main(revs):
_git_sha1s = list(map(hashutil.hash_to_bytes, revs))
rev_metadata_indexer = RevisionMetadataIndexer()
rev_metadata_indexer.run(_git_sha1s, 'update-dups')
if __name__ == '__main__':
logging.basicConfig(level=logging.INFO)
main()
diff --git a/swh/indexer/origin_head.py b/swh/indexer/origin_head.py
index 424fb57..35ea767 100644
--- a/swh/indexer/origin_head.py
+++ b/swh/indexer/origin_head.py
@@ -1,219 +1,221 @@
# Copyright (C) 2018 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import re
import click
import logging
from swh.scheduler import get_scheduler
from swh.scheduler.utils import create_task_dict
from swh.indexer.indexer import OriginIndexer
+from swh.model.hashutil import hash_to_hex
+
class OriginHeadIndexer(OriginIndexer):
"""Origin-level indexer.
This indexer is in charge of looking up the revision that acts as the
"head" of an origin.
In git, this is usually the commit pointed to by the 'master' branch."""
ADDITIONAL_CONFIG = {
'tools': ('dict', {
'name': 'origin-metadata',
'version': '0.0.1',
'configuration': {},
}),
'tasks': ('dict', {
'revision_metadata': 'revision_metadata',
'origin_intrinsic_metadata': 'origin_metadata',
})
}
CONFIG_BASE_FILENAME = 'indexer/origin_head'
def filter(self, ids):
yield from ids
def persist_index_computations(self, results, policy_update):
"""Do nothing. The indexer's results are not persistent, they
should only be piped to another indexer."""
pass
def next_step(self, results, task):
"""Once the head is found, call the RevisionMetadataIndexer
on these revisions, then call the OriginMetadataIndexer with
both the origin_id and the revision metadata, so it can copy the
revision metadata to the origin's metadata.
Args:
results (Iterable[dict]): Iterable of return values from `index`.
"""
super().next_step(results, task)
revision_metadata_task = self.config['tasks']['revision_metadata']
origin_intrinsic_metadata_task = self.config['tasks'][
'origin_intrinsic_metadata']
if revision_metadata_task is None and \
origin_intrinsic_metadata_task is None:
return
assert revision_metadata_task is not None
assert origin_intrinsic_metadata_task is not None
# Second task to run after this one: copy the revision's metadata
# to the origin
sub_task = create_task_dict(
origin_intrinsic_metadata_task,
'oneshot',
origin_head={
str(result['origin_id']):
- result['revision_id'].decode()
+ hash_to_hex(result['revision_id'])
for result in results},
policy_update='update-dups',
)
del sub_task['next_run'] # Not json-serializable
# First task to run after this one: index the metadata of the
# revision
task = create_task_dict(
revision_metadata_task,
'oneshot',
- ids=[res['revision_id'].decode() for res in results],
+ ids=[hash_to_hex(res['revision_id']) for res in results],
policy_update='update-dups',
next_step=sub_task,
)
if getattr(self, 'scheduler', None):
scheduler = self.scheduler
else:
scheduler = get_scheduler(**self.config['scheduler'])
scheduler.create_tasks([task])
# Dispatch
def index(self, origin):
origin_id = origin['id']
latest_snapshot = self.storage.snapshot_get_latest(origin_id)
method = getattr(self, '_try_get_%s_head' % origin['type'], None)
if method is None:
method = self._try_get_head_generic
rev_id = method(latest_snapshot)
if rev_id is None:
return None
result = {
'origin_id': origin_id,
'revision_id': rev_id,
}
return result
# VCSs
def _try_get_vcs_head(self, snapshot):
try:
if isinstance(snapshot, dict):
branches = snapshot['branches']
if branches[b'HEAD']['target_type'] == 'revision':
return branches[b'HEAD']['target']
except KeyError:
return None
_try_get_hg_head = _try_get_git_head = _try_get_vcs_head
# Tarballs
_archive_filename_re = re.compile(
rb'^'
rb'(?P<pkgname>.*)[-_]'
rb'(?P<version>[0-9]+(\.[0-9])*)'
rb'(?P<preversion>[-+][a-zA-Z0-9.~]+?)?'
rb'(?P<extension>(\.[a-zA-Z0-9]+)+)'
rb'$')
@classmethod
def _parse_version(cls, filename):
"""Extracts the release version from an archive filename,
to get an ordering whose maximum is likely to be the last
version of the software
>>> OriginHeadIndexer._parse_version(b'foo')
(-inf,)
>>> OriginHeadIndexer._parse_version(b'foo.tar.gz')
(-inf,)
>>> OriginHeadIndexer._parse_version(b'gnu-hello-0.0.1.tar.gz')
(0, 0, 1, 0)
>>> OriginHeadIndexer._parse_version(b'gnu-hello-0.0.1-beta2.tar.gz')
(0, 0, 1, -1, 'beta2')
>>> OriginHeadIndexer._parse_version(b'gnu-hello-0.0.1+foobar.tar.gz')
(0, 0, 1, 1, 'foobar')
"""
res = cls._archive_filename_re.match(filename)
if res is None:
return (float('-infinity'),)
version = [int(n) for n in res.group('version').decode().split('.')]
if res.group('preversion') is None:
version.append(0)
else:
preversion = res.group('preversion').decode()
if preversion.startswith('-'):
version.append(-1)
version.append(preversion[1:])
elif preversion.startswith('+'):
version.append(1)
version.append(preversion[1:])
else:
assert False, res.group('preversion')
return tuple(version)
def _try_get_ftp_head(self, snapshot):
archive_names = list(snapshot['branches'])
max_archive_name = max(archive_names, key=self._parse_version)
r = self._try_resolve_target(snapshot['branches'], max_archive_name)
return r
# Generic
def _try_get_head_generic(self, snapshot):
# Works on 'deposit', 'svn', and 'pypi'.
try:
if isinstance(snapshot, dict):
branches = snapshot['branches']
except KeyError:
return None
else:
return (
self._try_resolve_target(branches, b'HEAD') or
self._try_resolve_target(branches, b'master')
)
def _try_resolve_target(self, branches, target_name):
try:
target = branches[target_name]
while target['target_type'] == 'alias':
target = branches[target['target']]
if target['target_type'] == 'revision':
return target['target']
elif target['target_type'] == 'content':
return None # TODO
elif target['target_type'] == 'directory':
return None # TODO
elif target['target_type'] == 'release':
return None # TODO
else:
assert False
except KeyError:
return None
@click.command()
@click.option('--origins', '-i',
help='Origins to lookup, in the "type+url" format',
multiple=True)
def main(origins):
rev_metadata_indexer = OriginHeadIndexer()
rev_metadata_indexer.run(origins)
if __name__ == '__main__':
logging.basicConfig(level=logging.INFO)
main()
diff --git a/swh/indexer/storage/db.py b/swh/indexer/storage/db.py
index c60692c..0fea30c 100644
--- a/swh/indexer/storage/db.py
+++ b/swh/indexer/storage/db.py
@@ -1,396 +1,396 @@
# Copyright (C) 2015-2018 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from swh.model import hashutil
from swh.storage.db import BaseDb, stored_procedure, cursor_to_bytes
from swh.storage.db import line_to_bytes, execute_values_to_bytes
class Db(BaseDb):
"""Proxy to the SWH Indexer DB, with wrappers around stored procedures
"""
content_mimetype_hash_keys = ['id', 'indexer_configuration_id']
def _missing_from_list(self, table, data, hash_keys, cur=None):
"""Read from table the data with hash_keys that are missing.
Args:
table (str): Table name (e.g content_mimetype, content_language,
etc...)
data (dict): Dict of data to read from
hash_keys ([str]): List of keys to read in the data dict.
Yields:
The data which is missing from the db.
"""
cur = self._cursor(cur)
keys = ', '.join(hash_keys)
equality = ' AND '.join(
('t.%s = c.%s' % (key, key)) for key in hash_keys
)
yield from execute_values_to_bytes(
cur, """
select %s from (values %%s) as t(%s)
where not exists (
select 1 from %s c
where %s
)
""" % (keys, keys, table, equality),
(tuple(m[k] for k in hash_keys) for m in data)
)
def content_mimetype_missing_from_list(self, mimetypes, cur=None):
"""List missing mimetypes.
"""
yield from self._missing_from_list(
'content_mimetype', mimetypes, self.content_mimetype_hash_keys,
cur=cur)
content_mimetype_cols = [
'id', 'mimetype', 'encoding',
'tool_id', 'tool_name', 'tool_version', 'tool_configuration']
@stored_procedure('swh_mktemp_content_mimetype')
def mktemp_content_mimetype(self, cur=None): pass
def content_mimetype_add_from_temp(self, conflict_update, cur=None):
self._cursor(cur).execute("SELECT swh_content_mimetype_add(%s)",
(conflict_update, ))
def _convert_key(self, key, main_table='c'):
"""Convert keys according to specific use in the module.
Args:
key (str): Key expression to change according to the alias
used in the query
main_table (str): Alias to use for the main table. Default
to c for content_{something}.
Expected:
Tables content_{something} being aliased as 'c' (something
in {language, mimetype, ...}), table indexer_configuration
being aliased as 'i'.
"""
if key == 'id':
return '%s.id' % main_table
elif key == 'tool_id':
return 'i.id as tool_id'
elif key == 'licenses':
return '''
array(select name
from fossology_license
where id = ANY(
array_agg(%s.license_id))) as licenses''' % main_table
return key
def _get_from_list(self, table, ids, cols, cur=None, id_col='id'):
"""Fetches entries from the `table` such that their `id` field
(or whatever is given to `id_col`) is in `ids`.
Returns the columns `cols`.
The `cur`sor is used to connect to the database.
"""
cur = self._cursor(cur)
keys = map(self._convert_key, cols)
query = """
select {keys}
from (values %s) as t(id)
inner join {table} c
on c.{id_col}=t.id
inner join indexer_configuration i
on c.indexer_configuration_id=i.id;
""".format(
keys=', '.join(keys),
id_col=id_col,
table=table)
yield from execute_values_to_bytes(
cur, query,
((_id,) for _id in ids)
)
content_indexer_names = {
'mimetype': 'content_mimetype',
'fossology_license': 'content_fossology_license',
}
def content_get_range(self, content_type, start, end,
indexer_configuration_id, limit=1000,
with_textual_data=False, cur=None):
"""Retrieve contents with content_type, within range [start, end]
bound by limit and associated to the given indexer
configuration id.
When asking to work on textual content, that filters on the
mimetype table with any mimetype that is not binary.
"""
cur = self._cursor(cur)
table = self.content_indexer_names[content_type]
if with_textual_data:
extra = """inner join content_mimetype cm
on (t.id=cm.id and cm.mimetype like 'text/%%')"""
else:
extra = ""
query = """select t.id
from %s t
inner join indexer_configuration ic
on t.indexer_configuration_id=ic.id
%s
where ic.id=%%s and
%%s <= t.id and t.id <= %%s
order by t.indexer_configuration_id, t.id
limit %%s""" % (table, extra)
cur.execute(query, (indexer_configuration_id, start, end, limit))
yield from cursor_to_bytes(cur)
def content_mimetype_get_from_list(self, ids, cur=None):
yield from self._get_from_list(
'content_mimetype', ids, self.content_mimetype_cols, cur=cur)
content_language_hash_keys = ['id', 'indexer_configuration_id']
def content_language_missing_from_list(self, languages, cur=None):
"""List missing languages.
"""
yield from self._missing_from_list(
'content_language', languages, self.content_language_hash_keys,
cur=cur)
content_language_cols = [
'id', 'lang',
'tool_id', 'tool_name', 'tool_version', 'tool_configuration']
@stored_procedure('swh_mktemp_content_language')
def mktemp_content_language(self, cur=None): pass
def content_language_add_from_temp(self, conflict_update, cur=None):
self._cursor(cur).execute("SELECT swh_content_language_add(%s)",
(conflict_update, ))
def content_language_get_from_list(self, ids, cur=None):
yield from self._get_from_list(
'content_language', ids, self.content_language_cols, cur=cur)
content_ctags_hash_keys = ['id', 'indexer_configuration_id']
def content_ctags_missing_from_list(self, ctags, cur=None):
"""List missing ctags.
"""
yield from self._missing_from_list(
'content_ctags', ctags, self.content_ctags_hash_keys,
cur=cur)
content_ctags_cols = [
'id', 'name', 'kind', 'line', 'lang',
'tool_id', 'tool_name', 'tool_version', 'tool_configuration']
@stored_procedure('swh_mktemp_content_ctags')
def mktemp_content_ctags(self, cur=None): pass
def content_ctags_add_from_temp(self, conflict_update, cur=None):
self._cursor(cur).execute("SELECT swh_content_ctags_add(%s)",
(conflict_update, ))
def content_ctags_get_from_list(self, ids, cur=None):
cur = self._cursor(cur)
keys = map(self._convert_key, self.content_ctags_cols)
yield from execute_values_to_bytes(
cur, """
select %s
from (values %%s) as t(id)
inner join content_ctags c
on c.id=t.id
inner join indexer_configuration i
on c.indexer_configuration_id=i.id
order by line
""" % ', '.join(keys),
((_id,) for _id in ids)
)
def content_ctags_search(self, expression, last_sha1, limit, cur=None):
cur = self._cursor(cur)
if not last_sha1:
query = """SELECT %s
FROM swh_content_ctags_search(%%s, %%s)""" % (
','.join(self.content_ctags_cols))
cur.execute(query, (expression, limit))
else:
if last_sha1 and isinstance(last_sha1, bytes):
last_sha1 = '\\x%s' % hashutil.hash_to_hex(last_sha1)
elif last_sha1:
last_sha1 = '\\x%s' % last_sha1
query = """SELECT %s
FROM swh_content_ctags_search(%%s, %%s, %%s)""" % (
','.join(self.content_ctags_cols))
cur.execute(query, (expression, limit, last_sha1))
yield from cursor_to_bytes(cur)
content_fossology_license_cols = [
'id', 'tool_id', 'tool_name', 'tool_version', 'tool_configuration',
'licenses']
@stored_procedure('swh_mktemp_content_fossology_license')
def mktemp_content_fossology_license(self, cur=None): pass
def content_fossology_license_add_from_temp(self, conflict_update,
cur=None):
"""Add new licenses per content.
"""
self._cursor(cur).execute(
"SELECT swh_content_fossology_license_add(%s)",
(conflict_update, ))
def content_fossology_license_get_from_list(self, ids, cur=None):
"""Retrieve licenses per id.
"""
cur = self._cursor(cur)
keys = map(self._convert_key, self.content_fossology_license_cols)
yield from execute_values_to_bytes(
cur, """
select %s
from (values %%s) as t(id)
inner join content_fossology_license c on t.id=c.id
inner join indexer_configuration i
on i.id=c.indexer_configuration_id
group by c.id, i.id, i.tool_name, i.tool_version,
i.tool_configuration;
""" % ', '.join(keys),
((_id,) for _id in ids)
)
content_metadata_hash_keys = ['id', 'indexer_configuration_id']
def content_metadata_missing_from_list(self, metadata, cur=None):
"""List missing metadata.
"""
yield from self._missing_from_list(
'content_metadata', metadata, self.content_metadata_hash_keys,
cur=cur)
content_metadata_cols = [
'id', 'translated_metadata',
'tool_id', 'tool_name', 'tool_version', 'tool_configuration']
@stored_procedure('swh_mktemp_content_metadata')
def mktemp_content_metadata(self, cur=None): pass
def content_metadata_add_from_temp(self, conflict_update, cur=None):
self._cursor(cur).execute("SELECT swh_content_metadata_add(%s)",
(conflict_update, ))
def content_metadata_get_from_list(self, ids, cur=None):
yield from self._get_from_list(
'content_metadata', ids, self.content_metadata_cols, cur=cur)
revision_metadata_hash_keys = ['id', 'indexer_configuration_id']
def revision_metadata_missing_from_list(self, metadata, cur=None):
"""List missing metadata.
"""
yield from self._missing_from_list(
'revision_metadata', metadata, self.revision_metadata_hash_keys,
cur=cur)
revision_metadata_cols = [
'id', 'translated_metadata',
'tool_id', 'tool_name', 'tool_version', 'tool_configuration']
@stored_procedure('swh_mktemp_revision_metadata')
def mktemp_revision_metadata(self, cur=None): pass
def revision_metadata_add_from_temp(self, conflict_update, cur=None):
self._cursor(cur).execute("SELECT swh_revision_metadata_add(%s)",
(conflict_update, ))
def revision_metadata_get_from_list(self, ids, cur=None):
yield from self._get_from_list(
'revision_metadata', ids, self.revision_metadata_cols, cur=cur)
origin_intrinsic_metadata_cols = [
'origin_id', 'metadata', 'from_revision',
'tool_id', 'tool_name', 'tool_version', 'tool_configuration']
origin_intrinsic_metadata_regconfig = 'pg_catalog.simple'
"""The dictionary used to normalize 'metadata' and queries.
'pg_catalog.simple' provides no stopword, so it should be suitable
for proper names and non-English content.
When updating this value, make sure to add a new index on
origin_intrinsic_metadata.metadata."""
@stored_procedure('swh_mktemp_origin_intrinsic_metadata')
def mktemp_origin_intrinsic_metadata(self, cur=None): pass
def origin_intrinsic_metadata_add_from_temp(
self, conflict_update, cur=None):
cur = self._cursor(cur)
cur.execute(
"SELECT swh_origin_intrinsic_metadata_add(%s)",
(conflict_update, ))
def origin_intrinsic_metadata_get_from_list(self, orig_ids, cur=None):
yield from self._get_from_list(
'origin_intrinsic_metadata', orig_ids,
self.origin_intrinsic_metadata_cols, cur=cur,
id_col='origin_id')
def origin_intrinsic_metadata_search_fulltext(self, terms, *, limit,
cur=None):
regconfig = self.origin_intrinsic_metadata_regconfig
tsquery_template = ' && '.join("plainto_tsquery('%s', %%s)" % regconfig
for _ in terms)
tsquery_args = [(term,) for term in terms]
keys = map(self._convert_key, self.origin_intrinsic_metadata_cols)
query = ("SELECT {keys} FROM origin_intrinsic_metadata AS oim "
"INNER JOIN indexer_configuration AS i "
"ON oim.indexer_configuration_id=i.id "
"JOIN LATERAL (SELECT {tsquery_template}) AS s(tsq) ON true "
"WHERE to_tsvector('{regconfig}', metadata) @@ tsq "
"ORDER BY ts_rank(oim.metadata_tsvector, tsq, 1) DESC "
"LIMIT %s;"
).format(keys=', '.join(keys),
regconfig=regconfig,
tsquery_template=tsquery_template)
cur.execute(query, tsquery_args + [limit])
- yield from cur
+ yield from cursor_to_bytes(cur)
indexer_configuration_cols = ['id', 'tool_name', 'tool_version',
'tool_configuration']
@stored_procedure('swh_mktemp_indexer_configuration')
def mktemp_indexer_configuration(self, cur=None):
pass
def indexer_configuration_add_from_temp(self, cur=None):
cur = self._cursor(cur)
cur.execute("SELECT %s from swh_indexer_configuration_add()" % (
','.join(self.indexer_configuration_cols), ))
yield from cursor_to_bytes(cur)
def indexer_configuration_get(self, tool_name,
tool_version, tool_configuration, cur=None):
cur = self._cursor(cur)
cur.execute('''select %s
from indexer_configuration
where tool_name=%%s and
tool_version=%%s and
tool_configuration=%%s''' % (
','.join(self.indexer_configuration_cols)),
(tool_name, tool_version, tool_configuration))
data = cur.fetchone()
if not data:
return None
return line_to_bytes(data)
diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py
index f36cd1d..174c73c 100644
--- a/swh/indexer/tests/test_metadata.py
+++ b/swh/indexer/tests/test_metadata.py
@@ -1,478 +1,480 @@
# Copyright (C) 2017-2018 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import unittest
import logging
from swh.indexer.metadata_dictionary import CROSSWALK_TABLE, MAPPINGS
from swh.indexer.metadata_detector import detect_metadata
from swh.indexer.metadata_detector import extract_minimal_metadata_dict
from swh.indexer.metadata import ContentMetadataIndexer
from swh.indexer.metadata import RevisionMetadataIndexer
from swh.indexer.tests.test_utils import MockObjStorage, MockStorage
from swh.indexer.tests.test_utils import MockIndexerStorage
+from swh.model.hashutil import hash_to_bytes
+
class ContentMetadataTestIndexer(ContentMetadataIndexer):
"""Specific Metadata whose configuration is enough to satisfy the
indexing tests.
"""
def prepare(self):
self.idx_storage = MockIndexerStorage()
self.log = logging.getLogger('swh.indexer')
self.objstorage = MockObjStorage()
self.tools = self.register_tools(self.config['tools'])
self.tool = self.tools[0]
self.results = []
class RevisionMetadataTestIndexer(RevisionMetadataIndexer):
"""Specific indexer whose configuration is enough to satisfy the
indexing tests.
"""
ContentMetadataIndexer = ContentMetadataTestIndexer
def prepare(self):
self.config = {
'storage': {
'cls': 'remote',
'args': {
'url': 'http://localhost:9999',
}
},
'tools': {
'name': 'swh-metadata-detector',
'version': '0.0.2',
'configuration': {
'type': 'local',
'context': 'NpmMapping'
}
}
}
self.storage = MockStorage()
self.idx_storage = MockIndexerStorage()
self.log = logging.getLogger('swh.indexer')
self.objstorage = MockObjStorage()
self.tools = self.register_tools(self.config['tools'])
self.tool = self.tools[0]
self.results = []
class Metadata(unittest.TestCase):
"""
Tests metadata_mock_tool tool for Metadata detection
"""
def setUp(self):
"""
shows the entire diff in the results
"""
self.maxDiff = None
self.content_tool = {
'name': 'swh-metadata-translator',
'version': '0.0.2',
'configuration': {
'type': 'local',
'context': 'NpmMapping'
}
}
MockIndexerStorage.added_data = []
def test_crosstable(self):
self.assertEqual(CROSSWALK_TABLE['NodeJS'], {
'repository': 'http://schema.org/codeRepository',
'os': 'http://schema.org/operatingSystem',
'cpu': 'http://schema.org/processorRequirements',
'engines':
'http://schema.org/processorRequirements',
'author': 'http://schema.org/author',
'author.email': 'http://schema.org/email',
'author.name': 'http://schema.org/name',
'contributor': 'http://schema.org/contributor',
'keywords': 'http://schema.org/keywords',
'license': 'http://schema.org/license',
'version': 'http://schema.org/version',
'description': 'http://schema.org/description',
'name': 'http://schema.org/name',
'bugs': 'https://codemeta.github.io/terms/issueTracker',
'homepage': 'http://schema.org/url'
})
def test_compute_metadata_none(self):
"""
testing content empty content is empty
should return None
"""
# given
content = b""
# None if no metadata was found or an error occurred
declared_metadata = None
# when
result = MAPPINGS["NpmMapping"].translate(content)
# then
self.assertEqual(declared_metadata, result)
def test_compute_metadata_npm(self):
"""
testing only computation of metadata with hard_mapping_npm
"""
# given
content = b"""
{
"name": "test_metadata",
"version": "0.0.2",
"description": "Simple package.json test for indexer",
"repository": {
"type": "git",
"url": "https://github.com/moranegg/metadata_test"
},
"author": {
"email": "moranegg@example.com",
"name": "Morane G"
}
}
"""
declared_metadata = {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'type': 'SoftwareSourceCode',
'name': 'test_metadata',
'version': '0.0.2',
'description': 'Simple package.json test for indexer',
'schema:codeRepository':
'git+https://github.com/moranegg/metadata_test',
'schema:author': {
'type': 'Person',
'name': 'Morane G',
'email': 'moranegg@example.com',
},
}
# when
result = MAPPINGS["NpmMapping"].translate(content)
# then
self.assertEqual(declared_metadata, result)
def test_extract_minimal_metadata_dict(self):
"""
Test the creation of a coherent minimal metadata set
"""
# given
metadata_list = [{
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'name': 'test_1',
'version': '0.0.2',
'description': 'Simple package.json test for indexer',
'schema:codeRepository':
'git+https://github.com/moranegg/metadata_test',
}, {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'name': 'test_0_1',
'version': '0.0.2',
'description': 'Simple package.json test for indexer',
'schema:codeRepository':
'git+https://github.com/moranegg/metadata_test'
}, {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'name': 'test_metadata',
'version': '0.0.2',
'schema:author': 'moranegg',
}]
# when
results = extract_minimal_metadata_dict(metadata_list)
# then
expected_results = {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
"version": '0.0.2',
"description": 'Simple package.json test for indexer',
"name": ['test_1', 'test_0_1', 'test_metadata'],
"schema:author": 'moranegg',
"schema:codeRepository":
'git+https://github.com/moranegg/metadata_test',
}
self.assertEqual(expected_results, results)
def test_index_content_metadata_npm(self):
"""
testing NPM with package.json
- one sha1 uses a file that can't be translated to metadata and
should return None in the translated metadata
"""
# given
sha1s = ['26a9f72a7c87cc9205725cfd879f514ff4f3d8d5',
'd4c647f0fc257591cc9ba1722484229780d1c607',
'02fb2c89e14f7fab46701478c83779c7beb7b069']
# this metadata indexer computes only metadata for package.json
# in npm context with a hard mapping
metadata_indexer = ContentMetadataTestIndexer(
tool=self.content_tool, config={})
# when
metadata_indexer.run(sha1s, policy_update='ignore-dups')
results = metadata_indexer.idx_storage.added_data
expected_results = [('content_metadata', False, [{
'indexer_configuration_id': 30,
'translated_metadata': {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'type': 'SoftwareSourceCode',
'schema:codeRepository':
'git+https://github.com/moranegg/metadata_test',
'description': 'Simple package.json test for indexer',
'name': 'test_metadata',
'version': '0.0.1'
},
'id': '26a9f72a7c87cc9205725cfd879f514ff4f3d8d5'
}, {
'indexer_configuration_id': 30,
'translated_metadata': {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'type': 'SoftwareSourceCode',
'codemeta:issueTracker':
'https://github.com/npm/npm/issues',
'schema:author': {
'type': 'Person',
'name': 'Isaac Z. Schlueter',
'email': 'i@izs.me',
'schema:url': 'http://blog.izs.me',
},
'schema:codeRepository':
'git+https://github.com/npm/npm',
'description': 'a package manager for JavaScript',
'schema:license': 'Artistic-2.0',
'version': '5.0.3',
'name': 'npm',
'keywords': [
'install',
'modules',
'package manager',
'package.json'
],
'schema:url': 'https://docs.npmjs.com/'
},
'id': 'd4c647f0fc257591cc9ba1722484229780d1c607'
}, {
'indexer_configuration_id': 30,
'translated_metadata': None,
'id': '02fb2c89e14f7fab46701478c83779c7beb7b069'
}])]
# The assertion below returns False sometimes because of nested lists
self.assertEqual(expected_results, results)
def test_detect_metadata_package_json(self):
# given
df = [{
'sha1_git': b'abc',
'name': b'index.js',
'target': b'abc',
'length': 897,
'status': 'visible',
'type': 'file',
'perms': 33188,
'dir_id': b'dir_a',
'sha1': b'bcd'
},
{
'sha1_git': b'aab',
'name': b'package.json',
'target': b'aab',
'length': 712,
'status': 'visible',
'type': 'file',
'perms': 33188,
'dir_id': b'dir_a',
'sha1': b'cde'
}]
# when
results = detect_metadata(df)
expected_results = {
'NpmMapping': [
b'cde'
]
}
# then
self.assertEqual(expected_results, results)
def test_compute_metadata_valid_codemeta(self):
raw_content = (
b"""{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"@type": "SoftwareSourceCode",
"identifier": "CodeMeta",
"description": "CodeMeta is a concept vocabulary that can be used to standardize the exchange of software metadata across repositories and organizations.",
"name": "CodeMeta: Minimal metadata schemas for science software and code, in JSON-LD",
"codeRepository": "https://github.com/codemeta/codemeta",
"issueTracker": "https://github.com/codemeta/codemeta/issues",
"license": "https://spdx.org/licenses/Apache-2.0",
"version": "2.0",
"author": [
{
"@type": "Person",
"givenName": "Carl",
"familyName": "Boettiger",
"email": "cboettig@gmail.com",
"@id": "http://orcid.org/0000-0002-1642-628X"
},
{
"@type": "Person",
"givenName": "Matthew B.",
"familyName": "Jones",
"email": "jones@nceas.ucsb.edu",
"@id": "http://orcid.org/0000-0003-0077-4738"
}
],
"maintainer": {
"@type": "Person",
"givenName": "Carl",
"familyName": "Boettiger",
"email": "cboettig@gmail.com",
"@id": "http://orcid.org/0000-0002-1642-628X"
},
"contIntegration": "https://travis-ci.org/codemeta/codemeta",
"developmentStatus": "active",
"downloadUrl": "https://github.com/codemeta/codemeta/archive/2.0.zip",
"funder": {
"@id": "https://doi.org/10.13039/100000001",
"@type": "Organization",
"name": "National Science Foundation"
},
"funding":"1549758; Codemeta: A Rosetta Stone for Metadata in Scientific Software",
"keywords": [
"metadata",
"software"
],
"version":"2.0",
"dateCreated":"2017-06-05",
"datePublished":"2017-06-05",
"programmingLanguage": "JSON-LD"
}""") # noqa
expected_result = {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"identifier": "CodeMeta",
"description":
"CodeMeta is a concept vocabulary that can "
"be used to standardize the exchange of software metadata "
"across repositories and organizations.",
"name":
"CodeMeta: Minimal metadata schemas for science "
"software and code, in JSON-LD",
"codeRepository": "https://github.com/codemeta/codemeta",
"issueTracker": "https://github.com/codemeta/codemeta/issues",
"license": "https://spdx.org/licenses/Apache-2.0",
"version": "2.0",
"author": [
{
"type": "Person",
"givenName": "Carl",
"familyName": "Boettiger",
"email": "cboettig@gmail.com",
"id": "http://orcid.org/0000-0002-1642-628X"
},
{
"type": "Person",
"givenName": "Matthew B.",
"familyName": "Jones",
"email": "jones@nceas.ucsb.edu",
"id": "http://orcid.org/0000-0003-0077-4738"
}
],
"maintainer": {
"type": "Person",
"givenName": "Carl",
"familyName": "Boettiger",
"email": "cboettig@gmail.com",
"id": "http://orcid.org/0000-0002-1642-628X"
},
"contIntegration": "https://travis-ci.org/codemeta/codemeta",
"developmentStatus": "active",
"downloadUrl":
"https://github.com/codemeta/codemeta/archive/2.0.zip",
"funder": {
"id": "https://doi.org/10.13039/100000001",
"type": "Organization",
"name": "National Science Foundation"
},
"funding": "1549758; Codemeta: A Rosetta Stone for Metadata "
"in Scientific Software",
"keywords": [
"metadata",
"software"
],
"version": "2.0",
"dateCreated": "2017-06-05",
"datePublished": "2017-06-05",
"programmingLanguage": "JSON-LD"
}
result = MAPPINGS["CodemetaMapping"].translate(raw_content)
self.assertEqual(result, expected_result)
def test_compute_metadata_maven(self):
raw_content = b"""
<project>
<name>Maven Default Project</name>
<modelVersion>4.0.0</modelVersion>
<groupId>com.mycompany.app</groupId>
<artifactId>my-app</artifactId>
<version>1.2.3</version>
<repositories>
<repository>
<id>central</id>
<name>Maven Repository Switchboard</name>
<layout>default</layout>
<url>http://repo1.maven.org/maven2</url>
<snapshots>
<enabled>false</enabled>
</snapshots>
</repository>
</repositories>
</project>"""
result = MAPPINGS["MavenMapping"].translate(raw_content)
self.assertEqual(result, {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'type': 'SoftwareSourceCode',
'name': 'Maven Default Project',
'schema:identifier': 'com.mycompany.app',
'version': '1.2.3',
'schema:codeRepository':
'http://repo1.maven.org/maven2/com/mycompany/app/my-app',
})
def test_revision_metadata_indexer(self):
metadata_indexer = RevisionMetadataTestIndexer()
sha1_gits = [
- b'8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f',
+ hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'),
]
metadata_indexer.run(sha1_gits, 'update-dups')
results = metadata_indexer.idx_storage.added_data
expected_results = [('revision_metadata', True, [{
- 'id': '8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f',
+ 'id': hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'),
'translated_metadata': {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'url':
'https://github.com/librariesio/yarn-parser#readme',
'schema:codeRepository':
'git+https://github.com/librariesio/yarn-parser.git',
'schema:author': 'Andrew Nesbitt',
'license': 'AGPL-3.0',
'version': '1.0.0',
'description':
'Tiny web service for parsing yarn.lock files',
'codemeta:issueTracker':
'https://github.com/librariesio/yarn-parser/issues',
'name': 'yarn-parser',
'keywords': ['yarn', 'parse', 'lock', 'dependencies'],
},
'indexer_configuration_id': 7
}])]
# then
self.assertEqual(expected_results, results)
diff --git a/swh/indexer/tests/test_origin_metadata.py b/swh/indexer/tests/test_origin_metadata.py
index 510ae1a..2b651cc 100644
--- a/swh/indexer/tests/test_origin_metadata.py
+++ b/swh/indexer/tests/test_origin_metadata.py
@@ -1,122 +1,125 @@
# Copyright (C) 2018 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import time
import logging
import unittest
from celery import task
from swh.indexer.metadata import OriginMetadataIndexer
from swh.indexer.tests.test_utils import MockObjStorage, MockStorage
from swh.indexer.tests.test_utils import MockIndexerStorage
from swh.indexer.tests.test_origin_head import OriginHeadTestIndexer
from swh.indexer.tests.test_metadata import RevisionMetadataTestIndexer
from swh.scheduler.tests.scheduler_testing import SchedulerTestFixture
+from swh.model.hashutil import hash_to_bytes
+
class OriginMetadataTestIndexer(OriginMetadataIndexer):
def prepare(self):
self.config = {
'storage': {
'cls': 'remote',
'args': {
'url': 'http://localhost:9999',
}
},
'tools': [],
}
self.storage = MockStorage()
self.idx_storage = MockIndexerStorage()
self.log = logging.getLogger('swh.indexer')
self.objstorage = MockObjStorage()
self.tools = self.register_tools(self.config['tools'])
self.results = []
@task
def revision_metadata_test_task(*args, **kwargs):
indexer = RevisionMetadataTestIndexer()
indexer.run(*args, **kwargs)
return indexer.results
@task
def origin_intrinsic_metadata_test_task(*args, **kwargs):
indexer = OriginMetadataTestIndexer()
indexer.run(*args, **kwargs)
return indexer.results
class OriginHeadTestIndexer(OriginHeadTestIndexer):
def prepare(self):
super().prepare()
self.config['tasks'] = {
'revision_metadata': 'revision_metadata_test_task',
'origin_intrinsic_metadata': 'origin_intrinsic_metadata_test_task',
}
class TestOriginMetadata(SchedulerTestFixture, unittest.TestCase):
def setUp(self):
super().setUp()
self.maxDiff = None
MockIndexerStorage.added_data = []
self.add_scheduler_task_type(
'revision_metadata_test_task',
'swh.indexer.tests.test_origin_metadata.'
'revision_metadata_test_task')
self.add_scheduler_task_type(
'origin_intrinsic_metadata_test_task',
'swh.indexer.tests.test_origin_metadata.'
'origin_intrinsic_metadata_test_task')
RevisionMetadataTestIndexer.scheduler = self.scheduler
def tearDown(self):
del RevisionMetadataTestIndexer.scheduler
super().tearDown()
def test_pipeline(self):
indexer = OriginHeadTestIndexer()
indexer.scheduler = self.scheduler
indexer.run(["git+https://github.com/librariesio/yarn-parser"])
self.run_ready_tasks() # Run the first task
time.sleep(0.1) # Give it time to complete and schedule the 2nd one
self.run_ready_tasks() # Run the second task
metadata = {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'url':
'https://github.com/librariesio/yarn-parser#readme',
'schema:codeRepository':
'git+https://github.com/librariesio/yarn-parser.git',
'schema:author': 'Andrew Nesbitt',
'license': 'AGPL-3.0',
'version': '1.0.0',
'description':
'Tiny web service for parsing yarn.lock files',
'codemeta:issueTracker':
'https://github.com/librariesio/yarn-parser/issues',
'name': 'yarn-parser',
'keywords': ['yarn', 'parse', 'lock', 'dependencies'],
}
rev_metadata = {
- 'id': '8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f',
+ 'id': hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'),
'translated_metadata': metadata,
'indexer_configuration_id': 7,
}
origin_metadata = {
'origin_id': 54974445,
- 'from_revision': '8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f',
+ 'from_revision': hash_to_bytes(
+ '8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'),
'metadata': metadata,
'indexer_configuration_id': 7,
}
expected_results = [
('origin_intrinsic_metadata', True, [origin_metadata]),
('revision_metadata', True, [rev_metadata])]
results = list(indexer.idx_storage.added_data)
self.assertCountEqual(expected_results, results)
diff --git a/swh/indexer/tests/test_utils.py b/swh/indexer/tests/test_utils.py
index da1f11d..9ccefc6 100644
--- a/swh/indexer/tests/test_utils.py
+++ b/swh/indexer/tests/test_utils.py
@@ -1,728 +1,732 @@
# Copyright (C) 2017-2018 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from swh.objstorage.exc import ObjNotFoundError
from swh.model import hashutil
+from swh.model.hashutil import hash_to_bytes
ORIGINS = [
{
'id': 52189575,
'lister': None,
'project': None,
'type': 'git',
'url': 'https://github.com/SoftwareHeritage/swh-storage'},
{
'id': 4423668,
'lister': None,
'project': None,
'type': 'ftp',
'url': 'rsync://ftp.gnu.org/gnu/3dldf'},
{
'id': 77775770,
'lister': None,
'project': None,
'type': 'deposit',
'url': 'https://forge.softwareheritage.org/source/jesuisgpl/'},
{
'id': 85072327,
'lister': None,
'project': None,
'type': 'pypi',
'url': 'https://pypi.org/project/limnoria/'},
{
'id': 49908349,
'lister': None,
'project': None,
'type': 'svn',
'url': 'http://0-512-md.googlecode.com/svn/'},
{
'id': 54974445,
'lister': None,
'project': None,
'type': 'git',
'url': 'https://github.com/librariesio/yarn-parser'},
]
SNAPSHOTS = {
52189575: {
'branches': {
b'refs/heads/add-revision-origin-cache': {
'target': b'L[\xce\x1c\x88\x8eF\t\xf1"\x19\x1e\xfb\xc0'
b's\xe7/\xe9l\x1e',
'target_type': 'revision'},
b'HEAD': {
'target': b'8K\x12\x00d\x03\xcc\xe4]bS\xe3\x8f{\xd7}'
b'\xac\xefrm',
'target_type': 'revision'},
b'refs/tags/v0.0.103': {
'target': b'\xb6"Im{\xfdLb\xb0\x94N\xea\x96m\x13x\x88+'
b'\x0f\xdd',
'target_type': 'release'},
}},
4423668: {
'branches': {
b'3DLDF-1.1.4.tar.gz': {
'target': b'dJ\xfb\x1c\x91\xf4\x82B%]6\xa2\x90|\xd3\xfc'
b'"G\x99\x11',
'target_type': 'revision'},
b'3DLDF-2.0.2.tar.gz': {
'target': b'\xb6\x0e\xe7\x9e9\xac\xaa\x19\x9e='
b'\xd1\xc5\x00\\\xc6\xfc\xe0\xa6\xb4V',
'target_type': 'revision'},
b'3DLDF-2.0.3-examples.tar.gz': {
'target': b'!H\x19\xc0\xee\x82-\x12F1\xbd\x97'
b'\xfe\xadZ\x80\x80\xc1\x83\xff',
'target_type': 'revision'},
b'3DLDF-2.0.3.tar.gz': {
'target': b'\x8e\xa9\x8e/\xea}\x9feF\xf4\x9f\xfd\xee'
b'\xcc\x1a\xb4`\x8c\x8by',
'target_type': 'revision'},
b'3DLDF-2.0.tar.gz': {
'target': b'F6*\xff(?\x19a\xef\xb6\xc2\x1fv$S\xe3G'
b'\xd3\xd1m',
b'target_type': 'revision'}
}},
77775770: {
'branches': {
b'master': {
'target': b'\xe7n\xa4\x9c\x9f\xfb\xb7\xf76\x11\x08{'
b'\xa6\xe9\x99\xb1\x9e]q\xeb',
'target_type': 'revision'}
},
'id': b"h\xc0\xd2a\x04\xd4~'\x8d\xd6\xbe\x07\xeda\xfa\xfbV"
b"\x1d\r "},
85072327: {
'branches': {
b'HEAD': {
'target': b'releases/2018.09.09',
'target_type': 'alias'},
b'releases/2018.09.01': {
'target': b'<\xee1(\xe8\x8d_\xc1\xc9\xa6rT\xf1\x1d'
b'\xbb\xdfF\xfdw\xcf',
'target_type': 'revision'},
b'releases/2018.09.09': {
'target': b'\x83\xb9\xb6\xc7\x05\xb1%\xd0\xfem\xd8k'
b'A\x10\x9d\xc5\xfa2\xf8t',
'target_type': 'revision'}},
'id': b'{\xda\x8e\x84\x7fX\xff\x92\x80^\x93V\x18\xa3\xfay'
b'\x12\x9e\xd6\xb3'},
49908349: {
'branches': {
b'master': {
'target': b'\xe4?r\xe1,\x88\xab\xec\xe7\x9a\x87\xb8'
b'\xc9\xad#.\x1bw=\x18',
'target_type': 'revision'}},
'id': b'\xa1\xa2\x8c\n\xb3\x87\xa8\xf9\xe0a\x8c\xb7'
b'\x05\xea\xb8\x1f\xc4H\xf4s'},
54974445: {
'branches': {
b'HEAD': {
- 'target': b'8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f',
+ 'target': hash_to_bytes(
+ '8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'),
'target_type': 'revision'}}}
}
SHA1_TO_LICENSES = {
'01c9379dfc33803963d07c1ccc748d3fe4c96bb5': ['GPL'],
'02fb2c89e14f7fab46701478c83779c7beb7b069': ['Apache2.0'],
'103bc087db1d26afc3a0283f38663d081e9b01e6': ['MIT'],
'688a5ef812c53907562fe379d4b3851e69c7cb15': ['AGPL'],
'da39a3ee5e6b4b0d3255bfef95601890afd80709': [],
}
SHA1_TO_CTAGS = {
'01c9379dfc33803963d07c1ccc748d3fe4c96bb5': [{
'name': 'foo',
'kind': 'str',
'line': 10,
'lang': 'bar',
}],
'd4c647f0fc257591cc9ba1722484229780d1c607': [{
'name': 'let',
'kind': 'int',
'line': 100,
'lang': 'haskell',
}],
'688a5ef812c53907562fe379d4b3851e69c7cb15': [{
'name': 'symbol',
'kind': 'float',
'line': 99,
'lang': 'python',
}],
}
class MockObjStorage:
"""Mock an swh-objstorage objstorage with predefined contents.
"""
data = {}
def __init__(self):
self.data = {
'01c9379dfc33803963d07c1ccc748d3fe4c96bb5': b'this is some text',
'688a5ef812c53907562fe379d4b3851e69c7cb15': b'another text',
'8986af901dd2043044ce8f0d8fc039153641cf17': b'yet another text',
'02fb2c89e14f7fab46701478c83779c7beb7b069': b"""
import unittest
import logging
from swh.indexer.mimetype import ContentMimetypeIndexer
from swh.indexer.tests.test_utils import MockObjStorage
class MockStorage():
def content_mimetype_add(self, mimetypes):
self.state = mimetypes
self.conflict_update = conflict_update
def indexer_configuration_add(self, tools):
return [{
'id': 10,
}]
""",
'103bc087db1d26afc3a0283f38663d081e9b01e6': b"""
#ifndef __AVL__
#define __AVL__
typedef struct _avl_tree avl_tree;
typedef struct _data_t {
int content;
} data_t;
""",
'93666f74f1cf635c8c8ac118879da6ec5623c410': b"""
(should 'pygments (recognize 'lisp 'easily))
""",
'26a9f72a7c87cc9205725cfd879f514ff4f3d8d5': b"""
{
"name": "test_metadata",
"version": "0.0.1",
"description": "Simple package.json test for indexer",
"repository": {
"type": "git",
"url": "https://github.com/moranegg/metadata_test"
}
}
""",
'd4c647f0fc257591cc9ba1722484229780d1c607': b"""
{
"version": "5.0.3",
"name": "npm",
"description": "a package manager for JavaScript",
"keywords": [
"install",
"modules",
"package manager",
"package.json"
],
"preferGlobal": true,
"config": {
"publishtest": false
},
"homepage": "https://docs.npmjs.com/",
"author": "Isaac Z. Schlueter <i@izs.me> (http://blog.izs.me)",
"repository": {
"type": "git",
"url": "https://github.com/npm/npm"
},
"bugs": {
"url": "https://github.com/npm/npm/issues"
},
"dependencies": {
"JSONStream": "~1.3.1",
"abbrev": "~1.1.0",
"ansi-regex": "~2.1.1",
"ansicolors": "~0.3.2",
"ansistyles": "~0.1.3"
},
"devDependencies": {
"tacks": "~1.2.6",
"tap": "~10.3.2"
},
"license": "Artistic-2.0"
}
""",
'a7ab314d8a11d2c93e3dcf528ca294e7b431c449': b"""
""",
'da39a3ee5e6b4b0d3255bfef95601890afd80709': b'',
}
def __iter__(self):
yield from self.data.keys()
def __contains__(self, sha1):
return self.data.get(sha1) is not None
def get(self, sha1):
raw_content = self.data.get(sha1)
if raw_content is None:
raise ObjNotFoundError(sha1)
return raw_content
class MockIndexerStorage():
"""Mock an swh-indexer storage.
"""
added_data = []
revision_metadata = {}
def indexer_configuration_add(self, tools):
results = []
for tool in tools:
results.append(self._indexer_configuration_add_one(tool))
return results
def _indexer_configuration_add_one(self, tool):
if tool['tool_name'] == 'swh-metadata-translator':
return {
'id': 30,
'tool_name': 'swh-metadata-translator',
'tool_version': '0.0.1',
'tool_configuration': {
'type': 'local',
'context': 'NpmMapping'
},
}
elif tool['tool_name'] == 'swh-metadata-detector':
return {
'id': 7,
'tool_name': 'swh-metadata-detector',
'tool_version': '0.0.1',
'tool_configuration': {
'type': 'local',
'context': 'NpmMapping'
},
}
elif tool['tool_name'] == 'origin-metadata':
return {
'id': 8,
'tool_name': 'origin-metadata',
'tool_version': '0.0.1',
'tool_configuration': {},
}
else:
assert False, 'Unknown tool {tool_name}'.format(**tool)
def content_metadata_missing(self, sha1s):
yield from []
def content_metadata_add(self, metadata, conflict_update=None):
self.added_data.append(
('content_metadata', conflict_update, metadata))
def revision_metadata_add(self, metadata, conflict_update=None):
assert conflict_update
self.added_data.append(
('revision_metadata', conflict_update, metadata))
for item in metadata:
+ assert isinstance(item['id'], bytes)
self.revision_metadata.setdefault(item['id'], []).append(item)
def revision_metadata_get(self, ids):
for id_ in ids:
+ assert isinstance(id_, bytes)
yield from self.revision_metadata.get(id_)
def origin_intrinsic_metadata_add(self, metadata, conflict_update=None):
self.added_data.append(
('origin_intrinsic_metadata', conflict_update, metadata))
def content_metadata_get(self, sha1s):
return [{
'tool': {
'configuration': {
'type': 'local',
'context': 'NpmMapping'
},
'version': '0.0.1',
'id': 6,
'name': 'swh-metadata-translator'
},
'id': b'cde',
'translated_metadata': {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'type': 'SoftwareSourceCode',
'codemeta:issueTracker':
'https://github.com/librariesio/yarn-parser/issues',
'version': '1.0.0',
'name': 'yarn-parser',
'schema:author': 'Andrew Nesbitt',
'url':
'https://github.com/librariesio/yarn-parser#readme',
'processorRequirements': {'node': '7.5'},
'license': 'AGPL-3.0',
'keywords': ['yarn', 'parse', 'lock', 'dependencies'],
'schema:codeRepository':
'git+https://github.com/librariesio/yarn-parser.git',
'description':
'Tiny web service for parsing yarn.lock files',
}
}]
class MockStorage():
"""Mock a real swh-storage storage to simplify reading indexers'
outputs.
"""
def origin_get(self, id_):
for origin in ORIGINS:
for (k, v) in id_.items():
if origin[k] != v:
break
else:
# This block is run iff we didn't break, ie. if all supplied
# parts of the id are set to the expected value.
return origin
assert False, id_
def snapshot_get_latest(self, origin_id):
if origin_id in SNAPSHOTS:
return SNAPSHOTS[origin_id]
else:
assert False, origin_id
def revision_get(self, revisions):
return [{
- 'id': b'8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f',
+ 'id': hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'),
'committer': {
'id': 26,
'name': b'Andrew Nesbitt',
'fullname': b'Andrew Nesbitt <andrewnez@gmail.com>',
'email': b'andrewnez@gmail.com'
},
'synthetic': False,
'date': {
'negative_utc': False,
'timestamp': {
'seconds': 1487596456,
'microseconds': 0
},
'offset': 0
},
'directory': b'10'
}]
def directory_ls(self, directory, recursive=False, cur=None):
# with directory: b'\x9d',
return [{
'sha1_git': b'abc',
'name': b'index.js',
'target': b'abc',
'length': 897,
'status': 'visible',
'type': 'file',
'perms': 33188,
'dir_id': b'10',
'sha1': b'bcd'
},
{
'sha1_git': b'aab',
'name': b'package.json',
'target': b'aab',
'length': 712,
'status': 'visible',
'type': 'file',
'perms': 33188,
'dir_id': b'10',
'sha1': b'cde'
},
{
'dir_id': b'10',
'target': b'11',
'type': 'dir',
'length': None,
'name': b'.github',
'sha1': None,
'perms': 16384,
'sha1_git': None,
'status': None,
'sha256': None
}]
class BasicMockStorage():
"""In memory implementation to fake the content_get_range api.
FIXME: To remove when the actual in-memory lands.
"""
contents = []
def __init__(self, contents):
self.contents = contents
def content_get_range(self, start, end, limit=1000):
# to make input test data consilient with actual runtime the
# other way of doing properly things would be to rewrite all
# tests (that's another task entirely so not right now)
if isinstance(start, bytes):
start = hashutil.hash_to_hex(start)
if isinstance(end, bytes):
end = hashutil.hash_to_hex(end)
results = []
_next_id = None
counter = 0
for c in self.contents:
_id = c['sha1']
if start <= _id and _id <= end:
results.append(c)
if counter >= limit:
break
counter += 1
return {
'contents': results,
'next': _next_id
}
class BasicMockIndexerStorage():
"""Mock Indexer storage to simplify reading indexers' outputs.
"""
state = []
def _internal_add(self, data, conflict_update=None):
"""All content indexer have the same structure. So reuse `data` as the
same data. It's either mimetype, language,
fossology_license, etc...
"""
self.state = data
self.conflict_update = conflict_update
def content_mimetype_add(self, data, conflict_update=None):
self._internal_add(data, conflict_update=conflict_update)
def content_fossology_license_add(self, data, conflict_update=None):
self._internal_add(data, conflict_update=conflict_update)
def content_language_add(self, data, conflict_update=None):
self._internal_add(data, conflict_update=conflict_update)
def content_ctags_add(self, data, conflict_update=None):
self._internal_add(data, conflict_update=conflict_update)
def _internal_get_range(self, start, end,
indexer_configuration_id, limit=1000):
"""Same logic as _internal_add, we retrieve indexed data given an
identifier. So the code here does not change even though
the underlying data does.
"""
# to make input test data consilient with actual runtime the
# other way of doing properly things would be to rewrite all
# tests (that's another task entirely so not right now)
if isinstance(start, bytes):
start = hashutil.hash_to_hex(start)
if isinstance(end, bytes):
end = hashutil.hash_to_hex(end)
results = []
_next = None
counter = 0
for m in self.state:
_id = m['id']
_tool_id = m['indexer_configuration_id']
if (start <= _id and _id <= end and
_tool_id == indexer_configuration_id):
results.append(_id)
if counter >= limit:
break
counter += 1
return {
'ids': results,
'next': _next
}
def content_mimetype_get_range(
self, start, end, indexer_configuration_id, limit=1000):
return self._internal_get_range(
start, end, indexer_configuration_id, limit=limit)
def content_fossology_license_get_range(
self, start, end, indexer_configuration_id, limit=1000):
return self._internal_get_range(
start, end, indexer_configuration_id, limit=limit)
def indexer_configuration_add(self, tools):
return [{
'id': 10,
}]
class CommonIndexerNoTool:
"""Mixin to wronly initialize content indexer"""
def prepare(self):
super().prepare()
self.tools = None
class CommonIndexerWithErrorsTest:
"""Test indexer configuration checks.
"""
Indexer = None
RangeIndexer = None
def test_wrong_unknown_configuration_tool(self):
"""Indexer with unknown configuration tool fails check"""
with self.assertRaisesRegex(ValueError, 'Tools None is unknown'):
print('indexer: %s' % self.Indexer)
self.Indexer()
def test_wrong_unknown_configuration_tool_range(self):
"""Range Indexer with unknown configuration tool fails check"""
if self.RangeIndexer is not None:
with self.assertRaisesRegex(ValueError, 'Tools None is unknown'):
self.RangeIndexer()
class CommonContentIndexerTest:
def assert_results_ok(self, actual_results, expected_results=None):
if expected_results is None:
expected_results = self.expected_results
for indexed_data in actual_results:
_id = indexed_data['id']
self.assertEqual(indexed_data, expected_results[_id])
_tool_id = indexed_data['indexer_configuration_id']
self.assertEqual(_tool_id, self.indexer.tool['id'])
def test_index(self):
"""Known sha1 have their data indexed
"""
sha1s = [self.id0, self.id1, self.id2]
# when
self.indexer.run(sha1s, policy_update='update-dups')
actual_results = self.indexer.idx_storage.state
self.assertTrue(self.indexer.idx_storage.conflict_update)
self.assert_results_ok(actual_results)
# 2nd pass
self.indexer.run(sha1s, policy_update='ignore-dups')
self.assertFalse(self.indexer.idx_storage.conflict_update)
self.assert_results_ok(actual_results)
def test_index_one_unknown_sha1(self):
"""Unknown sha1 are not indexed"""
sha1s = [self.id1,
'799a5ef812c53907562fe379d4b3851e69c7cb15', # unknown
'800a5ef812c53907562fe379d4b3851e69c7cb15'] # unknown
# when
self.indexer.run(sha1s, policy_update='update-dups')
actual_results = self.indexer.idx_storage.state
# then
expected_results = {
k: v for k, v in self.expected_results.items() if k in sha1s
}
self.assert_results_ok(actual_results, expected_results)
class CommonContentIndexerRangeTest:
"""Allows to factorize tests on range indexer.
"""
def assert_results_ok(self, start, end, actual_results,
expected_results=None):
if expected_results is None:
expected_results = self.expected_results
for indexed_data in actual_results:
_id = indexed_data['id']
self.assertEqual(indexed_data, expected_results[_id])
self.assertTrue(start <= _id and _id <= end)
_tool_id = indexed_data['indexer_configuration_id']
self.assertEqual(_tool_id, self.indexer.tool['id'])
def test__index_contents(self):
"""Indexing contents without existing data results in indexed data
"""
start, end = [self.contents[0], self.contents[2]] # output hex ids
# given
actual_results = list(self.indexer._index_contents(
start, end, indexed={}))
self.assert_results_ok(start, end, actual_results)
def test__index_contents_with_indexed_data(self):
"""Indexing contents with existing data results in less indexed data
"""
start, end = [self.contents[0], self.contents[2]] # output hex ids
data_indexed = [self.id0, self.id2]
# given
actual_results = self.indexer._index_contents(
start, end, indexed=set(data_indexed))
# craft the expected results
expected_results = self.expected_results.copy()
for already_indexed_key in data_indexed:
expected_results.pop(already_indexed_key)
self.assert_results_ok(
start, end, actual_results, expected_results)
def test_generate_content_get(self):
"""Optimal indexing should result in indexed data
"""
start, end = [self.contents[0], self.contents[2]] # output hex ids
# given
actual_results = self.indexer.run(start, end)
# then
self.assertTrue(actual_results)
def test_generate_content_get_input_as_bytes(self):
"""Optimal indexing should result in indexed data
Input are in bytes here.
"""
_start, _end = [self.contents[0], self.contents[2]] # output hex ids
start, end = map(hashutil.hash_to_bytes, (_start, _end))
# given
actual_results = self.indexer.run( # checks the bytes input this time
start, end, skip_existing=False) # no data so same result
# then
self.assertTrue(actual_results)
def test_generate_content_get_no_result(self):
"""No result indexed returns False"""
start, end = ['0000000000000000000000000000000000000000',
'0000000000000000000000000000000000000001']
# given
actual_results = self.indexer.run(
start, end, incremental=False)
# then
self.assertFalse(actual_results)
class NoDiskIndexer:
"""Mixin to override the DiskIndexer behavior avoiding side-effects in
tests.
"""
def write_to_temp(self, filename, data): # noop
return filename
def cleanup(self, content_path): # noop
return None

File Metadata

Mime Type
text/x-diff
Expires
Thu, Jul 3, 10:51 AM (1 w, 5 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3292892

Event Timeline