Page MenuHomeSoftware Heritage

No OneTemporary

diff --git a/swh/indexer/ctags.py b/swh/indexer/ctags.py
index b6f4bb7..0679692 100644
--- a/swh/indexer/ctags.py
+++ b/swh/indexer/ctags.py
@@ -1,167 +1,156 @@
# Copyright (C) 2015-2017 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import click
import subprocess
import json
from swh.model import hashutil
from .language import compute_language
from .indexer import ContentIndexer, DiskIndexer
# Options used to compute tags
__FLAGS = [
'--fields=+lnz', # +l: language
# +n: line number of tag definition
# +z: include the symbol's kind (function, variable, ...)
'--sort=no', # sort output on tag name
'--links=no', # do not follow symlinks
'--output-format=json', # outputs in json
]
def run_ctags(path, lang=None, ctags_command='ctags'):
"""Run ctags on file path with optional language.
Args:
path: path to the file
lang: language for that path (optional)
Returns:
ctags' output
"""
optional = []
if lang:
optional = ['--language-force=%s' % lang]
cmd = [ctags_command] + __FLAGS + optional + [path]
output = subprocess.check_output(cmd, universal_newlines=True)
for symbol in output.split('\n'):
if not symbol:
continue
js_symbol = json.loads(symbol)
yield {
'name': js_symbol['name'],
'kind': js_symbol['kind'],
'line': js_symbol['line'],
'lang': js_symbol['language'],
}
class CtagsIndexer(ContentIndexer, DiskIndexer):
CONFIG_BASE_FILENAME = 'indexer/ctags'
ADDITIONAL_CONFIG = {
'workdir': ('str', '/tmp/swh/indexer.ctags'),
'tools': ('dict', {
'name': 'universal-ctags',
'version': '~git7859817b',
'configuration': {
'command_line': '''ctags --fields=+lnz --sort=no --links=no '''
'''--output-format=json <filepath>'''
},
}),
'languages': ('dict', {
'ada': 'Ada',
'adl': None,
'agda': None,
# ...
})
}
def prepare(self):
super().prepare()
self.working_directory = self.config['workdir']
self.language_map = self.config['languages']
self.tool = self.tools[0]
def filter(self, ids):
"""Filter out known sha1s and return only missing ones.
"""
yield from self.idx_storage.content_ctags_missing((
{
'id': sha1,
'indexer_configuration_id': self.tool['id'],
} for sha1 in ids
))
def compute_ctags(self, path, lang):
"""Compute ctags on file at path with language lang.
"""
return run_ctags(path, lang=lang)
def index(self, id, data):
"""Index sha1s' content and store result.
Args:
id (bytes): content's identifier
data (bytes): raw content in bytes
Returns:
A dict, representing a content_mimetype, with keys:
- id (bytes): content's identifier (sha1)
- ctags ([dict]): ctags list of symbols
"""
lang = compute_language(data, log=self.log)['lang']
if not lang:
return None
ctags_lang = self.language_map.get(lang)
if not ctags_lang:
return None
ctags = {
'id': id,
}
filename = hashutil.hash_to_hex(id)
content_path = self.write_to_temp(
filename=filename,
data=data)
result = run_ctags(content_path, lang=ctags_lang)
ctags.update({
'ctags': list(result),
'indexer_configuration_id': self.tool['id'],
})
self.cleanup(content_path)
return ctags
def persist_index_computations(self, results, policy_update):
"""Persist the results in storage.
Args:
results ([dict]): list of content_mimetype, dict with the
following keys:
- id (bytes): content's identifier (sha1)
- ctags ([dict]): ctags list of symbols
policy_update ([str]): either 'update-dups' or 'ignore-dups' to
respectively update duplicates or ignore them
"""
self.idx_storage.content_ctags_add(
results, conflict_update=(policy_update == 'update-dups'))
-
-
-@click.command()
-@click.option('--path', help="Path to execute index on")
-def main(path):
- r = list(run_ctags(path))
- print(r)
-
-
-if __name__ == '__main__':
- main()
diff --git a/swh/indexer/fossology_license.py b/swh/indexer/fossology_license.py
index bac3810..37522b9 100644
--- a/swh/indexer/fossology_license.py
+++ b/swh/indexer/fossology_license.py
@@ -1,184 +1,172 @@
# Copyright (C) 2016-2018 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import click
import subprocess
from swh.model import hashutil
from .indexer import ContentIndexer, ContentRangeIndexer, DiskIndexer
class MixinFossologyLicenseIndexer:
"""Mixin fossology license indexer.
See :class:`ContentFossologyLicenseIndexer` and
:class:`FossologyLicenseRangeIndexer`
"""
ADDITIONAL_CONFIG = {
'workdir': ('str', '/tmp/swh/indexer.fossology.license'),
'tools': ('dict', {
'name': 'nomos',
'version': '3.1.0rc2-31-ga2cbb8c',
'configuration': {
'command_line': 'nomossa <filepath>',
},
}),
'write_batch_size': ('int', 1000),
}
CONFIG_BASE_FILENAME = 'indexer/fossology_license'
def prepare(self):
super().prepare()
self.working_directory = self.config['workdir']
self.tool = self.tools[0]
def compute_license(self, path, log=None):
"""Determine license from file at path.
Args:
path: filepath to determine the license
Returns:
A dict with the following keys:
- licenses ([str]): associated detected licenses to path
- path (bytes): content filepath
- tool (str): tool used to compute the output
"""
try:
properties = subprocess.check_output(['nomossa', path],
universal_newlines=True)
if properties:
res = properties.rstrip().split(' contains license(s) ')
licenses = res[1].split(',')
return {
'licenses': licenses,
'path': path,
}
except subprocess.CalledProcessError:
if log:
from os import path as __path
log.exception('Problem during license detection for sha1 %s' %
__path.basename(path))
return {
'licenses': [],
'path': path,
}
def index(self, id, data):
"""Index sha1s' content and store result.
Args:
id (bytes): content's identifier
raw_content (bytes): raw content in bytes
Returns:
A dict, representing a content_license, with keys:
- id (bytes): content's identifier (sha1)
- license (bytes): license in bytes
- path (bytes): path
"""
if isinstance(id, str):
id = hashutil.hash_to_hex(id)
content_path = self.write_to_temp(
filename=id,
data=data)
try:
properties = self.compute_license(path=content_path, log=self.log)
properties.update({
'id': id,
'indexer_configuration_id': self.tool['id'],
})
finally:
self.cleanup(content_path)
return properties
def persist_index_computations(self, results, policy_update):
"""Persist the results in storage.
Args:
results ([dict]): list of content_license, dict with the
following keys:
- id (bytes): content's identifier (sha1)
- license (bytes): license in bytes
- path (bytes): path
policy_update ([str]): either 'update-dups' or 'ignore-dups' to
respectively update duplicates or ignore them
"""
self.idx_storage.content_fossology_license_add(
results, conflict_update=(policy_update == 'update-dups'))
class ContentFossologyLicenseIndexer(
MixinFossologyLicenseIndexer, DiskIndexer, ContentIndexer):
"""Indexer in charge of:
- filtering out content already indexed
- reading content from objstorage per the content's id (sha1)
- computing {license, encoding} from that content
- store result in storage
"""
def filter(self, ids):
"""Filter out known sha1s and return only missing ones.
"""
yield from self.idx_storage.content_fossology_license_missing((
{
'id': sha1,
'indexer_configuration_id': self.tool['id'],
} for sha1 in ids
))
class FossologyLicenseRangeIndexer(
MixinFossologyLicenseIndexer, DiskIndexer, ContentRangeIndexer):
"""FossologyLicense Range Indexer working on range of content identifiers.
It:
- filters out the non textual content
- (optionally) filters out content already indexed (cf :callable:`range`)
- reads content from objstorage per the content's id (sha1)
- computes {mimetype, encoding} from that content
- stores result in storage
"""
def indexed_contents_in_range(self, start, end):
"""Retrieve indexed content id within range [start, end].
Args
**start** (bytes): Starting bound from range identifier
**end** (bytes): End range identifier
Yields:
Content identifier (bytes) present in the range [start, end]
"""
while start:
result = self.idx_storage.content_fossology_license_get_range(
start, end, self.tool['id'])
contents = result['ids']
for _id in contents:
yield _id
start = result['next']
-
-
-@click.command(help='Compute license for path using tool')
-@click.option('--tool', default='nomossa', help="Path to tool")
-@click.option('--path', required=1, help="Path to execute index on")
-def main(tool, path):
- indexer = ContentFossologyLicenseIndexer()
- print(indexer.compute_license(tool, path))
-
-
-if __name__ == '__main__':
- main()
diff --git a/swh/indexer/mimetype.py b/swh/indexer/mimetype.py
index 4342faa..7dd43af 100644
--- a/swh/indexer/mimetype.py
+++ b/swh/indexer/mimetype.py
@@ -1,169 +1,156 @@
# Copyright (C) 2016-2018 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import click
import magic
from swh.model import hashutil
from .indexer import ContentIndexer, ContentRangeIndexer
def compute_mimetype_encoding(raw_content):
"""Determine mimetype and encoding from the raw content.
Args:
raw_content (bytes): content's raw data
Returns:
A dict with mimetype and encoding key and corresponding values
(as bytes).
"""
r = magic.detect_from_content(raw_content)
return {
'mimetype': r.mime_type.encode('utf-8'),
'encoding': r.encoding.encode('utf-8'),
}
class MixinMimetypeIndexer:
"""Mixin mimetype indexer.
See :class:`ContentMimetypeIndexer` and :class:`MimetypeRangeIndexer`
"""
ADDITIONAL_CONFIG = {
'tools': ('dict', {
'name': 'file',
'version': '1:5.30-1+deb9u1',
'configuration': {
"type": "library",
"debian-package": "python3-magic"
},
}),
'write_batch_size': ('int', 1000),
}
CONFIG_BASE_FILENAME = 'indexer/mimetype'
def prepare(self):
super().prepare()
self.tool = self.tools[0]
def index(self, id, data):
"""Index sha1s' content and store result.
Args:
id (bytes): content's identifier
data (bytes): raw content in bytes
Returns:
A dict, representing a content_mimetype, with keys:
- id (bytes): content's identifier (sha1)
- mimetype (bytes): mimetype in bytes
- encoding (bytes): encoding in bytes
"""
try:
properties = compute_mimetype_encoding(data)
properties.update({
'id': id,
'indexer_configuration_id': self.tool['id'],
})
except TypeError:
self.log.error('Detecting mimetype error for id %s' % (
hashutil.hash_to_hex(id), ))
return None
return properties
def persist_index_computations(self, results, policy_update):
"""Persist the results in storage.
Args:
results ([dict]): list of content_mimetype, dict with the
following keys:
- id (bytes): content's identifier (sha1)
- mimetype (bytes): mimetype in bytes
- encoding (bytes): encoding in bytes
policy_update ([str]): either 'update-dups' or 'ignore-dups' to
respectively update duplicates or ignore them
"""
self.idx_storage.content_mimetype_add(
results, conflict_update=(policy_update == 'update-dups'))
class ContentMimetypeIndexer(MixinMimetypeIndexer, ContentIndexer):
"""Mimetype Indexer working on list of content identifiers.
It:
- (optionally) filters out content already indexed (cf. :callable:`filter`)
- reads content from objstorage per the content's id (sha1)
- computes {mimetype, encoding} from that content
- stores result in storage
FIXME:
- 1. Rename redundant ContentMimetypeIndexer to MimetypeIndexer
- 2. Do we keep it afterwards? ~> i think this can be used with the journal
"""
def filter(self, ids):
"""Filter out known sha1s and return only missing ones.
"""
yield from self.idx_storage.content_mimetype_missing((
{
'id': sha1,
'indexer_configuration_id': self.tool['id'],
} for sha1 in ids
))
class MimetypeRangeIndexer(MixinMimetypeIndexer, ContentRangeIndexer):
"""Mimetype Range Indexer working on range of content identifiers.
It:
- (optionally) filters out content already indexed (cf :callable:`range`)
- reads content from objstorage per the content's id (sha1)
- computes {mimetype, encoding} from that content
- stores result in storage
"""
def indexed_contents_in_range(self, start, end):
"""Retrieve indexed content id within range [start, end].
Args
**start** (bytes): Starting bound from range identifier
**end** (bytes): End range identifier
Yields:
Content identifier (bytes) present in the range [start, end]
"""
while start:
result = self.idx_storage.content_mimetype_get_range(
start, end, self.tool['id'])
contents = result['ids']
for _id in contents:
yield _id
start = result['next']
-
-
-@click.command()
-@click.option('--path', help="Path to execute index on")
-def main(path):
- with open(path, 'rb') as f:
- raw_content = f.read()
-
- print(compute_mimetype_encoding(raw_content))
-
-
-if __name__ == '__main__':
- main()

File Metadata

Mime Type
text/x-diff
Expires
Fri, Jul 4, 3:23 PM (5 d, 18 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3356174

Event Timeline