Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9345504
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
16 KB
Subscribers
None
View Options
diff --git a/swh/indexer/ctags.py b/swh/indexer/ctags.py
index b6f4bb7..0679692 100644
--- a/swh/indexer/ctags.py
+++ b/swh/indexer/ctags.py
@@ -1,167 +1,156 @@
# Copyright (C) 2015-2017 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import click
import subprocess
import json
from swh.model import hashutil
from .language import compute_language
from .indexer import ContentIndexer, DiskIndexer
# Options used to compute tags
__FLAGS = [
'--fields=+lnz', # +l: language
# +n: line number of tag definition
# +z: include the symbol's kind (function, variable, ...)
'--sort=no', # sort output on tag name
'--links=no', # do not follow symlinks
'--output-format=json', # outputs in json
]
def run_ctags(path, lang=None, ctags_command='ctags'):
"""Run ctags on file path with optional language.
Args:
path: path to the file
lang: language for that path (optional)
Returns:
ctags' output
"""
optional = []
if lang:
optional = ['--language-force=%s' % lang]
cmd = [ctags_command] + __FLAGS + optional + [path]
output = subprocess.check_output(cmd, universal_newlines=True)
for symbol in output.split('\n'):
if not symbol:
continue
js_symbol = json.loads(symbol)
yield {
'name': js_symbol['name'],
'kind': js_symbol['kind'],
'line': js_symbol['line'],
'lang': js_symbol['language'],
}
class CtagsIndexer(ContentIndexer, DiskIndexer):
CONFIG_BASE_FILENAME = 'indexer/ctags'
ADDITIONAL_CONFIG = {
'workdir': ('str', '/tmp/swh/indexer.ctags'),
'tools': ('dict', {
'name': 'universal-ctags',
'version': '~git7859817b',
'configuration': {
'command_line': '''ctags --fields=+lnz --sort=no --links=no '''
'''--output-format=json <filepath>'''
},
}),
'languages': ('dict', {
'ada': 'Ada',
'adl': None,
'agda': None,
# ...
})
}
def prepare(self):
super().prepare()
self.working_directory = self.config['workdir']
self.language_map = self.config['languages']
self.tool = self.tools[0]
def filter(self, ids):
"""Filter out known sha1s and return only missing ones.
"""
yield from self.idx_storage.content_ctags_missing((
{
'id': sha1,
'indexer_configuration_id': self.tool['id'],
} for sha1 in ids
))
def compute_ctags(self, path, lang):
"""Compute ctags on file at path with language lang.
"""
return run_ctags(path, lang=lang)
def index(self, id, data):
"""Index sha1s' content and store result.
Args:
id (bytes): content's identifier
data (bytes): raw content in bytes
Returns:
A dict, representing a content_mimetype, with keys:
- id (bytes): content's identifier (sha1)
- ctags ([dict]): ctags list of symbols
"""
lang = compute_language(data, log=self.log)['lang']
if not lang:
return None
ctags_lang = self.language_map.get(lang)
if not ctags_lang:
return None
ctags = {
'id': id,
}
filename = hashutil.hash_to_hex(id)
content_path = self.write_to_temp(
filename=filename,
data=data)
result = run_ctags(content_path, lang=ctags_lang)
ctags.update({
'ctags': list(result),
'indexer_configuration_id': self.tool['id'],
})
self.cleanup(content_path)
return ctags
def persist_index_computations(self, results, policy_update):
"""Persist the results in storage.
Args:
results ([dict]): list of content_mimetype, dict with the
following keys:
- id (bytes): content's identifier (sha1)
- ctags ([dict]): ctags list of symbols
policy_update ([str]): either 'update-dups' or 'ignore-dups' to
respectively update duplicates or ignore them
"""
self.idx_storage.content_ctags_add(
results, conflict_update=(policy_update == 'update-dups'))
-
-
-@click.command()
-@click.option('--path', help="Path to execute index on")
-def main(path):
- r = list(run_ctags(path))
- print(r)
-
-
-if __name__ == '__main__':
- main()
diff --git a/swh/indexer/fossology_license.py b/swh/indexer/fossology_license.py
index bac3810..37522b9 100644
--- a/swh/indexer/fossology_license.py
+++ b/swh/indexer/fossology_license.py
@@ -1,184 +1,172 @@
# Copyright (C) 2016-2018 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import click
import subprocess
from swh.model import hashutil
from .indexer import ContentIndexer, ContentRangeIndexer, DiskIndexer
class MixinFossologyLicenseIndexer:
"""Mixin fossology license indexer.
See :class:`ContentFossologyLicenseIndexer` and
:class:`FossologyLicenseRangeIndexer`
"""
ADDITIONAL_CONFIG = {
'workdir': ('str', '/tmp/swh/indexer.fossology.license'),
'tools': ('dict', {
'name': 'nomos',
'version': '3.1.0rc2-31-ga2cbb8c',
'configuration': {
'command_line': 'nomossa <filepath>',
},
}),
'write_batch_size': ('int', 1000),
}
CONFIG_BASE_FILENAME = 'indexer/fossology_license'
def prepare(self):
super().prepare()
self.working_directory = self.config['workdir']
self.tool = self.tools[0]
def compute_license(self, path, log=None):
"""Determine license from file at path.
Args:
path: filepath to determine the license
Returns:
A dict with the following keys:
- licenses ([str]): associated detected licenses to path
- path (bytes): content filepath
- tool (str): tool used to compute the output
"""
try:
properties = subprocess.check_output(['nomossa', path],
universal_newlines=True)
if properties:
res = properties.rstrip().split(' contains license(s) ')
licenses = res[1].split(',')
return {
'licenses': licenses,
'path': path,
}
except subprocess.CalledProcessError:
if log:
from os import path as __path
log.exception('Problem during license detection for sha1 %s' %
__path.basename(path))
return {
'licenses': [],
'path': path,
}
def index(self, id, data):
"""Index sha1s' content and store result.
Args:
id (bytes): content's identifier
raw_content (bytes): raw content in bytes
Returns:
A dict, representing a content_license, with keys:
- id (bytes): content's identifier (sha1)
- license (bytes): license in bytes
- path (bytes): path
"""
if isinstance(id, str):
id = hashutil.hash_to_hex(id)
content_path = self.write_to_temp(
filename=id,
data=data)
try:
properties = self.compute_license(path=content_path, log=self.log)
properties.update({
'id': id,
'indexer_configuration_id': self.tool['id'],
})
finally:
self.cleanup(content_path)
return properties
def persist_index_computations(self, results, policy_update):
"""Persist the results in storage.
Args:
results ([dict]): list of content_license, dict with the
following keys:
- id (bytes): content's identifier (sha1)
- license (bytes): license in bytes
- path (bytes): path
policy_update ([str]): either 'update-dups' or 'ignore-dups' to
respectively update duplicates or ignore them
"""
self.idx_storage.content_fossology_license_add(
results, conflict_update=(policy_update == 'update-dups'))
class ContentFossologyLicenseIndexer(
MixinFossologyLicenseIndexer, DiskIndexer, ContentIndexer):
"""Indexer in charge of:
- filtering out content already indexed
- reading content from objstorage per the content's id (sha1)
- computing {license, encoding} from that content
- store result in storage
"""
def filter(self, ids):
"""Filter out known sha1s and return only missing ones.
"""
yield from self.idx_storage.content_fossology_license_missing((
{
'id': sha1,
'indexer_configuration_id': self.tool['id'],
} for sha1 in ids
))
class FossologyLicenseRangeIndexer(
MixinFossologyLicenseIndexer, DiskIndexer, ContentRangeIndexer):
"""FossologyLicense Range Indexer working on range of content identifiers.
It:
- filters out the non textual content
- (optionally) filters out content already indexed (cf :callable:`range`)
- reads content from objstorage per the content's id (sha1)
- computes {mimetype, encoding} from that content
- stores result in storage
"""
def indexed_contents_in_range(self, start, end):
"""Retrieve indexed content id within range [start, end].
Args
**start** (bytes): Starting bound from range identifier
**end** (bytes): End range identifier
Yields:
Content identifier (bytes) present in the range [start, end]
"""
while start:
result = self.idx_storage.content_fossology_license_get_range(
start, end, self.tool['id'])
contents = result['ids']
for _id in contents:
yield _id
start = result['next']
-
-
-@click.command(help='Compute license for path using tool')
-@click.option('--tool', default='nomossa', help="Path to tool")
-@click.option('--path', required=1, help="Path to execute index on")
-def main(tool, path):
- indexer = ContentFossologyLicenseIndexer()
- print(indexer.compute_license(tool, path))
-
-
-if __name__ == '__main__':
- main()
diff --git a/swh/indexer/mimetype.py b/swh/indexer/mimetype.py
index 4342faa..7dd43af 100644
--- a/swh/indexer/mimetype.py
+++ b/swh/indexer/mimetype.py
@@ -1,169 +1,156 @@
# Copyright (C) 2016-2018 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import click
import magic
from swh.model import hashutil
from .indexer import ContentIndexer, ContentRangeIndexer
def compute_mimetype_encoding(raw_content):
"""Determine mimetype and encoding from the raw content.
Args:
raw_content (bytes): content's raw data
Returns:
A dict with mimetype and encoding key and corresponding values
(as bytes).
"""
r = magic.detect_from_content(raw_content)
return {
'mimetype': r.mime_type.encode('utf-8'),
'encoding': r.encoding.encode('utf-8'),
}
class MixinMimetypeIndexer:
"""Mixin mimetype indexer.
See :class:`ContentMimetypeIndexer` and :class:`MimetypeRangeIndexer`
"""
ADDITIONAL_CONFIG = {
'tools': ('dict', {
'name': 'file',
'version': '1:5.30-1+deb9u1',
'configuration': {
"type": "library",
"debian-package": "python3-magic"
},
}),
'write_batch_size': ('int', 1000),
}
CONFIG_BASE_FILENAME = 'indexer/mimetype'
def prepare(self):
super().prepare()
self.tool = self.tools[0]
def index(self, id, data):
"""Index sha1s' content and store result.
Args:
id (bytes): content's identifier
data (bytes): raw content in bytes
Returns:
A dict, representing a content_mimetype, with keys:
- id (bytes): content's identifier (sha1)
- mimetype (bytes): mimetype in bytes
- encoding (bytes): encoding in bytes
"""
try:
properties = compute_mimetype_encoding(data)
properties.update({
'id': id,
'indexer_configuration_id': self.tool['id'],
})
except TypeError:
self.log.error('Detecting mimetype error for id %s' % (
hashutil.hash_to_hex(id), ))
return None
return properties
def persist_index_computations(self, results, policy_update):
"""Persist the results in storage.
Args:
results ([dict]): list of content_mimetype, dict with the
following keys:
- id (bytes): content's identifier (sha1)
- mimetype (bytes): mimetype in bytes
- encoding (bytes): encoding in bytes
policy_update ([str]): either 'update-dups' or 'ignore-dups' to
respectively update duplicates or ignore them
"""
self.idx_storage.content_mimetype_add(
results, conflict_update=(policy_update == 'update-dups'))
class ContentMimetypeIndexer(MixinMimetypeIndexer, ContentIndexer):
"""Mimetype Indexer working on list of content identifiers.
It:
- (optionally) filters out content already indexed (cf. :callable:`filter`)
- reads content from objstorage per the content's id (sha1)
- computes {mimetype, encoding} from that content
- stores result in storage
FIXME:
- 1. Rename redundant ContentMimetypeIndexer to MimetypeIndexer
- 2. Do we keep it afterwards? ~> i think this can be used with the journal
"""
def filter(self, ids):
"""Filter out known sha1s and return only missing ones.
"""
yield from self.idx_storage.content_mimetype_missing((
{
'id': sha1,
'indexer_configuration_id': self.tool['id'],
} for sha1 in ids
))
class MimetypeRangeIndexer(MixinMimetypeIndexer, ContentRangeIndexer):
"""Mimetype Range Indexer working on range of content identifiers.
It:
- (optionally) filters out content already indexed (cf :callable:`range`)
- reads content from objstorage per the content's id (sha1)
- computes {mimetype, encoding} from that content
- stores result in storage
"""
def indexed_contents_in_range(self, start, end):
"""Retrieve indexed content id within range [start, end].
Args
**start** (bytes): Starting bound from range identifier
**end** (bytes): End range identifier
Yields:
Content identifier (bytes) present in the range [start, end]
"""
while start:
result = self.idx_storage.content_mimetype_get_range(
start, end, self.tool['id'])
contents = result['ids']
for _id in contents:
yield _id
start = result['next']
-
-
-@click.command()
-@click.option('--path', help="Path to execute index on")
-def main(path):
- with open(path, 'rb') as f:
- raw_content = f.read()
-
- print(compute_mimetype_encoding(raw_content))
-
-
-if __name__ == '__main__':
- main()
File Metadata
Details
Attached
Mime Type
text/x-diff
Expires
Fri, Jul 4, 3:23 PM (6 d, 2 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3356174
Attached To
rDCIDX Metadata indexer
Event Timeline
Log In to Comment