Page MenuHomeSoftware Heritage

No OneTemporary

diff --git a/swh/indexer/__init__.py b/swh/indexer/__init__.py
index 90e662a..5b71c6d 100644
--- a/swh/indexer/__init__.py
+++ b/swh/indexer/__init__.py
@@ -1,25 +1,26 @@
# Copyright (C) 2016 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-from .file_properties import ContentMimetypeIndexer
+from .mimetype import ContentMimetypeIndexer
from .language import ContentLanguageIndexer
INDEXER_CLASSES = {
'mimetype': ContentMimetypeIndexer,
'language': ContentLanguageIndexer,
}
TASK_NAMES = {
'orchestrator': 'swh.indexer.tasks.SWHOrchestratorTask',
'mimetype': 'swh.indexer.tasks.SWHContentMimetypeTask',
'language': 'swh.indexer.tasks.SWHContentLanguageTask',
}
__all__ = [
- 'INDEXER_CLASSES', 'TASK_NAMES'
+ 'INDEXER_CLASSES', 'TASK_NAMES', 'ContentMimetypeIndexer',
+ 'ContentLanguageIndexer'
]
diff --git a/swh/indexer/file_properties.py b/swh/indexer/mimetype.py
similarity index 96%
rename from swh/indexer/file_properties.py
rename to swh/indexer/mimetype.py
index a570cbf..9a28ebd 100644
--- a/swh/indexer/file_properties.py
+++ b/swh/indexer/mimetype.py
@@ -1,107 +1,109 @@
# Copyright (C) 2016 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import click
import subprocess
from swh.core import hashutil
from .indexer import BaseIndexer, DiskIndexer
def compute_mimetype_encoding(path):
"""Determine mimetype and encoding from file at path.
Args:
path: filepath to determine the mime type
Returns:
A dict with mimetype and encoding key and corresponding values.
"""
cmd = ['file', '--mime-type', '--mime-encoding', path]
properties = subprocess.check_output(cmd)
if properties:
res = properties.split(b': ')[1].strip().split(b'; ')
mimetype = res[0]
encoding = res[1].split(b'=')[1]
return {
'mimetype': mimetype,
'encoding': encoding
}
class ContentMimetypeIndexer(BaseIndexer, DiskIndexer):
"""Indexer in charge of:
- filtering out content already indexed
- reading content from objstorage per the content's id (sha1)
- computing {mimetype, encoding} from that content
- store result in storage
"""
ADDITIONAL_CONFIG = {
- 'workdir': ('str', '/tmp/swh/worker.file.properties'),
+ 'workdir': ('str', '/tmp/swh/indexer.mimetype'),
}
+ CONFIG_BASE_FILENAME = 'indexer/mimetype'
+
def __init__(self):
super().__init__()
self.working_directory = self.config['workdir']
def filter_contents(self, sha1s):
"""Filter out known sha1s and return only missing ones.
"""
yield from self.storage.content_mimetype_missing(sha1s)
def index_content(self, sha1, content):
"""Index sha1s' content and store result.
Args:
sha1 (bytes): content's identifier
content (bytes): raw content in bytes
Returns:
A dict, representing a content_mimetype, with keys:
- id (bytes): content's identifier (sha1)
- mimetype (bytes): mimetype in bytes
- encoding (bytes): encoding in bytes
"""
filename = hashutil.hash_to_hex(sha1)
content_path = self.write_to_temp(
filename=filename,
data=content)
properties = compute_mimetype_encoding(content_path)
properties.update({
'id': sha1,
})
self.cleanup(content_path)
return properties
def persist_index_computations(self, results):
"""Persist the results in storage.
Args:
results ([dict]): list of content_mimetype, dict with the
following keys:
- id (bytes): content's identifier (sha1)
- mimetype (bytes): mimetype in bytes
- encoding (bytes): encoding in bytes
"""
self.storage.content_mimetype_add(results)
@click.command()
@click.option('--path', help="Path to execute index on")
def main(path):
print(compute_mimetype_encoding(path))
if __name__ == '__main__':
main()
diff --git a/swh/indexer/tasks.py b/swh/indexer/tasks.py
index 957a857..914e886 100644
--- a/swh/indexer/tasks.py
+++ b/swh/indexer/tasks.py
@@ -1,41 +1,40 @@
# Copyright (C) 2016 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from swh.scheduler.task import Task
from .orchestrator import OrchestratorIndexer
-from .file_properties import ContentMimetypeIndexer
-from .language import ContentLanguageIndexer
+from . import ContentMimetypeIndexer, ContentLanguageIndexer
class SWHOrchestratorTask(Task):
"""Main task in charge of reading messages and broadcasting them back
to other tasks.
"""
task_queue = 'swh_indexer_orchestrator'
def run(self, *args, **kwargs):
OrchestratorIndexer().run(*args, **kwargs)
class SWHContentMimetypeTask(Task):
"""Task which computes the mimetype, encoding from the sha1's content.
"""
task_queue = 'swh_indexer_content_mimetype'
def run(self, *args, **kwargs):
ContentMimetypeIndexer().run(*args, **kwargs)
class SWHContentLanguageTask(Task):
"""Task which computes the language from the sha1's content.
"""
task_queue = 'swh_indexer_content_language'
def run(self, *args, **kwargs):
ContentLanguageIndexer().run(*args, **kwargs)

File Metadata

Mime Type
text/x-diff
Expires
Jul 4 2025, 9:29 AM (5 w, 5 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3281502

Event Timeline