Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9339185
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
5 KB
Subscribers
None
View Options
diff --git a/swh/indexer/__init__.py b/swh/indexer/__init__.py
index 90e662a..5b71c6d 100644
--- a/swh/indexer/__init__.py
+++ b/swh/indexer/__init__.py
@@ -1,25 +1,26 @@
# Copyright (C) 2016 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-from .file_properties import ContentMimetypeIndexer
+from .mimetype import ContentMimetypeIndexer
from .language import ContentLanguageIndexer
INDEXER_CLASSES = {
'mimetype': ContentMimetypeIndexer,
'language': ContentLanguageIndexer,
}
TASK_NAMES = {
'orchestrator': 'swh.indexer.tasks.SWHOrchestratorTask',
'mimetype': 'swh.indexer.tasks.SWHContentMimetypeTask',
'language': 'swh.indexer.tasks.SWHContentLanguageTask',
}
__all__ = [
- 'INDEXER_CLASSES', 'TASK_NAMES'
+ 'INDEXER_CLASSES', 'TASK_NAMES', 'ContentMimetypeIndexer',
+ 'ContentLanguageIndexer'
]
diff --git a/swh/indexer/file_properties.py b/swh/indexer/mimetype.py
similarity index 96%
rename from swh/indexer/file_properties.py
rename to swh/indexer/mimetype.py
index a570cbf..9a28ebd 100644
--- a/swh/indexer/file_properties.py
+++ b/swh/indexer/mimetype.py
@@ -1,107 +1,109 @@
# Copyright (C) 2016 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import click
import subprocess
from swh.core import hashutil
from .indexer import BaseIndexer, DiskIndexer
def compute_mimetype_encoding(path):
"""Determine mimetype and encoding from file at path.
Args:
path: filepath to determine the mime type
Returns:
A dict with mimetype and encoding key and corresponding values.
"""
cmd = ['file', '--mime-type', '--mime-encoding', path]
properties = subprocess.check_output(cmd)
if properties:
res = properties.split(b': ')[1].strip().split(b'; ')
mimetype = res[0]
encoding = res[1].split(b'=')[1]
return {
'mimetype': mimetype,
'encoding': encoding
}
class ContentMimetypeIndexer(BaseIndexer, DiskIndexer):
"""Indexer in charge of:
- filtering out content already indexed
- reading content from objstorage per the content's id (sha1)
- computing {mimetype, encoding} from that content
- store result in storage
"""
ADDITIONAL_CONFIG = {
- 'workdir': ('str', '/tmp/swh/worker.file.properties'),
+ 'workdir': ('str', '/tmp/swh/indexer.mimetype'),
}
+ CONFIG_BASE_FILENAME = 'indexer/mimetype'
+
def __init__(self):
super().__init__()
self.working_directory = self.config['workdir']
def filter_contents(self, sha1s):
"""Filter out known sha1s and return only missing ones.
"""
yield from self.storage.content_mimetype_missing(sha1s)
def index_content(self, sha1, content):
"""Index sha1s' content and store result.
Args:
sha1 (bytes): content's identifier
content (bytes): raw content in bytes
Returns:
A dict, representing a content_mimetype, with keys:
- id (bytes): content's identifier (sha1)
- mimetype (bytes): mimetype in bytes
- encoding (bytes): encoding in bytes
"""
filename = hashutil.hash_to_hex(sha1)
content_path = self.write_to_temp(
filename=filename,
data=content)
properties = compute_mimetype_encoding(content_path)
properties.update({
'id': sha1,
})
self.cleanup(content_path)
return properties
def persist_index_computations(self, results):
"""Persist the results in storage.
Args:
results ([dict]): list of content_mimetype, dict with the
following keys:
- id (bytes): content's identifier (sha1)
- mimetype (bytes): mimetype in bytes
- encoding (bytes): encoding in bytes
"""
self.storage.content_mimetype_add(results)
@click.command()
@click.option('--path', help="Path to execute index on")
def main(path):
print(compute_mimetype_encoding(path))
if __name__ == '__main__':
main()
diff --git a/swh/indexer/tasks.py b/swh/indexer/tasks.py
index 957a857..914e886 100644
--- a/swh/indexer/tasks.py
+++ b/swh/indexer/tasks.py
@@ -1,41 +1,40 @@
# Copyright (C) 2016 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from swh.scheduler.task import Task
from .orchestrator import OrchestratorIndexer
-from .file_properties import ContentMimetypeIndexer
-from .language import ContentLanguageIndexer
+from . import ContentMimetypeIndexer, ContentLanguageIndexer
class SWHOrchestratorTask(Task):
"""Main task in charge of reading messages and broadcasting them back
to other tasks.
"""
task_queue = 'swh_indexer_orchestrator'
def run(self, *args, **kwargs):
OrchestratorIndexer().run(*args, **kwargs)
class SWHContentMimetypeTask(Task):
"""Task which computes the mimetype, encoding from the sha1's content.
"""
task_queue = 'swh_indexer_content_mimetype'
def run(self, *args, **kwargs):
ContentMimetypeIndexer().run(*args, **kwargs)
class SWHContentLanguageTask(Task):
"""Task which computes the language from the sha1's content.
"""
task_queue = 'swh_indexer_content_language'
def run(self, *args, **kwargs):
ContentLanguageIndexer().run(*args, **kwargs)
File Metadata
Details
Attached
Mime Type
text/x-diff
Expires
Jul 4 2025, 9:29 AM (5 w, 5 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3281502
Attached To
rDCIDX Metadata indexer
Event Timeline
Log In to Comment