diff --git a/debian/control b/debian/control index a339313..51abd71 100644 --- a/debian/control +++ b/debian/control @@ -1,24 +1,25 @@ Source: swh-indexer Maintainer: Software Heritage developers Section: python Priority: optional Build-Depends: debhelper (>= 9), dh-python, python3-all, python3-nose, python3-setuptools, python3-swh.core (>= 0.0.25~), python3-swh.storage (>= 0.0.64~), python3-swh.objstorage (>= 0.0.13~), python3-swh.scheduler (>= 0.0.9~), + python3-chardet (>= 2.3.0~), python3-click, exuberant-ctags, python3-pygments, python3-vcversioner Standards-Version: 3.9.6 Homepage: https://forge.softwareheritage.org/diffusion/78/ Package: python3-swh.indexer Architecture: all Depends: exuberant-ctags, ${misc:Depends}, ${python3:Depends} Description: Software Heritage Content Indexer diff --git a/requirements.txt b/requirements.txt index 30d3b92..5316e2b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,11 +1,12 @@ # Add here external Python modules dependencies, one per line. Module names # should match https://pypi.python.org/pypi names. For the full spec or # dependency lines, see https://pip.readthedocs.org/en/1.1/requirements.html vcversioner pygments click +chardet swh.core >= 0.0.25 swh.storage >= 0.0.64 swh.objstorage >= 0.0.13 swh.scheduler >= 0.0.9 diff --git a/swh/indexer/language.py b/swh/indexer/language.py index 133d298..2a7855c 100644 --- a/swh/indexer/language.py +++ b/swh/indexer/language.py @@ -1,53 +1,48 @@ -# Copyright (C) 2015-2016 The Software Heritage developers +# Copyright (C) 2016 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import json -import os -import math -import sys - from pygments.lexers import guess_lexer from pygments.util import ClassNotFound +from chardet import detect def cleanup_classname(classname): """Determine the language from the pygments' lexer names. """ return classname.lower().replace(' ', '-') -def run_language(path, encoding=None): - """Determine mime-type from file at path. +def run_language(raw_content): + """Determine the raw content's language. Args: - path (str): filepath to determine the mime type - encoding (str): optional file's encoding + raw_content (bytes): content to determine raw content Returns: Dict with keys: - lang: None if nothing found or the possible language - decoding_failure: True if a decoding failure happened """ try: - with open(path, 'r', encoding=encoding) as f: - try: - raw_content = f.read() - lang = cleanup_classname( - guess_lexer(raw_content).name) - return { - 'lang': lang - } - except ClassNotFound as e: - return { - 'lang': None - } + encoding = detect(raw_content)['encoding'] + content = raw_content.decode(encoding) + lang = cleanup_classname( + guess_lexer(content).name) + + return { + 'lang': lang + } + except ClassNotFound as e: + return { + 'lang': None + } except LookupError as e: # Unknown encoding return { 'decoding_failure': True, 'lang': None }