diff --git a/PKG-INFO b/PKG-INFO index 43d1939..e0e8f6b 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,71 +1,71 @@ Metadata-Version: 2.1 Name: swh.indexer -Version: 0.6.1 +Version: 0.6.2 Summary: Software Heritage Content Indexer Home-page: https://forge.softwareheritage.org/diffusion/78/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-indexer Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-indexer/ Description: swh-indexer ============ Tools to compute multiple indexes on SWH's raw contents: - content: - mimetype - ctags - language - fossology-license - metadata - revision: - metadata An indexer is in charge of: - looking up objects - extracting information from those objects - store those information in the swh-indexer db There are multiple indexers working on different object types: - content indexer: works with content sha1 hashes - revision indexer: works with revision sha1 hashes - origin indexer: works with origin identifiers Indexation procedure: - receive batch of ids - retrieve the associated data depending on object type - compute for that object some index - store the result to swh's storage Current content indexers: - mimetype (queue swh_indexer_content_mimetype): detect the encoding and mimetype - language (queue swh_indexer_content_language): detect the programming language - ctags (queue swh_indexer_content_ctags): compute tags information - fossology-license (queue swh_indexer_fossology_license): compute the license - metadata: translate file into translated_metadata dict Current revision indexers: - metadata: detects files containing metadata and retrieves translated_metadata in content_metadata table in storage or run content indexer to translate files. Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing diff --git a/swh.indexer.egg-info/PKG-INFO b/swh.indexer.egg-info/PKG-INFO index 43d1939..e0e8f6b 100644 --- a/swh.indexer.egg-info/PKG-INFO +++ b/swh.indexer.egg-info/PKG-INFO @@ -1,71 +1,71 @@ Metadata-Version: 2.1 Name: swh.indexer -Version: 0.6.1 +Version: 0.6.2 Summary: Software Heritage Content Indexer Home-page: https://forge.softwareheritage.org/diffusion/78/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-indexer Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-indexer/ Description: swh-indexer ============ Tools to compute multiple indexes on SWH's raw contents: - content: - mimetype - ctags - language - fossology-license - metadata - revision: - metadata An indexer is in charge of: - looking up objects - extracting information from those objects - store those information in the swh-indexer db There are multiple indexers working on different object types: - content indexer: works with content sha1 hashes - revision indexer: works with revision sha1 hashes - origin indexer: works with origin identifiers Indexation procedure: - receive batch of ids - retrieve the associated data depending on object type - compute for that object some index - store the result to swh's storage Current content indexers: - mimetype (queue swh_indexer_content_mimetype): detect the encoding and mimetype - language (queue swh_indexer_content_language): detect the programming language - ctags (queue swh_indexer_content_ctags): compute tags information - fossology-license (queue swh_indexer_fossology_license): compute the license - metadata: translate file into translated_metadata dict Current revision indexers: - metadata: detects files containing metadata and retrieves translated_metadata in content_metadata table in storage or run content indexer to translate files. Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing diff --git a/swh.indexer.egg-info/SOURCES.txt b/swh.indexer.egg-info/SOURCES.txt index da35213..88bcdbe 100644 --- a/swh.indexer.egg-info/SOURCES.txt +++ b/swh.indexer.egg-info/SOURCES.txt @@ -1,140 +1,141 @@ .gitignore .pre-commit-config.yaml AUTHORS CODE_OF_CONDUCT.md CONTRIBUTORS LICENSE MANIFEST.in Makefile Makefile.local README.md codemeta.json conftest.py mypy.ini pyproject.toml pytest.ini requirements-swh.txt requirements-test.txt requirements.txt setup.cfg setup.py tox.ini docs/.gitignore docs/Makefile docs/Makefile.local docs/README.md docs/conf.py docs/dev-info.rst docs/index.rst docs/metadata-workflow.rst docs/_static/.placeholder docs/_templates/.placeholder docs/images/.gitignore docs/images/Makefile docs/images/tasks-metadata-indexers.uml sql/bin/db-upgrade sql/bin/dot_add_content sql/doc/json sql/doc/json/.gitignore sql/doc/json/Makefile sql/doc/json/indexer_configuration.tool_configuration.schema.json sql/doc/json/revision_metadata.translated_metadata.json sql/json/.gitignore sql/json/Makefile sql/json/indexer_configuration.tool_configuration.schema.json sql/json/revision_metadata.translated_metadata.json sql/upgrades/115.sql sql/upgrades/116.sql sql/upgrades/117.sql sql/upgrades/118.sql sql/upgrades/119.sql sql/upgrades/120.sql sql/upgrades/121.sql sql/upgrades/122.sql sql/upgrades/123.sql sql/upgrades/124.sql sql/upgrades/125.sql sql/upgrades/126.sql sql/upgrades/127.sql sql/upgrades/128.sql sql/upgrades/129.sql sql/upgrades/130.sql sql/upgrades/131.sql sql/upgrades/132.sql sql/upgrades/133.sql swh/__init__.py swh.indexer.egg-info/PKG-INFO swh.indexer.egg-info/SOURCES.txt swh.indexer.egg-info/dependency_links.txt swh.indexer.egg-info/entry_points.txt swh.indexer.egg-info/requires.txt swh.indexer.egg-info/top_level.txt swh/indexer/__init__.py swh/indexer/cli.py swh/indexer/codemeta.py swh/indexer/ctags.py swh/indexer/fossology_license.py swh/indexer/indexer.py swh/indexer/journal_client.py swh/indexer/metadata.py swh/indexer/metadata_detector.py swh/indexer/mimetype.py swh/indexer/origin_head.py swh/indexer/py.typed swh/indexer/rehash.py swh/indexer/tasks.py swh/indexer/data/codemeta/CITATION swh/indexer/data/codemeta/LICENSE swh/indexer/data/codemeta/codemeta.jsonld swh/indexer/data/codemeta/crosswalk.csv swh/indexer/metadata_dictionary/__init__.py swh/indexer/metadata_dictionary/base.py swh/indexer/metadata_dictionary/codemeta.py swh/indexer/metadata_dictionary/maven.py swh/indexer/metadata_dictionary/npm.py swh/indexer/metadata_dictionary/python.py swh/indexer/metadata_dictionary/ruby.py swh/indexer/sql/10-superuser-init.sql swh/indexer/sql/20-enums.sql swh/indexer/sql/30-schema.sql swh/indexer/sql/50-data.sql swh/indexer/sql/50-func.sql swh/indexer/sql/60-indexes.sql swh/indexer/storage/__init__.py swh/indexer/storage/converters.py swh/indexer/storage/db.py swh/indexer/storage/exc.py swh/indexer/storage/in_memory.py swh/indexer/storage/interface.py swh/indexer/storage/metrics.py swh/indexer/storage/model.py swh/indexer/storage/writer.py swh/indexer/storage/api/__init__.py swh/indexer/storage/api/client.py swh/indexer/storage/api/serializers.py swh/indexer/storage/api/server.py swh/indexer/tests/__init__.py swh/indexer/tests/conftest.py swh/indexer/tests/tasks.py swh/indexer/tests/test_cli.py swh/indexer/tests/test_codemeta.py swh/indexer/tests/test_ctags.py swh/indexer/tests/test_fossology_license.py swh/indexer/tests/test_indexer.py swh/indexer/tests/test_journal_client.py swh/indexer/tests/test_metadata.py swh/indexer/tests/test_mimetype.py swh/indexer/tests/test_origin_head.py swh/indexer/tests/test_origin_metadata.py swh/indexer/tests/test_tasks.py swh/indexer/tests/utils.py swh/indexer/tests/storage/__init__.py swh/indexer/tests/storage/conftest.py swh/indexer/tests/storage/generate_data_test.py swh/indexer/tests/storage/test_api_client.py swh/indexer/tests/storage/test_converters.py swh/indexer/tests/storage/test_in_memory.py swh/indexer/tests/storage/test_init.py swh/indexer/tests/storage/test_metrics.py +swh/indexer/tests/storage/test_model.py swh/indexer/tests/storage/test_server.py swh/indexer/tests/storage/test_storage.py \ No newline at end of file diff --git a/swh/indexer/storage/model.py b/swh/indexer/storage/model.py index d14d107..3eace16 100644 --- a/swh/indexer/storage/model.py +++ b/swh/indexer/storage/model.py @@ -1,135 +1,138 @@ # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """Classes used internally by the in-memory idx-storage, and will be used for the interface of the idx-storage in the near future.""" from __future__ import annotations from typing import Any, Dict, List, Optional, Tuple, Type, TypeVar import attr from typing_extensions import Final from swh.model.model import Sha1Git, dictify TSelf = TypeVar("TSelf") @attr.s class BaseRow: UNIQUE_KEY_FIELDS: Tuple = ("id", "indexer_configuration_id") id = attr.ib(type=Any) indexer_configuration_id = attr.ib(type=Optional[int], default=None, kw_only=True) tool = attr.ib(type=Optional[Dict], default=None, kw_only=True) def __attrs_post_init__(self): if self.indexer_configuration_id is None and self.tool is None: raise TypeError("Either indexer_configuration_id or tool must be not None.") if self.indexer_configuration_id is not None and self.tool is not None: raise TypeError( "indexer_configuration_id and tool are mutually exclusive; " "only one may be not None." ) def anonymize(self: TSelf) -> Optional[TSelf]: # Needed to implement swh.journal.writer.ValueProtocol return None def to_dict(self) -> Dict[str, Any]: """Wrapper of `attr.asdict` that can be overridden by subclasses that have special handling of some of the fields.""" d = dictify(attr.asdict(self, recurse=False)) if d["indexer_configuration_id"] is None: del d["indexer_configuration_id"] if d["tool"] is None: del d["tool"] return d @classmethod def from_dict(cls: Type[TSelf], d) -> TSelf: return cls(**d) # type: ignore def unique_key(self) -> Dict: - if self.indexer_configuration_id is None: - raise ValueError( - "Can only call unique_key() on objects without " - "indexer_configuration_id." - ) - return {key: getattr(self, key) for key in self.UNIQUE_KEY_FIELDS} + obj = self + + # tool["id"] and obj.indexer_configuration_id are the same value, but + # only one of them is set for any given object + if obj.indexer_configuration_id is None: + assert obj.tool # constructors ensures tool XOR indexer_configuration_id + obj = attr.evolve(obj, indexer_configuration_id=obj.tool["id"], tool=None) + + return {key: getattr(obj, key) for key in self.UNIQUE_KEY_FIELDS} @attr.s class ContentMimetypeRow(BaseRow): object_type: Final = "content_mimetype" id = attr.ib(type=Sha1Git) mimetype = attr.ib(type=str) encoding = attr.ib(type=str) @attr.s class ContentLanguageRow(BaseRow): object_type: Final = "content_language" id = attr.ib(type=Sha1Git) lang = attr.ib(type=str) @attr.s class ContentCtagsRow(BaseRow): object_type: Final = "content_ctags" UNIQUE_KEY_FIELDS = ( "id", "indexer_configuration_id", "name", "kind", "line", "lang", ) id = attr.ib(type=Sha1Git) name = attr.ib(type=str) kind = attr.ib(type=str) line = attr.ib(type=int) lang = attr.ib(type=str) @attr.s class ContentLicenseRow(BaseRow): object_type: Final = "content_fossology_license" UNIQUE_KEY_FIELDS = ("id", "indexer_configuration_id", "license") id = attr.ib(type=Sha1Git) license = attr.ib(type=str) @attr.s class ContentMetadataRow(BaseRow): object_type: Final = "content_metadata" id = attr.ib(type=Sha1Git) metadata = attr.ib(type=Dict[str, Any]) @attr.s class RevisionIntrinsicMetadataRow(BaseRow): object_type: Final = "revision_intrinsic_metadata" id = attr.ib(type=Sha1Git) metadata = attr.ib(type=Dict[str, Any]) mappings = attr.ib(type=List[str]) @attr.s class OriginIntrinsicMetadataRow(BaseRow): object_type: Final = "origin_intrinsic_metadata" id = attr.ib(type=str) metadata = attr.ib(type=Dict[str, Any]) from_revision = attr.ib(type=Sha1Git) mappings = attr.ib(type=List[str]) diff --git a/swh/indexer/storage/writer.py b/swh/indexer/storage/writer.py index adae76d..297b468 100644 --- a/swh/indexer/storage/writer.py +++ b/swh/indexer/storage/writer.py @@ -1,64 +1,66 @@ # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from typing import Any, Callable, Dict, Iterable import attr try: from swh.journal.writer import get_journal_writer except ImportError: get_journal_writer = None # type: ignore # mypy limitation, see https://github.com/python/mypy/issues/1153 from .model import BaseRow class JournalWriter: """Journal writer storage collaborator. It's in charge of adding objects to the journal. """ def __init__(self, tool_getter: Callable[[int], Dict[str, Any]], journal_writer): """ Args: tool_getter: a callable that takes a tool_id and return a dict representing a tool object journal_writer: configuration passed to `swh.journal.writer.get_journal_writer` """ self._tool_getter = tool_getter if journal_writer: if get_journal_writer is None: raise EnvironmentError( "You need the swh.journal package to use the " "journal_writer feature" ) - self.journal = get_journal_writer(**journal_writer) + self.journal = get_journal_writer( + **journal_writer, value_sanitizer=lambda x: x + ) else: self.journal = None def write_additions(self, obj_type, entries: Iterable[BaseRow]) -> None: if not self.journal: return # usually, all the additions in a batch are from the same indexer, # so this cache allows doing a single query for all the entries. tool_cache = {} for entry in entries: assert entry.object_type == obj_type # type: ignore # get the tool used to generate this addition tool_id = entry.indexer_configuration_id assert tool_id if tool_id not in tool_cache: tool_cache[tool_id] = self._tool_getter(tool_id) entry = attr.evolve( entry, tool=tool_cache[tool_id], indexer_configuration_id=None ) # write to kafka self.journal.write_addition(obj_type, entry) diff --git a/swh/indexer/tests/storage/test_model.py b/swh/indexer/tests/storage/test_model.py new file mode 100644 index 0000000..d33e529 --- /dev/null +++ b/swh/indexer/tests/storage/test_model.py @@ -0,0 +1,26 @@ +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from swh.indexer.storage.model import BaseRow, ContentLicenseRow + + +def test_unique_key(): + assert BaseRow(id=12, indexer_configuration_id=34).unique_key() == { + "id": 12, + "indexer_configuration_id": 34, + } + + assert BaseRow(id=12, tool={"id": 34, "name": "foo"}).unique_key() == { + "id": 12, + "indexer_configuration_id": 34, + } + + assert ContentLicenseRow( + id=12, indexer_configuration_id=34, license="BSD" + ).unique_key() == {"id": 12, "indexer_configuration_id": 34, "license": "BSD"} + + assert ContentLicenseRow( + id=12, tool={"id": 34, "name": "foo"}, license="BSD" + ).unique_key() == {"id": 12, "indexer_configuration_id": 34, "license": "BSD"}