diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 21a6fe9..3fb4b4d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,49 +1,43 @@ repos: -- repo: https://github.com/pre-commit/pre-commit-hooks - rev: v2.4.0 - hooks: - - id: trailing-whitespace - - id: check-json - - id: check-yaml + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.1.0 + hooks: + - id: trailing-whitespace + - id: check-json + - id: check-yaml -- repo: https://gitlab.com/pycqa/flake8 - rev: 3.8.3 - hooks: - - id: flake8 + - repo: https://gitlab.com/pycqa/flake8 + rev: 4.0.1 + hooks: + - id: flake8 -- repo: https://github.com/codespell-project/codespell - rev: v1.16.0 - hooks: - - id: codespell - exclude: ^(swh/indexer/data/codemeta/crosswalk.csv)$ -- repo: local - hooks: - - id: mypy - name: mypy - entry: mypy - args: [swh] - pass_filenames: false - language: system - types: [python] + - repo: https://github.com/codespell-project/codespell + rev: v2.1.0 + hooks: + - id: codespell + name: Check source code spelling + exclude: ^(swh/indexer/data/codemeta/crosswalk.csv)$ + stages: [commit] + - id: codespell + name: Check commit message spelling + stages: [commit-msg] -- repo: https://github.com/PyCQA/isort - rev: 5.5.2 - hooks: - - id: isort + - repo: local + hooks: + - id: mypy + name: mypy + entry: mypy + args: [swh] + pass_filenames: false + language: system + types: [python] -- repo: https://github.com/python/black - rev: 19.10b0 - hooks: - - id: black - -# unfortunately, we are far from being able to enable this... -# - repo: https://github.com/PyCQA/pydocstyle.git -# rev: 4.0.0 -# hooks: -# - id: pydocstyle -# name: pydocstyle -# description: pydocstyle is a static analysis tool for checking compliance with Python docstring conventions. -# entry: pydocstyle --convention=google -# language: python -# types: [python] + - repo: https://github.com/PyCQA/isort + rev: 5.10.1 + hooks: + - id: isort + - repo: https://github.com/python/black + rev: 19.10b0 + hooks: + - id: black diff --git a/PKG-INFO b/PKG-INFO index f9e8f71..3186c16 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,75 +1,75 @@ Metadata-Version: 2.1 Name: swh.indexer -Version: 0.8.2 +Version: 1.0.0 Summary: Software Heritage Content Indexer Home-page: https://forge.softwareheritage.org/diffusion/78/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-indexer Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-indexer/ Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing License-File: LICENSE License-File: AUTHORS swh-indexer ============ Tools to compute multiple indexes on SWH's raw contents: - content: - mimetype - ctags - language - fossology-license - metadata - revision: - metadata An indexer is in charge of: - looking up objects - extracting information from those objects - store those information in the swh-indexer db There are multiple indexers working on different object types: - content indexer: works with content sha1 hashes - revision indexer: works with revision sha1 hashes - origin indexer: works with origin identifiers Indexation procedure: - receive batch of ids - retrieve the associated data depending on object type - compute for that object some index - store the result to swh's storage Current content indexers: - mimetype (queue swh_indexer_content_mimetype): detect the encoding and mimetype - language (queue swh_indexer_content_language): detect the programming language - ctags (queue swh_indexer_content_ctags): compute tags information - fossology-license (queue swh_indexer_fossology_license): compute the license - metadata: translate file into translated_metadata dict Current revision indexers: - metadata: detects files containing metadata and retrieves translated_metadata in content_metadata table in storage or run content indexer to translate files. diff --git a/docs/metadata-workflow.rst b/docs/metadata-workflow.rst index 471ce8c..f913f49 100644 --- a/docs/metadata-workflow.rst +++ b/docs/metadata-workflow.rst @@ -1,208 +1,208 @@ Metadata workflow ================= Intrinsic metadata ------------------ Indexing :term:`intrinsic metadata` requires extracting information from the lowest levels of the :ref:`Merkle DAG ` (directories, files, and content blobs) and associate them to the highest ones (origins). In order to deduplicate the work between origins, we split this work between multiple indexers, which coordinate with each other and save their results at each step in the indexer storage. Indexer architecture -------------------- .. thumbnail:: images/tasks-metadata-indexers.svg Origin-Head Indexer ___________________ First, the Origin-Head indexer gets called externally, with an origin as argument (or multiple origins, that are handled sequentially). For now, its tasks are scheduled manually via recurring Scheduler tasks; but in the near future, the :term:`journal` will be used to do that. It first looks up the last :term:`snapshot` and determines what the main branch of origin is (the "Head branch") and what revision it points to (the "Head"). Intrinsic metadata for that origin will be extracted from that revision. It schedules a Revision Metadata Indexer task for that revision, with a hint that the revision is the Head of that particular origin. Revision and Content Metadata Indexers ______________________________________ These two indexers do the hard part of the work. The Revision Metadata Indexer fetches the root directory associated with a revision, then extracts the metadata from that directory. To do so, it lists files in that directory, and looks for known names, such -as `codemeta.json`, `package.json`, or `pom.xml`. If there are any, it +as :file:`codemeta.json`, :file:`package.json`, or :file:`pom.xml`. If there are any, it runs the Content Metadata Indexer on them, which in turn fetches their contents and runs them through extraction dictionaries/mappings. See below for details. Their results are saved in a database (the indexer storage), associated with the content and revision hashes. If it received a hint that this revision is the head of an origin, the Revision Metadata Indexer then schedules the Origin Metadata Indexer to run on that origin. Origin Metadata Indexer _______________________ The job of this indexer is very simple: it takes an origin identifier and a revision hash, and copies the metadata of the former to a new table, to associate it with the latter. The reason for this is to be able to perform searches on metadata, and efficiently find out which origins matched the pattern. -Running that search on the `revision_metadata` table would require either +Running that search on the ``revision_metadata`` table would require either a reverse lookup from revisions to origins, which is costly. Translation from language-specific metadata to CodeMeta ------------------------------------------------------- Intrinsic metadata are extracted from files provided with a project's source code, and translated using `CodeMeta`_'s `crosswalk table`_. All input formats supported so far are straightforward dictionaries (eg. JSON) or can be accessed as such (eg. XML); and the first part of the translation is to map their keys to a term in the CodeMeta vocabulary. This is done by parsing the crosswalk table's `CSV file`_ and using it as a map between these two vocabularies; and this does not require any format-specific code in the indexers. The second part is to normalize values. As language-specific metadata files each have their way(s) of formatting these values, we need to turn them into the data type required by CodeMeta. This normalization makes up for most of the code of :py:mod:`swh.indexer.metadata_dictionary`. .. _CodeMeta: https://codemeta.github.io/ .. _crosswalk table: https://codemeta.github.io/crosswalk/ .. _CSV file: https://github.com/codemeta/codemeta/blob/master/crosswalk.csv Supported intrinsic metadata ---------------------------- The following sources of intrinsic metadata are supported: * CodeMeta's `codemeta.json`_, * Maven's `pom.xml`_, * NPM's `package.json`_, * Python's `PKG-INFO`_, * Ruby's `.gemspec`_ .. _codemeta.json: https://codemeta.github.io/terms/ .. _pom.xml: https://maven.apache.org/pom.html .. _package.json: https://docs.npmjs.com/files/package.json .. _PKG-INFO: https://www.python.org/dev/peps/pep-0314/ .. _.gemspec: https://guides.rubygems.org/specification-reference/ Supported CodeMeta terms ------------------------ The following terms may be found in the output of the metadata translation (other than the `codemeta` mapping, which is the identity function, and therefore supports all terms): .. program-output:: python3 -m swh.indexer.cli mapping list-terms --exclude-mapping codemeta :nostderr: Adding support for additional ecosystem-specific metadata --------------------------------------------------------- This section will guide you through adding code to the metadata indexer to detect and translate new metadata formats. First, you should start by picking one of the `CodeMeta crosswalks`_. -Then create a new file in `swh-indexer/swh/indexer/metadata_dictionary/`, that +Then create a new file in :file:`swh-indexer/swh/indexer/metadata_dictionary/`, that will contain your code, and create a new class that inherits from helper classes, with some documentation about your indexer: .. code-block:: python from .base import DictMapping, SingleFileMapping from swh.indexer.codemeta import CROSSWALK_TABLE class MyMapping(DictMapping, SingleFileMapping): """Dedicated class for ...""" name = 'my-mapping' filename = b'the-filename' mapping = CROSSWALK_TABLE['Name of the CodeMeta crosswalk'] .. _CodeMeta crosswalks: https://github.com/codemeta/codemeta/tree/master/crosswalks -Then, add a `string_fields` attribute, that is the list of all keys whose +Then, add a ``string_fields`` attribute, that is the list of all keys whose values are simple text values. For instance, to `translate Python PKG-INFO`_, it's: .. code-block:: python string_fields = ['name', 'version', 'description', 'summary', 'author', 'author-email'] These values will be automatically added to the above list of supported terms. .. _translate Python PKG-INFO: https://forge.softwareheritage.org/source/swh-indexer/browse/master/swh/indexer/metadata_dictionary/python.py -Last step to get your code working: add a `translate` method that will +Last step to get your code working: add a ``translate`` method that will take a single byte string as argument, turn it into a Python dictionary, whose keys are the ones of the input document, and pass it to -`_translate_dict`. +``_translate_dict``. For instance, if the input document is in JSON, it can be as simple as: .. code-block:: python def translate(self, raw_content): raw_content = raw_content.decode() # bytes to str content_dict = json.loads(raw_content) # str to dict return self._translate_dict(content_dict) # convert to CodeMeta -`_translate_dict` will do the heavy work of reading the crosswalk table for -each of `string_fields`, read the corresponding value in the `content_dict`, +``_translate_dict`` will do the heavy work of reading the crosswalk table for +each of ``string_fields``, read the corresponding value in the ``content_dict``, and build a CodeMeta dictionary with the corresponding names from the crosswalk table. One last thing to run your code: add it to the list in -`swh-indexer/swh/indexer/metadata_dictionary/__init__.py`, so the rest of the +:file:`swh-indexer/swh/indexer/metadata_dictionary/__init__.py`, so the rest of the code is aware of it. Now, you can run it: .. code-block:: shell python3 -m swh.indexer.metadata_dictionary MyMapping path/to/input/file and it will (hopefully) returns a CodeMeta object. If it works, well done! You can now improve your translation code further, by adding methods that will do more advanced conversion. For example, if there is a field named -`license` containing an SPDX identifier, you must convert it to an URI, +``license`` containing an SPDX identifier, you must convert it to an URI, like this: .. code-block:: python def normalize_license(self, s): if isinstance(s, str): return {"@id": "https://spdx.org/licenses/" + s} -This method will automatically get called by `_translate_dict` when it -finds a `license` field in `content_dict`. +This method will automatically get called by ``_translate_dict`` when it +finds a ``license`` field in ``content_dict``. diff --git a/requirements-test.txt b/requirements-test.txt index c343e8f..94659ce 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1,9 +1,9 @@ confluent-kafka hypothesis >= 3.11.0 -pytest +pytest < 7.0.0 # v7.0.0 removed _pytest.tmpdir.TempdirFactory, which is used by some of the pytest plugins we use pytest-mock swh.scheduler[testing] >= 0.5.0 swh.storage[testing] >= 0.10.0 types-click types-pyyaml diff --git a/swh.indexer.egg-info/PKG-INFO b/swh.indexer.egg-info/PKG-INFO index f9e8f71..3186c16 100644 --- a/swh.indexer.egg-info/PKG-INFO +++ b/swh.indexer.egg-info/PKG-INFO @@ -1,75 +1,75 @@ Metadata-Version: 2.1 Name: swh.indexer -Version: 0.8.2 +Version: 1.0.0 Summary: Software Heritage Content Indexer Home-page: https://forge.softwareheritage.org/diffusion/78/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-indexer Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-indexer/ Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing License-File: LICENSE License-File: AUTHORS swh-indexer ============ Tools to compute multiple indexes on SWH's raw contents: - content: - mimetype - ctags - language - fossology-license - metadata - revision: - metadata An indexer is in charge of: - looking up objects - extracting information from those objects - store those information in the swh-indexer db There are multiple indexers working on different object types: - content indexer: works with content sha1 hashes - revision indexer: works with revision sha1 hashes - origin indexer: works with origin identifiers Indexation procedure: - receive batch of ids - retrieve the associated data depending on object type - compute for that object some index - store the result to swh's storage Current content indexers: - mimetype (queue swh_indexer_content_mimetype): detect the encoding and mimetype - language (queue swh_indexer_content_language): detect the programming language - ctags (queue swh_indexer_content_ctags): compute tags information - fossology-license (queue swh_indexer_fossology_license): compute the license - metadata: translate file into translated_metadata dict Current revision indexers: - metadata: detects files containing metadata and retrieves translated_metadata in content_metadata table in storage or run content indexer to translate files. diff --git a/swh.indexer.egg-info/SOURCES.txt b/swh.indexer.egg-info/SOURCES.txt index 55e837c..525d815 100644 --- a/swh.indexer.egg-info/SOURCES.txt +++ b/swh.indexer.egg-info/SOURCES.txt @@ -1,143 +1,143 @@ .gitignore .pre-commit-config.yaml AUTHORS CODE_OF_CONDUCT.md CONTRIBUTORS LICENSE MANIFEST.in Makefile Makefile.local README.md codemeta.json conftest.py mypy.ini pyproject.toml pytest.ini requirements-swh.txt requirements-test.txt requirements.txt setup.cfg setup.py tox.ini docs/.gitignore docs/Makefile docs/Makefile.local docs/README.md docs/cli.rst docs/conf.py docs/dev-info.rst docs/index.rst docs/metadata-workflow.rst docs/_static/.placeholder docs/_templates/.placeholder docs/images/.gitignore docs/images/Makefile docs/images/tasks-metadata-indexers.uml sql/bin/db-upgrade sql/bin/dot_add_content sql/doc/json sql/doc/json/.gitignore sql/doc/json/Makefile sql/doc/json/indexer_configuration.tool_configuration.schema.json sql/doc/json/revision_metadata.translated_metadata.json sql/json/.gitignore sql/json/Makefile sql/json/indexer_configuration.tool_configuration.schema.json sql/json/revision_metadata.translated_metadata.json -sql/upgrades/115.sql -sql/upgrades/116.sql -sql/upgrades/117.sql -sql/upgrades/118.sql -sql/upgrades/119.sql -sql/upgrades/120.sql -sql/upgrades/121.sql -sql/upgrades/122.sql -sql/upgrades/123.sql -sql/upgrades/124.sql -sql/upgrades/125.sql -sql/upgrades/126.sql -sql/upgrades/127.sql -sql/upgrades/128.sql -sql/upgrades/129.sql -sql/upgrades/130.sql -sql/upgrades/131.sql -sql/upgrades/132.sql -sql/upgrades/133.sql swh/__init__.py swh.indexer.egg-info/PKG-INFO swh.indexer.egg-info/SOURCES.txt swh.indexer.egg-info/dependency_links.txt swh.indexer.egg-info/entry_points.txt swh.indexer.egg-info/requires.txt swh.indexer.egg-info/top_level.txt swh/indexer/__init__.py swh/indexer/cli.py swh/indexer/codemeta.py swh/indexer/ctags.py swh/indexer/fossology_license.py swh/indexer/indexer.py swh/indexer/journal_client.py swh/indexer/metadata.py swh/indexer/metadata_detector.py swh/indexer/mimetype.py swh/indexer/origin_head.py swh/indexer/py.typed swh/indexer/rehash.py swh/indexer/tasks.py swh/indexer/data/codemeta/CITATION swh/indexer/data/codemeta/LICENSE swh/indexer/data/codemeta/codemeta.jsonld swh/indexer/data/codemeta/crosswalk.csv swh/indexer/metadata_dictionary/__init__.py swh/indexer/metadata_dictionary/base.py swh/indexer/metadata_dictionary/cff.py swh/indexer/metadata_dictionary/codemeta.py swh/indexer/metadata_dictionary/maven.py swh/indexer/metadata_dictionary/npm.py swh/indexer/metadata_dictionary/python.py swh/indexer/metadata_dictionary/ruby.py swh/indexer/sql/10-superuser-init.sql swh/indexer/sql/20-enums.sql swh/indexer/sql/30-schema.sql swh/indexer/sql/50-data.sql swh/indexer/sql/50-func.sql swh/indexer/sql/60-indexes.sql +swh/indexer/sql/upgrades/115.sql +swh/indexer/sql/upgrades/116.sql +swh/indexer/sql/upgrades/117.sql +swh/indexer/sql/upgrades/118.sql +swh/indexer/sql/upgrades/119.sql +swh/indexer/sql/upgrades/120.sql +swh/indexer/sql/upgrades/121.sql +swh/indexer/sql/upgrades/122.sql +swh/indexer/sql/upgrades/123.sql +swh/indexer/sql/upgrades/124.sql +swh/indexer/sql/upgrades/125.sql +swh/indexer/sql/upgrades/126.sql +swh/indexer/sql/upgrades/127.sql +swh/indexer/sql/upgrades/128.sql +swh/indexer/sql/upgrades/129.sql +swh/indexer/sql/upgrades/130.sql +swh/indexer/sql/upgrades/131.sql +swh/indexer/sql/upgrades/132.sql +swh/indexer/sql/upgrades/133.sql swh/indexer/storage/__init__.py swh/indexer/storage/converters.py swh/indexer/storage/db.py swh/indexer/storage/exc.py swh/indexer/storage/in_memory.py swh/indexer/storage/interface.py swh/indexer/storage/metrics.py swh/indexer/storage/model.py swh/indexer/storage/writer.py swh/indexer/storage/api/__init__.py swh/indexer/storage/api/client.py swh/indexer/storage/api/serializers.py swh/indexer/storage/api/server.py swh/indexer/tests/__init__.py swh/indexer/tests/conftest.py swh/indexer/tests/tasks.py swh/indexer/tests/test_cli.py swh/indexer/tests/test_codemeta.py swh/indexer/tests/test_ctags.py swh/indexer/tests/test_fossology_license.py swh/indexer/tests/test_indexer.py swh/indexer/tests/test_journal_client.py swh/indexer/tests/test_metadata.py swh/indexer/tests/test_mimetype.py swh/indexer/tests/test_origin_head.py swh/indexer/tests/test_origin_metadata.py swh/indexer/tests/test_tasks.py swh/indexer/tests/utils.py swh/indexer/tests/storage/__init__.py swh/indexer/tests/storage/conftest.py swh/indexer/tests/storage/generate_data_test.py swh/indexer/tests/storage/test_api_client.py swh/indexer/tests/storage/test_converters.py swh/indexer/tests/storage/test_in_memory.py swh/indexer/tests/storage/test_init.py swh/indexer/tests/storage/test_metrics.py swh/indexer/tests/storage/test_model.py swh/indexer/tests/storage/test_server.py swh/indexer/tests/storage/test_storage.py \ No newline at end of file diff --git a/swh.indexer.egg-info/entry_points.txt b/swh.indexer.egg-info/entry_points.txt index 011df5d..e0c9a87 100644 --- a/swh.indexer.egg-info/entry_points.txt +++ b/swh.indexer.egg-info/entry_points.txt @@ -1,4 +1,2 @@ - - [swh.cli.subcommands] - indexer=swh.indexer.cli - \ No newline at end of file +[swh.cli.subcommands] +indexer = swh.indexer.cli diff --git a/swh.indexer.egg-info/requires.txt b/swh.indexer.egg-info/requires.txt index 33fc812..572c47d 100644 --- a/swh.indexer.egg-info/requires.txt +++ b/swh.indexer.egg-info/requires.txt @@ -1,22 +1,22 @@ python-magic>=0.4.13 click frozendict<2.1.2 pyld xmltodict typing-extensions swh.core[db,http]>=0.14.0 swh.model>=0.0.15 swh.objstorage>=0.2.2 swh.scheduler>=0.5.2 swh.storage>=0.22.0 swh.journal>=0.1.0 [testing] confluent-kafka hypothesis>=3.11.0 -pytest +pytest<7.0.0 pytest-mock swh.scheduler[testing]>=0.5.0 swh.storage[testing]>=0.10.0 types-click types-pyyaml diff --git a/swh/indexer/__init__.py b/swh/indexer/__init__.py index 7ad5ba4..6f15383 100644 --- a/swh/indexer/__init__.py +++ b/swh/indexer/__init__.py @@ -1,4 +1,11 @@ -# Copyright (C) 2016-2018 The Software Heritage developers +# Copyright (C) 2016-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information + + +# implemented as a function to help lazy loading +def get_datastore(*args, **kw): + from .indexer import get_indexer_storage + + return get_indexer_storage(*args, **kw) diff --git a/swh/indexer/sql/30-schema.sql b/swh/indexer/sql/30-schema.sql index bb0f672..33ca574 100644 --- a/swh/indexer/sql/30-schema.sql +++ b/swh/indexer/sql/30-schema.sql @@ -1,145 +1,132 @@ --- --- Software Heritage Indexers Data Model --- --- drop schema if exists swh cascade; --- create schema swh; --- set search_path to swh; - -create table dbversion -( - version int primary key, - release timestamptz, - description text -); - -insert into dbversion(version, release, description) - values(133, now(), 'Work In Progress'); -- Computing metadata on sha1's contents -- a SHA1 checksum (not necessarily originating from Git) create domain sha1 as bytea check (length(value) = 20); -- a Git object ID, i.e., a SHA1 checksum create domain sha1_git as bytea check (length(value) = 20); create table indexer_configuration ( id serial not null, tool_name text not null, tool_version text not null, tool_configuration jsonb ); comment on table indexer_configuration is 'Indexer''s configuration version'; comment on column indexer_configuration.id is 'Tool identifier'; comment on column indexer_configuration.tool_version is 'Tool name'; comment on column indexer_configuration.tool_version is 'Tool version'; comment on column indexer_configuration.tool_configuration is 'Tool configuration: command line, flags, etc...'; -- Properties (mimetype, encoding, etc...) create table content_mimetype ( id sha1 not null, mimetype text not null, encoding text not null, indexer_configuration_id bigint not null ); comment on table content_mimetype is 'Metadata associated to a raw content'; comment on column content_mimetype.mimetype is 'Raw content Mimetype'; comment on column content_mimetype.encoding is 'Raw content encoding'; comment on column content_mimetype.indexer_configuration_id is 'Tool used to compute the information'; -- Language metadata create table content_language ( id sha1 not null, lang languages not null, indexer_configuration_id bigint not null ); comment on table content_language is 'Language information on a raw content'; comment on column content_language.lang is 'Language information'; comment on column content_language.indexer_configuration_id is 'Tool used to compute the information'; -- ctags information per content create table content_ctags ( id sha1 not null, name text not null, kind text not null, line bigint not null, lang ctags_languages not null, indexer_configuration_id bigint not null ); comment on table content_ctags is 'Ctags information on a raw content'; comment on column content_ctags.id is 'Content identifier'; comment on column content_ctags.name is 'Symbol name'; comment on column content_ctags.kind is 'Symbol kind (function, class, variable, const...)'; comment on column content_ctags.line is 'Symbol line'; comment on column content_ctags.lang is 'Language information for that content'; comment on column content_ctags.indexer_configuration_id is 'Tool used to compute the information'; create table fossology_license( id smallserial, name text not null ); comment on table fossology_license is 'Possible license recognized by license indexer'; comment on column fossology_license.id is 'License identifier'; comment on column fossology_license.name is 'License name'; create table content_fossology_license ( id sha1 not null, license_id smallserial not null, indexer_configuration_id bigint not null ); comment on table content_fossology_license is 'license associated to a raw content'; comment on column content_fossology_license.id is 'Raw content identifier'; comment on column content_fossology_license.license_id is 'One of the content''s license identifier'; comment on column content_fossology_license.indexer_configuration_id is 'Tool used to compute the information'; -- The table content_metadata provides a translation to files -- identified as potentially containning metadata with a translation tool (indexer_configuration_id) create table content_metadata( id sha1 not null, metadata jsonb not null, indexer_configuration_id bigint not null ); comment on table content_metadata is 'metadata semantically translated from a content file'; comment on column content_metadata.id is 'sha1 of content file'; comment on column content_metadata.metadata is 'result of translation with defined format'; comment on column content_metadata.indexer_configuration_id is 'tool used for translation'; -- The table revision_intrinsic_metadata provides a minimal set of intrinsic -- metadata detected with the detection tool (indexer_configuration_id) and -- aggregated from the content_metadata translation. create table revision_intrinsic_metadata( id sha1_git not null, metadata jsonb not null, indexer_configuration_id bigint not null, mappings text array not null ); comment on table revision_intrinsic_metadata is 'metadata semantically detected and translated in a revision'; comment on column revision_intrinsic_metadata.id is 'sha1_git of revision'; comment on column revision_intrinsic_metadata.metadata is 'result of detection and translation with defined format'; comment on column revision_intrinsic_metadata.indexer_configuration_id is 'tool used for detection'; comment on column revision_intrinsic_metadata.mappings is 'type of metadata files used to obtain this metadata (eg. pkg-info, npm)'; create table origin_intrinsic_metadata( id text not null, -- origin url metadata jsonb, indexer_configuration_id bigint not null, from_revision sha1_git not null, metadata_tsvector tsvector, mappings text array not null ); comment on table origin_intrinsic_metadata is 'keeps intrinsic metadata for an origin'; comment on column origin_intrinsic_metadata.id is 'url of the origin'; comment on column origin_intrinsic_metadata.metadata is 'metadata extracted from a revision'; comment on column origin_intrinsic_metadata.indexer_configuration_id is 'tool used to generate this metadata'; comment on column origin_intrinsic_metadata.from_revision is 'sha1 of the revision this metadata was copied from.'; comment on column origin_intrinsic_metadata.mappings is 'type of metadata files used to obtain this metadata (eg. pkg-info, npm)'; diff --git a/sql/upgrades/115.sql b/swh/indexer/sql/upgrades/115.sql similarity index 100% rename from sql/upgrades/115.sql rename to swh/indexer/sql/upgrades/115.sql diff --git a/sql/upgrades/116.sql b/swh/indexer/sql/upgrades/116.sql similarity index 100% rename from sql/upgrades/116.sql rename to swh/indexer/sql/upgrades/116.sql diff --git a/sql/upgrades/117.sql b/swh/indexer/sql/upgrades/117.sql similarity index 100% rename from sql/upgrades/117.sql rename to swh/indexer/sql/upgrades/117.sql diff --git a/sql/upgrades/118.sql b/swh/indexer/sql/upgrades/118.sql similarity index 100% rename from sql/upgrades/118.sql rename to swh/indexer/sql/upgrades/118.sql diff --git a/sql/upgrades/119.sql b/swh/indexer/sql/upgrades/119.sql similarity index 100% rename from sql/upgrades/119.sql rename to swh/indexer/sql/upgrades/119.sql diff --git a/sql/upgrades/120.sql b/swh/indexer/sql/upgrades/120.sql similarity index 100% rename from sql/upgrades/120.sql rename to swh/indexer/sql/upgrades/120.sql diff --git a/sql/upgrades/121.sql b/swh/indexer/sql/upgrades/121.sql similarity index 100% rename from sql/upgrades/121.sql rename to swh/indexer/sql/upgrades/121.sql diff --git a/sql/upgrades/122.sql b/swh/indexer/sql/upgrades/122.sql similarity index 100% rename from sql/upgrades/122.sql rename to swh/indexer/sql/upgrades/122.sql diff --git a/sql/upgrades/123.sql b/swh/indexer/sql/upgrades/123.sql similarity index 100% rename from sql/upgrades/123.sql rename to swh/indexer/sql/upgrades/123.sql diff --git a/sql/upgrades/124.sql b/swh/indexer/sql/upgrades/124.sql similarity index 100% rename from sql/upgrades/124.sql rename to swh/indexer/sql/upgrades/124.sql diff --git a/sql/upgrades/125.sql b/swh/indexer/sql/upgrades/125.sql similarity index 100% rename from sql/upgrades/125.sql rename to swh/indexer/sql/upgrades/125.sql diff --git a/sql/upgrades/126.sql b/swh/indexer/sql/upgrades/126.sql similarity index 100% rename from sql/upgrades/126.sql rename to swh/indexer/sql/upgrades/126.sql diff --git a/sql/upgrades/127.sql b/swh/indexer/sql/upgrades/127.sql similarity index 100% rename from sql/upgrades/127.sql rename to swh/indexer/sql/upgrades/127.sql diff --git a/sql/upgrades/128.sql b/swh/indexer/sql/upgrades/128.sql similarity index 100% rename from sql/upgrades/128.sql rename to swh/indexer/sql/upgrades/128.sql diff --git a/sql/upgrades/129.sql b/swh/indexer/sql/upgrades/129.sql similarity index 100% rename from sql/upgrades/129.sql rename to swh/indexer/sql/upgrades/129.sql diff --git a/sql/upgrades/130.sql b/swh/indexer/sql/upgrades/130.sql similarity index 100% rename from sql/upgrades/130.sql rename to swh/indexer/sql/upgrades/130.sql diff --git a/sql/upgrades/131.sql b/swh/indexer/sql/upgrades/131.sql similarity index 100% rename from sql/upgrades/131.sql rename to swh/indexer/sql/upgrades/131.sql diff --git a/sql/upgrades/132.sql b/swh/indexer/sql/upgrades/132.sql similarity index 100% rename from sql/upgrades/132.sql rename to swh/indexer/sql/upgrades/132.sql diff --git a/sql/upgrades/133.sql b/swh/indexer/sql/upgrades/133.sql similarity index 100% rename from sql/upgrades/133.sql rename to swh/indexer/sql/upgrades/133.sql diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py index df4d5ae..edb8704 100644 --- a/swh/indexer/storage/__init__.py +++ b/swh/indexer/storage/__init__.py @@ -1,723 +1,729 @@ # Copyright (C) 2015-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from collections import Counter from importlib import import_module import json from typing import Dict, Iterable, List, Optional, Tuple, Union import warnings import psycopg2 import psycopg2.pool from swh.core.db.common import db_transaction from swh.indexer.storage.interface import IndexerStorageInterface from swh.model.hashutil import hash_to_bytes, hash_to_hex from swh.model.model import SHA1_SIZE from swh.storage.exc import StorageDBError from swh.storage.utils import get_partition_bounds_bytes from . import converters from .db import Db from .exc import DuplicateId, IndexerStorageArgumentException from .interface import PagedResult, Sha1 from .metrics import process_metrics, send_metric, timed from .model import ( ContentCtagsRow, ContentLanguageRow, ContentLicenseRow, ContentMetadataRow, ContentMimetypeRow, OriginIntrinsicMetadataRow, RevisionIntrinsicMetadataRow, ) from .writer import JournalWriter INDEXER_CFG_KEY = "indexer_storage" MAPPING_NAMES = ["cff", "codemeta", "gemspec", "maven", "npm", "pkg-info"] SERVER_IMPLEMENTATIONS: Dict[str, str] = { - "local": ".IndexerStorage", + "postgresql": ".IndexerStorage", "remote": ".api.client.RemoteStorage", "memory": ".in_memory.IndexerStorage", + # deprecated + "local": ".IndexerStorage", } def get_indexer_storage(cls: str, **kwargs) -> IndexerStorageInterface: """Instantiate an indexer storage implementation of class `cls` with arguments `kwargs`. Args: cls: indexer storage class (local, remote or memory) kwargs: dictionary of arguments passed to the indexer storage class constructor Returns: an instance of swh.indexer.storage Raises: ValueError if passed an unknown storage class. """ if "args" in kwargs: warnings.warn( 'Explicit "args" key is deprecated, use keys directly instead.', DeprecationWarning, ) kwargs = kwargs["args"] class_path = SERVER_IMPLEMENTATIONS.get(cls) if class_path is None: raise ValueError( f"Unknown indexer storage class `{cls}`. " f"Supported: {', '.join(SERVER_IMPLEMENTATIONS)}" ) (module_path, class_name) = class_path.rsplit(".", 1) module = import_module(module_path if module_path else ".", package=__package__) BackendClass = getattr(module, class_name) check_config = kwargs.pop("check_config", {}) idx_storage = BackendClass(**kwargs) if check_config: if not idx_storage.check_config(**check_config): raise EnvironmentError("Indexer storage check config failed") return idx_storage def check_id_duplicates(data): """ If any two row models in `data` have the same unique key, raises a `ValueError`. Values associated to the key must be hashable. Args: data (List[dict]): List of dictionaries to be inserted >>> check_id_duplicates([ ... ContentLanguageRow(id=b'foo', indexer_configuration_id=42, lang="python"), ... ContentLanguageRow(id=b'foo', indexer_configuration_id=32, lang="python"), ... ]) >>> check_id_duplicates([ ... ContentLanguageRow(id=b'foo', indexer_configuration_id=42, lang="python"), ... ContentLanguageRow(id=b'foo', indexer_configuration_id=42, lang="python"), ... ]) Traceback (most recent call last): ... swh.indexer.storage.exc.DuplicateId: [{'id': b'foo', 'indexer_configuration_id': 42}] """ # noqa counter = Counter(tuple(sorted(item.unique_key().items())) for item in data) duplicates = [id_ for (id_, count) in counter.items() if count >= 2] if duplicates: raise DuplicateId(list(map(dict, duplicates))) class IndexerStorage: """SWH Indexer Storage """ def __init__(self, db, min_pool_conns=1, max_pool_conns=10, journal_writer=None): """ Args: db: either a libpq connection string, or a psycopg2 connection journal_writer: configuration passed to `swh.journal.writer.get_journal_writer` """ self.journal_writer = JournalWriter(self._tool_get_from_id, journal_writer) try: if isinstance(db, psycopg2.extensions.connection): self._pool = None self._db = Db(db) else: self._pool = psycopg2.pool.ThreadedConnectionPool( min_pool_conns, max_pool_conns, db ) self._db = None except psycopg2.OperationalError as e: raise StorageDBError(e) def get_db(self): if self._db: return self._db return Db.from_pool(self._pool) def put_db(self, db): if db is not self._db: db.put_conn() + @db_transaction() + def get_current_version(self, *, db=None, cur=None): + return db.current_version + @timed @db_transaction() def check_config(self, *, check_write, db=None, cur=None): # Check permissions on one of the tables if check_write: check = "INSERT" else: check = "SELECT" cur.execute( "select has_table_privilege(current_user, 'content_mimetype', %s)", # noqa (check,), ) return cur.fetchone()[0] @timed @db_transaction() def content_mimetype_missing( self, mimetypes: Iterable[Dict], db=None, cur=None ) -> List[Tuple[Sha1, int]]: return [obj[0] for obj in db.content_mimetype_missing_from_list(mimetypes, cur)] @timed @db_transaction() def get_partition( self, indexer_type: str, indexer_configuration_id: int, partition_id: int, nb_partitions: int, page_token: Optional[str] = None, limit: int = 1000, with_textual_data=False, db=None, cur=None, ) -> PagedResult[Sha1]: """Retrieve ids of content with `indexer_type` within within partition partition_id bound by limit. Args: **indexer_type**: Type of data content to index (mimetype, language, etc...) **indexer_configuration_id**: The tool used to index data **partition_id**: index of the partition to fetch **nb_partitions**: total number of partitions to split into **page_token**: opaque token used for pagination **limit**: Limit result (default to 1000) **with_textual_data** (bool): Deal with only textual content (True) or all content (all contents by defaults, False) Raises: IndexerStorageArgumentException for; - limit to None - wrong indexer_type provided Returns: PagedResult of Sha1. If next_page_token is None, there is no more data to fetch """ if limit is None: raise IndexerStorageArgumentException("limit should not be None") if indexer_type not in db.content_indexer_names: err = f"Wrong type. Should be one of [{','.join(db.content_indexer_names)}]" raise IndexerStorageArgumentException(err) start, end = get_partition_bounds_bytes(partition_id, nb_partitions, SHA1_SIZE) if page_token is not None: start = hash_to_bytes(page_token) if end is None: end = b"\xff" * SHA1_SIZE next_page_token: Optional[str] = None ids = [ row[0] for row in db.content_get_range( indexer_type, start, end, indexer_configuration_id, limit=limit + 1, with_textual_data=with_textual_data, cur=cur, ) ] if len(ids) >= limit: next_page_token = hash_to_hex(ids[-1]) ids = ids[:limit] assert len(ids) <= limit return PagedResult(results=ids, next_page_token=next_page_token) @timed @db_transaction() def content_mimetype_get_partition( self, indexer_configuration_id: int, partition_id: int, nb_partitions: int, page_token: Optional[str] = None, limit: int = 1000, db=None, cur=None, ) -> PagedResult[Sha1]: return self.get_partition( "mimetype", indexer_configuration_id, partition_id, nb_partitions, page_token=page_token, limit=limit, db=db, cur=cur, ) @timed @process_metrics @db_transaction() def content_mimetype_add( self, mimetypes: List[ContentMimetypeRow], db=None, cur=None, ) -> Dict[str, int]: check_id_duplicates(mimetypes) mimetypes.sort(key=lambda m: m.id) self.journal_writer.write_additions("content_mimetype", mimetypes) db.mktemp_content_mimetype(cur) db.copy_to( [m.to_dict() for m in mimetypes], "tmp_content_mimetype", ["id", "mimetype", "encoding", "indexer_configuration_id"], cur, ) count = db.content_mimetype_add_from_temp(cur) return {"content_mimetype:add": count} @timed @db_transaction() def content_mimetype_get( self, ids: Iterable[Sha1], db=None, cur=None ) -> List[ContentMimetypeRow]: return [ ContentMimetypeRow.from_dict( converters.db_to_mimetype(dict(zip(db.content_mimetype_cols, c))) ) for c in db.content_mimetype_get_from_list(ids, cur) ] @timed @db_transaction() def content_language_missing( self, languages: Iterable[Dict], db=None, cur=None ) -> List[Tuple[Sha1, int]]: return [obj[0] for obj in db.content_language_missing_from_list(languages, cur)] @timed @db_transaction() def content_language_get( self, ids: Iterable[Sha1], db=None, cur=None ) -> List[ContentLanguageRow]: return [ ContentLanguageRow.from_dict( converters.db_to_language(dict(zip(db.content_language_cols, c))) ) for c in db.content_language_get_from_list(ids, cur) ] @timed @process_metrics @db_transaction() def content_language_add( self, languages: List[ContentLanguageRow], db=None, cur=None, ) -> Dict[str, int]: check_id_duplicates(languages) languages.sort(key=lambda m: m.id) self.journal_writer.write_additions("content_language", languages) db.mktemp_content_language(cur) # empty language is mapped to 'unknown' db.copy_to( ( { "id": lang.id, "lang": lang.lang or "unknown", "indexer_configuration_id": lang.indexer_configuration_id, } for lang in languages ), "tmp_content_language", ["id", "lang", "indexer_configuration_id"], cur, ) count = db.content_language_add_from_temp(cur) return {"content_language:add": count} @timed @db_transaction() def content_ctags_missing( self, ctags: Iterable[Dict], db=None, cur=None ) -> List[Tuple[Sha1, int]]: return [obj[0] for obj in db.content_ctags_missing_from_list(ctags, cur)] @timed @db_transaction() def content_ctags_get( self, ids: Iterable[Sha1], db=None, cur=None ) -> List[ContentCtagsRow]: return [ ContentCtagsRow.from_dict( converters.db_to_ctags(dict(zip(db.content_ctags_cols, c))) ) for c in db.content_ctags_get_from_list(ids, cur) ] @timed @process_metrics @db_transaction() def content_ctags_add( self, ctags: List[ContentCtagsRow], db=None, cur=None, ) -> Dict[str, int]: check_id_duplicates(ctags) ctags.sort(key=lambda m: m.id) self.journal_writer.write_additions("content_ctags", ctags) db.mktemp_content_ctags(cur) db.copy_to( [ctag.to_dict() for ctag in ctags], tblname="tmp_content_ctags", columns=["id", "name", "kind", "line", "lang", "indexer_configuration_id"], cur=cur, ) count = db.content_ctags_add_from_temp(cur) return {"content_ctags:add": count} @timed @db_transaction() def content_ctags_search( self, expression: str, limit: int = 10, last_sha1: Optional[Sha1] = None, db=None, cur=None, ) -> List[ContentCtagsRow]: return [ ContentCtagsRow.from_dict( converters.db_to_ctags(dict(zip(db.content_ctags_cols, obj))) ) for obj in db.content_ctags_search(expression, last_sha1, limit, cur=cur) ] @timed @db_transaction() def content_fossology_license_get( self, ids: Iterable[Sha1], db=None, cur=None ) -> List[ContentLicenseRow]: return [ ContentLicenseRow.from_dict( converters.db_to_fossology_license( dict(zip(db.content_fossology_license_cols, c)) ) ) for c in db.content_fossology_license_get_from_list(ids, cur) ] @timed @process_metrics @db_transaction() def content_fossology_license_add( self, licenses: List[ContentLicenseRow], db=None, cur=None, ) -> Dict[str, int]: check_id_duplicates(licenses) licenses.sort(key=lambda m: m.id) self.journal_writer.write_additions("content_fossology_license", licenses) db.mktemp_content_fossology_license(cur) db.copy_to( [license.to_dict() for license in licenses], tblname="tmp_content_fossology_license", columns=["id", "license", "indexer_configuration_id"], cur=cur, ) count = db.content_fossology_license_add_from_temp(cur) return {"content_fossology_license:add": count} @timed @db_transaction() def content_fossology_license_get_partition( self, indexer_configuration_id: int, partition_id: int, nb_partitions: int, page_token: Optional[str] = None, limit: int = 1000, db=None, cur=None, ) -> PagedResult[Sha1]: return self.get_partition( "fossology_license", indexer_configuration_id, partition_id, nb_partitions, page_token=page_token, limit=limit, with_textual_data=True, db=db, cur=cur, ) @timed @db_transaction() def content_metadata_missing( self, metadata: Iterable[Dict], db=None, cur=None ) -> List[Tuple[Sha1, int]]: return [obj[0] for obj in db.content_metadata_missing_from_list(metadata, cur)] @timed @db_transaction() def content_metadata_get( self, ids: Iterable[Sha1], db=None, cur=None ) -> List[ContentMetadataRow]: return [ ContentMetadataRow.from_dict( converters.db_to_metadata(dict(zip(db.content_metadata_cols, c))) ) for c in db.content_metadata_get_from_list(ids, cur) ] @timed @process_metrics @db_transaction() def content_metadata_add( self, metadata: List[ContentMetadataRow], db=None, cur=None, ) -> Dict[str, int]: check_id_duplicates(metadata) metadata.sort(key=lambda m: m.id) self.journal_writer.write_additions("content_metadata", metadata) db.mktemp_content_metadata(cur) db.copy_to( [m.to_dict() for m in metadata], "tmp_content_metadata", ["id", "metadata", "indexer_configuration_id"], cur, ) count = db.content_metadata_add_from_temp(cur) return { "content_metadata:add": count, } @timed @db_transaction() def revision_intrinsic_metadata_missing( self, metadata: Iterable[Dict], db=None, cur=None ) -> List[Tuple[Sha1, int]]: return [ obj[0] for obj in db.revision_intrinsic_metadata_missing_from_list(metadata, cur) ] @timed @db_transaction() def revision_intrinsic_metadata_get( self, ids: Iterable[Sha1], db=None, cur=None ) -> List[RevisionIntrinsicMetadataRow]: return [ RevisionIntrinsicMetadataRow.from_dict( converters.db_to_metadata( dict(zip(db.revision_intrinsic_metadata_cols, c)) ) ) for c in db.revision_intrinsic_metadata_get_from_list(ids, cur) ] @timed @process_metrics @db_transaction() def revision_intrinsic_metadata_add( self, metadata: List[RevisionIntrinsicMetadataRow], db=None, cur=None, ) -> Dict[str, int]: check_id_duplicates(metadata) metadata.sort(key=lambda m: m.id) self.journal_writer.write_additions("revision_intrinsic_metadata", metadata) db.mktemp_revision_intrinsic_metadata(cur) db.copy_to( [m.to_dict() for m in metadata], "tmp_revision_intrinsic_metadata", ["id", "metadata", "mappings", "indexer_configuration_id"], cur, ) count = db.revision_intrinsic_metadata_add_from_temp(cur) return { "revision_intrinsic_metadata:add": count, } @timed @db_transaction() def origin_intrinsic_metadata_get( self, urls: Iterable[str], db=None, cur=None ) -> List[OriginIntrinsicMetadataRow]: return [ OriginIntrinsicMetadataRow.from_dict( converters.db_to_metadata( dict(zip(db.origin_intrinsic_metadata_cols, c)) ) ) for c in db.origin_intrinsic_metadata_get_from_list(urls, cur) ] @timed @process_metrics @db_transaction() def origin_intrinsic_metadata_add( self, metadata: List[OriginIntrinsicMetadataRow], db=None, cur=None, ) -> Dict[str, int]: check_id_duplicates(metadata) metadata.sort(key=lambda m: m.id) self.journal_writer.write_additions("origin_intrinsic_metadata", metadata) db.mktemp_origin_intrinsic_metadata(cur) db.copy_to( [m.to_dict() for m in metadata], "tmp_origin_intrinsic_metadata", ["id", "metadata", "indexer_configuration_id", "from_revision", "mappings"], cur, ) count = db.origin_intrinsic_metadata_add_from_temp(cur) return { "origin_intrinsic_metadata:add": count, } @timed @db_transaction() def origin_intrinsic_metadata_search_fulltext( self, conjunction: List[str], limit: int = 100, db=None, cur=None ) -> List[OriginIntrinsicMetadataRow]: return [ OriginIntrinsicMetadataRow.from_dict( converters.db_to_metadata( dict(zip(db.origin_intrinsic_metadata_cols, c)) ) ) for c in db.origin_intrinsic_metadata_search_fulltext( conjunction, limit=limit, cur=cur ) ] @timed @db_transaction() def origin_intrinsic_metadata_search_by_producer( self, page_token: str = "", limit: int = 100, ids_only: bool = False, mappings: Optional[List[str]] = None, tool_ids: Optional[List[int]] = None, db=None, cur=None, ) -> PagedResult[Union[str, OriginIntrinsicMetadataRow]]: assert isinstance(page_token, str) # we go to limit+1 to check whether we should add next_page_token in # the response rows = db.origin_intrinsic_metadata_search_by_producer( page_token, limit + 1, ids_only, mappings, tool_ids, cur ) next_page_token = None if ids_only: results = [origin for (origin,) in rows] if len(results) > limit: results[limit:] = [] next_page_token = results[-1] else: results = [ OriginIntrinsicMetadataRow.from_dict( converters.db_to_metadata( dict(zip(db.origin_intrinsic_metadata_cols, row)) ) ) for row in rows ] if len(results) > limit: results[limit:] = [] next_page_token = results[-1].id return PagedResult(results=results, next_page_token=next_page_token,) @timed @db_transaction() def origin_intrinsic_metadata_stats(self, db=None, cur=None): mapping_names = [m for m in MAPPING_NAMES] select_parts = [] # Count rows for each mapping for mapping_name in mapping_names: select_parts.append( ( "sum(case when (mappings @> ARRAY['%s']) " " then 1 else 0 end)" ) % mapping_name ) # Total select_parts.append("sum(1)") # Rows whose metadata has at least one key that is not '@context' select_parts.append( "sum(case when ('{}'::jsonb @> (metadata - '@context')) " " then 0 else 1 end)" ) cur.execute( "select " + ", ".join(select_parts) + " from origin_intrinsic_metadata" ) results = dict(zip(mapping_names + ["total", "non_empty"], cur.fetchone())) return { "total": results.pop("total"), "non_empty": results.pop("non_empty"), "per_mapping": results, } @timed @db_transaction() def indexer_configuration_add(self, tools, db=None, cur=None): db.mktemp_indexer_configuration(cur) db.copy_to( tools, "tmp_indexer_configuration", ["tool_name", "tool_version", "tool_configuration"], cur, ) tools = db.indexer_configuration_add_from_temp(cur) results = [dict(zip(db.indexer_configuration_cols, line)) for line in tools] send_metric( "indexer_configuration:add", len(results), method_name="indexer_configuration_add", ) return results @timed @db_transaction() def indexer_configuration_get(self, tool, db=None, cur=None): tool_conf = tool["tool_configuration"] if isinstance(tool_conf, dict): tool_conf = json.dumps(tool_conf) idx = db.indexer_configuration_get( tool["tool_name"], tool["tool_version"], tool_conf ) if not idx: return None return dict(zip(db.indexer_configuration_cols, idx)) @db_transaction() def _tool_get_from_id(self, id_, db, cur): tool = dict( zip( db.indexer_configuration_cols, db.indexer_configuration_get_from_id(id_, cur), ) ) return { "id": tool["id"], "name": tool["tool_name"], "version": tool["tool_version"], "configuration": tool["tool_configuration"], } diff --git a/swh/indexer/storage/db.py b/swh/indexer/storage/db.py index 6f41d44..6526625 100644 --- a/swh/indexer/storage/db.py +++ b/swh/indexer/storage/db.py @@ -1,550 +1,551 @@ # Copyright (C) 2015-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from typing import Dict, Iterable, Iterator, List from swh.core.db import BaseDb from swh.core.db.db_utils import execute_values_generator, stored_procedure from swh.model import hashutil from .interface import Sha1 class Db(BaseDb): """Proxy to the SWH Indexer DB, with wrappers around stored procedures """ content_mimetype_hash_keys = ["id", "indexer_configuration_id"] + current_version = 133 def _missing_from_list( self, table: str, data: Iterable[Dict], hash_keys: List[str], cur=None ): """Read from table the data with hash_keys that are missing. Args: table: Table name (e.g content_mimetype, content_language, etc...) data: Dict of data to read from hash_keys: List of keys to read in the data dict. Yields: The data which is missing from the db. """ cur = self._cursor(cur) keys = ", ".join(hash_keys) equality = " AND ".join(("t.%s = c.%s" % (key, key)) for key in hash_keys) yield from execute_values_generator( cur, """ select %s from (values %%s) as t(%s) where not exists ( select 1 from %s c where %s ) """ % (keys, keys, table, equality), (tuple(m[k] for k in hash_keys) for m in data), ) def content_mimetype_missing_from_list( self, mimetypes: Iterable[Dict], cur=None ) -> Iterator[Sha1]: """List missing mimetypes. """ yield from self._missing_from_list( "content_mimetype", mimetypes, self.content_mimetype_hash_keys, cur=cur ) content_mimetype_cols = [ "id", "mimetype", "encoding", "tool_id", "tool_name", "tool_version", "tool_configuration", ] @stored_procedure("swh_mktemp_content_mimetype") def mktemp_content_mimetype(self, cur=None): pass def content_mimetype_add_from_temp(self, cur=None): cur = self._cursor(cur) cur.execute("select * from swh_content_mimetype_add()") return cur.fetchone()[0] def _convert_key(self, key, main_table="c"): """Convert keys according to specific use in the module. Args: key (str): Key expression to change according to the alias used in the query main_table (str): Alias to use for the main table. Default to c for content_{something}. Expected: Tables content_{something} being aliased as 'c' (something in {language, mimetype, ...}), table indexer_configuration being aliased as 'i'. """ if key == "id": return "%s.id" % main_table elif key == "tool_id": return "i.id as tool_id" elif key == "license": return ( """ ( select name from fossology_license where id = %s.license_id ) as licenses""" % main_table ) return key def _get_from_list(self, table, ids, cols, cur=None, id_col="id"): """Fetches entries from the `table` such that their `id` field (or whatever is given to `id_col`) is in `ids`. Returns the columns `cols`. The `cur` parameter is used to connect to the database. """ cur = self._cursor(cur) keys = map(self._convert_key, cols) query = """ select {keys} from (values %s) as t(id) inner join {table} c on c.{id_col}=t.id inner join indexer_configuration i on c.indexer_configuration_id=i.id; """.format( keys=", ".join(keys), id_col=id_col, table=table ) yield from execute_values_generator(cur, query, ((_id,) for _id in ids)) content_indexer_names = { "mimetype": "content_mimetype", "fossology_license": "content_fossology_license", } def content_get_range( self, content_type, start, end, indexer_configuration_id, limit=1000, with_textual_data=False, cur=None, ): """Retrieve contents with content_type, within range [start, end] bound by limit and associated to the given indexer configuration id. When asking to work on textual content, that filters on the mimetype table with any mimetype that is not binary. """ cur = self._cursor(cur) table = self.content_indexer_names[content_type] if with_textual_data: extra = """inner join content_mimetype cm on (t.id=cm.id and cm.mimetype like 'text/%%' and %(start)s <= cm.id and cm.id <= %(end)s) """ else: extra = "" query = f"""select t.id from {table} t {extra} where t.indexer_configuration_id=%(tool_id)s and %(start)s <= t.id and t.id <= %(end)s order by t.indexer_configuration_id, t.id limit %(limit)s""" cur.execute( query, { "start": start, "end": end, "tool_id": indexer_configuration_id, "limit": limit, }, ) yield from cur def content_mimetype_get_from_list(self, ids, cur=None): yield from self._get_from_list( "content_mimetype", ids, self.content_mimetype_cols, cur=cur ) content_language_hash_keys = ["id", "indexer_configuration_id"] def content_language_missing_from_list(self, languages, cur=None): """List missing languages. """ yield from self._missing_from_list( "content_language", languages, self.content_language_hash_keys, cur=cur ) content_language_cols = [ "id", "lang", "tool_id", "tool_name", "tool_version", "tool_configuration", ] @stored_procedure("swh_mktemp_content_language") def mktemp_content_language(self, cur=None): pass def content_language_add_from_temp(self, cur=None): cur = self._cursor(cur) cur.execute("select * from swh_content_language_add()") return cur.fetchone()[0] def content_language_get_from_list(self, ids, cur=None): yield from self._get_from_list( "content_language", ids, self.content_language_cols, cur=cur ) content_ctags_hash_keys = ["id", "indexer_configuration_id"] def content_ctags_missing_from_list(self, ctags, cur=None): """List missing ctags. """ yield from self._missing_from_list( "content_ctags", ctags, self.content_ctags_hash_keys, cur=cur ) content_ctags_cols = [ "id", "name", "kind", "line", "lang", "tool_id", "tool_name", "tool_version", "tool_configuration", ] @stored_procedure("swh_mktemp_content_ctags") def mktemp_content_ctags(self, cur=None): pass def content_ctags_add_from_temp(self, cur=None): cur = self._cursor(cur) cur.execute("select * from swh_content_ctags_add()") return cur.fetchone()[0] def content_ctags_get_from_list(self, ids, cur=None): cur = self._cursor(cur) keys = map(self._convert_key, self.content_ctags_cols) yield from execute_values_generator( cur, """ select %s from (values %%s) as t(id) inner join content_ctags c on c.id=t.id inner join indexer_configuration i on c.indexer_configuration_id=i.id order by line """ % ", ".join(keys), ((_id,) for _id in ids), ) def content_ctags_search(self, expression, last_sha1, limit, cur=None): cur = self._cursor(cur) if not last_sha1: query = """SELECT %s FROM swh_content_ctags_search(%%s, %%s)""" % ( ",".join(self.content_ctags_cols) ) cur.execute(query, (expression, limit)) else: if last_sha1 and isinstance(last_sha1, bytes): last_sha1 = "\\x%s" % hashutil.hash_to_hex(last_sha1) elif last_sha1: last_sha1 = "\\x%s" % last_sha1 query = """SELECT %s FROM swh_content_ctags_search(%%s, %%s, %%s)""" % ( ",".join(self.content_ctags_cols) ) cur.execute(query, (expression, limit, last_sha1)) yield from cur content_fossology_license_cols = [ "id", "tool_id", "tool_name", "tool_version", "tool_configuration", "license", ] @stored_procedure("swh_mktemp_content_fossology_license") def mktemp_content_fossology_license(self, cur=None): pass def content_fossology_license_add_from_temp(self, cur=None): """Add new licenses per content. """ cur = self._cursor(cur) cur.execute("select * from swh_content_fossology_license_add()") return cur.fetchone()[0] def content_fossology_license_get_from_list(self, ids, cur=None): """Retrieve licenses per id. """ cur = self._cursor(cur) keys = map(self._convert_key, self.content_fossology_license_cols) yield from execute_values_generator( cur, """ select %s from (values %%s) as t(id) inner join content_fossology_license c on t.id=c.id inner join indexer_configuration i on i.id=c.indexer_configuration_id """ % ", ".join(keys), ((_id,) for _id in ids), ) content_metadata_hash_keys = ["id", "indexer_configuration_id"] def content_metadata_missing_from_list(self, metadata, cur=None): """List missing metadata. """ yield from self._missing_from_list( "content_metadata", metadata, self.content_metadata_hash_keys, cur=cur ) content_metadata_cols = [ "id", "metadata", "tool_id", "tool_name", "tool_version", "tool_configuration", ] @stored_procedure("swh_mktemp_content_metadata") def mktemp_content_metadata(self, cur=None): pass def content_metadata_add_from_temp(self, cur=None): cur = self._cursor(cur) cur.execute("select * from swh_content_metadata_add()") return cur.fetchone()[0] def content_metadata_get_from_list(self, ids, cur=None): yield from self._get_from_list( "content_metadata", ids, self.content_metadata_cols, cur=cur ) revision_intrinsic_metadata_hash_keys = ["id", "indexer_configuration_id"] def revision_intrinsic_metadata_missing_from_list(self, metadata, cur=None): """List missing metadata. """ yield from self._missing_from_list( "revision_intrinsic_metadata", metadata, self.revision_intrinsic_metadata_hash_keys, cur=cur, ) revision_intrinsic_metadata_cols = [ "id", "metadata", "mappings", "tool_id", "tool_name", "tool_version", "tool_configuration", ] @stored_procedure("swh_mktemp_revision_intrinsic_metadata") def mktemp_revision_intrinsic_metadata(self, cur=None): pass def revision_intrinsic_metadata_add_from_temp(self, cur=None): cur = self._cursor(cur) cur.execute("select * from swh_revision_intrinsic_metadata_add()") return cur.fetchone()[0] def revision_intrinsic_metadata_get_from_list(self, ids, cur=None): yield from self._get_from_list( "revision_intrinsic_metadata", ids, self.revision_intrinsic_metadata_cols, cur=cur, ) origin_intrinsic_metadata_cols = [ "id", "metadata", "from_revision", "mappings", "tool_id", "tool_name", "tool_version", "tool_configuration", ] origin_intrinsic_metadata_regconfig = "pg_catalog.simple" """The dictionary used to normalize 'metadata' and queries. 'pg_catalog.simple' provides no stopword, so it should be suitable for proper names and non-English content. When updating this value, make sure to add a new index on origin_intrinsic_metadata.metadata.""" @stored_procedure("swh_mktemp_origin_intrinsic_metadata") def mktemp_origin_intrinsic_metadata(self, cur=None): pass def origin_intrinsic_metadata_add_from_temp(self, cur=None): cur = self._cursor(cur) cur.execute("select * from swh_origin_intrinsic_metadata_add()") return cur.fetchone()[0] def origin_intrinsic_metadata_get_from_list(self, ids, cur=None): yield from self._get_from_list( "origin_intrinsic_metadata", ids, self.origin_intrinsic_metadata_cols, cur=cur, id_col="id", ) def origin_intrinsic_metadata_search_fulltext(self, terms, *, limit, cur): regconfig = self.origin_intrinsic_metadata_regconfig tsquery_template = " && ".join( "plainto_tsquery('%s', %%s)" % regconfig for _ in terms ) tsquery_args = [(term,) for term in terms] keys = ( self._convert_key(col, "oim") for col in self.origin_intrinsic_metadata_cols ) query = ( "SELECT {keys} FROM origin_intrinsic_metadata AS oim " "INNER JOIN indexer_configuration AS i " "ON oim.indexer_configuration_id=i.id " "JOIN LATERAL (SELECT {tsquery_template}) AS s(tsq) ON true " "WHERE oim.metadata_tsvector @@ tsq " "ORDER BY ts_rank(oim.metadata_tsvector, tsq, 1) DESC " "LIMIT %s;" ).format(keys=", ".join(keys), tsquery_template=tsquery_template) cur.execute(query, tsquery_args + [limit]) yield from cur def origin_intrinsic_metadata_search_by_producer( self, last, limit, ids_only, mappings, tool_ids, cur ): if ids_only: keys = "oim.id" else: keys = ", ".join( ( self._convert_key(col, "oim") for col in self.origin_intrinsic_metadata_cols ) ) query_parts = [ "SELECT %s" % keys, "FROM origin_intrinsic_metadata AS oim", "INNER JOIN indexer_configuration AS i", "ON oim.indexer_configuration_id=i.id", ] args = [] where = [] if last: where.append("oim.id > %s") args.append(last) if mappings is not None: where.append("oim.mappings && %s") args.append(list(mappings)) if tool_ids is not None: where.append("oim.indexer_configuration_id = ANY(%s)") args.append(list(tool_ids)) if where: query_parts.append("WHERE") query_parts.append(" AND ".join(where)) if limit: query_parts.append("LIMIT %s") args.append(limit) cur.execute(" ".join(query_parts), args) yield from cur indexer_configuration_cols = [ "id", "tool_name", "tool_version", "tool_configuration", ] @stored_procedure("swh_mktemp_indexer_configuration") def mktemp_indexer_configuration(self, cur=None): pass def indexer_configuration_add_from_temp(self, cur=None): cur = self._cursor(cur) cur.execute( "SELECT %s from swh_indexer_configuration_add()" % (",".join(self.indexer_configuration_cols),) ) yield from cur def indexer_configuration_get( self, tool_name, tool_version, tool_configuration, cur=None ): cur = self._cursor(cur) cur.execute( """select %s from indexer_configuration where tool_name=%%s and tool_version=%%s and tool_configuration=%%s""" % (",".join(self.indexer_configuration_cols)), (tool_name, tool_version, tool_configuration), ) return cur.fetchone() def indexer_configuration_get_from_id(self, id_, cur=None): cur = self._cursor(cur) cur.execute( """select %s from indexer_configuration where id=%%s""" % (",".join(self.indexer_configuration_cols)), (id_,), ) return cur.fetchone() diff --git a/swh/indexer/tests/conftest.py b/swh/indexer/tests/conftest.py index 5ae6aeb..d42c3e3 100644 --- a/swh/indexer/tests/conftest.py +++ b/swh/indexer/tests/conftest.py @@ -1,127 +1,134 @@ # Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from datetime import timedelta +from functools import partial import os -from os import path from typing import List, Tuple from unittest.mock import patch import pytest +from pytest_postgresql import factories import yaml -from swh.core.db.pytest_plugin import postgresql_fact -import swh.indexer +from swh.core.db.pytest_plugin import initialize_database_for_module, postgresql_fact from swh.indexer.storage import get_indexer_storage +from swh.indexer.storage.db import Db as IndexerDb from swh.objstorage.factory import get_objstorage from swh.storage import get_storage from .utils import fill_obj_storage, fill_storage TASK_NAMES: List[Tuple[str, str]] = [ # (scheduler-task-type, task-class-test-name) ("index-revision-metadata", "revision_intrinsic_metadata"), ("index-origin-metadata", "origin_intrinsic_metadata"), ] -SQL_FILES = path.join(path.dirname(swh.indexer.__file__), "sql", "*.sql") - - -idx_storage_postgresql = postgresql_fact( - "postgresql_proc", dbname="indexer_storage", dump_files=SQL_FILES, +idx_postgresql_proc = factories.postgresql_proc( + dbname="indexer_storage", + load=[ + partial( + initialize_database_for_module, + modname="indexer", + version=IndexerDb.current_version, + ) + ], ) +idx_storage_postgresql = postgresql_fact("idx_postgresql_proc") + @pytest.fixture def indexer_scheduler(swh_scheduler): # Insert the expected task types within the scheduler for task_name, task_class_name in TASK_NAMES: swh_scheduler.create_task_type( { "type": task_name, "description": f"The {task_class_name} indexer testing task", "backend_name": f"swh.indexer.tests.tasks.{task_class_name}", "default_interval": timedelta(days=1), "min_interval": timedelta(hours=6), "max_interval": timedelta(days=12), "num_retries": 3, } ) return swh_scheduler @pytest.fixture def idx_storage_backend_config(idx_storage_postgresql): """Basic pg storage configuration with no journal collaborator for the indexer storage (to avoid pulling optional dependency on clients of this fixture) """ return { "cls": "local", "db": idx_storage_postgresql.dsn, } @pytest.fixture def swh_indexer_config( swh_storage_backend_config, idx_storage_backend_config, swh_scheduler_config ): return { "storage": swh_storage_backend_config, "objstorage": {"cls": "memory"}, "indexer_storage": idx_storage_backend_config, "scheduler": {"cls": "local", **swh_scheduler_config}, "tools": { "name": "file", "version": "1:5.30-1+deb9u1", "configuration": {"type": "library", "debian-package": "python3-magic"}, }, "compute_checksums": ["blake2b512"], # for rehash indexer } @pytest.fixture def idx_storage(swh_indexer_config): """An instance of in-memory indexer storage that gets injected into all indexers classes. """ idx_storage_config = swh_indexer_config["indexer_storage"] return get_indexer_storage(**idx_storage_config) @pytest.fixture def storage(swh_indexer_config): """An instance of in-memory storage that gets injected into all indexers classes. """ storage = get_storage(**swh_indexer_config["storage"]) fill_storage(storage) return storage @pytest.fixture def obj_storage(swh_indexer_config): """An instance of in-memory objstorage that gets injected into all indexers classes. """ objstorage = get_objstorage(**swh_indexer_config["objstorage"]) fill_obj_storage(objstorage) with patch.dict( "swh.objstorage.factory._STORAGE_CLASSES", {"memory": lambda: objstorage} ): yield objstorage @pytest.fixture def swh_config(swh_indexer_config, monkeypatch, tmp_path): conffile = os.path.join(str(tmp_path), "indexer.yml") with open(conffile, "w") as f: f.write(yaml.dump(swh_indexer_config)) monkeypatch.setenv("SWH_CONFIG_FILENAME", conffile) return conffile diff --git a/swh/indexer/tests/storage/__init__.py b/swh/indexer/tests/storage/__init__.py index 6939f7b..1bf13b2 100644 --- a/swh/indexer/tests/storage/__init__.py +++ b/swh/indexer/tests/storage/__init__.py @@ -1,10 +1,10 @@ # Copyright (C) 2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from os import path -import swh.storage +import swh.indexer SQL_DIR = path.join(path.dirname(swh.indexer.__file__), "sql") diff --git a/swh/indexer/tests/storage/conftest.py b/swh/indexer/tests/storage/conftest.py index 133404b..c301e92 100644 --- a/swh/indexer/tests/storage/conftest.py +++ b/swh/indexer/tests/storage/conftest.py @@ -1,80 +1,78 @@ # Copyright (C) 2015-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from os.path import join import pytest from swh.indexer.storage import get_indexer_storage from swh.indexer.storage.model import ContentLicenseRow, ContentMimetypeRow +from swh.indexer.tests.conftest import idx_storage_postgresql from swh.model.hashutil import hash_to_bytes -from swh.storage.pytest_plugin import postgresql_fact from . import SQL_DIR from .generate_data_test import FOSSOLOGY_LICENSES, MIMETYPE_OBJECTS, TOOLS DUMP_FILES = join(SQL_DIR, "*.sql") class DataObj(dict): def __getattr__(self, key): return self.__getitem__(key) def __setattr__(self, key, value): return self.__setitem__(key, value) @pytest.fixture def swh_indexer_storage_with_data(swh_indexer_storage): data = DataObj() tools = { tool["tool_name"]: { "id": tool["id"], "name": tool["tool_name"], "version": tool["tool_version"], "configuration": tool["tool_configuration"], } for tool in swh_indexer_storage.indexer_configuration_add(TOOLS) } data.tools = tools data.sha1_1 = hash_to_bytes("34973274ccef6ab4dfaaf86599792fa9c3fe4689") data.sha1_2 = hash_to_bytes("61c2b3a30496d329e21af70dd2d7e097046d07b7") data.revision_id_1 = hash_to_bytes("7026b7c1a2af56521e951c01ed20f255fa054238") data.revision_id_2 = hash_to_bytes("7026b7c1a2af56521e9587659012345678904321") data.revision_id_3 = hash_to_bytes("7026b7c1a2af56521e9587659012345678904320") data.origin_url_1 = "file:///dev/0/zero" # 44434341 data.origin_url_2 = "file:///dev/1/one" # 44434342 data.origin_url_3 = "file:///dev/2/two" # 54974445 data.mimetypes = [ ContentMimetypeRow(indexer_configuration_id=tools["file"]["id"], **mimetype_obj) for mimetype_obj in MIMETYPE_OBJECTS ] swh_indexer_storage.content_mimetype_add(data.mimetypes) data.fossology_licenses = [ ContentLicenseRow( id=fossology_obj["id"], indexer_configuration_id=tools["nomos"]["id"], license=license, ) for fossology_obj in FOSSOLOGY_LICENSES for license in fossology_obj["licenses"] ] swh_indexer_storage._test_data = data return (swh_indexer_storage, data) -swh_indexer_storage_postgresql = postgresql_fact( - "postgresql_proc", dump_files=DUMP_FILES -) +swh_indexer_storage_postgresql = idx_storage_postgresql @pytest.fixture def swh_indexer_storage(swh_indexer_storage_postgresql): return get_indexer_storage( "local", db=swh_indexer_storage_postgresql.dsn, journal_writer={"cls": "memory",}, ) diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py index d3bddc5..afc1f31 100644 --- a/swh/indexer/tests/test_metadata.py +++ b/swh/indexer/tests/test_metadata.py @@ -1,1318 +1,1330 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json import unittest from hypothesis import HealthCheck, given, settings, strategies from swh.indexer.codemeta import CODEMETA_TERMS from swh.indexer.metadata import ContentMetadataIndexer, RevisionMetadataIndexer from swh.indexer.metadata_detector import detect_metadata from swh.indexer.metadata_dictionary import MAPPINGS from swh.indexer.metadata_dictionary.maven import MavenMapping from swh.indexer.metadata_dictionary.npm import NpmMapping from swh.indexer.metadata_dictionary.ruby import GemspecMapping from swh.indexer.storage.model import ContentMetadataRow, RevisionIntrinsicMetadataRow from swh.indexer.tests.utils import DIRECTORY2, REVISION from swh.model.hashutil import hash_to_bytes from swh.model.model import Directory, DirectoryEntry, Revision from .utils import ( BASE_TEST_CONFIG, YARN_PARSER_METADATA, fill_obj_storage, fill_storage, json_document_strategy, xml_document_strategy, ) TRANSLATOR_TOOL = { "name": "swh-metadata-translator", "version": "0.0.2", "configuration": {"type": "local", "context": "NpmMapping"}, } class ContentMetadataTestIndexer(ContentMetadataIndexer): """Specific Metadata whose configuration is enough to satisfy the indexing tests. """ def parse_config_file(self, *args, **kwargs): assert False, "should not be called; the rev indexer configures it." REVISION_METADATA_CONFIG = { **BASE_TEST_CONFIG, "tools": TRANSLATOR_TOOL, } class Metadata(unittest.TestCase): """ Tests metadata_mock_tool tool for Metadata detection """ def setUp(self): """ shows the entire diff in the results """ self.maxDiff = None self.npm_mapping = MAPPINGS["NpmMapping"]() self.codemeta_mapping = MAPPINGS["CodemetaMapping"]() self.maven_mapping = MAPPINGS["MavenMapping"]() self.pkginfo_mapping = MAPPINGS["PythonPkginfoMapping"]() self.gemspec_mapping = MAPPINGS["GemspecMapping"]() self.cff_mapping = MAPPINGS["CffMapping"]() def test_compute_metadata_none(self): """ testing content empty content is empty should return None """ # given content = b"" # None if no metadata was found or an error occurred declared_metadata = None # when result = self.npm_mapping.translate(content) # then self.assertEqual(declared_metadata, result) def test_compute_metadata_cff(self): """ testing CITATION.cff translation """ # given content = """# YAML 1.2 --- abstract: "Command line program to convert from Citation File \ Format to various other formats such as BibTeX, EndNote, RIS, \ schema.org, CodeMeta, and .zenodo.json." authors: - affiliation: "Netherlands eScience Center" family-names: Klaver given-names: Tom - affiliation: "Humboldt-Universität zu Berlin" family-names: Druskat given-names: Stephan orcid: https://orcid.org/0000-0003-4925-7248 cff-version: "1.0.3" date-released: 2019-11-12 doi: 10.5281/zenodo.1162057 keywords: - "citation" - "bibliography" - "cff" - "CITATION.cff" license: Apache-2.0 message: "If you use this software, please cite it using these metadata." repository-code: "https://github.com/citation-file-format/cff-converter-python" title: cffconvert version: "1.4.0-alpha0" """.encode( "utf-8" ) expected = { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "author": [ { "type": "Person", "affiliation": { "type": "Organization", "name": "Netherlands eScience Center", }, "familyName": "Klaver", "givenName": "Tom", }, { "id": "https://orcid.org/0000-0003-4925-7248", "type": "Person", "affiliation": { "type": "Organization", "name": "Humboldt-Universität zu Berlin", }, "familyName": "Druskat", "givenName": "Stephan", }, ], "codeRepository": ( "https://github.com/citation-file-format/cff-converter-python" ), "datePublished": "2019-11-12", "description": """Command line program to convert from \ Citation File Format to various other formats such as BibTeX, EndNote, \ RIS, schema.org, CodeMeta, and .zenodo.json.""", "identifier": "https://doi.org/10.5281/zenodo.1162057", "keywords": ["citation", "bibliography", "cff", "CITATION.cff"], "license": "https://spdx.org/licenses/Apache-2.0", "version": "1.4.0-alpha0", } # when result = self.cff_mapping.translate(content) # then self.assertEqual(expected, result) def test_compute_metadata_npm(self): """ testing only computation of metadata with hard_mapping_npm """ # given content = b""" { "name": "test_metadata", "version": "0.0.2", "description": "Simple package.json test for indexer", "repository": { "type": "git", "url": "https://github.com/moranegg/metadata_test" }, "author": { "email": "moranegg@example.com", "name": "Morane G" } } """ declared_metadata = { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "name": "test_metadata", "version": "0.0.2", "description": "Simple package.json test for indexer", "codeRepository": "git+https://github.com/moranegg/metadata_test", "author": [ {"type": "Person", "name": "Morane G", "email": "moranegg@example.com",} ], } # when result = self.npm_mapping.translate(content) # then self.assertEqual(declared_metadata, result) def test_index_content_metadata_npm(self): """ testing NPM with package.json - one sha1 uses a file that can't be translated to metadata and should return None in the translated metadata """ # given sha1s = [ hash_to_bytes("26a9f72a7c87cc9205725cfd879f514ff4f3d8d5"), hash_to_bytes("d4c647f0fc257591cc9ba1722484229780d1c607"), hash_to_bytes("02fb2c89e14f7fab46701478c83779c7beb7b069"), ] # this metadata indexer computes only metadata for package.json # in npm context with a hard mapping config = BASE_TEST_CONFIG.copy() config["tools"] = [TRANSLATOR_TOOL] metadata_indexer = ContentMetadataTestIndexer(config=config) fill_obj_storage(metadata_indexer.objstorage) fill_storage(metadata_indexer.storage) # when metadata_indexer.run(sha1s) results = list(metadata_indexer.idx_storage.content_metadata_get(sha1s)) expected_results = [ ContentMetadataRow( id=hash_to_bytes("26a9f72a7c87cc9205725cfd879f514ff4f3d8d5"), tool=TRANSLATOR_TOOL, metadata={ "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "codeRepository": "git+https://github.com/moranegg/metadata_test", "description": "Simple package.json test for indexer", "name": "test_metadata", "version": "0.0.1", }, ), ContentMetadataRow( id=hash_to_bytes("d4c647f0fc257591cc9ba1722484229780d1c607"), tool=TRANSLATOR_TOOL, metadata={ "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "issueTracker": "https://github.com/npm/npm/issues", "author": [ { "type": "Person", "name": "Isaac Z. Schlueter", "email": "i@izs.me", "url": "http://blog.izs.me", } ], "codeRepository": "git+https://github.com/npm/npm", "description": "a package manager for JavaScript", "license": "https://spdx.org/licenses/Artistic-2.0", "version": "5.0.3", "name": "npm", "keywords": [ "install", "modules", "package manager", "package.json", ], "url": "https://docs.npmjs.com/", }, ), ] for result in results: del result.tool["id"] # The assertion below returns False sometimes because of nested lists self.assertEqual(expected_results, results) def test_npm_bugs_normalization(self): # valid dictionary package_json = b"""{ "name": "foo", "bugs": { "url": "https://github.com/owner/project/issues", "email": "foo@example.com" } }""" result = self.npm_mapping.translate(package_json) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "name": "foo", "issueTracker": "https://github.com/owner/project/issues", "type": "SoftwareSourceCode", }, ) # "invalid" dictionary package_json = b"""{ "name": "foo", "bugs": { "email": "foo@example.com" } }""" result = self.npm_mapping.translate(package_json) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "name": "foo", "type": "SoftwareSourceCode", }, ) # string package_json = b"""{ "name": "foo", "bugs": "https://github.com/owner/project/issues" }""" result = self.npm_mapping.translate(package_json) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "name": "foo", "issueTracker": "https://github.com/owner/project/issues", "type": "SoftwareSourceCode", }, ) def test_npm_repository_normalization(self): # normal package_json = b"""{ "name": "foo", "repository": { "type" : "git", "url" : "https://github.com/npm/cli.git" } }""" result = self.npm_mapping.translate(package_json) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "name": "foo", "codeRepository": "git+https://github.com/npm/cli.git", "type": "SoftwareSourceCode", }, ) # missing url package_json = b"""{ "name": "foo", "repository": { "type" : "git" } }""" result = self.npm_mapping.translate(package_json) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "name": "foo", "type": "SoftwareSourceCode", }, ) # github shortcut package_json = b"""{ "name": "foo", "repository": "github:npm/cli" }""" result = self.npm_mapping.translate(package_json) expected_result = { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "name": "foo", "codeRepository": "git+https://github.com/npm/cli.git", "type": "SoftwareSourceCode", } self.assertEqual(result, expected_result) # github shortshortcut package_json = b"""{ "name": "foo", "repository": "npm/cli" }""" result = self.npm_mapping.translate(package_json) self.assertEqual(result, expected_result) # gitlab shortcut package_json = b"""{ "name": "foo", "repository": "gitlab:user/repo" }""" result = self.npm_mapping.translate(package_json) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "name": "foo", "codeRepository": "git+https://gitlab.com/user/repo.git", "type": "SoftwareSourceCode", }, ) def test_detect_metadata_package_json(self): # given df = [ { "sha1_git": b"abc", "name": b"index.js", "target": b"abc", "length": 897, "status": "visible", "type": "file", "perms": 33188, "dir_id": b"dir_a", "sha1": b"bcd", }, { "sha1_git": b"aab", "name": b"package.json", "target": b"aab", "length": 712, "status": "visible", "type": "file", "perms": 33188, "dir_id": b"dir_a", "sha1": b"cde", }, ] # when results = detect_metadata(df) expected_results = {"NpmMapping": [b"cde"]} # then self.assertEqual(expected_results, results) def test_detect_metadata_codemeta_json_uppercase(self): # given df = [ { "sha1_git": b"abc", "name": b"index.html", "target": b"abc", "length": 897, "status": "visible", "type": "file", "perms": 33188, "dir_id": b"dir_a", "sha1": b"bcd", }, { "sha1_git": b"aab", "name": b"CODEMETA.json", "target": b"aab", "length": 712, "status": "visible", "type": "file", "perms": 33188, "dir_id": b"dir_a", "sha1": b"bcd", }, ] # when results = detect_metadata(df) expected_results = {"CodemetaMapping": [b"bcd"]} # then self.assertEqual(expected_results, results) def test_compute_metadata_valid_codemeta(self): raw_content = b"""{ "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "@type": "SoftwareSourceCode", "identifier": "CodeMeta", "description": "CodeMeta is a concept vocabulary that can be used to standardize the exchange of software metadata across repositories and organizations.", "name": "CodeMeta: Minimal metadata schemas for science software and code, in JSON-LD", "codeRepository": "https://github.com/codemeta/codemeta", "issueTracker": "https://github.com/codemeta/codemeta/issues", "license": "https://spdx.org/licenses/Apache-2.0", "version": "2.0", "author": [ { "@type": "Person", "givenName": "Carl", "familyName": "Boettiger", "email": "cboettig@gmail.com", "@id": "http://orcid.org/0000-0002-1642-628X" }, { "@type": "Person", "givenName": "Matthew B.", "familyName": "Jones", "email": "jones@nceas.ucsb.edu", "@id": "http://orcid.org/0000-0003-0077-4738" } ], "maintainer": { "@type": "Person", "givenName": "Carl", "familyName": "Boettiger", "email": "cboettig@gmail.com", "@id": "http://orcid.org/0000-0002-1642-628X" }, "contIntegration": "https://travis-ci.org/codemeta/codemeta", "developmentStatus": "active", "downloadUrl": "https://github.com/codemeta/codemeta/archive/2.0.zip", "funder": { "@id": "https://doi.org/10.13039/100000001", "@type": "Organization", "name": "National Science Foundation" }, "funding":"1549758; Codemeta: A Rosetta Stone for Metadata in Scientific Software", "keywords": [ "metadata", "software" ], "version":"2.0", "dateCreated":"2017-06-05", "datePublished":"2017-06-05", "programmingLanguage": "JSON-LD" }""" # noqa expected_result = { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "identifier": "CodeMeta", "description": "CodeMeta is a concept vocabulary that can " "be used to standardize the exchange of software metadata " "across repositories and organizations.", "name": "CodeMeta: Minimal metadata schemas for science " "software and code, in JSON-LD", "codeRepository": "https://github.com/codemeta/codemeta", "issueTracker": "https://github.com/codemeta/codemeta/issues", "license": "https://spdx.org/licenses/Apache-2.0", "version": "2.0", "author": [ { "type": "Person", "givenName": "Carl", "familyName": "Boettiger", "email": "cboettig@gmail.com", "id": "http://orcid.org/0000-0002-1642-628X", }, { "type": "Person", "givenName": "Matthew B.", "familyName": "Jones", "email": "jones@nceas.ucsb.edu", "id": "http://orcid.org/0000-0003-0077-4738", }, ], "maintainer": { "type": "Person", "givenName": "Carl", "familyName": "Boettiger", "email": "cboettig@gmail.com", "id": "http://orcid.org/0000-0002-1642-628X", }, "contIntegration": "https://travis-ci.org/codemeta/codemeta", "developmentStatus": "active", "downloadUrl": "https://github.com/codemeta/codemeta/archive/2.0.zip", "funder": { "id": "https://doi.org/10.13039/100000001", "type": "Organization", "name": "National Science Foundation", }, "funding": "1549758; Codemeta: A Rosetta Stone for Metadata " "in Scientific Software", "keywords": ["metadata", "software"], "version": "2.0", "dateCreated": "2017-06-05", "datePublished": "2017-06-05", "programmingLanguage": "JSON-LD", } result = self.codemeta_mapping.translate(raw_content) self.assertEqual(result, expected_result) def test_compute_metadata_codemeta_alternate_context(self): raw_content = b"""{ "@context": "https://raw.githubusercontent.com/codemeta/codemeta/master/codemeta.jsonld", "@type": "SoftwareSourceCode", "identifier": "CodeMeta" }""" # noqa expected_result = { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "identifier": "CodeMeta", } result = self.codemeta_mapping.translate(raw_content) self.assertEqual(result, expected_result) def test_compute_metadata_maven(self): raw_content = b""" Maven Default Project 4.0.0 com.mycompany.app my-app 1.2.3 central Maven Repository Switchboard default http://repo1.maven.org/maven2 false Apache License, Version 2.0 https://www.apache.org/licenses/LICENSE-2.0.txt repo A business-friendly OSS license """ result = self.maven_mapping.translate(raw_content) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "name": "Maven Default Project", "identifier": "com.mycompany.app", "version": "1.2.3", "license": "https://www.apache.org/licenses/LICENSE-2.0.txt", "codeRepository": ( "http://repo1.maven.org/maven2/com/mycompany/app/my-app" ), }, ) def test_compute_metadata_maven_empty(self): raw_content = b""" """ result = self.maven_mapping.translate(raw_content) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", }, ) def test_compute_metadata_maven_almost_empty(self): raw_content = b""" """ result = self.maven_mapping.translate(raw_content) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", }, ) def test_compute_metadata_maven_invalid_xml(self): expected_warning = ( "WARNING:swh.indexer.metadata_dictionary.maven.MavenMapping:" "Error parsing XML from foo" ) raw_content = b""" """ with self.assertLogs("swh.indexer.metadata_dictionary", level="WARNING") as cm: result = MAPPINGS["MavenMapping"]("foo").translate(raw_content) self.assertEqual(cm.output, [expected_warning]) self.assertEqual(result, None) raw_content = b""" """ with self.assertLogs("swh.indexer.metadata_dictionary", level="WARNING") as cm: result = MAPPINGS["MavenMapping"]("foo").translate(raw_content) self.assertEqual(cm.output, [expected_warning]) self.assertEqual(result, None) def test_compute_metadata_maven_unknown_encoding(self): expected_warning = ( "WARNING:swh.indexer.metadata_dictionary.maven.MavenMapping:" "Error detecting XML encoding from foo" ) raw_content = b""" """ with self.assertLogs("swh.indexer.metadata_dictionary", level="WARNING") as cm: result = MAPPINGS["MavenMapping"]("foo").translate(raw_content) self.assertEqual(cm.output, [expected_warning]) self.assertEqual(result, None) raw_content = b""" """ with self.assertLogs("swh.indexer.metadata_dictionary", level="WARNING") as cm: result = MAPPINGS["MavenMapping"]("foo").translate(raw_content) self.assertEqual(cm.output, [expected_warning]) self.assertEqual(result, None) def test_compute_metadata_maven_invalid_encoding(self): - expected_warning = ( - "WARNING:swh.indexer.metadata_dictionary.maven.MavenMapping:" - "Error unidecoding XML from foo" - ) + expected_warning = [ + # libexpat1 <= 2.2.10-2+deb11u1 + [ + ( + "WARNING:swh.indexer.metadata_dictionary.maven.MavenMapping:" + "Error unidecoding XML from foo" + ) + ], + # libexpat1 >= 2.2.10-2+deb11u2 + [ + ( + "WARNING:swh.indexer.metadata_dictionary.maven.MavenMapping:" + "Error parsing XML from foo" + ) + ], + ] raw_content = b""" """ with self.assertLogs("swh.indexer.metadata_dictionary", level="WARNING") as cm: result = MAPPINGS["MavenMapping"]("foo").translate(raw_content) - self.assertEqual(cm.output, [expected_warning]) + self.assertIn(cm.output, expected_warning) self.assertEqual(result, None) def test_compute_metadata_maven_minimal(self): raw_content = b""" Maven Default Project 4.0.0 com.mycompany.app my-app 1.2.3 """ result = self.maven_mapping.translate(raw_content) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "name": "Maven Default Project", "identifier": "com.mycompany.app", "version": "1.2.3", "codeRepository": ( "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app" ), }, ) def test_compute_metadata_maven_empty_nodes(self): raw_content = b""" Maven Default Project 4.0.0 com.mycompany.app my-app 1.2.3 """ result = self.maven_mapping.translate(raw_content) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "name": "Maven Default Project", "identifier": "com.mycompany.app", "version": "1.2.3", "codeRepository": ( "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app" ), }, ) raw_content = b""" Maven Default Project 4.0.0 com.mycompany.app my-app """ result = self.maven_mapping.translate(raw_content) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "name": "Maven Default Project", "identifier": "com.mycompany.app", "codeRepository": ( "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app" ), }, ) raw_content = b""" 4.0.0 com.mycompany.app my-app 1.2.3 """ result = self.maven_mapping.translate(raw_content) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "identifier": "com.mycompany.app", "version": "1.2.3", "codeRepository": ( "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app" ), }, ) raw_content = b""" Maven Default Project 4.0.0 com.mycompany.app my-app 1.2.3 """ result = self.maven_mapping.translate(raw_content) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "name": "Maven Default Project", "identifier": "com.mycompany.app", "version": "1.2.3", "codeRepository": ( "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app" ), }, ) raw_content = b""" 1.2.3 """ result = self.maven_mapping.translate(raw_content) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "version": "1.2.3", }, ) def test_compute_metadata_maven_invalid_licenses(self): raw_content = b""" Maven Default Project 4.0.0 com.mycompany.app my-app 1.2.3 foo """ result = self.maven_mapping.translate(raw_content) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "name": "Maven Default Project", "identifier": "com.mycompany.app", "version": "1.2.3", "codeRepository": ( "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app" ), }, ) def test_compute_metadata_maven_multiple(self): """Tests when there are multiple code repos and licenses.""" raw_content = b""" Maven Default Project 4.0.0 com.mycompany.app my-app 1.2.3 central Maven Repository Switchboard default http://repo1.maven.org/maven2 false example Example Maven Repo default http://example.org/maven2 Apache License, Version 2.0 https://www.apache.org/licenses/LICENSE-2.0.txt repo A business-friendly OSS license MIT license https://opensource.org/licenses/MIT """ result = self.maven_mapping.translate(raw_content) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "name": "Maven Default Project", "identifier": "com.mycompany.app", "version": "1.2.3", "license": [ "https://www.apache.org/licenses/LICENSE-2.0.txt", "https://opensource.org/licenses/MIT", ], "codeRepository": [ "http://repo1.maven.org/maven2/com/mycompany/app/my-app", "http://example.org/maven2/com/mycompany/app/my-app", ], }, ) def test_compute_metadata_pkginfo(self): raw_content = b"""\ Metadata-Version: 2.1 Name: swh.core Version: 0.0.49 Summary: Software Heritage core utilities Home-page: https://forge.softwareheritage.org/diffusion/DCORE/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-core Description: swh-core ======== \x20 core library for swh's modules: - config parser - hash computations - serialization - logging mechanism \x20 Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Description-Content-Type: text/markdown Provides-Extra: testing """ # noqa result = self.pkginfo_mapping.translate(raw_content) self.assertCountEqual( result["description"], [ "Software Heritage core utilities", # note the comma here "swh-core\n" "========\n" "\n" "core library for swh's modules:\n" "- config parser\n" "- hash computations\n" "- serialization\n" "- logging mechanism\n" "", ], result, ) del result["description"] self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "url": "https://forge.softwareheritage.org/diffusion/DCORE/", "name": "swh.core", "author": [ { "type": "Person", "name": "Software Heritage developers", "email": "swh-devel@inria.fr", } ], "version": "0.0.49", }, ) def test_compute_metadata_pkginfo_utf8(self): raw_content = b"""\ Metadata-Version: 1.1 Name: snowpyt Description-Content-Type: UNKNOWN Description: foo Hydrology N\xc2\xb083 """ # noqa result = self.pkginfo_mapping.translate(raw_content) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "name": "snowpyt", "description": "foo\nHydrology N°83", }, ) def test_compute_metadata_pkginfo_keywords(self): raw_content = b"""\ Metadata-Version: 2.1 Name: foo Keywords: foo bar baz """ # noqa result = self.pkginfo_mapping.translate(raw_content) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "name": "foo", "keywords": ["foo", "bar", "baz"], }, ) def test_compute_metadata_pkginfo_license(self): raw_content = b"""\ Metadata-Version: 2.1 Name: foo License: MIT """ # noqa result = self.pkginfo_mapping.translate(raw_content) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "name": "foo", "license": "MIT", }, ) def test_gemspec_base(self): raw_content = b""" Gem::Specification.new do |s| s.name = 'example' s.version = '0.1.0' s.licenses = ['MIT'] s.summary = "This is an example!" s.description = "Much longer explanation of the example!" s.authors = ["Ruby Coder"] s.email = 'rubycoder@example.com' s.files = ["lib/example.rb"] s.homepage = 'https://rubygems.org/gems/example' s.metadata = { "source_code_uri" => "https://github.com/example/example" } end""" result = self.gemspec_mapping.translate(raw_content) self.assertCountEqual( result.pop("description"), ["This is an example!", "Much longer explanation of the example!"], ) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "author": [{"type": "Person", "name": "Ruby Coder"}], "name": "example", "license": "https://spdx.org/licenses/MIT", "codeRepository": "https://rubygems.org/gems/example", "email": "rubycoder@example.com", "version": "0.1.0", }, ) def test_gemspec_two_author_fields(self): raw_content = b""" Gem::Specification.new do |s| s.authors = ["Ruby Coder1"] s.author = "Ruby Coder2" end""" result = self.gemspec_mapping.translate(raw_content) self.assertCountEqual( result.pop("author"), [ {"type": "Person", "name": "Ruby Coder1"}, {"type": "Person", "name": "Ruby Coder2"}, ], ) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", }, ) def test_gemspec_invalid_author(self): raw_content = b""" Gem::Specification.new do |s| s.author = ["Ruby Coder"] end""" result = self.gemspec_mapping.translate(raw_content) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", }, ) raw_content = b""" Gem::Specification.new do |s| s.author = "Ruby Coder1", end""" result = self.gemspec_mapping.translate(raw_content) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", }, ) raw_content = b""" Gem::Specification.new do |s| s.authors = ["Ruby Coder1", ["Ruby Coder2"]] end""" result = self.gemspec_mapping.translate(raw_content) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "author": [{"type": "Person", "name": "Ruby Coder1"}], }, ) def test_gemspec_alternative_header(self): raw_content = b""" require './lib/version' Gem::Specification.new { |s| s.name = 'rb-system-with-aliases' s.summary = 'execute system commands with aliases' } """ result = self.gemspec_mapping.translate(raw_content) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "name": "rb-system-with-aliases", "description": "execute system commands with aliases", }, ) @settings(suppress_health_check=[HealthCheck.too_slow]) @given(json_document_strategy(keys=list(NpmMapping.mapping))) def test_npm_adversarial(self, doc): raw = json.dumps(doc).encode() self.npm_mapping.translate(raw) @settings(suppress_health_check=[HealthCheck.too_slow]) @given(json_document_strategy(keys=CODEMETA_TERMS)) def test_codemeta_adversarial(self, doc): raw = json.dumps(doc).encode() self.codemeta_mapping.translate(raw) @settings(suppress_health_check=[HealthCheck.too_slow]) @given( xml_document_strategy( keys=list(MavenMapping.mapping), root="project", xmlns="http://maven.apache.org/POM/4.0.0", ) ) def test_maven_adversarial(self, doc): self.maven_mapping.translate(doc) @settings(suppress_health_check=[HealthCheck.too_slow]) @given( strategies.dictionaries( # keys strategies.one_of( strategies.text(), *map(strategies.just, GemspecMapping.mapping) ), # values strategies.recursive( strategies.characters(), lambda children: strategies.lists(children, min_size=1), ), ) ) def test_gemspec_adversarial(self, doc): parts = [b"Gem::Specification.new do |s|\n"] for (k, v) in doc.items(): parts.append(" s.{} = {}\n".format(k, repr(v)).encode()) parts.append(b"end\n") self.gemspec_mapping.translate(b"".join(parts)) def test_revision_metadata_indexer(self): metadata_indexer = RevisionMetadataIndexer(config=REVISION_METADATA_CONFIG) fill_obj_storage(metadata_indexer.objstorage) fill_storage(metadata_indexer.storage) tool = metadata_indexer.idx_storage.indexer_configuration_get( {f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()} ) assert tool is not None rev = REVISION assert rev.directory == DIRECTORY2.id metadata_indexer.idx_storage.content_metadata_add( [ ContentMetadataRow( id=DIRECTORY2.entries[0].target, indexer_configuration_id=tool["id"], metadata=YARN_PARSER_METADATA, ) ] ) metadata_indexer.run([rev.id]) results = list( metadata_indexer.idx_storage.revision_intrinsic_metadata_get([REVISION.id]) ) expected_results = [ RevisionIntrinsicMetadataRow( id=rev.id, tool=TRANSLATOR_TOOL, metadata=YARN_PARSER_METADATA, mappings=["npm"], ) ] for result in results: del result.tool["id"] # then self.assertEqual(results, expected_results) def test_revision_metadata_indexer_single_root_dir(self): metadata_indexer = RevisionMetadataIndexer(config=REVISION_METADATA_CONFIG) fill_obj_storage(metadata_indexer.objstorage) fill_storage(metadata_indexer.storage) # Add a parent directory, that is the only directory at the root # of the revision rev = REVISION assert rev.directory == DIRECTORY2.id directory = Directory( entries=( DirectoryEntry( name=b"foobar-1.0.0", type="dir", target=rev.directory, perms=16384, ), ), ) assert directory.id is not None metadata_indexer.storage.directory_add([directory]) new_rev_dict = {**rev.to_dict(), "directory": directory.id} new_rev_dict.pop("id") new_rev = Revision.from_dict(new_rev_dict) metadata_indexer.storage.revision_add([new_rev]) tool = metadata_indexer.idx_storage.indexer_configuration_get( {f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()} ) assert tool is not None metadata_indexer.idx_storage.content_metadata_add( [ ContentMetadataRow( id=DIRECTORY2.entries[0].target, indexer_configuration_id=tool["id"], metadata=YARN_PARSER_METADATA, ) ] ) metadata_indexer.run([new_rev.id]) results = list( metadata_indexer.idx_storage.revision_intrinsic_metadata_get([new_rev.id]) ) expected_results = [ RevisionIntrinsicMetadataRow( id=new_rev.id, tool=TRANSLATOR_TOOL, metadata=YARN_PARSER_METADATA, mappings=["npm"], ) ] for result in results: del result.tool["id"] # then self.assertEqual(results, expected_results)