diff --git a/.gitignore b/.gitignore index 1c279bb..f7a062e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,16 +1,17 @@ *.pyc *.sw? *~ /.coverage /.coverage.* .eggs/ __pycache__ *.egg-info/ build/ dist/ version.txt /sql/createdb-stamp /sql/filldb-stamp .tox/ .hypothesis/ .mypy_cache/ +.vscode/ \ No newline at end of file diff --git a/CONTRIBUTORS b/CONTRIBUTORS index a1a7b45..d040f10 100644 --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -1,2 +1,3 @@ +Kumar Shivendu Siddharth Ravikumar Thibault Allançon diff --git a/PKG-INFO b/PKG-INFO index a1441e2..57272cb 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,71 +1,75 @@ Metadata-Version: 2.1 Name: swh.indexer -Version: 0.7.0 +Version: 0.8.0 Summary: Software Heritage Content Indexer Home-page: https://forge.softwareheritage.org/diffusion/78/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-indexer Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-indexer/ -Description: swh-indexer - ============ - - Tools to compute multiple indexes on SWH's raw contents: - - content: - - mimetype - - ctags - - language - - fossology-license - - metadata - - revision: - - metadata - - An indexer is in charge of: - - looking up objects - - extracting information from those objects - - store those information in the swh-indexer db - - There are multiple indexers working on different object types: - - content indexer: works with content sha1 hashes - - revision indexer: works with revision sha1 hashes - - origin indexer: works with origin identifiers - - Indexation procedure: - - receive batch of ids - - retrieve the associated data depending on object type - - compute for that object some index - - store the result to swh's storage - - Current content indexers: - - - mimetype (queue swh_indexer_content_mimetype): detect the encoding - and mimetype - - - language (queue swh_indexer_content_language): detect the - programming language - - - ctags (queue swh_indexer_content_ctags): compute tags information - - - fossology-license (queue swh_indexer_fossology_license): compute the - license - - - metadata: translate file into translated_metadata dict - - Current revision indexers: - - - metadata: detects files containing metadata and retrieves translated_metadata - in content_metadata table in storage or run content indexer to translate - files. - Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing +License-File: LICENSE +License-File: AUTHORS + +swh-indexer +============ + +Tools to compute multiple indexes on SWH's raw contents: +- content: + - mimetype + - ctags + - language + - fossology-license + - metadata +- revision: + - metadata + +An indexer is in charge of: +- looking up objects +- extracting information from those objects +- store those information in the swh-indexer db + +There are multiple indexers working on different object types: + - content indexer: works with content sha1 hashes + - revision indexer: works with revision sha1 hashes + - origin indexer: works with origin identifiers + +Indexation procedure: +- receive batch of ids +- retrieve the associated data depending on object type +- compute for that object some index +- store the result to swh's storage + +Current content indexers: + +- mimetype (queue swh_indexer_content_mimetype): detect the encoding + and mimetype + +- language (queue swh_indexer_content_language): detect the + programming language + +- ctags (queue swh_indexer_content_ctags): compute tags information + +- fossology-license (queue swh_indexer_fossology_license): compute the + license + +- metadata: translate file into translated_metadata dict + +Current revision indexers: + +- metadata: detects files containing metadata and retrieves translated_metadata + in content_metadata table in storage or run content indexer to translate + files. + + diff --git a/requirements-swh.txt b/requirements-swh.txt index 4f72e9b..fd8a344 100644 --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,6 +1,6 @@ -swh.core[db,http] >= 0.9.1 +swh.core[db,http] >= 0.14.0 swh.model >= 0.0.15 swh.objstorage >= 0.2.2 swh.scheduler >= 0.5.2 swh.storage >= 0.22.0 swh.journal >= 0.1.0 diff --git a/swh.indexer.egg-info/PKG-INFO b/swh.indexer.egg-info/PKG-INFO index a1441e2..57272cb 100644 --- a/swh.indexer.egg-info/PKG-INFO +++ b/swh.indexer.egg-info/PKG-INFO @@ -1,71 +1,75 @@ Metadata-Version: 2.1 Name: swh.indexer -Version: 0.7.0 +Version: 0.8.0 Summary: Software Heritage Content Indexer Home-page: https://forge.softwareheritage.org/diffusion/78/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-indexer Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-indexer/ -Description: swh-indexer - ============ - - Tools to compute multiple indexes on SWH's raw contents: - - content: - - mimetype - - ctags - - language - - fossology-license - - metadata - - revision: - - metadata - - An indexer is in charge of: - - looking up objects - - extracting information from those objects - - store those information in the swh-indexer db - - There are multiple indexers working on different object types: - - content indexer: works with content sha1 hashes - - revision indexer: works with revision sha1 hashes - - origin indexer: works with origin identifiers - - Indexation procedure: - - receive batch of ids - - retrieve the associated data depending on object type - - compute for that object some index - - store the result to swh's storage - - Current content indexers: - - - mimetype (queue swh_indexer_content_mimetype): detect the encoding - and mimetype - - - language (queue swh_indexer_content_language): detect the - programming language - - - ctags (queue swh_indexer_content_ctags): compute tags information - - - fossology-license (queue swh_indexer_fossology_license): compute the - license - - - metadata: translate file into translated_metadata dict - - Current revision indexers: - - - metadata: detects files containing metadata and retrieves translated_metadata - in content_metadata table in storage or run content indexer to translate - files. - Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing +License-File: LICENSE +License-File: AUTHORS + +swh-indexer +============ + +Tools to compute multiple indexes on SWH's raw contents: +- content: + - mimetype + - ctags + - language + - fossology-license + - metadata +- revision: + - metadata + +An indexer is in charge of: +- looking up objects +- extracting information from those objects +- store those information in the swh-indexer db + +There are multiple indexers working on different object types: + - content indexer: works with content sha1 hashes + - revision indexer: works with revision sha1 hashes + - origin indexer: works with origin identifiers + +Indexation procedure: +- receive batch of ids +- retrieve the associated data depending on object type +- compute for that object some index +- store the result to swh's storage + +Current content indexers: + +- mimetype (queue swh_indexer_content_mimetype): detect the encoding + and mimetype + +- language (queue swh_indexer_content_language): detect the + programming language + +- ctags (queue swh_indexer_content_ctags): compute tags information + +- fossology-license (queue swh_indexer_fossology_license): compute the + license + +- metadata: translate file into translated_metadata dict + +Current revision indexers: + +- metadata: detects files containing metadata and retrieves translated_metadata + in content_metadata table in storage or run content indexer to translate + files. + + diff --git a/swh.indexer.egg-info/SOURCES.txt b/swh.indexer.egg-info/SOURCES.txt index 9f4e312..55e837c 100644 --- a/swh.indexer.egg-info/SOURCES.txt +++ b/swh.indexer.egg-info/SOURCES.txt @@ -1,142 +1,143 @@ .gitignore .pre-commit-config.yaml AUTHORS CODE_OF_CONDUCT.md CONTRIBUTORS LICENSE MANIFEST.in Makefile Makefile.local README.md codemeta.json conftest.py mypy.ini pyproject.toml pytest.ini requirements-swh.txt requirements-test.txt requirements.txt setup.cfg setup.py tox.ini docs/.gitignore docs/Makefile docs/Makefile.local docs/README.md docs/cli.rst docs/conf.py docs/dev-info.rst docs/index.rst docs/metadata-workflow.rst docs/_static/.placeholder docs/_templates/.placeholder docs/images/.gitignore docs/images/Makefile docs/images/tasks-metadata-indexers.uml sql/bin/db-upgrade sql/bin/dot_add_content sql/doc/json sql/doc/json/.gitignore sql/doc/json/Makefile sql/doc/json/indexer_configuration.tool_configuration.schema.json sql/doc/json/revision_metadata.translated_metadata.json sql/json/.gitignore sql/json/Makefile sql/json/indexer_configuration.tool_configuration.schema.json sql/json/revision_metadata.translated_metadata.json sql/upgrades/115.sql sql/upgrades/116.sql sql/upgrades/117.sql sql/upgrades/118.sql sql/upgrades/119.sql sql/upgrades/120.sql sql/upgrades/121.sql sql/upgrades/122.sql sql/upgrades/123.sql sql/upgrades/124.sql sql/upgrades/125.sql sql/upgrades/126.sql sql/upgrades/127.sql sql/upgrades/128.sql sql/upgrades/129.sql sql/upgrades/130.sql sql/upgrades/131.sql sql/upgrades/132.sql sql/upgrades/133.sql swh/__init__.py swh.indexer.egg-info/PKG-INFO swh.indexer.egg-info/SOURCES.txt swh.indexer.egg-info/dependency_links.txt swh.indexer.egg-info/entry_points.txt swh.indexer.egg-info/requires.txt swh.indexer.egg-info/top_level.txt swh/indexer/__init__.py swh/indexer/cli.py swh/indexer/codemeta.py swh/indexer/ctags.py swh/indexer/fossology_license.py swh/indexer/indexer.py swh/indexer/journal_client.py swh/indexer/metadata.py swh/indexer/metadata_detector.py swh/indexer/mimetype.py swh/indexer/origin_head.py swh/indexer/py.typed swh/indexer/rehash.py swh/indexer/tasks.py swh/indexer/data/codemeta/CITATION swh/indexer/data/codemeta/LICENSE swh/indexer/data/codemeta/codemeta.jsonld swh/indexer/data/codemeta/crosswalk.csv swh/indexer/metadata_dictionary/__init__.py swh/indexer/metadata_dictionary/base.py +swh/indexer/metadata_dictionary/cff.py swh/indexer/metadata_dictionary/codemeta.py swh/indexer/metadata_dictionary/maven.py swh/indexer/metadata_dictionary/npm.py swh/indexer/metadata_dictionary/python.py swh/indexer/metadata_dictionary/ruby.py swh/indexer/sql/10-superuser-init.sql swh/indexer/sql/20-enums.sql swh/indexer/sql/30-schema.sql swh/indexer/sql/50-data.sql swh/indexer/sql/50-func.sql swh/indexer/sql/60-indexes.sql swh/indexer/storage/__init__.py swh/indexer/storage/converters.py swh/indexer/storage/db.py swh/indexer/storage/exc.py swh/indexer/storage/in_memory.py swh/indexer/storage/interface.py swh/indexer/storage/metrics.py swh/indexer/storage/model.py swh/indexer/storage/writer.py swh/indexer/storage/api/__init__.py swh/indexer/storage/api/client.py swh/indexer/storage/api/serializers.py swh/indexer/storage/api/server.py swh/indexer/tests/__init__.py swh/indexer/tests/conftest.py swh/indexer/tests/tasks.py swh/indexer/tests/test_cli.py swh/indexer/tests/test_codemeta.py swh/indexer/tests/test_ctags.py swh/indexer/tests/test_fossology_license.py swh/indexer/tests/test_indexer.py swh/indexer/tests/test_journal_client.py swh/indexer/tests/test_metadata.py swh/indexer/tests/test_mimetype.py swh/indexer/tests/test_origin_head.py swh/indexer/tests/test_origin_metadata.py swh/indexer/tests/test_tasks.py swh/indexer/tests/utils.py swh/indexer/tests/storage/__init__.py swh/indexer/tests/storage/conftest.py swh/indexer/tests/storage/generate_data_test.py swh/indexer/tests/storage/test_api_client.py swh/indexer/tests/storage/test_converters.py swh/indexer/tests/storage/test_in_memory.py swh/indexer/tests/storage/test_init.py swh/indexer/tests/storage/test_metrics.py swh/indexer/tests/storage/test_model.py swh/indexer/tests/storage/test_server.py swh/indexer/tests/storage/test_storage.py \ No newline at end of file diff --git a/swh.indexer.egg-info/requires.txt b/swh.indexer.egg-info/requires.txt index 2f9c7ae..48a5c82 100644 --- a/swh.indexer.egg-info/requires.txt +++ b/swh.indexer.egg-info/requires.txt @@ -1,19 +1,19 @@ click python-magic>=0.4.13 pyld xmltodict typing-extensions -swh.core[db,http]>=0.9.1 +swh.core[db,http]>=0.14.0 swh.model>=0.0.15 swh.objstorage>=0.2.2 swh.scheduler>=0.5.2 swh.storage>=0.22.0 swh.journal>=0.1.0 [testing] confluent-kafka pytest pytest-mock hypothesis>=3.11.0 swh.scheduler[testing]>=0.5.0 swh.storage[testing]>=0.10.0 diff --git a/swh/indexer/data/codemeta/CITATION b/swh/indexer/data/codemeta/CITATION index 92290ce..87ea0e9 100644 --- a/swh/indexer/data/codemeta/CITATION +++ b/swh/indexer/data/codemeta/CITATION @@ -1,2 +1,2 @@ Matthew B. Jones, Carl Boettiger, Abby Cabunoc Mayes, Arfon Smith, Peter Slaughter, Kyle Niemeyer, Yolanda Gil, Martin Fenner, Krzysztof Nowak, Mark Hahnel, Luke Coy, Alice Allen, Mercè Crosas, Ashley Sands, Neil Chue Hong, Patricia Cruse, Daniel S. Katz, Carole Goble. 2017. CodeMeta: an exchange schema for software metadata. Version 2.0. KNB Data Repository. doi:10.5063/schema/codemeta-2.0 -swh:1:dir:21824b406ede4d40ac37c693ca9aebd02c85229e;origin=https://github.com/codemeta/codemeta +swh:1:dir:f39a0ef0005ad0dee50dcd546231ed568cf8705d;origin=https://github.com/codemeta/codemeta diff --git a/swh/indexer/data/codemeta/codemeta.jsonld b/swh/indexer/data/codemeta/codemeta.jsonld index 5e19122..5fc4022 100644 --- a/swh/indexer/data/codemeta/codemeta.jsonld +++ b/swh/indexer/data/codemeta/codemeta.jsonld @@ -1,80 +1,79 @@ { "@context": { "type": "@type", "id": "@id", "schema":"http://schema.org/", "codemeta": "https://codemeta.github.io/terms/", "Organization": {"@id": "schema:Organization"}, "Person": {"@id": "schema:Person"}, "SoftwareSourceCode": {"@id": "schema:SoftwareSourceCode"}, "SoftwareApplication": {"@id": "schema:SoftwareApplication"}, "Text": {"@id": "schema:Text"}, "URL": {"@id": "schema:URL"}, "address": { "@id": "schema:address"}, "affiliation": { "@id": "schema:affiliation"}, "applicationCategory": { "@id": "schema:applicationCategory", "@type": "@id"}, "applicationSubCategory": { "@id": "schema:applicationSubCategory", "@type": "@id"}, "citation": { "@id": "schema:citation"}, "codeRepository": { "@id": "schema:codeRepository", "@type": "@id"}, "contributor": { "@id": "schema:contributor"}, "copyrightHolder": { "@id": "schema:copyrightHolder"}, "copyrightYear": { "@id": "schema:copyrightYear"}, - "creator": { "@id": "schema:creator"}, "dateCreated": {"@id": "schema:dateCreated", "@type": "schema:Date" }, "dateModified": {"@id": "schema:dateModified", "@type": "schema:Date" }, "datePublished": {"@id": "schema:datePublished", "@type": "schema:Date" }, "description": { "@id": "schema:description"}, "downloadUrl": { "@id": "schema:downloadUrl", "@type": "@id"}, "email": { "@id": "schema:email"}, "editor": { "@id": "schema:editor"}, "encoding": { "@id": "schema:encoding"}, "familyName": { "@id": "schema:familyName"}, "fileFormat": { "@id": "schema:fileFormat", "@type": "@id"}, "fileSize": { "@id": "schema:fileSize"}, "funder": { "@id": "schema:funder"}, "givenName": { "@id": "schema:givenName"}, "hasPart": { "@id": "schema:hasPart" }, "identifier": { "@id": "schema:identifier", "@type": "@id"}, "installUrl": { "@id": "schema:installUrl", "@type": "@id"}, "isAccessibleForFree": { "@id": "schema:isAccessibleForFree"}, "isPartOf": { "@id": "schema:isPartOf"}, "keywords": { "@id": "schema:keywords"}, "license": { "@id": "schema:license", "@type": "@id"}, "memoryRequirements": { "@id": "schema:memoryRequirements", "@type": "@id"}, "name": { "@id": "schema:name"}, "operatingSystem": { "@id": "schema:operatingSystem"}, "permissions": { "@id": "schema:permissions"}, "position": { "@id": "schema:position"}, "processorRequirements": { "@id": "schema:processorRequirements"}, "producer": { "@id": "schema:producer"}, "programmingLanguage": { "@id": "schema:programmingLanguage"}, "provider": { "@id": "schema:provider"}, "publisher": { "@id": "schema:publisher"}, "relatedLink": { "@id": "schema:relatedLink", "@type": "@id"}, "releaseNotes": { "@id": "schema:releaseNotes", "@type": "@id"}, "runtimePlatform": { "@id": "schema:runtimePlatform"}, "sameAs": { "@id": "schema:sameAs", "@type": "@id"}, "softwareHelp": { "@id": "schema:softwareHelp"}, "softwareRequirements": { "@id": "schema:softwareRequirements", "@type": "@id"}, "softwareVersion": { "@id": "schema:softwareVersion"}, "sponsor": { "@id": "schema:sponsor"}, "storageRequirements": { "@id": "schema:storageRequirements", "@type": "@id"}, "supportingData": { "@id": "schema:supportingData"}, "targetProduct": { "@id": "schema:targetProduct"}, "url": { "@id": "schema:url", "@type": "@id"}, "version": { "@id": "schema:version"}, "author": { "@id": "schema:author", "@container": "@list" }, "softwareSuggestions": { "@id": "codemeta:softwareSuggestions", "@type": "@id"}, "contIntegration": { "@id": "codemeta:contIntegration", "@type": "@id"}, "buildInstructions": { "@id": "codemeta:buildInstructions", "@type": "@id"}, "developmentStatus": { "@id": "codemeta:developmentStatus", "@type": "@id"}, "embargoDate": { "@id":"codemeta:embargoDate", "@type": "schema:Date" }, "funding": { "@id": "codemeta:funding" }, "readme": { "@id":"codemeta:readme", "@type": "@id" }, "issueTracker": { "@id":"codemeta:issueTracker", "@type": "@id" }, "referencePublication": { "@id": "codemeta:referencePublication", "@type": "@id"}, "maintainer": { "@id": "codemeta:maintainer" } } } diff --git a/swh/indexer/data/codemeta/crosswalk.csv b/swh/indexer/data/codemeta/crosswalk.csv index d5fb64d..fb0a688 100644 --- a/swh/indexer/data/codemeta/crosswalk.csv +++ b/swh/indexer/data/codemeta/crosswalk.csv @@ -1,77 +1,76 @@ Parent Type,Property,Type,Description,codemeta-V1,DataCite,OntoSoft,Zenodo,GitHub,Figshare,Software Ontology,Software Discovery Index,Dublin Core,R Package Description,Debian Package,Python Distutils (PyPI),Python PKG-INFO,Trove Software Map,Perl Module Description (CPAN::Meta),NodeJS,Java (Maven),Octave,Ruby Gem,ASCL,DOAP,Wikidata,Citation File Format Core (CFF-Core) 1.0.2 -schema:SoftwareSourceCode,codeRepository,URL,"Link to the repository where the un-compiled, human readable code and related code is located (SVN, github, CodePlex).",codeRepository,,,relatedLink,html_url,relatedLink,,,,URL,HomePage,,,,resources.repository,repository,repositories,,homepage,site_list,repository,source code repository,repository-code -schema:SoftwareSourceCode,programmingLanguage,ComputerLanguage or Text,The computer programming language.,programmingLanguage,Format,hasProgrammingLanguage,,languages_url,,programming language,,,,,classifiers['Programming Language'],,Programming Language,,,,,,,programming-language,programming language, -schema:SoftwareSourceCode,runtimePlatform,Text,"Runtime platform or script interpreter dependencies (Example - Java v1, Python2.3, .Net Framework 3.0). Supersedes runtime.",,,,,,,,,,,,,,,,,,,platform,,platform,, +schema:SoftwareSourceCode,codeRepository,URL,"Link to the repository where the un-compiled, human readable code and related code is located (SVN, github, CodePlex).",codeRepository,,,relatedLink,html_url,relatedLink,,,,URL,HomePage,,,,resources.repository,repository,scm / repositories,,homepage,site_list,repository,P1324,repository-code +schema:SoftwareSourceCode,programmingLanguage,ComputerLanguage or Text,The computer programming language.,programmingLanguage,Format,hasProgrammingLanguage,,languages_url,,programming language,,,,,classifiers['Programming Language'],,Programming Language,,,,,,,programming-language,P277, +schema:SoftwareSourceCode,runtimePlatform,Text,"Runtime platform or script interpreter dependencies (Example - Java v1, Python2.3, .Net Framework 3.0). Supersedes runtime.",,,,,,,,,,,,,,,,engines,,,platform,,platform,P400, schema:SoftwareSourceCode,targetProduct,SoftwareApplication,"Target Operating System / Product to which the code applies. If applies to several versions, just the product name can be used.",,,,,,,,,,,,,,,,,,,,,,, schema:SoftwareApplication,applicationCategory,Text or URL,"Type of software application, e.g. 'Game, Multimedia'.",,,hasSoftwareCategory,communities,,categories,,,,,,classifiers['Topic'],,Topic,Categories,,,Categories,,,,, schema:SoftwareApplication,applicationSubCategory,Text or URL,"Subcategory of the application, e.g. 'Arcade Game'.",,,,,,,,,,,,,,,,,,,,,,, -schema:SoftwareApplication,downloadUrl,URL,"If the file can be downloaded, URL to download the binary.",downloadLink,,,,archive_url,,,,,,,download_url,Download-URL,,,,,,,,download-page,,repository-artifact -schema:SoftwareApplication,fileSize,Text,"Size of the application / package (e.g. 18MB). In the absence of a unit (MB, KB etc.), KB will be assumed.",,,,,,,,,,,,,,,,,,,,,,, +schema:SoftwareApplication,downloadUrl,URL,"If the file can be downloaded, URL to download the binary.",downloadLink,,,,archive_url,,,,,,,download_url,Download-URL,,,,,,,,download-page,P4945,repository-artifact +schema:SoftwareApplication,fileSize,Text,"Size of the application / package (e.g. 18MB). In the absence of a unit (MB, KB etc.), KB will be assumed.",,size,,,,,,,,,,,,,,,,,,,,P3575, schema:SoftwareApplication,installUrl,URL,"URL at which the app may be installed, if different from the URL of the item.",,,,,,,,,,,,,,,,,,,,,download-mirror,, schema:SoftwareApplication,memoryRequirements,Text or URL,Minimum memory requirements.,,,,,,,,,,,,,,,,,,,,,,, -schema:SoftwareApplication,operatingSystem,Text,"Operating systems supported (Windows 7, OSX 10.6, Android 1.6).",operatingSystems,,SupportsOperatingSystem,,,,,,,,,classifiers['Operating System'],,Operating System,OSNAMES,os,,,,,os,operating system, +schema:SoftwareApplication,operatingSystem,Text,"Operating systems supported (Windows 7, OSX 10.6, Android 1.6).",operatingSystems,,SupportsOperatingSystem,,,,,,,,,classifiers['Operating System'],,Operating System,OSNAMES,os,,,,,os,P306, schema:SoftwareApplication,permissions,Text,"Permission(s) required to run the app (for example, a mobile app may require full internet access or may run only on wifi).",,,,,,,,,,,,,,,,,,,,,,, -schema:SoftwareApplication,processorRequirements,Text,Processor architecture required to run the application (e.g. IA64).,,,,,,,,,,,,,,,,cpu / engines,,,,,,, +schema:SoftwareApplication,processorRequirements,Text,Processor architecture required to run the application (e.g. IA64).,,,,,,,,,,,,,,,,cpu,,,,,,, schema:SoftwareApplication,releaseNotes,Text or URL,Description of what changed in this version.,,,,,,,,,,,,,,,,,,,,,,, schema:SoftwareApplication,softwareHelp,CreativeWork,Software application help.,,,,,,,,,,,,,,,,,,,,,,, -schema:SoftwareApplication,softwareRequirements,SoftwareSourceCode,Required software dependencies,depends,,hasDependency->Software,,,,,"""Platform, environment, and dependencies""",,"Depends, SystemRequirements",,install_requires,Requires,Database Environment,prereqs,dependencies / bundledDependencies / bundleDependencies / peerDependencies,prerequisites,"Depends, SystemRequirements","requirements, add_runtime_dependency",,,depends on software, -schema:SoftwareApplication,softwareVersion,Text,Version of the software instance.,,,,,,,,,,,,,,,,,,,,,release,software version, +schema:SoftwareApplication,softwareRequirements,SoftwareSourceCode,Required software dependencies,depends,,hasDependency->Software,,,,,"""Platform, environment, and dependencies""",,"Depends, SystemRequirements",,install_requires,Requires,Database Environment,prereqs,dependencies / bundledDependencies / bundleDependencies / peerDependencies,prerequisites,"Depends, SystemRequirements","requirements, add_runtime_dependency",,,P1547, +schema:SoftwareApplication,softwareVersion,Text,Version of the software instance.,,,,,,,,,,,,,,,,,,,,,release,P348, schema:SoftwareApplication,storageRequirements,Text or URL,Storage requirements (free space required).,,,,,,,,,,,,,,,,,,,,,,, schema:SoftwareApplication,supportingData,DataFeed,Supporting data for a SoftwareApplication.,,,,,,,,,,,,,,,,,,,,,,, -schema:CreativeWork,author,Organization or Person,The author of this content or rating. Please note that author is special in that HTML 5 provides a special mechanism for indicating authorship via the rel tag. That is equivalent to this and may be used interchangeably.,agents,creators,,creators,login,,,,,[aut] in Author,,author,Author,,,author,,,author/authors,,developer,,authors -schema:CreativeWork,citation,CreativeWork or URL,"A citation or reference to another creative work, such as another publication, web page, scholarly article, etc.",relatedLink,,,,,,,,,,,,,,,,,,,,,, -schema:CreativeWork,contributor,Organization or Person,A secondary contributor to the CreativeWork or Event.,,,,,,,,,,[ctb] in Author,,,,,,contributor,,,,,developer,, +schema:CreativeWork,author,Organization or Person,The author of this content or rating. Please note that author is special in that HTML 5 provides a special mechanism for indicating authorship via the rel tag. That is equivalent to this and may be used interchangeably.,agents,creators,,creators,login,,,,creator,[aut] in Author,,author,Author,,,author,,,author/authors,,developer,P50,authors +schema:CreativeWork,citation,CreativeWork or URL,"A citation or reference to another creative work, such as another publication, web page, scholarly article, etc.",relatedLink,,,,,,,,,,,,,,,,,,,,,P2860, +schema:CreativeWork,contributor,Organization or Person,A secondary contributor to the CreativeWork or Event.,,contributor,,,,,,,,[ctb] in Author,,,,,,contributors,,,,,developer,P767, schema:CreativeWork,copyrightHolder,Organization or Person,The party holding the legal copyright to the CreativeWork.,agents [role=copyrightHolder],,,,,,,,,,,,,,,,,,,,,, schema:CreativeWork,copyrightYear,Number,The year during which the claimed copyright for the CreativeWork was first asserted.,,,,,,,,,,,,,,,,,,,,,,, -schema:CreativeWork,creator,Organization or Person,The creator/author of this CreativeWork. This is the same as the Author property for CreativeWork.,agent,,,,,,,,creator,[cre] in Author,,,,,,author,,,,,,, -schema:CreativeWork,dateCreated,Date or DateTime,The date on which the CreativeWork was created or the item was added to a DataFeed.,dateCreated,date,,,created_at,,,,created,,Date,,,,,,,,,,,, -schema:CreativeWork,dateModified,Date or DateTime,The date on which the CreativeWork was most recently modified or when the item's entry was modified within a DataFeed.,dateModified,date,,,updated_at,,,,,,,,,last-updated,,,,,,,,, -schema:CreativeWork,datePublished,Date,Date of first broadcast/publication.,datePublished,publicationYear,,date_published,,date_retrieved,,,date,Date,,,,,,,,Date,,,,publication date,date-released -schema:CreativeWork,editor,Person,Specifies the Person who edited the CreativeWork.,,,,,,,,,,,,,,,,,,,,,,editor, +schema:CreativeWork,dateCreated,Date or DateTime,The date on which the CreativeWork was created or the item was added to a DataFeed.,dateCreated,date,,,created_at,,,,created,,Date,,,,,,,,,,,P571, +schema:CreativeWork,dateModified,Date or DateTime,The date on which the CreativeWork was most recently modified or when the item's entry was modified within a DataFeed.,dateModified,date,,,updated_at,,,,,,,,,last-updated,,,,,,,,P5017, +schema:CreativeWork,datePublished,Date,Date of first broadcast/publication.,datePublished,publicationYear,,date_published,,date_retrieved,,,date,Date,,,,,,,,Date,,,,P577,date-released +schema:CreativeWork,editor,Person,Specifies the Person who edited the CreativeWork.,,,,,,,,,,,,,,,,,,,,,,P98, schema:CreativeWork,encoding,MediaObject,A media object that encodes this CreativeWork. This property is a synonym for associatedMedia. Supersedes encodings.,,,,,,,,,,,,,,,,,,,,,,, -schema:CreativeWork,fileFormat,Text or URL,"Media type, typically MIME format (see IANA site) of the content e.g. application/zip of a SoftwareApplication binary. In cases where a CreativeWork has several media type representations, 'encoding' can be used to indicate each MediaObject alongside particular fileFormat information. Unregistered or niche file formats can be indicated instead via the most appropriate URL, e.g. defining Web page or a Wikipedia entry.",,Format,,,,,,,,,,,,,,,,,,,,, -schema:CreativeWork,funder,Organization or Person,A person or organization that supports (sponsors) something through some kind of financial contribution.,fundingReference.funderName,,,contributors.Funder,,,,,,,,,,,,,,,,,,, -schema:CreativeWork,keywords,Text,Keywords or tags used to describe this content. Multiple entries in a keywords list are typically delimited by commas.,controlledTerms,subject,hasDomainKeywords,keywords,,tags,,,,,,keywords,Keywords,,keywords,keywords,,,,,category,,keywords -schema:CreativeWork,license,CreativeWork or URL,"A license document that applies to this content, typically indicated by URL.",licenseId,rights,License,license,license,License,software license,Software license,license,License,,license,License,license,license,license,license,License,license/licenses,,license,license,license/license-url -schema:CreativeWork,producer,Organization or Person,"The person or organization who produced the work (e.g. music album, movie, tv/radio series etc.).",,,,,,,,,,,,,,,,,,,,,,, +schema:CreativeWork,fileFormat,Text or URL,"Media type, typically MIME format (see IANA site) of the content e.g. application/zip of a SoftwareApplication binary. In cases where a CreativeWork has several media type representations, 'encoding' can be used to indicate each MediaObject alongside particular fileFormat information. Unregistered or niche file formats can be indicated instead via the most appropriate URL, e.g. defining Web page or a Wikipedia entry.",,Format,,,,,,,,,,,,,,,,,,,,P2701, +schema:CreativeWork,funder,Organization or Person,A person or organization that supports (sponsors) something through some kind of financial contribution.,fundingReference.funderName,funderName,,contributors.Funder,,,,,,,,,,,,,,,,,,P859, +schema:CreativeWork,keywords,Text,Keywords or tags used to describe this content. Multiple entries in a keywords list are typically delimited by commas.,controlledTerms,subject,hasDomainKeywords,keywords,,tags,,,,,,keywords,Keywords,,keywords,keywords,,,,,category,P921,keywords +schema:CreativeWork,license,CreativeWork or URL,"A license document that applies to this content, typically indicated by URL.",licenseId,rights,License,license,license,License,software license,Software license,license,License,,license,License,license,license,license,license,License,license/licenses,,license,P275,license/license-url +schema:CreativeWork,producer,Organization or Person,"The person or organization who produced the work (e.g. music album, movie, tv/radio series etc.).",,,,,,,,,,,,,,,,,,,,,,P162, schema:CreativeWork,provider,Organization or Person,"The service provider, service operator, or service performer; the goods producer. Another party (a seller) may offer those services or goods on behalf of the provider. A provider may also serve as the seller. Supersedes carrier.",,,,,,,,,,,,,,,,,,,,,,, -schema:CreativeWork,publisher,Organization or Person,The publisher of the creative work.,publisher,publisher,os:hasPublisher,,,,software publisher organization,,publisher,,,,,,,,,,,,vendor,, +schema:CreativeWork,publisher,Organization or Person,The publisher of the creative work.,publisher,publisher,os:hasPublisher,,,,software publisher organization,,publisher,,,,,,,,,,,,vendor,P123, schema:CreativeWork,sponsor,Organization or Person,"A person or organization that supports a thing through a pledge, promise, or financial contribution. e.g. a sponsor of a Medical Study or a corporate sponsor of an event.",,,,,,,,,,,,,,,,,,,,,,, schema:CreativeWork,version,Number or Text,The version of the CreativeWork embodied by a specified resource.,version,version,hasSoftwareVersion,,,,Version,Software version,dcterms:hasVersion,,numeric_version,Version,Version,version,,version,version,version,version,,,,version schema:CreativeWork,isAccessibleForFree,Boolean,A flag to signal that the publication is accessible for free.,,,,,,,,,,,,,,,,,,,,,,, -schema:CreativeWork,isPartOf,CreativeWork,Indicates a CreativeWork that this CreativeWork is (in some sense) part of. Reverse property hasPart,,,,,,,,,,,,,,,,,,,,,,,references -schema:CreativeWork,hasPart,CreativeWork,Indicates a CreativeWork that is (in some sense) a part of this CreativeWork. Reverse property isPartOf,,,,,,,,,,,,,,,,,,,,,,, +schema:CreativeWork,isPartOf,CreativeWork,Indicates a CreativeWork that this CreativeWork is (in some sense) part of. Reverse property hasPart,,,,,,,,,,,,,,,,,,,,,,P361,references +schema:CreativeWork,hasPart,CreativeWork,Indicates a CreativeWork that is (in some sense) a part of this CreativeWork. Reverse property isPartOf,,,,,,,,,,,,,,,,,,,,,,P527, schema:CreativeWork,position,Integer or Text,"The position of an item in a series or sequence of items. (While schema.org considers this a property of CreativeWork, it is also the way to indicate ordering in any list (e.g. the Authors list). By default arrays are unordered in JSON-LD",,,,,,,,,,,,,,,,,,,,,,, schema:Thing,description,Text,A description of the item.,description,description,hasShortDescription,description/notes,description,Description,software,,description,Description,Description,"description, long_description",Summary / Description,description,"abstract, description",description,description,Description,"summary, description",abstract,,,abstract schema:Thing,identifier,PropertyValue or URL,"The identifier property represents any kind of identifier for any kind of Thing, such as ISBNs, GTIN codes, UUIDs etc. Schema.org provides dedicated properties for representing many of these, either as textual strings or as URL (URI) links. See background notes for more details.",identifier,identifier,hasUniqueId,id,id,,,Persistent Identifier,identifier,Package,Package,,,,,name,groupId,,,ascl_id,,,doi -schema:Thing,name,Text,"The name of the item (software, Organization)",name,,hasName,title,full_name,Title,SoftwareTitle,Software title,title,Title,,name,Name,Title,name,name,name,name,name,title,,,title -schema:Thing,sameAs,URL,"URL of a reference Web page that unambiguously indicates the item's identity. E.g. the URL of the item's Wikipedia page, Wikidata entry, or official website.",,,,,,,,,,,,,,,,,,,,,,, -schema:Thing,url,URL,URL of the item.,URL,,,,,,,,,URL,,url,Home-Page,,,homepage,,URL,,,homepage,official website,url -schema:Thing,relatedLink,URL,"A link related to this object, e.g. related web pages",,RelateIdentifier,,,,,,,,,,,,,,,,,,,,, -schema:Person,givenName,Text,"Given name. In the U.S., the first name of a Person. This can be used along with familyName instead of the name property",,givenName,,,,,,,,givenName,,,,,,,,,,,,,person.given-names -schema:Person,familyName,Text,"Family name. In the U.S., the last name of an Person. This can be used along with givenName instead of the name property.",,familyName,,,,,,,,familyName,,,,,,,,,,,,,person.name-particle + person.family-names + person.name-suffix -schema:Person,email,Text,Email address,email,,,,,,,,,email,,author_email,Author-email,,email-address,author.email,,,email,email,,,person.email/entity.email +schema:Thing,name,Text,"The name of the item (software, Organization)",name,title ,hasName,title,full_name,Title,SoftwareTitle,Software title,title,Title,,name,Name,Title,name,name,name,name,name,title,,,title +schema:Thing,sameAs,URL,"URL of a reference Web page that unambiguously indicates the item's identity. E.g. the URL of the item's Wikipedia page, Wikidata entry, or official website.",,,,,,,,,,,,,,,,,,,,,,P2888, +schema:Thing,url,URL,URL of the item.,URL,,,,,,,,,URL,,url,Home-Page,,,homepage,,URL,,,homepage,P856,url +schema:Thing,relatedLink,URL,"A link related to this object, e.g. related web pages",,RelatedIdentifier,,,,,,,,,,,,,,,,,,,,, +schema:Person,givenName,Text,"Given name. In the U.S., the first name of a Person. This can be used along with familyName instead of the name property",,givenName,,,,,,,,givenName,,,,,,,,,,,,P735,person.given-names +schema:Person,familyName,Text,"Family name. In the U.S., the last name of an Person. This can be used along with givenName instead of the name property.",,familyName,,,,,,,,familyName,,,,,,,,,,,,P734,person.name-particle + person.family-names + person.name-suffix +schema:Person,email,Text,Email address,email,,,,,,,,,email,,author_email,Author-email,,email-address,author.email,,,email,email,,P968,person.email/entity.email schema:Person,affiliation,Text,"An organization that this person is affiliated with. For example, a school/university",affiliation,affiliation,,affiliation,,,,,,,,,,,,,,,,,,,person.affiliation schema:Person,identifier,URL,"URL identifer, ideally an ORCID ID for individuals, a FundRef ID for funders",identifier,nameIdentifier,,ORCID,,ORCID,,,,,,,,,,,,,,,,,person.orcid / entity.orcid -schema:Person,name,Text,"The name of an Organization, or if separate given and family names cannot be resolved for a Person",,,,name,,name,,,,,,,,,author:contact-name,author.name,,,,,,,entity.name +schema:Person,name,Text,"The name of an Organization, or if separate given and family names cannot be resolved for a Person",,creatorName ,,name,,name,,,,,,,,,author:contact-name,author.name,,,,,,,entity.name schema:Person,address,PostalAddress or Text,Physical address of the item.,,,,,,,,,,,,,,,,,,,,,,,person.address + person.city + person.region + person.post-code + person.country / entity.address + entity.city + entity.region + entity.post-code + entity.country schema,type,Object Type (from context or URI),"The object type (e.g. ""Person"", ""Organization"", ""ScientificArticle"", ""SoftwareApplication"", etc).",,,,,,,,,,,,,,,,,,,,,,,reference.type schema,id,URL,Primary identifier for an object. Must be a resolvable URL or a string used to refer to this node elsewhere in the same document,,,,,,,,,,,,,,,,,,,,,,, codemeta:SoftwareSourceCode,softwareSuggestions,SoftwareSourceCode,"Optional dependencies , e.g. for optional features, code development, etc",suggests,,,,,,,,,Suggests,,,,,,devDependencies / optionalDependencies,,BuildDepends,add_development_dependency,,,, codemeta:SoftwareSourceCode,maintainer,Person,Individual responsible for maintaining the software (usually includes an email contact address),uploadedBy,,,,,,,,,Maintainer,,maintainer / maintainer_email,,,,,,,,,maintainer,, codemeta:SoftwareSourceCode,contIntegration,URL,link to continuous integration service,contIntegration,,,,,,,,,,,,,,,,ciManagement,,,,,, codemeta:SoftwareSourceCode,buildInstructions,URL,link to installation instructions/documentation,buildInstructions,,,,,,,,,,,,,,,,,,,,,, codemeta:SoftwareSourceCode,developmentStatus,Text,"Description of development status, e.g. Active, inactive, supsended. See repostatus.org",developmentStatus,,activeDevelopment,,,,,,,,,classifiers['Development Status'],,Development Status,release_status,,,,,,,, codemeta:SoftwareSourceCode,embargoDate,Date,"Software may be embargoed from public access until a specified date (e.g. pending publication, 1 year from publication)",embargoDate,,,,,embargo_date,,,,,,,,,,,,,,,,, -codemeta:SoftwareSourceCode,funding,Text,Funding source (e.g. specific grant),funding,,fundingReference.awardTitle or fundingReference.awardNumber,,,,,,,,,,,,,,,,,,,, -codemeta:SoftwareSourceCode,issueTracker,URL,link to software bug reporting or issue tracking system,issueTracker,,,,issues_url,,,,,BugReports,,,,,resources.bugtracker,bugs,issuesManagement,Problems,,,bug-database,bug tracking system,repository +codemeta:SoftwareSourceCode,funding,Text,Funding source (e.g. specific grant),funding,awardNumber ,fundingReference.awardTitle or fundingReference.awardNumber,,,,,,,,,,,,,,,,,,,, +codemeta:SoftwareSourceCode,issueTracker,URL,link to software bug reporting or issue tracking system,issueTracker,,,,issues_url,,,,,BugReports,,,,,resources.bugtracker,bugs,issueManagement,Problems,,,bug-database,bug tracking system,repository codemeta:SoftwareSourceCode,referencePublication,ScholarlyArticle,An academic publication related to the software.,relatedPublications,,,,,,,,,,,,,,,,,,,,blog,,references codemeta:SoftwareSourceCode,readme,URL,link to software Readme file,readme,,,,,,,,,,,,,,,,,,,,,, ,,,,relatedIdentifer,,,,,,,,,,,,,,,,,,,,,, ,,,,relatedIdentiferType,,,,,,,,,,,,,,,,,,,,,, ,,,,relationshipType,,,,,,,,,,,,,,,,,,,,,, ,,,,title,,,,,,,,,,,,,,,,,,,,,, ,,,,namespace,,,,,,,,,,,,,,,,,,,,,, ,,,,role,,,,,,,,,,,,,,,,,,,,,, ,,,,roleCode,,,,,,,,,,,,,,,,,,,,,, ,,,,softwarePaperCitationIdenifiers,,,,,,,,,,,,,,,,,,,,,, diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py index e81466c..8b207a6 100644 --- a/swh/indexer/metadata.py +++ b/swh/indexer/metadata.py @@ -1,378 +1,379 @@ -# Copyright (C) 2017-2020 The Software Heritage developers +# Copyright (C) 2017-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from copy import deepcopy from typing import ( Any, Callable, Dict, Iterable, Iterator, List, Optional, Tuple, TypeVar, ) from swh.core.config import merge_configs from swh.core.utils import grouper from swh.indexer.codemeta import merge_documents from swh.indexer.indexer import ContentIndexer, OriginIndexer, RevisionIndexer from swh.indexer.metadata_detector import detect_metadata from swh.indexer.metadata_dictionary import MAPPINGS from swh.indexer.origin_head import OriginHeadIndexer from swh.indexer.storage import INDEXER_CFG_KEY, Sha1 from swh.indexer.storage.model import ( ContentMetadataRow, OriginIntrinsicMetadataRow, RevisionIntrinsicMetadataRow, ) from swh.model import hashutil from swh.model.model import Revision, Sha1Git REVISION_GET_BATCH_SIZE = 10 ORIGIN_GET_BATCH_SIZE = 10 T1 = TypeVar("T1") T2 = TypeVar("T2") def call_with_batches( f: Callable[[List[T1]], Iterable[T2]], args: List[T1], batch_size: int, ) -> Iterator[T2]: """Calls a function with batches of args, and concatenates the results. """ groups = grouper(args, batch_size) for group in groups: yield from f(list(group)) class ContentMetadataIndexer(ContentIndexer[ContentMetadataRow]): """Content-level indexer This indexer is in charge of: - filtering out content already indexed in content_metadata - reading content from objstorage with the content's id sha1 - computing metadata by given context - using the metadata_dictionary as the 'swh-metadata-translator' tool - store result in content_metadata table """ def filter(self, ids): """Filter out known sha1s and return only missing ones. """ yield from self.idx_storage.content_metadata_missing( ({"id": sha1, "indexer_configuration_id": self.tool["id"],} for sha1 in ids) ) def index( self, id: Sha1, data: Optional[bytes] = None, log_suffix="unknown revision", **kwargs, ) -> List[ContentMetadataRow]: """Index sha1s' content and store result. Args: - id (bytes): content's identifier - data (bytes): raw content in bytes + id: content's identifier + data: raw content in bytes Returns: dict: dictionary representing a content_metadata. If the translation wasn't successful the metadata keys will be returned as None """ assert isinstance(id, bytes) assert data is not None + metadata = None try: mapping_name = self.tool["tool_configuration"]["context"] log_suffix += ", content_id=%s" % hashutil.hash_to_hex(id) metadata = MAPPINGS[mapping_name](log_suffix).translate(data) except Exception: self.log.exception( "Problem during metadata translation " "for content %s" % hashutil.hash_to_hex(id) ) if metadata is None: return [] return [ ContentMetadataRow( id=id, indexer_configuration_id=self.tool["id"], metadata=metadata, ) ] def persist_index_computations( self, results: List[ContentMetadataRow] ) -> Dict[str, int]: """Persist the results in storage. Args: results: list of content_metadata, dict with the following keys: - id (bytes): content's identifier (sha1) - metadata (jsonb): detected metadata """ return self.idx_storage.content_metadata_add(results) DEFAULT_CONFIG: Dict[str, Any] = { "tools": { "name": "swh-metadata-detector", "version": "0.0.2", "configuration": {}, }, } class RevisionMetadataIndexer(RevisionIndexer[RevisionIntrinsicMetadataRow]): """Revision-level indexer This indexer is in charge of: - filtering revisions already indexed in revision_intrinsic_metadata table with defined computation tool - retrieve all entry_files in root directory - use metadata_detector for file_names containing metadata - compute metadata translation if necessary and possible (depends on tool) - send sha1s to content indexing if possible - store the results for revision """ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.config = merge_configs(DEFAULT_CONFIG, self.config) def filter(self, sha1_gits): """Filter out known sha1s and return only missing ones. """ yield from self.idx_storage.revision_intrinsic_metadata_missing( ( {"id": sha1_git, "indexer_configuration_id": self.tool["id"],} for sha1_git in sha1_gits ) ) def index( self, id: Sha1Git, data: Optional[Revision], **kwargs ) -> List[RevisionIntrinsicMetadataRow]: """Index rev by processing it and organizing result. use metadata_detector to iterate on filenames - if one filename detected -> sends file to content indexer - if multiple file detected -> translation needed at revision level Args: id: sha1_git of the revision data: revision model object from storage Returns: dict: dictionary representing a revision_intrinsic_metadata, with keys: - id (str): rev's identifier (sha1_git) - indexer_configuration_id (bytes): tool used - metadata: dict of retrieved metadata """ rev = data assert isinstance(rev, Revision) try: root_dir = rev.directory dir_ls = list(self.storage.directory_ls(root_dir, recursive=False)) if [entry["type"] for entry in dir_ls] == ["dir"]: # If the root is just a single directory, recurse into it # eg. PyPI packages, GNU tarballs subdir = dir_ls[0]["target"] dir_ls = list(self.storage.directory_ls(subdir, recursive=False)) files = [entry for entry in dir_ls if entry["type"] == "file"] detected_files = detect_metadata(files) (mappings, metadata) = self.translate_revision_intrinsic_metadata( detected_files, log_suffix="revision=%s" % hashutil.hash_to_hex(rev.id), ) except Exception as e: self.log.exception("Problem when indexing rev: %r", e) return [ RevisionIntrinsicMetadataRow( id=rev.id, indexer_configuration_id=self.tool["id"], mappings=mappings, metadata=metadata, ) ] def persist_index_computations( self, results: List[RevisionIntrinsicMetadataRow] ) -> Dict[str, int]: """Persist the results in storage. Args: results: list of content_mimetype, dict with the following keys: - id (bytes): content's identifier (sha1) - mimetype (bytes): mimetype in bytes - encoding (bytes): encoding in bytes """ # TODO: add functions in storage to keep data in # revision_intrinsic_metadata return self.idx_storage.revision_intrinsic_metadata_add(results) def translate_revision_intrinsic_metadata( self, detected_files: Dict[str, List[Any]], log_suffix: str ) -> Tuple[List[Any], Any]: """ Determine plan of action to translate metadata when containing one or multiple detected files: Args: detected_files: dictionary mapping context names (e.g., "npm", "authors") to list of sha1 Returns: (List[str], dict): list of mappings used and dict with translated metadata according to the CodeMeta vocabulary """ used_mappings = [MAPPINGS[context].name for context in detected_files] metadata = [] tool = { "name": "swh-metadata-translator", "version": "0.0.2", "configuration": {}, } # TODO: iterate on each context, on each file # -> get raw_contents # -> translate each content config = {k: self.config[k] for k in [INDEXER_CFG_KEY, "objstorage", "storage"]} config["tools"] = [tool] for context in detected_files.keys(): cfg = deepcopy(config) cfg["tools"][0]["configuration"]["context"] = context c_metadata_indexer = ContentMetadataIndexer(config=cfg) # sha1s that are in content_metadata table sha1s_in_storage = [] metadata_generator = self.idx_storage.content_metadata_get( detected_files[context] ) for c in metadata_generator: # extracting metadata sha1 = c.id sha1s_in_storage.append(sha1) local_metadata = c.metadata # local metadata is aggregated if local_metadata: metadata.append(local_metadata) sha1s_filtered = [ item for item in detected_files[context] if item not in sha1s_in_storage ] if sha1s_filtered: # content indexing try: c_metadata_indexer.run( sha1s_filtered, log_suffix=log_suffix, ) # on the fly possibility: for result in c_metadata_indexer.results: local_metadata = result.metadata metadata.append(local_metadata) except Exception: self.log.exception("Exception while indexing metadata on contents") metadata = merge_documents(metadata) return (used_mappings, metadata) class OriginMetadataIndexer( OriginIndexer[Tuple[OriginIntrinsicMetadataRow, RevisionIntrinsicMetadataRow]] ): USE_TOOLS = False def __init__(self, config=None, **kwargs) -> None: super().__init__(config=config, **kwargs) self.origin_head_indexer = OriginHeadIndexer(config=config) self.revision_metadata_indexer = RevisionMetadataIndexer(config=config) def index_list( self, origin_urls: List[str], **kwargs ) -> List[Tuple[OriginIntrinsicMetadataRow, RevisionIntrinsicMetadataRow]]: head_rev_ids = [] origins_with_head = [] origins = list( call_with_batches( self.storage.origin_get, origin_urls, ORIGIN_GET_BATCH_SIZE, ) ) for origin in origins: if origin is None: continue head_results = self.origin_head_indexer.index(origin.url) if head_results: (head_result,) = head_results origins_with_head.append(origin) head_rev_ids.append(head_result["revision_id"]) head_revs = list( call_with_batches( self.storage.revision_get, head_rev_ids, REVISION_GET_BATCH_SIZE ) ) assert len(head_revs) == len(head_rev_ids) results = [] for (origin, rev) in zip(origins_with_head, head_revs): if not rev: self.log.warning("Missing head revision of origin %r", origin.url) continue for rev_metadata in self.revision_metadata_indexer.index(rev.id, rev): # There is at most one rev_metadata orig_metadata = OriginIntrinsicMetadataRow( from_revision=rev_metadata.id, id=origin.url, metadata=rev_metadata.metadata, mappings=rev_metadata.mappings, indexer_configuration_id=rev_metadata.indexer_configuration_id, ) results.append((orig_metadata, rev_metadata)) return results def persist_index_computations( self, results: List[Tuple[OriginIntrinsicMetadataRow, RevisionIntrinsicMetadataRow]], ) -> Dict[str, int]: # Deduplicate revisions rev_metadata: List[RevisionIntrinsicMetadataRow] = [] orig_metadata: List[OriginIntrinsicMetadataRow] = [] summary: Dict = {} for (orig_item, rev_item) in results: assert rev_item.metadata == orig_item.metadata if rev_item.metadata and not (rev_item.metadata.keys() <= {"@context"}): # Only store non-empty metadata sets if rev_item not in rev_metadata: rev_metadata.append(rev_item) if orig_item not in orig_metadata: orig_metadata.append(orig_item) if rev_metadata: summary_rev = self.idx_storage.revision_intrinsic_metadata_add(rev_metadata) summary.update(summary_rev) if orig_metadata: summary_ori = self.idx_storage.origin_intrinsic_metadata_add(orig_metadata) summary.update(summary_ori) return summary diff --git a/swh/indexer/metadata_dictionary/__init__.py b/swh/indexer/metadata_dictionary/__init__.py index a0668df..81df82a 100644 --- a/swh/indexer/metadata_dictionary/__init__.py +++ b/swh/indexer/metadata_dictionary/__init__.py @@ -1,39 +1,40 @@ import collections import click -from . import codemeta, maven, npm, python, ruby +from . import cff, codemeta, maven, npm, python, ruby MAPPINGS = { "CodemetaMapping": codemeta.CodemetaMapping, "MavenMapping": maven.MavenMapping, "NpmMapping": npm.NpmMapping, "PythonPkginfoMapping": python.PythonPkginfoMapping, "GemspecMapping": ruby.GemspecMapping, + "CffMapping": cff.CffMapping, } def list_terms(): """Returns a dictionary with all supported CodeMeta terms as keys, and the mappings that support each of them as values.""" d = collections.defaultdict(set) for mapping in MAPPINGS.values(): for term in mapping.supported_terms(): d[term].add(mapping) return d @click.command() @click.argument("mapping_name") @click.argument("file_name") def main(mapping_name, file_name): from pprint import pprint with open(file_name, "rb") as fd: file_content = fd.read() res = MAPPINGS[mapping_name]().translate(file_content) pprint(res) if __name__ == "__main__": main() diff --git a/swh/indexer/metadata_dictionary/base.py b/swh/indexer/metadata_dictionary/base.py index 0ddd975..0a2becf 100644 --- a/swh/indexer/metadata_dictionary/base.py +++ b/swh/indexer/metadata_dictionary/base.py @@ -1,177 +1,177 @@ # Copyright (C) 2017-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json import logging from typing import List from swh.indexer.codemeta import SCHEMA_URI, compact, merge_values class BaseMapping: """Base class for mappings to inherit from To implement a new mapping: - inherit this class - override translate function """ def __init__(self, log_suffix=""): self.log_suffix = log_suffix self.log = logging.getLogger( "%s.%s" % (self.__class__.__module__, self.__class__.__name__) ) @property def name(self): """A name of this mapping, used as an identifier in the indexer storage.""" raise NotImplementedError(f"{self.__class__.__name__}.name") @classmethod def detect_metadata_files(cls, files): """ Detects files potentially containing metadata Args: file_entries (list): list of files Returns: list: list of sha1 (possibly empty) """ raise NotImplementedError(f"{cls.__name__}.detect_metadata_files") def translate(self, file_content): raise NotImplementedError(f"{self.__class__.__name__}.translate") def normalize_translation(self, metadata): return compact(metadata) class SingleFileMapping(BaseMapping): """Base class for all mappings that use a single file as input.""" @property def filename(self): """The .json file to extract metadata from.""" raise NotImplementedError(f"{self.__class__.__name__}.filename") @classmethod def detect_metadata_files(cls, file_entries): for entry in file_entries: - if entry["name"] == cls.filename: + if entry["name"].lower() == cls.filename.lower(): return [entry["sha1"]] return [] class DictMapping(BaseMapping): """Base class for mappings that take as input a file that is mostly a key-value store (eg. a shallow JSON dict).""" string_fields = [] # type: List[str] """List of fields that are simple strings, and don't need any normalization.""" @property def mapping(self): """A translation dict to map dict keys into a canonical name.""" raise NotImplementedError(f"{self.__class__.__name__}.mapping") @staticmethod def _normalize_method_name(name): return name.replace("-", "_") @classmethod def supported_terms(cls): return { term for (key, term) in cls.mapping.items() if key in cls.string_fields or hasattr(cls, "translate_" + cls._normalize_method_name(key)) or hasattr(cls, "normalize_" + cls._normalize_method_name(key)) } def _translate_dict(self, content_dict, *, normalize=True): """ Translates content by parsing content from a dict object and translating with the appropriate mapping Args: content_dict (dict): content dict to translate Returns: dict: translated metadata in json-friendly form needed for the indexer """ translated_metadata = {"@type": SCHEMA_URI + "SoftwareSourceCode"} for k, v in content_dict.items(): # First, check if there is a specific translation # method for this key translation_method = getattr( self, "translate_" + self._normalize_method_name(k), None ) if translation_method: translation_method(translated_metadata, v) elif k in self.mapping: # if there is no method, but the key is known from the # crosswalk table codemeta_key = self.mapping[k] # if there is a normalization method, use it on the value normalization_method = getattr( self, "normalize_" + self._normalize_method_name(k), None ) if normalization_method: v = normalization_method(v) elif k in self.string_fields and isinstance(v, str): pass elif k in self.string_fields and isinstance(v, list): v = [x for x in v if isinstance(x, str)] else: continue # set the translation metadata with the normalized value if codemeta_key in translated_metadata: translated_metadata[codemeta_key] = merge_values( translated_metadata[codemeta_key], v ) else: translated_metadata[codemeta_key] = v if normalize: return self.normalize_translation(translated_metadata) else: return translated_metadata class JsonMapping(DictMapping, SingleFileMapping): """Base class for all mappings that use a JSON file as input.""" def translate(self, raw_content): """ Translates content by parsing content from a bytestring containing json data and translating with the appropriate mapping Args: raw_content (bytes): raw content to translate Returns: dict: translated metadata in json-friendly form needed for the indexer """ try: raw_content = raw_content.decode() except UnicodeDecodeError: self.log.warning("Error unidecoding from %s", self.log_suffix) return try: content_dict = json.loads(raw_content) except json.JSONDecodeError: self.log.warning("Error unjsoning from %s", self.log_suffix) return if isinstance(content_dict, dict): return self._translate_dict(content_dict) diff --git a/swh/indexer/metadata_dictionary/cff.py b/swh/indexer/metadata_dictionary/cff.py new file mode 100644 index 0000000..43f944d --- /dev/null +++ b/swh/indexer/metadata_dictionary/cff.py @@ -0,0 +1,65 @@ +import yaml + +from swh.indexer.codemeta import CODEMETA_CONTEXT_URL, CROSSWALK_TABLE, SCHEMA_URI + +from .base import DictMapping, SingleFileMapping + +yaml.SafeLoader.yaml_implicit_resolvers = { + k: [r for r in v if r[0] != "tag:yaml.org,2002:timestamp"] + for k, v in yaml.SafeLoader.yaml_implicit_resolvers.items() +} + + +class CffMapping(DictMapping, SingleFileMapping): + """Dedicated class for Citation (CITATION.cff) mapping and translation""" + + name = "cff" + filename = b"CITATION.cff" + mapping = CROSSWALK_TABLE["Citation File Format Core (CFF-Core) 1.0.2"] + string_fields = ["keywords", "license", "abstract", "version", "doi"] + + def translate(self, raw_content): + raw_content = raw_content.decode() + content_dict = yaml.load(raw_content, Loader=yaml.SafeLoader) + metadata = self._translate_dict(content_dict) + + metadata["@context"] = CODEMETA_CONTEXT_URL + + return metadata + + def normalize_authors(self, d): + result = [] + for author in d: + author_data = {"@type": SCHEMA_URI + "Person"} + if "orcid" in author: + author_data["@id"] = author["orcid"] + if "affiliation" in author: + author_data[SCHEMA_URI + "affiliation"] = { + "@type": SCHEMA_URI + "Organization", + SCHEMA_URI + "name": author["affiliation"], + } + if "family-names" in author: + author_data[SCHEMA_URI + "familyName"] = author["family-names"] + if "given-names" in author: + author_data[SCHEMA_URI + "givenName"] = author["given-names"] + + result.append(author_data) + + result = {"@list": result} + return result + + def normalize_doi(self, s): + if isinstance(s, str): + return {"@id": "https://doi.org/" + s} + + def normalize_license(self, s): + if isinstance(s, str): + return {"@id": "https://spdx.org/licenses/" + s} + + def normalize_repository_code(self, s): + if isinstance(s, str): + return {"@id": s} + + def normalize_date_released(self, s): + if isinstance(s, str): + return {"@value": s, "@type": SCHEMA_URI + "Date"} diff --git a/swh/indexer/metadata_dictionary/maven.py b/swh/indexer/metadata_dictionary/maven.py index d725f5f..f5f6d9b 100644 --- a/swh/indexer/metadata_dictionary/maven.py +++ b/swh/indexer/metadata_dictionary/maven.py @@ -1,158 +1,161 @@ -# Copyright (C) 2018-2019 The Software Heritage developers +# Copyright (C) 2018-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import xml.parsers.expat import xmltodict from swh.indexer.codemeta import CROSSWALK_TABLE, SCHEMA_URI from .base import DictMapping, SingleFileMapping class MavenMapping(DictMapping, SingleFileMapping): """ dedicated class for Maven (pom.xml) mapping and translation """ name = "maven" filename = b"pom.xml" mapping = CROSSWALK_TABLE["Java (Maven)"] string_fields = ["name", "version", "description", "email"] def translate(self, content): try: d = xmltodict.parse(content).get("project") or {} except xml.parsers.expat.ExpatError: self.log.warning("Error parsing XML from %s", self.log_suffix) return None except UnicodeDecodeError: self.log.warning("Error unidecoding XML from %s", self.log_suffix) return None except (LookupError, ValueError): # unknown encoding or multi-byte encoding self.log.warning("Error detecting XML encoding from %s", self.log_suffix) return None + if not isinstance(d, dict): + self.log.warning("Skipping ill-formed XML content: %s", content) + return None metadata = self._translate_dict(d, normalize=False) metadata[SCHEMA_URI + "codeRepository"] = self.parse_repositories(d) metadata[SCHEMA_URI + "license"] = self.parse_licenses(d) return self.normalize_translation(metadata) _default_repository = {"url": "https://repo.maven.apache.org/maven2/"} def parse_repositories(self, d): """https://maven.apache.org/pom.html#Repositories >>> import xmltodict >>> from pprint import pprint >>> d = xmltodict.parse(''' ... ... ... codehausSnapshots ... Codehaus Snapshots ... http://snapshots.maven.codehaus.org/maven2 ... default ... ... ... ''') >>> MavenMapping().parse_repositories(d) """ repositories = d.get("repositories") if not repositories: results = [self.parse_repository(d, self._default_repository)] elif isinstance(repositories, dict): repositories = repositories.get("repository") or [] if not isinstance(repositories, list): repositories = [repositories] results = [self.parse_repository(d, repo) for repo in repositories] else: results = [] return [res for res in results if res] or None def parse_repository(self, d, repo): if not isinstance(repo, dict): return if repo.get("layout", "default") != "default": return # TODO ? url = repo.get("url") group_id = d.get("groupId") artifact_id = d.get("artifactId") if ( isinstance(url, str) and isinstance(group_id, str) and isinstance(artifact_id, str) ): repo = os.path.join(url, *group_id.split("."), artifact_id) return {"@id": repo} def normalize_groupId(self, id_): """https://maven.apache.org/pom.html#Maven_Coordinates >>> MavenMapping().normalize_groupId('org.example') {'@id': 'org.example'} """ if isinstance(id_, str): return {"@id": id_} def parse_licenses(self, d): """https://maven.apache.org/pom.html#Licenses >>> import xmltodict >>> import json >>> d = xmltodict.parse(''' ... ... ... Apache License, Version 2.0 ... https://www.apache.org/licenses/LICENSE-2.0.txt ... ... ... ''') >>> print(json.dumps(d, indent=4)) { "licenses": { "license": { "name": "Apache License, Version 2.0", "url": "https://www.apache.org/licenses/LICENSE-2.0.txt" } } } >>> MavenMapping().parse_licenses(d) [{'@id': 'https://www.apache.org/licenses/LICENSE-2.0.txt'}] or, if there are more than one license: >>> import xmltodict >>> from pprint import pprint >>> d = xmltodict.parse(''' ... ... ... Apache License, Version 2.0 ... https://www.apache.org/licenses/LICENSE-2.0.txt ... ... ... MIT License ... https://opensource.org/licenses/MIT ... ... ... ''') >>> pprint(MavenMapping().parse_licenses(d)) [{'@id': 'https://www.apache.org/licenses/LICENSE-2.0.txt'}, {'@id': 'https://opensource.org/licenses/MIT'}] """ licenses = d.get("licenses") if not isinstance(licenses, dict): return licenses = licenses.get("license") if isinstance(licenses, dict): licenses = [licenses] elif not isinstance(licenses, list): return return [ {"@id": license["url"]} for license in licenses if isinstance(license, dict) and isinstance(license.get("url"), str) ] or None diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py index 8d902ce..df4d5ae 100644 --- a/swh/indexer/storage/__init__.py +++ b/swh/indexer/storage/__init__.py @@ -1,723 +1,723 @@ # Copyright (C) 2015-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from collections import Counter from importlib import import_module import json from typing import Dict, Iterable, List, Optional, Tuple, Union import warnings import psycopg2 import psycopg2.pool from swh.core.db.common import db_transaction from swh.indexer.storage.interface import IndexerStorageInterface from swh.model.hashutil import hash_to_bytes, hash_to_hex from swh.model.model import SHA1_SIZE from swh.storage.exc import StorageDBError from swh.storage.utils import get_partition_bounds_bytes from . import converters from .db import Db from .exc import DuplicateId, IndexerStorageArgumentException from .interface import PagedResult, Sha1 from .metrics import process_metrics, send_metric, timed from .model import ( ContentCtagsRow, ContentLanguageRow, ContentLicenseRow, ContentMetadataRow, ContentMimetypeRow, OriginIntrinsicMetadataRow, RevisionIntrinsicMetadataRow, ) from .writer import JournalWriter INDEXER_CFG_KEY = "indexer_storage" -MAPPING_NAMES = ["codemeta", "gemspec", "maven", "npm", "pkg-info"] +MAPPING_NAMES = ["cff", "codemeta", "gemspec", "maven", "npm", "pkg-info"] SERVER_IMPLEMENTATIONS: Dict[str, str] = { "local": ".IndexerStorage", "remote": ".api.client.RemoteStorage", "memory": ".in_memory.IndexerStorage", } def get_indexer_storage(cls: str, **kwargs) -> IndexerStorageInterface: """Instantiate an indexer storage implementation of class `cls` with arguments `kwargs`. Args: cls: indexer storage class (local, remote or memory) kwargs: dictionary of arguments passed to the indexer storage class constructor Returns: an instance of swh.indexer.storage Raises: ValueError if passed an unknown storage class. """ if "args" in kwargs: warnings.warn( 'Explicit "args" key is deprecated, use keys directly instead.', DeprecationWarning, ) kwargs = kwargs["args"] class_path = SERVER_IMPLEMENTATIONS.get(cls) if class_path is None: raise ValueError( f"Unknown indexer storage class `{cls}`. " f"Supported: {', '.join(SERVER_IMPLEMENTATIONS)}" ) (module_path, class_name) = class_path.rsplit(".", 1) module = import_module(module_path if module_path else ".", package=__package__) BackendClass = getattr(module, class_name) check_config = kwargs.pop("check_config", {}) idx_storage = BackendClass(**kwargs) if check_config: if not idx_storage.check_config(**check_config): raise EnvironmentError("Indexer storage check config failed") return idx_storage def check_id_duplicates(data): """ If any two row models in `data` have the same unique key, raises a `ValueError`. Values associated to the key must be hashable. Args: data (List[dict]): List of dictionaries to be inserted >>> check_id_duplicates([ ... ContentLanguageRow(id=b'foo', indexer_configuration_id=42, lang="python"), ... ContentLanguageRow(id=b'foo', indexer_configuration_id=32, lang="python"), ... ]) >>> check_id_duplicates([ ... ContentLanguageRow(id=b'foo', indexer_configuration_id=42, lang="python"), ... ContentLanguageRow(id=b'foo', indexer_configuration_id=42, lang="python"), ... ]) Traceback (most recent call last): ... swh.indexer.storage.exc.DuplicateId: [{'id': b'foo', 'indexer_configuration_id': 42}] """ # noqa counter = Counter(tuple(sorted(item.unique_key().items())) for item in data) duplicates = [id_ for (id_, count) in counter.items() if count >= 2] if duplicates: raise DuplicateId(list(map(dict, duplicates))) class IndexerStorage: """SWH Indexer Storage """ def __init__(self, db, min_pool_conns=1, max_pool_conns=10, journal_writer=None): """ Args: db: either a libpq connection string, or a psycopg2 connection journal_writer: configuration passed to `swh.journal.writer.get_journal_writer` """ self.journal_writer = JournalWriter(self._tool_get_from_id, journal_writer) try: if isinstance(db, psycopg2.extensions.connection): self._pool = None self._db = Db(db) else: self._pool = psycopg2.pool.ThreadedConnectionPool( min_pool_conns, max_pool_conns, db ) self._db = None except psycopg2.OperationalError as e: raise StorageDBError(e) def get_db(self): if self._db: return self._db return Db.from_pool(self._pool) def put_db(self, db): if db is not self._db: db.put_conn() @timed @db_transaction() def check_config(self, *, check_write, db=None, cur=None): # Check permissions on one of the tables if check_write: check = "INSERT" else: check = "SELECT" cur.execute( "select has_table_privilege(current_user, 'content_mimetype', %s)", # noqa (check,), ) return cur.fetchone()[0] @timed @db_transaction() def content_mimetype_missing( self, mimetypes: Iterable[Dict], db=None, cur=None ) -> List[Tuple[Sha1, int]]: return [obj[0] for obj in db.content_mimetype_missing_from_list(mimetypes, cur)] @timed @db_transaction() def get_partition( self, indexer_type: str, indexer_configuration_id: int, partition_id: int, nb_partitions: int, page_token: Optional[str] = None, limit: int = 1000, with_textual_data=False, db=None, cur=None, ) -> PagedResult[Sha1]: """Retrieve ids of content with `indexer_type` within within partition partition_id bound by limit. Args: **indexer_type**: Type of data content to index (mimetype, language, etc...) **indexer_configuration_id**: The tool used to index data **partition_id**: index of the partition to fetch **nb_partitions**: total number of partitions to split into **page_token**: opaque token used for pagination **limit**: Limit result (default to 1000) **with_textual_data** (bool): Deal with only textual content (True) or all content (all contents by defaults, False) Raises: IndexerStorageArgumentException for; - limit to None - wrong indexer_type provided Returns: PagedResult of Sha1. If next_page_token is None, there is no more data to fetch """ if limit is None: raise IndexerStorageArgumentException("limit should not be None") if indexer_type not in db.content_indexer_names: err = f"Wrong type. Should be one of [{','.join(db.content_indexer_names)}]" raise IndexerStorageArgumentException(err) start, end = get_partition_bounds_bytes(partition_id, nb_partitions, SHA1_SIZE) if page_token is not None: start = hash_to_bytes(page_token) if end is None: end = b"\xff" * SHA1_SIZE next_page_token: Optional[str] = None ids = [ row[0] for row in db.content_get_range( indexer_type, start, end, indexer_configuration_id, limit=limit + 1, with_textual_data=with_textual_data, cur=cur, ) ] if len(ids) >= limit: next_page_token = hash_to_hex(ids[-1]) ids = ids[:limit] assert len(ids) <= limit return PagedResult(results=ids, next_page_token=next_page_token) @timed @db_transaction() def content_mimetype_get_partition( self, indexer_configuration_id: int, partition_id: int, nb_partitions: int, page_token: Optional[str] = None, limit: int = 1000, db=None, cur=None, ) -> PagedResult[Sha1]: return self.get_partition( "mimetype", indexer_configuration_id, partition_id, nb_partitions, page_token=page_token, limit=limit, db=db, cur=cur, ) @timed @process_metrics @db_transaction() def content_mimetype_add( self, mimetypes: List[ContentMimetypeRow], db=None, cur=None, ) -> Dict[str, int]: check_id_duplicates(mimetypes) mimetypes.sort(key=lambda m: m.id) self.journal_writer.write_additions("content_mimetype", mimetypes) db.mktemp_content_mimetype(cur) db.copy_to( [m.to_dict() for m in mimetypes], "tmp_content_mimetype", ["id", "mimetype", "encoding", "indexer_configuration_id"], cur, ) count = db.content_mimetype_add_from_temp(cur) return {"content_mimetype:add": count} @timed @db_transaction() def content_mimetype_get( self, ids: Iterable[Sha1], db=None, cur=None ) -> List[ContentMimetypeRow]: return [ ContentMimetypeRow.from_dict( converters.db_to_mimetype(dict(zip(db.content_mimetype_cols, c))) ) for c in db.content_mimetype_get_from_list(ids, cur) ] @timed @db_transaction() def content_language_missing( self, languages: Iterable[Dict], db=None, cur=None ) -> List[Tuple[Sha1, int]]: return [obj[0] for obj in db.content_language_missing_from_list(languages, cur)] @timed @db_transaction() def content_language_get( self, ids: Iterable[Sha1], db=None, cur=None ) -> List[ContentLanguageRow]: return [ ContentLanguageRow.from_dict( converters.db_to_language(dict(zip(db.content_language_cols, c))) ) for c in db.content_language_get_from_list(ids, cur) ] @timed @process_metrics @db_transaction() def content_language_add( self, languages: List[ContentLanguageRow], db=None, cur=None, ) -> Dict[str, int]: check_id_duplicates(languages) languages.sort(key=lambda m: m.id) self.journal_writer.write_additions("content_language", languages) db.mktemp_content_language(cur) # empty language is mapped to 'unknown' db.copy_to( ( { "id": lang.id, "lang": lang.lang or "unknown", "indexer_configuration_id": lang.indexer_configuration_id, } for lang in languages ), "tmp_content_language", ["id", "lang", "indexer_configuration_id"], cur, ) count = db.content_language_add_from_temp(cur) return {"content_language:add": count} @timed @db_transaction() def content_ctags_missing( self, ctags: Iterable[Dict], db=None, cur=None ) -> List[Tuple[Sha1, int]]: return [obj[0] for obj in db.content_ctags_missing_from_list(ctags, cur)] @timed @db_transaction() def content_ctags_get( self, ids: Iterable[Sha1], db=None, cur=None ) -> List[ContentCtagsRow]: return [ ContentCtagsRow.from_dict( converters.db_to_ctags(dict(zip(db.content_ctags_cols, c))) ) for c in db.content_ctags_get_from_list(ids, cur) ] @timed @process_metrics @db_transaction() def content_ctags_add( self, ctags: List[ContentCtagsRow], db=None, cur=None, ) -> Dict[str, int]: check_id_duplicates(ctags) ctags.sort(key=lambda m: m.id) self.journal_writer.write_additions("content_ctags", ctags) db.mktemp_content_ctags(cur) db.copy_to( [ctag.to_dict() for ctag in ctags], tblname="tmp_content_ctags", columns=["id", "name", "kind", "line", "lang", "indexer_configuration_id"], cur=cur, ) count = db.content_ctags_add_from_temp(cur) return {"content_ctags:add": count} @timed @db_transaction() def content_ctags_search( self, expression: str, limit: int = 10, last_sha1: Optional[Sha1] = None, db=None, cur=None, ) -> List[ContentCtagsRow]: return [ ContentCtagsRow.from_dict( converters.db_to_ctags(dict(zip(db.content_ctags_cols, obj))) ) for obj in db.content_ctags_search(expression, last_sha1, limit, cur=cur) ] @timed @db_transaction() def content_fossology_license_get( self, ids: Iterable[Sha1], db=None, cur=None ) -> List[ContentLicenseRow]: return [ ContentLicenseRow.from_dict( converters.db_to_fossology_license( dict(zip(db.content_fossology_license_cols, c)) ) ) for c in db.content_fossology_license_get_from_list(ids, cur) ] @timed @process_metrics @db_transaction() def content_fossology_license_add( self, licenses: List[ContentLicenseRow], db=None, cur=None, ) -> Dict[str, int]: check_id_duplicates(licenses) licenses.sort(key=lambda m: m.id) self.journal_writer.write_additions("content_fossology_license", licenses) db.mktemp_content_fossology_license(cur) db.copy_to( [license.to_dict() for license in licenses], tblname="tmp_content_fossology_license", columns=["id", "license", "indexer_configuration_id"], cur=cur, ) count = db.content_fossology_license_add_from_temp(cur) return {"content_fossology_license:add": count} @timed @db_transaction() def content_fossology_license_get_partition( self, indexer_configuration_id: int, partition_id: int, nb_partitions: int, page_token: Optional[str] = None, limit: int = 1000, db=None, cur=None, ) -> PagedResult[Sha1]: return self.get_partition( "fossology_license", indexer_configuration_id, partition_id, nb_partitions, page_token=page_token, limit=limit, with_textual_data=True, db=db, cur=cur, ) @timed @db_transaction() def content_metadata_missing( self, metadata: Iterable[Dict], db=None, cur=None ) -> List[Tuple[Sha1, int]]: return [obj[0] for obj in db.content_metadata_missing_from_list(metadata, cur)] @timed @db_transaction() def content_metadata_get( self, ids: Iterable[Sha1], db=None, cur=None ) -> List[ContentMetadataRow]: return [ ContentMetadataRow.from_dict( converters.db_to_metadata(dict(zip(db.content_metadata_cols, c))) ) for c in db.content_metadata_get_from_list(ids, cur) ] @timed @process_metrics @db_transaction() def content_metadata_add( self, metadata: List[ContentMetadataRow], db=None, cur=None, ) -> Dict[str, int]: check_id_duplicates(metadata) metadata.sort(key=lambda m: m.id) self.journal_writer.write_additions("content_metadata", metadata) db.mktemp_content_metadata(cur) db.copy_to( [m.to_dict() for m in metadata], "tmp_content_metadata", ["id", "metadata", "indexer_configuration_id"], cur, ) count = db.content_metadata_add_from_temp(cur) return { "content_metadata:add": count, } @timed @db_transaction() def revision_intrinsic_metadata_missing( self, metadata: Iterable[Dict], db=None, cur=None ) -> List[Tuple[Sha1, int]]: return [ obj[0] for obj in db.revision_intrinsic_metadata_missing_from_list(metadata, cur) ] @timed @db_transaction() def revision_intrinsic_metadata_get( self, ids: Iterable[Sha1], db=None, cur=None ) -> List[RevisionIntrinsicMetadataRow]: return [ RevisionIntrinsicMetadataRow.from_dict( converters.db_to_metadata( dict(zip(db.revision_intrinsic_metadata_cols, c)) ) ) for c in db.revision_intrinsic_metadata_get_from_list(ids, cur) ] @timed @process_metrics @db_transaction() def revision_intrinsic_metadata_add( self, metadata: List[RevisionIntrinsicMetadataRow], db=None, cur=None, ) -> Dict[str, int]: check_id_duplicates(metadata) metadata.sort(key=lambda m: m.id) self.journal_writer.write_additions("revision_intrinsic_metadata", metadata) db.mktemp_revision_intrinsic_metadata(cur) db.copy_to( [m.to_dict() for m in metadata], "tmp_revision_intrinsic_metadata", ["id", "metadata", "mappings", "indexer_configuration_id"], cur, ) count = db.revision_intrinsic_metadata_add_from_temp(cur) return { "revision_intrinsic_metadata:add": count, } @timed @db_transaction() def origin_intrinsic_metadata_get( self, urls: Iterable[str], db=None, cur=None ) -> List[OriginIntrinsicMetadataRow]: return [ OriginIntrinsicMetadataRow.from_dict( converters.db_to_metadata( dict(zip(db.origin_intrinsic_metadata_cols, c)) ) ) for c in db.origin_intrinsic_metadata_get_from_list(urls, cur) ] @timed @process_metrics @db_transaction() def origin_intrinsic_metadata_add( self, metadata: List[OriginIntrinsicMetadataRow], db=None, cur=None, ) -> Dict[str, int]: check_id_duplicates(metadata) metadata.sort(key=lambda m: m.id) self.journal_writer.write_additions("origin_intrinsic_metadata", metadata) db.mktemp_origin_intrinsic_metadata(cur) db.copy_to( [m.to_dict() for m in metadata], "tmp_origin_intrinsic_metadata", ["id", "metadata", "indexer_configuration_id", "from_revision", "mappings"], cur, ) count = db.origin_intrinsic_metadata_add_from_temp(cur) return { "origin_intrinsic_metadata:add": count, } @timed @db_transaction() def origin_intrinsic_metadata_search_fulltext( self, conjunction: List[str], limit: int = 100, db=None, cur=None ) -> List[OriginIntrinsicMetadataRow]: return [ OriginIntrinsicMetadataRow.from_dict( converters.db_to_metadata( dict(zip(db.origin_intrinsic_metadata_cols, c)) ) ) for c in db.origin_intrinsic_metadata_search_fulltext( conjunction, limit=limit, cur=cur ) ] @timed @db_transaction() def origin_intrinsic_metadata_search_by_producer( self, page_token: str = "", limit: int = 100, ids_only: bool = False, mappings: Optional[List[str]] = None, tool_ids: Optional[List[int]] = None, db=None, cur=None, ) -> PagedResult[Union[str, OriginIntrinsicMetadataRow]]: assert isinstance(page_token, str) # we go to limit+1 to check whether we should add next_page_token in # the response rows = db.origin_intrinsic_metadata_search_by_producer( page_token, limit + 1, ids_only, mappings, tool_ids, cur ) next_page_token = None if ids_only: results = [origin for (origin,) in rows] if len(results) > limit: results[limit:] = [] next_page_token = results[-1] else: results = [ OriginIntrinsicMetadataRow.from_dict( converters.db_to_metadata( dict(zip(db.origin_intrinsic_metadata_cols, row)) ) ) for row in rows ] if len(results) > limit: results[limit:] = [] next_page_token = results[-1].id return PagedResult(results=results, next_page_token=next_page_token,) @timed @db_transaction() def origin_intrinsic_metadata_stats(self, db=None, cur=None): mapping_names = [m for m in MAPPING_NAMES] select_parts = [] # Count rows for each mapping for mapping_name in mapping_names: select_parts.append( ( "sum(case when (mappings @> ARRAY['%s']) " " then 1 else 0 end)" ) % mapping_name ) # Total select_parts.append("sum(1)") # Rows whose metadata has at least one key that is not '@context' select_parts.append( "sum(case when ('{}'::jsonb @> (metadata - '@context')) " " then 0 else 1 end)" ) cur.execute( "select " + ", ".join(select_parts) + " from origin_intrinsic_metadata" ) results = dict(zip(mapping_names + ["total", "non_empty"], cur.fetchone())) return { "total": results.pop("total"), "non_empty": results.pop("non_empty"), "per_mapping": results, } @timed @db_transaction() def indexer_configuration_add(self, tools, db=None, cur=None): db.mktemp_indexer_configuration(cur) db.copy_to( tools, "tmp_indexer_configuration", ["tool_name", "tool_version", "tool_configuration"], cur, ) tools = db.indexer_configuration_add_from_temp(cur) results = [dict(zip(db.indexer_configuration_cols, line)) for line in tools] send_metric( "indexer_configuration:add", len(results), method_name="indexer_configuration_add", ) return results @timed @db_transaction() def indexer_configuration_get(self, tool, db=None, cur=None): tool_conf = tool["tool_configuration"] if isinstance(tool_conf, dict): tool_conf = json.dumps(tool_conf) idx = db.indexer_configuration_get( tool["tool_name"], tool["tool_version"], tool_conf ) if not idx: return None return dict(zip(db.indexer_configuration_cols, idx)) @db_transaction() def _tool_get_from_id(self, id_, db, cur): tool = dict( zip( db.indexer_configuration_cols, db.indexer_configuration_get_from_id(id_, cur), ) ) return { "id": tool["id"], "name": tool["tool_name"], "version": tool["tool_version"], "configuration": tool["tool_configuration"], } diff --git a/swh/indexer/tests/conftest.py b/swh/indexer/tests/conftest.py index 270812d..5ae6aeb 100644 --- a/swh/indexer/tests/conftest.py +++ b/swh/indexer/tests/conftest.py @@ -1,127 +1,127 @@ # Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from datetime import timedelta import os from os import path from typing import List, Tuple from unittest.mock import patch import pytest import yaml from swh.core.db.pytest_plugin import postgresql_fact import swh.indexer from swh.indexer.storage import get_indexer_storage from swh.objstorage.factory import get_objstorage from swh.storage import get_storage from .utils import fill_obj_storage, fill_storage TASK_NAMES: List[Tuple[str, str]] = [ # (scheduler-task-type, task-class-test-name) ("index-revision-metadata", "revision_intrinsic_metadata"), ("index-origin-metadata", "origin_intrinsic_metadata"), ] SQL_FILES = path.join(path.dirname(swh.indexer.__file__), "sql", "*.sql") idx_storage_postgresql = postgresql_fact( - "postgresql_proc", db_name="indexer_storage", dump_files=SQL_FILES, + "postgresql_proc", dbname="indexer_storage", dump_files=SQL_FILES, ) @pytest.fixture def indexer_scheduler(swh_scheduler): # Insert the expected task types within the scheduler for task_name, task_class_name in TASK_NAMES: swh_scheduler.create_task_type( { "type": task_name, "description": f"The {task_class_name} indexer testing task", "backend_name": f"swh.indexer.tests.tasks.{task_class_name}", "default_interval": timedelta(days=1), "min_interval": timedelta(hours=6), "max_interval": timedelta(days=12), "num_retries": 3, } ) return swh_scheduler @pytest.fixture def idx_storage_backend_config(idx_storage_postgresql): """Basic pg storage configuration with no journal collaborator for the indexer storage (to avoid pulling optional dependency on clients of this fixture) """ return { "cls": "local", "db": idx_storage_postgresql.dsn, } @pytest.fixture def swh_indexer_config( swh_storage_backend_config, idx_storage_backend_config, swh_scheduler_config ): return { "storage": swh_storage_backend_config, "objstorage": {"cls": "memory"}, "indexer_storage": idx_storage_backend_config, "scheduler": {"cls": "local", **swh_scheduler_config}, "tools": { "name": "file", "version": "1:5.30-1+deb9u1", "configuration": {"type": "library", "debian-package": "python3-magic"}, }, "compute_checksums": ["blake2b512"], # for rehash indexer } @pytest.fixture def idx_storage(swh_indexer_config): """An instance of in-memory indexer storage that gets injected into all indexers classes. """ idx_storage_config = swh_indexer_config["indexer_storage"] return get_indexer_storage(**idx_storage_config) @pytest.fixture def storage(swh_indexer_config): """An instance of in-memory storage that gets injected into all indexers classes. """ storage = get_storage(**swh_indexer_config["storage"]) fill_storage(storage) return storage @pytest.fixture def obj_storage(swh_indexer_config): """An instance of in-memory objstorage that gets injected into all indexers classes. """ objstorage = get_objstorage(**swh_indexer_config["objstorage"]) fill_obj_storage(objstorage) with patch.dict( "swh.objstorage.factory._STORAGE_CLASSES", {"memory": lambda: objstorage} ): yield objstorage @pytest.fixture def swh_config(swh_indexer_config, monkeypatch, tmp_path): conffile = os.path.join(str(tmp_path), "indexer.yml") with open(conffile, "w") as f: f.write(yaml.dump(swh_indexer_config)) monkeypatch.setenv("SWH_CONFIG_FILENAME", conffile) return conffile diff --git a/swh/indexer/tests/storage/test_storage.py b/swh/indexer/tests/storage/test_storage.py index c17c3e8..3067dc2 100644 --- a/swh/indexer/tests/storage/test_storage.py +++ b/swh/indexer/tests/storage/test_storage.py @@ -1,1722 +1,1723 @@ # Copyright (C) 2015-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import math import threading from typing import Any, Dict, List, Tuple, Type import attr import pytest from swh.indexer.storage.exc import DuplicateId, IndexerStorageArgumentException from swh.indexer.storage.interface import IndexerStorageInterface, PagedResult from swh.indexer.storage.model import ( BaseRow, ContentCtagsRow, ContentLanguageRow, ContentLicenseRow, ContentMetadataRow, ContentMimetypeRow, OriginIntrinsicMetadataRow, RevisionIntrinsicMetadataRow, ) from swh.model.hashutil import hash_to_bytes def prepare_mimetypes_from_licenses( fossology_licenses: List[ContentLicenseRow], ) -> List[ContentMimetypeRow]: """Fossology license needs some consistent data in db to run. """ mimetypes = [] for c in fossology_licenses: mimetypes.append( ContentMimetypeRow( id=c.id, mimetype="text/plain", # for filtering on textual data to work encoding="utf-8", indexer_configuration_id=c.indexer_configuration_id, ) ) return mimetypes def endpoint_name(etype: str, ename: str) -> str: """Compute the storage's endpoint's name >>> endpoint_name('content_mimetype', 'add') 'content_mimetype_add' >>> endpoint_name('content_fosso_license', 'delete') 'content_fosso_license_delete' """ return f"{etype}_{ename}" def endpoint(storage, etype: str, ename: str): return getattr(storage, endpoint_name(etype, ename)) def expected_summary(count: int, etype: str, ename: str = "add") -> Dict[str, int]: """Compute the expected summary The key is determine according to etype and ename >>> expected_summary(10, 'content_mimetype', 'add') {'content_mimetype:add': 10} >>> expected_summary(9, 'origin_intrinsic_metadata', 'delete') {'origin_intrinsic_metadata:del': 9} """ pattern = ename[0:3] key = endpoint_name(etype, ename).replace(f"_{ename}", f":{pattern}") return {key: count} def test_check_config(swh_indexer_storage) -> None: assert swh_indexer_storage.check_config(check_write=True) assert swh_indexer_storage.check_config(check_write=False) class StorageETypeTester: """Base class for testing a series of common behaviour between a bunch of endpoint types supported by an IndexerStorage. This is supposed to be inherited with the following class attributes: - endpoint_type - tool_name - example_data See below for example usage. """ endpoint_type: str tool_name: str example_data: List[Dict] row_class: Type[BaseRow] def test_missing( self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any] ) -> None: storage, data = swh_indexer_storage_with_data etype = self.endpoint_type tool_id = data.tools[self.tool_name]["id"] # given 2 (hopefully) unknown objects query = [ {"id": data.sha1_1, "indexer_configuration_id": tool_id,}, {"id": data.sha1_2, "indexer_configuration_id": tool_id,}, ] # we expect these are both returned by the xxx_missing endpoint actual_missing = endpoint(storage, etype, "missing")(query) assert list(actual_missing) == [ data.sha1_1, data.sha1_2, ] # now, when we add one of them summary = endpoint(storage, etype, "add")( [ self.row_class.from_dict( { "id": data.sha1_2, **self.example_data[0], "indexer_configuration_id": tool_id, } ) ] ) assert summary == expected_summary(1, etype) # we expect only the other one returned actual_missing = endpoint(storage, etype, "missing")(query) assert list(actual_missing) == [data.sha1_1] def test_add__update_in_place_duplicate( self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any] ) -> None: storage, data = swh_indexer_storage_with_data etype = self.endpoint_type tool = data.tools[self.tool_name] data_v1 = { "id": data.sha1_2, **self.example_data[0], "indexer_configuration_id": tool["id"], } # given summary = endpoint(storage, etype, "add")([self.row_class.from_dict(data_v1)]) assert summary == expected_summary(1, etype) # not added # when actual_data = list(endpoint(storage, etype, "get")([data.sha1_2])) expected_data_v1 = [ self.row_class.from_dict( {"id": data.sha1_2, **self.example_data[0], "tool": tool} ) ] # then assert actual_data == expected_data_v1 # given data_v2 = data_v1.copy() data_v2.update(self.example_data[1]) endpoint(storage, etype, "add")([self.row_class.from_dict(data_v2)]) assert summary == expected_summary(1, etype) # modified so counted actual_data = list(endpoint(storage, etype, "get")([data.sha1_2])) expected_data_v2 = [ self.row_class.from_dict( {"id": data.sha1_2, **self.example_data[1], "tool": tool,} ) ] # data did change as the v2 was used to overwrite v1 assert actual_data == expected_data_v2 def test_add_deadlock( self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any] ) -> None: storage, data = swh_indexer_storage_with_data etype = self.endpoint_type tool = data.tools[self.tool_name] hashes = [ hash_to_bytes("34973274ccef6ab4dfaaf86599792fa9c3fe4{:03d}".format(i)) for i in range(1000) ] data_v1 = [ self.row_class.from_dict( { "id": hash_, **self.example_data[0], "indexer_configuration_id": tool["id"], } ) for hash_ in hashes ] data_v2 = [ self.row_class.from_dict( { "id": hash_, **self.example_data[1], "indexer_configuration_id": tool["id"], } ) for hash_ in hashes ] # Remove one item from each, so that both queries have to succeed for # all items to be in the DB. data_v2a = data_v2[1:] data_v2b = list(reversed(data_v2[0:-1])) # given endpoint(storage, etype, "add")(data_v1) # when actual_data = sorted( endpoint(storage, etype, "get")(hashes), key=lambda x: x.id, ) expected_data_v1 = [ self.row_class.from_dict( {"id": hash_, **self.example_data[0], "tool": tool} ) for hash_ in hashes ] # then assert actual_data == expected_data_v1 # given def f1() -> None: endpoint(storage, etype, "add")(data_v2a) def f2() -> None: endpoint(storage, etype, "add")(data_v2b) t1 = threading.Thread(target=f1) t2 = threading.Thread(target=f2) t2.start() t1.start() t1.join() t2.join() actual_data = sorted( endpoint(storage, etype, "get")(hashes), key=lambda x: x.id, ) expected_data_v2 = [ self.row_class.from_dict( {"id": hash_, **self.example_data[1], "tool": tool} ) for hash_ in hashes ] assert len(actual_data) == len(expected_data_v1) == len(expected_data_v2) for (item, expected_item_v1, expected_item_v2) in zip( actual_data, expected_data_v1, expected_data_v2 ): assert item in (expected_item_v1, expected_item_v2) def test_add__duplicate_twice( self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any] ) -> None: storage, data = swh_indexer_storage_with_data etype = self.endpoint_type tool = data.tools[self.tool_name] data_rev1 = self.row_class.from_dict( { "id": data.revision_id_2, **self.example_data[0], "indexer_configuration_id": tool["id"], } ) data_rev2 = self.row_class.from_dict( { "id": data.revision_id_2, **self.example_data[1], "indexer_configuration_id": tool["id"], } ) # when summary = endpoint(storage, etype, "add")([data_rev1]) assert summary == expected_summary(1, etype) with pytest.raises(DuplicateId): endpoint(storage, etype, "add")([data_rev2, data_rev2]) # then actual_data = list( endpoint(storage, etype, "get")([data.revision_id_2, data.revision_id_1]) ) expected_data = [ self.row_class.from_dict( {"id": data.revision_id_2, **self.example_data[0], "tool": tool} ) ] assert actual_data == expected_data def test_add( self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any] ) -> None: storage, data = swh_indexer_storage_with_data etype = self.endpoint_type tool = data.tools[self.tool_name] # conftest fills it with mimetypes storage.journal_writer.journal.objects = [] # type: ignore query = [data.sha1_2, data.sha1_1] data1 = self.row_class.from_dict( { "id": data.sha1_2, **self.example_data[0], "indexer_configuration_id": tool["id"], } ) # when summary = endpoint(storage, etype, "add")([data1]) assert summary == expected_summary(1, etype) # then actual_data = list(endpoint(storage, etype, "get")(query)) # then expected_data = [ self.row_class.from_dict( {"id": data.sha1_2, **self.example_data[0], "tool": tool} ) ] assert actual_data == expected_data journal_objects = storage.journal_writer.journal.objects # type: ignore actual_journal_data = [ obj for (obj_type, obj) in journal_objects if obj_type == self.endpoint_type ] assert list(sorted(actual_journal_data)) == list(sorted(expected_data)) class TestIndexerStorageContentMimetypes(StorageETypeTester): """Test Indexer Storage content_mimetype related methods """ endpoint_type = "content_mimetype" tool_name = "file" example_data = [ {"mimetype": "text/plain", "encoding": "utf-8",}, {"mimetype": "text/html", "encoding": "us-ascii",}, ] row_class = ContentMimetypeRow def test_generate_content_mimetype_get_partition_failure( self, swh_indexer_storage: IndexerStorageInterface ) -> None: """get_partition call with wrong limit input should fail""" storage = swh_indexer_storage indexer_configuration_id = 42 with pytest.raises( IndexerStorageArgumentException, match="limit should not be None" ): storage.content_mimetype_get_partition( indexer_configuration_id, 0, 3, limit=None # type: ignore ) def test_generate_content_mimetype_get_partition_no_limit( self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any] ) -> None: """get_partition should return result""" storage, data = swh_indexer_storage_with_data mimetypes = data.mimetypes expected_ids = set([c.id for c in mimetypes]) indexer_configuration_id = mimetypes[0].indexer_configuration_id assert len(mimetypes) == 16 nb_partitions = 16 actual_ids = [] for partition_id in range(nb_partitions): actual_result = storage.content_mimetype_get_partition( indexer_configuration_id, partition_id, nb_partitions ) assert actual_result.next_page_token is None actual_ids.extend(actual_result.results) assert len(actual_ids) == len(expected_ids) for actual_id in actual_ids: assert actual_id in expected_ids def test_generate_content_mimetype_get_partition_full( self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any] ) -> None: """get_partition for a single partition should return available ids """ storage, data = swh_indexer_storage_with_data mimetypes = data.mimetypes expected_ids = set([c.id for c in mimetypes]) indexer_configuration_id = mimetypes[0].indexer_configuration_id actual_result = storage.content_mimetype_get_partition( indexer_configuration_id, 0, 1 ) assert actual_result.next_page_token is None actual_ids = actual_result.results assert len(actual_ids) == len(expected_ids) for actual_id in actual_ids: assert actual_id in expected_ids def test_generate_content_mimetype_get_partition_empty( self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any] ) -> None: """get_partition when at least one of the partitions is empty""" storage, data = swh_indexer_storage_with_data mimetypes = data.mimetypes expected_ids = set([c.id for c in mimetypes]) indexer_configuration_id = mimetypes[0].indexer_configuration_id # nb_partitions = smallest power of 2 such that at least one of # the partitions is empty nb_mimetypes = len(mimetypes) nb_partitions = 1 << math.floor(math.log2(nb_mimetypes) + 1) seen_ids = [] for partition_id in range(nb_partitions): actual_result = storage.content_mimetype_get_partition( indexer_configuration_id, partition_id, nb_partitions, limit=nb_mimetypes + 1, ) for actual_id in actual_result.results: seen_ids.append(actual_id) # Limit is higher than the max number of results assert actual_result.next_page_token is None assert set(seen_ids) == expected_ids def test_generate_content_mimetype_get_partition_with_pagination( self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any] ) -> None: """get_partition should return ids provided with pagination """ storage, data = swh_indexer_storage_with_data mimetypes = data.mimetypes expected_ids = set([c.id for c in mimetypes]) indexer_configuration_id = mimetypes[0].indexer_configuration_id nb_partitions = 4 actual_ids = [] for partition_id in range(nb_partitions): next_page_token = None while True: actual_result = storage.content_mimetype_get_partition( indexer_configuration_id, partition_id, nb_partitions, limit=2, page_token=next_page_token, ) actual_ids.extend(actual_result.results) next_page_token = actual_result.next_page_token if next_page_token is None: break assert len(set(actual_ids)) == len(set(expected_ids)) for actual_id in actual_ids: assert actual_id in expected_ids class TestIndexerStorageContentLanguage(StorageETypeTester): """Test Indexer Storage content_language related methods """ endpoint_type = "content_language" tool_name = "pygments" example_data = [ {"lang": "haskell",}, {"lang": "common-lisp",}, ] row_class = ContentLanguageRow class TestIndexerStorageContentCTags(StorageETypeTester): """Test Indexer Storage content_ctags related methods """ endpoint_type = "content_ctags" tool_name = "universal-ctags" example_data = [ {"name": "done", "kind": "variable", "line": 119, "lang": "OCaml",}, {"name": "done", "kind": "variable", "line": 100, "lang": "Python",}, {"name": "main", "kind": "function", "line": 119, "lang": "Python",}, ] row_class = ContentCtagsRow # the following tests are disabled because CTAGS behaves differently @pytest.mark.skip def test_add__update_in_place_duplicate(self): pass @pytest.mark.skip def test_add_deadlock(self): pass def test_content_ctags_search( self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any] ) -> None: storage, data = swh_indexer_storage_with_data # 1. given tool = data.tools["universal-ctags"] tool_id = tool["id"] ctags1 = [ ContentCtagsRow( id=data.sha1_1, indexer_configuration_id=tool_id, **kwargs, # type: ignore ) for kwargs in [ {"name": "hello", "kind": "function", "line": 133, "lang": "Python",}, {"name": "counter", "kind": "variable", "line": 119, "lang": "Python",}, {"name": "hello", "kind": "variable", "line": 210, "lang": "Python",}, ] ] ctags1_with_tool = [ attr.evolve(ctag, indexer_configuration_id=None, tool=tool) for ctag in ctags1 ] ctags2 = [ ContentCtagsRow( id=data.sha1_2, indexer_configuration_id=tool_id, **kwargs, # type: ignore ) for kwargs in [ {"name": "hello", "kind": "variable", "line": 100, "lang": "C",}, {"name": "result", "kind": "variable", "line": 120, "lang": "C",}, ] ] ctags2_with_tool = [ attr.evolve(ctag, indexer_configuration_id=None, tool=tool) for ctag in ctags2 ] storage.content_ctags_add(ctags1 + ctags2) # 1. when actual_ctags = list(storage.content_ctags_search("hello", limit=1)) # 1. then assert actual_ctags == [ctags1_with_tool[0]] # 2. when actual_ctags = list( storage.content_ctags_search("hello", limit=1, last_sha1=data.sha1_1) ) # 2. then assert actual_ctags == [ctags2_with_tool[0]] # 3. when actual_ctags = list(storage.content_ctags_search("hello")) # 3. then assert actual_ctags == [ ctags1_with_tool[0], ctags1_with_tool[2], ctags2_with_tool[0], ] # 4. when actual_ctags = list(storage.content_ctags_search("counter")) # then assert actual_ctags == [ctags1_with_tool[1]] # 5. when actual_ctags = list(storage.content_ctags_search("result", limit=1)) # then assert actual_ctags == [ctags2_with_tool[1]] def test_content_ctags_search_no_result( self, swh_indexer_storage: IndexerStorageInterface ) -> None: storage = swh_indexer_storage actual_ctags = list(storage.content_ctags_search("counter")) assert not actual_ctags def test_content_ctags_add__add_new_ctags_added( self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any] ) -> None: storage, data = swh_indexer_storage_with_data # given tool = data.tools["universal-ctags"] tool_id = tool["id"] ctag1 = ContentCtagsRow( id=data.sha1_2, indexer_configuration_id=tool_id, name="done", kind="variable", line=100, lang="Scheme", ) ctag1_with_tool = attr.evolve(ctag1, indexer_configuration_id=None, tool=tool) # given storage.content_ctags_add([ctag1]) storage.content_ctags_add([ctag1]) # conflict does nothing # when actual_ctags = list(storage.content_ctags_get([data.sha1_2])) # then assert actual_ctags == [ctag1_with_tool] # given ctag2 = ContentCtagsRow( id=data.sha1_2, indexer_configuration_id=tool_id, name="defn", kind="function", line=120, lang="Scheme", ) ctag2_with_tool = attr.evolve(ctag2, indexer_configuration_id=None, tool=tool) storage.content_ctags_add([ctag2]) actual_ctags = list(storage.content_ctags_get([data.sha1_2])) assert actual_ctags == [ctag1_with_tool, ctag2_with_tool] def test_content_ctags_add__update_in_place( self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any] ) -> None: storage, data = swh_indexer_storage_with_data # given tool = data.tools["universal-ctags"] tool_id = tool["id"] ctag1 = ContentCtagsRow( id=data.sha1_2, indexer_configuration_id=tool_id, name="done", kind="variable", line=100, lang="Scheme", ) ctag1_with_tool = attr.evolve(ctag1, indexer_configuration_id=None, tool=tool) # given storage.content_ctags_add([ctag1]) # when actual_ctags = list(storage.content_ctags_get([data.sha1_2])) # then assert actual_ctags == [ctag1_with_tool] # given ctag2 = ContentCtagsRow( id=data.sha1_2, indexer_configuration_id=tool_id, name="defn", kind="function", line=120, lang="Scheme", ) ctag2_with_tool = attr.evolve(ctag2, indexer_configuration_id=None, tool=tool) storage.content_ctags_add([ctag1, ctag2]) actual_ctags = list(storage.content_ctags_get([data.sha1_2])) assert actual_ctags == [ctag1_with_tool, ctag2_with_tool] def test_add_empty( self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any] ) -> None: (storage, data) = swh_indexer_storage_with_data etype = self.endpoint_type summary = endpoint(storage, etype, "add")([]) assert summary == {"content_ctags:add": 0} actual_ctags = list(endpoint(storage, etype, "get")([data.sha1_2])) assert actual_ctags == [] def test_get_unknown( self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any] ) -> None: (storage, data) = swh_indexer_storage_with_data etype = self.endpoint_type actual_ctags = list(endpoint(storage, etype, "get")([data.sha1_2])) assert actual_ctags == [] class TestIndexerStorageContentMetadata(StorageETypeTester): """Test Indexer Storage content_metadata related methods """ tool_name = "swh-metadata-detector" endpoint_type = "content_metadata" example_data = [ { "metadata": { "other": {}, "codeRepository": { "type": "git", "url": "https://github.com/moranegg/metadata_test", }, "description": "Simple package.json test for indexer", "name": "test_metadata", "version": "0.0.1", }, }, {"metadata": {"other": {}, "name": "test_metadata", "version": "0.0.1"},}, ] row_class = ContentMetadataRow class TestIndexerStorageRevisionIntrinsicMetadata(StorageETypeTester): """Test Indexer Storage revision_intrinsic_metadata related methods """ tool_name = "swh-metadata-detector" endpoint_type = "revision_intrinsic_metadata" example_data = [ { "metadata": { "other": {}, "codeRepository": { "type": "git", "url": "https://github.com/moranegg/metadata_test", }, "description": "Simple package.json test for indexer", "name": "test_metadata", "version": "0.0.1", }, "mappings": ["mapping1"], }, { "metadata": {"other": {}, "name": "test_metadata", "version": "0.0.1"}, "mappings": ["mapping2"], }, ] row_class = RevisionIntrinsicMetadataRow class TestIndexerStorageContentFossologyLicense(StorageETypeTester): endpoint_type = "content_fossology_license" tool_name = "nomos" example_data = [ {"license": "Apache-2.0"}, {"license": "BSD-2-Clause"}, ] row_class = ContentLicenseRow # the following tests are disabled because licenses behaves differently @pytest.mark.skip def test_add__update_in_place_duplicate(self): pass @pytest.mark.skip def test_add_deadlock(self): pass # content_fossology_license_missing does not exist @pytest.mark.skip def test_missing(self): pass def test_content_fossology_license_add__new_license_added( self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any] ) -> None: storage, data = swh_indexer_storage_with_data # given tool = data.tools["nomos"] tool_id = tool["id"] license1 = ContentLicenseRow( id=data.sha1_1, license="Apache-2.0", indexer_configuration_id=tool_id, ) # given storage.content_fossology_license_add([license1]) # conflict does nothing storage.content_fossology_license_add([license1]) # when actual_licenses = list(storage.content_fossology_license_get([data.sha1_1])) # then expected_licenses = [ ContentLicenseRow(id=data.sha1_1, license="Apache-2.0", tool=tool,) ] assert actual_licenses == expected_licenses # given license2 = ContentLicenseRow( id=data.sha1_1, license="BSD-2-Clause", indexer_configuration_id=tool_id, ) storage.content_fossology_license_add([license2]) actual_licenses = list(storage.content_fossology_license_get([data.sha1_1])) expected_licenses.append( ContentLicenseRow(id=data.sha1_1, license="BSD-2-Clause", tool=tool,) ) # first license was not removed when the second one was added assert sorted(actual_licenses) == sorted(expected_licenses) def test_generate_content_fossology_license_get_partition_failure( self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any] ) -> None: """get_partition call with wrong limit input should fail""" storage, data = swh_indexer_storage_with_data indexer_configuration_id = 42 with pytest.raises( IndexerStorageArgumentException, match="limit should not be None" ): storage.content_fossology_license_get_partition( indexer_configuration_id, 0, 3, limit=None, # type: ignore ) def test_generate_content_fossology_license_get_partition_no_limit( self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any] ) -> None: """get_partition should return results""" storage, data = swh_indexer_storage_with_data # craft some consistent mimetypes fossology_licenses = data.fossology_licenses mimetypes = prepare_mimetypes_from_licenses(fossology_licenses) indexer_configuration_id = fossology_licenses[0].indexer_configuration_id storage.content_mimetype_add(mimetypes) # add fossology_licenses to storage storage.content_fossology_license_add(fossology_licenses) # All ids from the db expected_ids = set([c.id for c in fossology_licenses]) assert len(fossology_licenses) == 10 assert len(mimetypes) == 10 nb_partitions = 4 actual_ids = [] for partition_id in range(nb_partitions): actual_result = storage.content_fossology_license_get_partition( indexer_configuration_id, partition_id, nb_partitions ) assert actual_result.next_page_token is None actual_ids.extend(actual_result.results) assert len(set(actual_ids)) == len(expected_ids) for actual_id in actual_ids: assert actual_id in expected_ids def test_generate_content_fossology_license_get_partition_full( self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any] ) -> None: """get_partition for a single partition should return available ids """ storage, data = swh_indexer_storage_with_data # craft some consistent mimetypes fossology_licenses = data.fossology_licenses mimetypes = prepare_mimetypes_from_licenses(fossology_licenses) indexer_configuration_id = fossology_licenses[0].indexer_configuration_id storage.content_mimetype_add(mimetypes) # add fossology_licenses to storage storage.content_fossology_license_add(fossology_licenses) # All ids from the db expected_ids = set([c.id for c in fossology_licenses]) actual_result = storage.content_fossology_license_get_partition( indexer_configuration_id, 0, 1 ) assert actual_result.next_page_token is None actual_ids = actual_result.results assert len(set(actual_ids)) == len(expected_ids) for actual_id in actual_ids: assert actual_id in expected_ids def test_generate_content_fossology_license_get_partition_empty( self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any] ) -> None: """get_partition when at least one of the partitions is empty""" storage, data = swh_indexer_storage_with_data # craft some consistent mimetypes fossology_licenses = data.fossology_licenses mimetypes = prepare_mimetypes_from_licenses(fossology_licenses) indexer_configuration_id = fossology_licenses[0].indexer_configuration_id storage.content_mimetype_add(mimetypes) # add fossology_licenses to storage storage.content_fossology_license_add(fossology_licenses) # All ids from the db expected_ids = set([c.id for c in fossology_licenses]) # nb_partitions = smallest power of 2 such that at least one of # the partitions is empty nb_licenses = len(fossology_licenses) nb_partitions = 1 << math.floor(math.log2(nb_licenses) + 1) seen_ids = [] for partition_id in range(nb_partitions): actual_result = storage.content_fossology_license_get_partition( indexer_configuration_id, partition_id, nb_partitions, limit=nb_licenses + 1, ) for actual_id in actual_result.results: seen_ids.append(actual_id) # Limit is higher than the max number of results assert actual_result.next_page_token is None assert set(seen_ids) == expected_ids def test_generate_content_fossology_license_get_partition_with_pagination( self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any] ) -> None: """get_partition should return ids provided with paginationv """ storage, data = swh_indexer_storage_with_data # craft some consistent mimetypes fossology_licenses = data.fossology_licenses mimetypes = prepare_mimetypes_from_licenses(fossology_licenses) indexer_configuration_id = fossology_licenses[0].indexer_configuration_id storage.content_mimetype_add(mimetypes) # add fossology_licenses to storage storage.content_fossology_license_add(fossology_licenses) # All ids from the db expected_ids = [c.id for c in fossology_licenses] nb_partitions = 4 actual_ids = [] for partition_id in range(nb_partitions): next_page_token = None while True: actual_result = storage.content_fossology_license_get_partition( indexer_configuration_id, partition_id, nb_partitions, limit=2, page_token=next_page_token, ) actual_ids.extend(actual_result.results) next_page_token = actual_result.next_page_token if next_page_token is None: break assert len(set(actual_ids)) == len(set(expected_ids)) for actual_id in actual_ids: assert actual_id in expected_ids def test_add_empty( self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any] ) -> None: (storage, data) = swh_indexer_storage_with_data etype = self.endpoint_type summary = endpoint(storage, etype, "add")([]) assert summary == {"content_fossology_license:add": 0} actual_license = list(endpoint(storage, etype, "get")([data.sha1_2])) assert actual_license == [] def test_get_unknown( self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any] ) -> None: (storage, data) = swh_indexer_storage_with_data etype = self.endpoint_type actual_license = list(endpoint(storage, etype, "get")([data.sha1_2])) assert actual_license == [] class TestIndexerStorageOriginIntrinsicMetadata: def test_origin_intrinsic_metadata_add( self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any] ) -> None: storage, data = swh_indexer_storage_with_data # given tool_id = data.tools["swh-metadata-detector"]["id"] metadata = { "version": None, "name": None, } metadata_rev = RevisionIntrinsicMetadataRow( id=data.revision_id_2, metadata=metadata, mappings=["mapping1"], indexer_configuration_id=tool_id, ) metadata_origin = OriginIntrinsicMetadataRow( id=data.origin_url_1, metadata=metadata, indexer_configuration_id=tool_id, mappings=["mapping1"], from_revision=data.revision_id_2, ) # when storage.revision_intrinsic_metadata_add([metadata_rev]) storage.origin_intrinsic_metadata_add([metadata_origin]) # then actual_metadata = list( storage.origin_intrinsic_metadata_get([data.origin_url_1, "no://where"]) ) expected_metadata = [ OriginIntrinsicMetadataRow( id=data.origin_url_1, metadata=metadata, tool=data.tools["swh-metadata-detector"], from_revision=data.revision_id_2, mappings=["mapping1"], ) ] assert actual_metadata == expected_metadata journal_objects = storage.journal_writer.journal.objects # type: ignore actual_journal_metadata = [ obj for (obj_type, obj) in journal_objects if obj_type == "origin_intrinsic_metadata" ] assert list(sorted(actual_journal_metadata)) == list(sorted(expected_metadata)) def test_origin_intrinsic_metadata_add_update_in_place_duplicate( self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any] ) -> None: storage, data = swh_indexer_storage_with_data # given tool_id = data.tools["swh-metadata-detector"]["id"] metadata_v1: Dict[str, Any] = { "version": None, "name": None, } metadata_rev_v1 = RevisionIntrinsicMetadataRow( id=data.revision_id_2, metadata=metadata_v1, mappings=[], indexer_configuration_id=tool_id, ) metadata_origin_v1 = OriginIntrinsicMetadataRow( id=data.origin_url_1, metadata=metadata_v1.copy(), indexer_configuration_id=tool_id, mappings=[], from_revision=data.revision_id_2, ) # given storage.revision_intrinsic_metadata_add([metadata_rev_v1]) storage.origin_intrinsic_metadata_add([metadata_origin_v1]) # when actual_metadata = list( storage.origin_intrinsic_metadata_get([data.origin_url_1]) ) # then expected_metadata_v1 = [ OriginIntrinsicMetadataRow( id=data.origin_url_1, metadata=metadata_v1, tool=data.tools["swh-metadata-detector"], from_revision=data.revision_id_2, mappings=[], ) ] assert actual_metadata == expected_metadata_v1 # given metadata_v2 = metadata_v1.copy() metadata_v2.update( {"name": "test_update_duplicated_metadata", "author": "MG",} ) metadata_rev_v2 = attr.evolve(metadata_rev_v1, metadata=metadata_v2) metadata_origin_v2 = OriginIntrinsicMetadataRow( id=data.origin_url_1, metadata=metadata_v2.copy(), indexer_configuration_id=tool_id, mappings=["npm"], from_revision=data.revision_id_1, ) storage.revision_intrinsic_metadata_add([metadata_rev_v2]) storage.origin_intrinsic_metadata_add([metadata_origin_v2]) actual_metadata = list( storage.origin_intrinsic_metadata_get([data.origin_url_1]) ) expected_metadata_v2 = [ OriginIntrinsicMetadataRow( id=data.origin_url_1, metadata=metadata_v2, tool=data.tools["swh-metadata-detector"], from_revision=data.revision_id_1, mappings=["npm"], ) ] # metadata did change as the v2 was used to overwrite v1 assert actual_metadata == expected_metadata_v2 def test_origin_intrinsic_metadata_add__deadlock( self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any] ) -> None: storage, data = swh_indexer_storage_with_data # given tool_id = data.tools["swh-metadata-detector"]["id"] origins = ["file:///tmp/origin{:02d}".format(i) for i in range(100)] example_data1: Dict[str, Any] = { "metadata": {"version": None, "name": None,}, "mappings": [], } example_data2: Dict[str, Any] = { "metadata": {"version": "v1.1.1", "name": "foo",}, "mappings": [], } metadata_rev_v1 = RevisionIntrinsicMetadataRow( id=data.revision_id_2, metadata={"version": None, "name": None,}, mappings=[], indexer_configuration_id=tool_id, ) data_v1 = [ OriginIntrinsicMetadataRow( id=origin, from_revision=data.revision_id_2, indexer_configuration_id=tool_id, **example_data1, ) for origin in origins ] data_v2 = [ OriginIntrinsicMetadataRow( id=origin, from_revision=data.revision_id_2, indexer_configuration_id=tool_id, **example_data2, ) for origin in origins ] # Remove one item from each, so that both queries have to succeed for # all items to be in the DB. data_v2a = data_v2[1:] data_v2b = list(reversed(data_v2[0:-1])) # given storage.revision_intrinsic_metadata_add([metadata_rev_v1]) storage.origin_intrinsic_metadata_add(data_v1) # when actual_data = list(storage.origin_intrinsic_metadata_get(origins)) expected_data_v1 = [ OriginIntrinsicMetadataRow( id=origin, from_revision=data.revision_id_2, tool=data.tools["swh-metadata-detector"], **example_data1, ) for origin in origins ] # then assert actual_data == expected_data_v1 # given def f1() -> None: storage.origin_intrinsic_metadata_add(data_v2a) def f2() -> None: storage.origin_intrinsic_metadata_add(data_v2b) t1 = threading.Thread(target=f1) t2 = threading.Thread(target=f2) t2.start() t1.start() t1.join() t2.join() actual_data = list(storage.origin_intrinsic_metadata_get(origins)) expected_data_v2 = [ OriginIntrinsicMetadataRow( id=origin, from_revision=data.revision_id_2, tool=data.tools["swh-metadata-detector"], **example_data2, ) for origin in origins ] actual_data.sort(key=lambda item: item.id) assert len(actual_data) == len(expected_data_v1) == len(expected_data_v2) for (item, expected_item_v1, expected_item_v2) in zip( actual_data, expected_data_v1, expected_data_v2 ): assert item in (expected_item_v1, expected_item_v2) def test_origin_intrinsic_metadata_add__duplicate_twice( self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any] ) -> None: storage, data = swh_indexer_storage_with_data # given tool_id = data.tools["swh-metadata-detector"]["id"] metadata = { "developmentStatus": None, "name": None, } metadata_rev = RevisionIntrinsicMetadataRow( id=data.revision_id_2, metadata=metadata, mappings=["mapping1"], indexer_configuration_id=tool_id, ) metadata_origin = OriginIntrinsicMetadataRow( id=data.origin_url_1, metadata=metadata, indexer_configuration_id=tool_id, mappings=["mapping1"], from_revision=data.revision_id_2, ) # when storage.revision_intrinsic_metadata_add([metadata_rev]) with pytest.raises(DuplicateId): storage.origin_intrinsic_metadata_add([metadata_origin, metadata_origin]) def test_origin_intrinsic_metadata_search_fulltext( self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any] ) -> None: storage, data = swh_indexer_storage_with_data # given tool_id = data.tools["swh-metadata-detector"]["id"] metadata1 = { "author": "John Doe", } metadata1_rev = RevisionIntrinsicMetadataRow( id=data.revision_id_1, metadata=metadata1, mappings=[], indexer_configuration_id=tool_id, ) metadata1_origin = OriginIntrinsicMetadataRow( id=data.origin_url_1, metadata=metadata1, mappings=[], indexer_configuration_id=tool_id, from_revision=data.revision_id_1, ) metadata2 = { "author": "Jane Doe", } metadata2_rev = RevisionIntrinsicMetadataRow( id=data.revision_id_2, metadata=metadata2, mappings=[], indexer_configuration_id=tool_id, ) metadata2_origin = OriginIntrinsicMetadataRow( id=data.origin_url_2, metadata=metadata2, mappings=[], indexer_configuration_id=tool_id, from_revision=data.revision_id_2, ) # when storage.revision_intrinsic_metadata_add([metadata1_rev]) storage.origin_intrinsic_metadata_add([metadata1_origin]) storage.revision_intrinsic_metadata_add([metadata2_rev]) storage.origin_intrinsic_metadata_add([metadata2_origin]) # then search = storage.origin_intrinsic_metadata_search_fulltext assert set([res.id for res in search(["Doe"])]) == set( [data.origin_url_1, data.origin_url_2] ) assert [res.id for res in search(["John", "Doe"])] == [data.origin_url_1] assert [res.id for res in search(["John"])] == [data.origin_url_1] assert not list(search(["John", "Jane"])) def test_origin_intrinsic_metadata_search_fulltext_rank( self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any] ) -> None: storage, data = swh_indexer_storage_with_data # given tool_id = data.tools["swh-metadata-detector"]["id"] # The following authors have "Random Person" to add some more content # to the JSON data, to work around normalization quirks when there # are few words (rank/(1+ln(nb_words)) is very sensitive to nb_words # for small values of nb_words). metadata1 = {"author": ["Random Person", "John Doe", "Jane Doe",]} metadata1_rev = RevisionIntrinsicMetadataRow( id=data.revision_id_1, metadata=metadata1, mappings=[], indexer_configuration_id=tool_id, ) metadata1_origin = OriginIntrinsicMetadataRow( id=data.origin_url_1, metadata=metadata1, mappings=[], indexer_configuration_id=tool_id, from_revision=data.revision_id_1, ) metadata2 = {"author": ["Random Person", "Jane Doe",]} metadata2_rev = RevisionIntrinsicMetadataRow( id=data.revision_id_2, metadata=metadata2, mappings=[], indexer_configuration_id=tool_id, ) metadata2_origin = OriginIntrinsicMetadataRow( id=data.origin_url_2, metadata=metadata2, mappings=[], indexer_configuration_id=tool_id, from_revision=data.revision_id_2, ) # when storage.revision_intrinsic_metadata_add([metadata1_rev]) storage.origin_intrinsic_metadata_add([metadata1_origin]) storage.revision_intrinsic_metadata_add([metadata2_rev]) storage.origin_intrinsic_metadata_add([metadata2_origin]) # then search = storage.origin_intrinsic_metadata_search_fulltext assert [res.id for res in search(["Doe"])] == [ data.origin_url_1, data.origin_url_2, ] assert [res.id for res in search(["Doe"], limit=1)] == [data.origin_url_1] assert [res.id for res in search(["John"])] == [data.origin_url_1] assert [res.id for res in search(["Jane"])] == [ data.origin_url_2, data.origin_url_1, ] assert [res.id for res in search(["John", "Jane"])] == [data.origin_url_1] def _fill_origin_intrinsic_metadata( self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any] ) -> None: storage, data = swh_indexer_storage_with_data tool1_id = data.tools["swh-metadata-detector"]["id"] tool2_id = data.tools["swh-metadata-detector2"]["id"] metadata1 = { "@context": "foo", "author": "John Doe", } metadata1_rev = RevisionIntrinsicMetadataRow( id=data.revision_id_1, metadata=metadata1, mappings=["npm"], indexer_configuration_id=tool1_id, ) metadata1_origin = OriginIntrinsicMetadataRow( id=data.origin_url_1, metadata=metadata1, mappings=["npm"], indexer_configuration_id=tool1_id, from_revision=data.revision_id_1, ) metadata2 = { "@context": "foo", "author": "Jane Doe", } metadata2_rev = RevisionIntrinsicMetadataRow( id=data.revision_id_2, metadata=metadata2, mappings=["npm", "gemspec"], indexer_configuration_id=tool2_id, ) metadata2_origin = OriginIntrinsicMetadataRow( id=data.origin_url_2, metadata=metadata2, mappings=["npm", "gemspec"], indexer_configuration_id=tool2_id, from_revision=data.revision_id_2, ) metadata3 = { "@context": "foo", } metadata3_rev = RevisionIntrinsicMetadataRow( id=data.revision_id_3, metadata=metadata3, mappings=["npm", "gemspec"], indexer_configuration_id=tool2_id, ) metadata3_origin = OriginIntrinsicMetadataRow( id=data.origin_url_3, metadata=metadata3, mappings=["pkg-info"], indexer_configuration_id=tool2_id, from_revision=data.revision_id_3, ) storage.revision_intrinsic_metadata_add([metadata1_rev]) storage.origin_intrinsic_metadata_add([metadata1_origin]) storage.revision_intrinsic_metadata_add([metadata2_rev]) storage.origin_intrinsic_metadata_add([metadata2_origin]) storage.revision_intrinsic_metadata_add([metadata3_rev]) storage.origin_intrinsic_metadata_add([metadata3_origin]) def test_origin_intrinsic_metadata_search_by_producer( self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any] ) -> None: storage, data = swh_indexer_storage_with_data self._fill_origin_intrinsic_metadata(swh_indexer_storage_with_data) tool1 = data.tools["swh-metadata-detector"] tool2 = data.tools["swh-metadata-detector2"] endpoint = storage.origin_intrinsic_metadata_search_by_producer # test pagination # no 'page_token' param, return all origins result = endpoint(ids_only=True) assert result == PagedResult( results=[data.origin_url_1, data.origin_url_2, data.origin_url_3,], next_page_token=None, ) # 'page_token' is < than origin_1, return everything result = endpoint(page_token=data.origin_url_1[:-1], ids_only=True) assert result == PagedResult( results=[data.origin_url_1, data.origin_url_2, data.origin_url_3,], next_page_token=None, ) # 'page_token' is origin_3, return nothing result = endpoint(page_token=data.origin_url_3, ids_only=True) assert result == PagedResult(results=[], next_page_token=None) # test limit argument result = endpoint(page_token=data.origin_url_1[:-1], limit=2, ids_only=True) assert result == PagedResult( results=[data.origin_url_1, data.origin_url_2], next_page_token=data.origin_url_2, ) result = endpoint(page_token=data.origin_url_1, limit=2, ids_only=True) assert result == PagedResult( results=[data.origin_url_2, data.origin_url_3], next_page_token=None, ) result = endpoint(page_token=data.origin_url_2, limit=2, ids_only=True) assert result == PagedResult(results=[data.origin_url_3], next_page_token=None,) # test mappings filtering result = endpoint(mappings=["npm"], ids_only=True) assert result == PagedResult( results=[data.origin_url_1, data.origin_url_2], next_page_token=None, ) result = endpoint(mappings=["npm", "gemspec"], ids_only=True) assert result == PagedResult( results=[data.origin_url_1, data.origin_url_2], next_page_token=None, ) result = endpoint(mappings=["gemspec"], ids_only=True) assert result == PagedResult(results=[data.origin_url_2], next_page_token=None,) result = endpoint(mappings=["pkg-info"], ids_only=True) assert result == PagedResult(results=[data.origin_url_3], next_page_token=None,) result = endpoint(mappings=["foobar"], ids_only=True) assert result == PagedResult(results=[], next_page_token=None,) # test pagination + mappings result = endpoint(mappings=["npm"], limit=1, ids_only=True) assert result == PagedResult( results=[data.origin_url_1], next_page_token=data.origin_url_1, ) # test tool filtering result = endpoint(tool_ids=[tool1["id"]], ids_only=True) assert result == PagedResult(results=[data.origin_url_1], next_page_token=None,) result = endpoint(tool_ids=[tool2["id"]], ids_only=True) assert sorted(result.results) == [data.origin_url_2, data.origin_url_3] assert result.next_page_token is None result = endpoint(tool_ids=[tool1["id"], tool2["id"]], ids_only=True) assert sorted(result.results) == [ data.origin_url_1, data.origin_url_2, data.origin_url_3, ] assert result.next_page_token is None # test ids_only=False assert endpoint(mappings=["gemspec"]) == PagedResult( results=[ OriginIntrinsicMetadataRow( id=data.origin_url_2, metadata={"@context": "foo", "author": "Jane Doe",}, mappings=["npm", "gemspec"], tool=tool2, from_revision=data.revision_id_2, ) ], next_page_token=None, ) def test_origin_intrinsic_metadata_stats( self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any] ) -> None: storage, data = swh_indexer_storage_with_data self._fill_origin_intrinsic_metadata(swh_indexer_storage_with_data) result = storage.origin_intrinsic_metadata_stats() assert result == { "per_mapping": { + "cff": 0, "gemspec": 1, "npm": 2, "pkg-info": 1, "codemeta": 0, "maven": 0, }, "total": 3, "non_empty": 2, } class TestIndexerStorageIndexerConfiguration: def test_indexer_configuration_add( self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any] ) -> None: storage, data = swh_indexer_storage_with_data tool = { "tool_name": "some-unknown-tool", "tool_version": "some-version", "tool_configuration": {"debian-package": "some-package"}, } actual_tool = storage.indexer_configuration_get(tool) assert actual_tool is None # does not exist # add it actual_tools = list(storage.indexer_configuration_add([tool])) assert len(actual_tools) == 1 actual_tool = actual_tools[0] assert actual_tool is not None # now it exists new_id = actual_tool.pop("id") assert actual_tool == tool actual_tools2 = list(storage.indexer_configuration_add([tool])) actual_tool2 = actual_tools2[0] assert actual_tool2 is not None # now it exists new_id2 = actual_tool2.pop("id") assert new_id == new_id2 assert actual_tool == actual_tool2 def test_indexer_configuration_add_multiple( self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any] ) -> None: storage, data = swh_indexer_storage_with_data tool = { "tool_name": "some-unknown-tool", "tool_version": "some-version", "tool_configuration": {"debian-package": "some-package"}, } actual_tools = list(storage.indexer_configuration_add([tool])) assert len(actual_tools) == 1 new_tools = [ tool, { "tool_name": "yet-another-tool", "tool_version": "version", "tool_configuration": {}, }, ] actual_tools = list(storage.indexer_configuration_add(new_tools)) assert len(actual_tools) == 2 # order not guaranteed, so we iterate over results to check for tool in actual_tools: _id = tool.pop("id") assert _id is not None assert tool in new_tools def test_indexer_configuration_get_missing( self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any] ) -> None: storage, data = swh_indexer_storage_with_data tool = { "tool_name": "unknown-tool", "tool_version": "3.1.0rc2-31-ga2cbb8c", "tool_configuration": {"command_line": "nomossa "}, } actual_tool = storage.indexer_configuration_get(tool) assert actual_tool is None def test_indexer_configuration_get( self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any] ) -> None: storage, data = swh_indexer_storage_with_data tool = { "tool_name": "nomos", "tool_version": "3.1.0rc2-31-ga2cbb8c", "tool_configuration": {"command_line": "nomossa "}, } actual_tool = storage.indexer_configuration_get(tool) assert actual_tool expected_tool = tool.copy() del actual_tool["id"] assert expected_tool == actual_tool def test_indexer_configuration_metadata_get_missing_context( self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any] ) -> None: storage, data = swh_indexer_storage_with_data tool = { "tool_name": "swh-metadata-translator", "tool_version": "0.0.1", "tool_configuration": {"context": "unknown-context"}, } actual_tool = storage.indexer_configuration_get(tool) assert actual_tool is None def test_indexer_configuration_metadata_get( self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any] ) -> None: storage, data = swh_indexer_storage_with_data tool = { "tool_name": "swh-metadata-translator", "tool_version": "0.0.1", "tool_configuration": {"type": "local", "context": "NpmMapping"}, } storage.indexer_configuration_add([tool]) actual_tool = storage.indexer_configuration_get(tool) assert actual_tool expected_tool = tool.copy() expected_tool["id"] = actual_tool["id"] assert expected_tool == actual_tool diff --git a/swh/indexer/tests/test_cli.py b/swh/indexer/tests/test_cli.py index 265fbea..97698d3 100644 --- a/swh/indexer/tests/test_cli.py +++ b/swh/indexer/tests/test_cli.py @@ -1,493 +1,501 @@ # Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime from functools import reduce import re from typing import Any, Dict, List from unittest.mock import patch from click.testing import CliRunner from confluent_kafka import Consumer import pytest from swh.indexer.cli import indexer_cli_group from swh.indexer.storage.interface import IndexerStorageInterface from swh.indexer.storage.model import ( OriginIntrinsicMetadataRow, RevisionIntrinsicMetadataRow, ) from swh.journal.writer import get_journal_writer from swh.model.hashutil import hash_to_bytes from swh.model.model import OriginVisitStatus def fill_idx_storage(idx_storage: IndexerStorageInterface, nb_rows: int) -> List[int]: tools: List[Dict[str, Any]] = [ {"tool_name": "tool %d" % i, "tool_version": "0.0.1", "tool_configuration": {},} for i in range(2) ] tools = idx_storage.indexer_configuration_add(tools) origin_metadata = [ OriginIntrinsicMetadataRow( id="file://dev/%04d" % origin_id, from_revision=hash_to_bytes("abcd{:0>36}".format(origin_id)), indexer_configuration_id=tools[origin_id % 2]["id"], metadata={"name": "origin %d" % origin_id}, mappings=["mapping%d" % (origin_id % 10)], ) for origin_id in range(nb_rows) ] revision_metadata = [ RevisionIntrinsicMetadataRow( id=hash_to_bytes("abcd{:0>36}".format(origin_id)), indexer_configuration_id=tools[origin_id % 2]["id"], metadata={"name": "origin %d" % origin_id}, mappings=["mapping%d" % (origin_id % 10)], ) for origin_id in range(nb_rows) ] idx_storage.revision_intrinsic_metadata_add(revision_metadata) idx_storage.origin_intrinsic_metadata_add(origin_metadata) return [tool["id"] for tool in tools] def _origins_in_task_args(tasks): """Returns the set of origins contained in the arguments of the provided tasks (assumed to be of type index-origin-metadata).""" return reduce( set.union, (set(task["arguments"]["args"][0]) for task in tasks), set() ) def _assert_tasks_for_origins(tasks, origins): expected_kwargs = {} assert {task["type"] for task in tasks} == {"index-origin-metadata"} assert all(len(task["arguments"]["args"]) == 1 for task in tasks) for task in tasks: assert task["arguments"]["kwargs"] == expected_kwargs, task assert _origins_in_task_args(tasks) == set(["file://dev/%04d" % i for i in origins]) @pytest.fixture def cli_runner(): return CliRunner() def test_cli_mapping_list(cli_runner, swh_config): result = cli_runner.invoke( indexer_cli_group, ["-C", swh_config, "mapping", "list"], catch_exceptions=False, ) expected_output = "\n".join( - ["codemeta", "gemspec", "maven", "npm", "pkg-info", "",] + [ + "cff", + "codemeta", + "gemspec", + "maven", + "npm", + "pkg-info", + "", + ] # must be sorted for test to pass ) assert result.exit_code == 0, result.output assert result.output == expected_output def test_cli_mapping_list_terms(cli_runner, swh_config): result = cli_runner.invoke( indexer_cli_group, ["-C", swh_config, "mapping", "list-terms"], catch_exceptions=False, ) assert result.exit_code == 0, result.output assert re.search(r"http://schema.org/url:\n.*npm", result.output) assert re.search(r"http://schema.org/url:\n.*codemeta", result.output) assert re.search( r"https://codemeta.github.io/terms/developmentStatus:\n\tcodemeta", result.output, ) def test_cli_mapping_list_terms_exclude(cli_runner, swh_config): result = cli_runner.invoke( indexer_cli_group, ["-C", swh_config, "mapping", "list-terms", "--exclude-mapping", "codemeta"], catch_exceptions=False, ) assert result.exit_code == 0, result.output assert re.search(r"http://schema.org/url:\n.*npm", result.output) assert not re.search(r"http://schema.org/url:\n.*codemeta", result.output) assert not re.search( r"https://codemeta.github.io/terms/developmentStatus:\n\tcodemeta", result.output, ) @patch("swh.scheduler.cli.utils.TASK_BATCH_SIZE", 3) @patch("swh.scheduler.cli_utils.TASK_BATCH_SIZE", 3) def test_cli_origin_metadata_reindex_empty_db( cli_runner, swh_config, indexer_scheduler, idx_storage, storage ): result = cli_runner.invoke( indexer_cli_group, ["-C", swh_config, "schedule", "reindex_origin_metadata",], catch_exceptions=False, ) expected_output = "Nothing to do (no origin metadata matched the criteria).\n" assert result.exit_code == 0, result.output assert result.output == expected_output tasks = indexer_scheduler.search_tasks() assert len(tasks) == 0 @patch("swh.scheduler.cli.utils.TASK_BATCH_SIZE", 3) @patch("swh.scheduler.cli_utils.TASK_BATCH_SIZE", 3) def test_cli_origin_metadata_reindex_divisor( cli_runner, swh_config, indexer_scheduler, idx_storage, storage ): """Tests the re-indexing when origin_batch_size*task_batch_size is a divisor of nb_origins.""" fill_idx_storage(idx_storage, 90) result = cli_runner.invoke( indexer_cli_group, ["-C", swh_config, "schedule", "reindex_origin_metadata",], catch_exceptions=False, ) # Check the output expected_output = ( "Scheduled 3 tasks (30 origins).\n" "Scheduled 6 tasks (60 origins).\n" "Scheduled 9 tasks (90 origins).\n" "Done.\n" ) assert result.exit_code == 0, result.output assert result.output == expected_output # Check scheduled tasks tasks = indexer_scheduler.search_tasks() assert len(tasks) == 9 _assert_tasks_for_origins(tasks, range(90)) @patch("swh.scheduler.cli.utils.TASK_BATCH_SIZE", 3) @patch("swh.scheduler.cli_utils.TASK_BATCH_SIZE", 3) def test_cli_origin_metadata_reindex_dry_run( cli_runner, swh_config, indexer_scheduler, idx_storage, storage ): """Tests the re-indexing when origin_batch_size*task_batch_size is a divisor of nb_origins.""" fill_idx_storage(idx_storage, 90) result = cli_runner.invoke( indexer_cli_group, ["-C", swh_config, "schedule", "--dry-run", "reindex_origin_metadata",], catch_exceptions=False, ) # Check the output expected_output = ( "Scheduled 3 tasks (30 origins).\n" "Scheduled 6 tasks (60 origins).\n" "Scheduled 9 tasks (90 origins).\n" "Done.\n" ) assert result.exit_code == 0, result.output assert result.output == expected_output # Check scheduled tasks tasks = indexer_scheduler.search_tasks() assert len(tasks) == 0 @patch("swh.scheduler.cli.utils.TASK_BATCH_SIZE", 3) @patch("swh.scheduler.cli_utils.TASK_BATCH_SIZE", 3) def test_cli_origin_metadata_reindex_nondivisor( cli_runner, swh_config, indexer_scheduler, idx_storage, storage ): """Tests the re-indexing when neither origin_batch_size or task_batch_size is a divisor of nb_origins.""" fill_idx_storage(idx_storage, 70) result = cli_runner.invoke( indexer_cli_group, [ "-C", swh_config, "schedule", "reindex_origin_metadata", "--batch-size", "20", ], catch_exceptions=False, ) # Check the output expected_output = ( "Scheduled 3 tasks (60 origins).\n" "Scheduled 4 tasks (70 origins).\n" "Done.\n" ) assert result.exit_code == 0, result.output assert result.output == expected_output # Check scheduled tasks tasks = indexer_scheduler.search_tasks() assert len(tasks) == 4 _assert_tasks_for_origins(tasks, range(70)) @patch("swh.scheduler.cli.utils.TASK_BATCH_SIZE", 3) @patch("swh.scheduler.cli_utils.TASK_BATCH_SIZE", 3) def test_cli_origin_metadata_reindex_filter_one_mapping( cli_runner, swh_config, indexer_scheduler, idx_storage, storage ): """Tests the re-indexing when origin_batch_size*task_batch_size is a divisor of nb_origins.""" fill_idx_storage(idx_storage, 110) result = cli_runner.invoke( indexer_cli_group, [ "-C", swh_config, "schedule", "reindex_origin_metadata", "--mapping", "mapping1", ], catch_exceptions=False, ) # Check the output expected_output = "Scheduled 2 tasks (11 origins).\nDone.\n" assert result.exit_code == 0, result.output assert result.output == expected_output # Check scheduled tasks tasks = indexer_scheduler.search_tasks() assert len(tasks) == 2 _assert_tasks_for_origins(tasks, [1, 11, 21, 31, 41, 51, 61, 71, 81, 91, 101]) @patch("swh.scheduler.cli.utils.TASK_BATCH_SIZE", 3) @patch("swh.scheduler.cli_utils.TASK_BATCH_SIZE", 3) def test_cli_origin_metadata_reindex_filter_two_mappings( cli_runner, swh_config, indexer_scheduler, idx_storage, storage ): """Tests the re-indexing when origin_batch_size*task_batch_size is a divisor of nb_origins.""" fill_idx_storage(idx_storage, 110) result = cli_runner.invoke( indexer_cli_group, [ "--config-file", swh_config, "schedule", "reindex_origin_metadata", "--mapping", "mapping1", "--mapping", "mapping2", ], catch_exceptions=False, ) # Check the output expected_output = "Scheduled 3 tasks (22 origins).\nDone.\n" assert result.exit_code == 0, result.output assert result.output == expected_output # Check scheduled tasks tasks = indexer_scheduler.search_tasks() assert len(tasks) == 3 _assert_tasks_for_origins( tasks, [ 1, 11, 21, 31, 41, 51, 61, 71, 81, 91, 101, 2, 12, 22, 32, 42, 52, 62, 72, 82, 92, 102, ], ) @patch("swh.scheduler.cli.utils.TASK_BATCH_SIZE", 3) @patch("swh.scheduler.cli_utils.TASK_BATCH_SIZE", 3) def test_cli_origin_metadata_reindex_filter_one_tool( cli_runner, swh_config, indexer_scheduler, idx_storage, storage ): """Tests the re-indexing when origin_batch_size*task_batch_size is a divisor of nb_origins.""" tool_ids = fill_idx_storage(idx_storage, 110) result = cli_runner.invoke( indexer_cli_group, [ "-C", swh_config, "schedule", "reindex_origin_metadata", "--tool-id", str(tool_ids[0]), ], catch_exceptions=False, ) # Check the output expected_output = ( "Scheduled 3 tasks (30 origins).\n" "Scheduled 6 tasks (55 origins).\n" "Done.\n" ) assert result.exit_code == 0, result.output assert result.output == expected_output # Check scheduled tasks tasks = indexer_scheduler.search_tasks() assert len(tasks) == 6 _assert_tasks_for_origins(tasks, [x * 2 for x in range(55)]) def now(): return datetime.datetime.now(tz=datetime.timezone.utc) def test_cli_journal_client( cli_runner, swh_config, indexer_scheduler, kafka_prefix: str, kafka_server, consumer: Consumer, ): """Test the 'swh indexer journal-client' cli tool.""" journal_writer = get_journal_writer( "kafka", brokers=[kafka_server], prefix=kafka_prefix, client_id="test producer", value_sanitizer=lambda object_type, value: value, flush_timeout=3, # fail early if something is going wrong ) visit_statuses = [ OriginVisitStatus( origin="file:///dev/zero", visit=1, date=now(), status="full", snapshot=None, ), OriginVisitStatus( origin="file:///dev/foobar", visit=2, date=now(), status="full", snapshot=None, ), OriginVisitStatus( origin="file:///tmp/spamegg", visit=3, date=now(), status="full", snapshot=None, ), OriginVisitStatus( origin="file:///dev/0002", visit=6, date=now(), status="full", snapshot=None, ), OriginVisitStatus( # will be filtered out due to its 'partial' status origin="file:///dev/0000", visit=4, date=now(), status="partial", snapshot=None, ), OriginVisitStatus( # will be filtered out due to its 'ongoing' status origin="file:///dev/0001", visit=5, date=now(), status="ongoing", snapshot=None, ), ] journal_writer.write_additions("origin_visit_status", visit_statuses) visit_statuses_full = [vs for vs in visit_statuses if vs.status == "full"] result = cli_runner.invoke( indexer_cli_group, [ "-C", swh_config, "journal-client", "--broker", kafka_server, "--prefix", kafka_prefix, "--group-id", "test-consumer", "--stop-after-objects", len(visit_statuses), "--origin-metadata-task-type", "index-origin-metadata", ], catch_exceptions=False, ) # Check the output expected_output = "Done.\n" assert result.exit_code == 0, result.output assert result.output == expected_output # Check scheduled tasks tasks = indexer_scheduler.search_tasks(task_type="index-origin-metadata") # This can be split into multiple tasks but no more than the origin-visit-statuses # written in the journal assert len(tasks) <= len(visit_statuses_full) actual_origins = [] for task in tasks: actual_task = dict(task) assert actual_task["type"] == "index-origin-metadata" scheduled_origins = actual_task["arguments"]["args"][0] actual_origins.extend(scheduled_origins) assert set(actual_origins) == {vs.origin for vs in visit_statuses_full} def test_cli_journal_client_without_brokers( cli_runner, swh_config, kafka_prefix: str, kafka_server, consumer: Consumer ): """Without brokers configuration, the cli fails.""" with pytest.raises(ValueError, match="brokers"): cli_runner.invoke( indexer_cli_group, ["-C", swh_config, "journal-client",], catch_exceptions=False, ) diff --git a/swh/indexer/tests/test_codemeta.py b/swh/indexer/tests/test_codemeta.py index 7214e73..cadc35f 100644 --- a/swh/indexer/tests/test_codemeta.py +++ b/swh/indexer/tests/test_codemeta.py @@ -1,258 +1,258 @@ # Copyright (C) 2018-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import pytest from swh.indexer.codemeta import CROSSWALK_TABLE, merge_documents, merge_values def test_crosstable(): assert CROSSWALK_TABLE["NodeJS"] == { "repository": "http://schema.org/codeRepository", "os": "http://schema.org/operatingSystem", "cpu": "http://schema.org/processorRequirements", - "engines": "http://schema.org/processorRequirements", + "engines": "http://schema.org/runtimePlatform", "author": "http://schema.org/author", "author.email": "http://schema.org/email", "author.name": "http://schema.org/name", - "contributor": "http://schema.org/contributor", + "contributors": "http://schema.org/contributor", "keywords": "http://schema.org/keywords", "license": "http://schema.org/license", "version": "http://schema.org/version", "description": "http://schema.org/description", "name": "http://schema.org/name", "bugs": "https://codemeta.github.io/terms/issueTracker", "homepage": "http://schema.org/url", } def test_merge_values(): assert merge_values("a", "b") == ["a", "b"] assert merge_values(["a", "b"], "c") == ["a", "b", "c"] assert merge_values("a", ["b", "c"]) == ["a", "b", "c"] assert merge_values({"@list": ["a"]}, {"@list": ["b"]}) == {"@list": ["a", "b"]} assert merge_values({"@list": ["a", "b"]}, {"@list": ["c"]}) == { "@list": ["a", "b", "c"] } with pytest.raises(ValueError): merge_values({"@list": ["a"]}, "b") with pytest.raises(ValueError): merge_values("a", {"@list": ["b"]}) with pytest.raises(ValueError): merge_values({"@list": ["a"]}, ["b"]) with pytest.raises(ValueError): merge_values(["a"], {"@list": ["b"]}) assert merge_values("a", None) == "a" assert merge_values(["a", "b"], None) == ["a", "b"] assert merge_values(None, ["b", "c"]) == ["b", "c"] assert merge_values({"@list": ["a"]}, None) == {"@list": ["a"]} assert merge_values(None, {"@list": ["a"]}) == {"@list": ["a"]} def test_merge_documents(): """ Test the creation of a coherent minimal metadata set """ # given metadata_list = [ { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "name": "test_1", "version": "0.0.2", "description": "Simple package.json test for indexer", "codeRepository": "git+https://github.com/moranegg/metadata_test", }, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "name": "test_0_1", "version": "0.0.2", "description": "Simple package.json test for indexer", "codeRepository": "git+https://github.com/moranegg/metadata_test", }, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "name": "test_metadata", "version": "0.0.2", "author": {"type": "Person", "name": "moranegg",}, }, ] # when results = merge_documents(metadata_list) # then expected_results = { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "version": "0.0.2", "description": "Simple package.json test for indexer", "name": ["test_1", "test_0_1", "test_metadata"], "author": [{"type": "Person", "name": "moranegg"}], "codeRepository": "git+https://github.com/moranegg/metadata_test", } assert results == expected_results def test_merge_documents_ids(): # given metadata_list = [ { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "id": "http://example.org/test1", "name": "test_1", }, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "id": "http://example.org/test2", "name": "test_2", }, ] # when results = merge_documents(metadata_list) # then expected_results = { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "id": "http://example.org/test1", "schema:sameAs": "http://example.org/test2", "name": ["test_1", "test_2"], } assert results == expected_results def test_merge_documents_duplicate_ids(): # given metadata_list = [ { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "id": "http://example.org/test1", "name": "test_1", }, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "id": "http://example.org/test1", "name": "test_1b", }, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "id": "http://example.org/test2", "name": "test_2", }, ] # when results = merge_documents(metadata_list) # then expected_results = { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "id": "http://example.org/test1", "schema:sameAs": "http://example.org/test2", "name": ["test_1", "test_1b", "test_2"], } assert results == expected_results def test_merge_documents_lists(): """Tests merging two @list elements.""" # given metadata_list = [ { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "author": {"@list": [{"name": "test_1"},]}, }, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "author": {"@list": [{"name": "test_2"},]}, }, ] # when results = merge_documents(metadata_list) # then expected_results = { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "author": [{"name": "test_1"}, {"name": "test_2"},], } assert results == expected_results def test_merge_documents_lists_duplicates(): """Tests merging two @list elements with a duplicate subelement.""" # given metadata_list = [ { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "author": {"@list": [{"name": "test_1"},]}, }, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "author": {"@list": [{"name": "test_2"}, {"name": "test_1"},]}, }, ] # when results = merge_documents(metadata_list) # then expected_results = { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "author": [{"name": "test_1"}, {"name": "test_2"},], } assert results == expected_results def test_merge_documents_list_left(): """Tests merging a singleton with an @list.""" # given metadata_list = [ { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "author": {"name": "test_1"}, }, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "author": {"@list": [{"name": "test_2"},]}, }, ] # when results = merge_documents(metadata_list) # then expected_results = { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "author": [{"name": "test_1"}, {"name": "test_2"},], } assert results == expected_results def test_merge_documents_list_right(): """Tests merging an @list with a singleton.""" # given metadata_list = [ { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "author": {"@list": [{"name": "test_1"},]}, }, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "author": {"name": "test_2"}, }, ] # when results = merge_documents(metadata_list) # then expected_results = { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "author": [{"name": "test_1"}, {"name": "test_2"},], } assert results == expected_results diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py index 0505934..d3bddc5 100644 --- a/swh/indexer/tests/test_metadata.py +++ b/swh/indexer/tests/test_metadata.py @@ -1,1205 +1,1318 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json import unittest from hypothesis import HealthCheck, given, settings, strategies from swh.indexer.codemeta import CODEMETA_TERMS from swh.indexer.metadata import ContentMetadataIndexer, RevisionMetadataIndexer from swh.indexer.metadata_detector import detect_metadata from swh.indexer.metadata_dictionary import MAPPINGS from swh.indexer.metadata_dictionary.maven import MavenMapping from swh.indexer.metadata_dictionary.npm import NpmMapping from swh.indexer.metadata_dictionary.ruby import GemspecMapping from swh.indexer.storage.model import ContentMetadataRow, RevisionIntrinsicMetadataRow from swh.indexer.tests.utils import DIRECTORY2, REVISION from swh.model.hashutil import hash_to_bytes from swh.model.model import Directory, DirectoryEntry, Revision from .utils import ( BASE_TEST_CONFIG, YARN_PARSER_METADATA, fill_obj_storage, fill_storage, json_document_strategy, xml_document_strategy, ) TRANSLATOR_TOOL = { "name": "swh-metadata-translator", "version": "0.0.2", "configuration": {"type": "local", "context": "NpmMapping"}, } class ContentMetadataTestIndexer(ContentMetadataIndexer): """Specific Metadata whose configuration is enough to satisfy the indexing tests. """ def parse_config_file(self, *args, **kwargs): assert False, "should not be called; the rev indexer configures it." REVISION_METADATA_CONFIG = { **BASE_TEST_CONFIG, "tools": TRANSLATOR_TOOL, } class Metadata(unittest.TestCase): """ Tests metadata_mock_tool tool for Metadata detection """ def setUp(self): """ shows the entire diff in the results """ self.maxDiff = None self.npm_mapping = MAPPINGS["NpmMapping"]() self.codemeta_mapping = MAPPINGS["CodemetaMapping"]() self.maven_mapping = MAPPINGS["MavenMapping"]() self.pkginfo_mapping = MAPPINGS["PythonPkginfoMapping"]() self.gemspec_mapping = MAPPINGS["GemspecMapping"]() + self.cff_mapping = MAPPINGS["CffMapping"]() def test_compute_metadata_none(self): """ testing content empty content is empty should return None """ # given content = b"" # None if no metadata was found or an error occurred declared_metadata = None # when result = self.npm_mapping.translate(content) # then self.assertEqual(declared_metadata, result) + def test_compute_metadata_cff(self): + """ + testing CITATION.cff translation + """ + # given + content = """# YAML 1.2 +--- +abstract: "Command line program to convert from Citation File \ +Format to various other formats such as BibTeX, EndNote, RIS, \ +schema.org, CodeMeta, and .zenodo.json." +authors: + - + affiliation: "Netherlands eScience Center" + family-names: Klaver + given-names: Tom + - + affiliation: "Humboldt-Universität zu Berlin" + family-names: Druskat + given-names: Stephan + orcid: https://orcid.org/0000-0003-4925-7248 +cff-version: "1.0.3" +date-released: 2019-11-12 +doi: 10.5281/zenodo.1162057 +keywords: + - "citation" + - "bibliography" + - "cff" + - "CITATION.cff" +license: Apache-2.0 +message: "If you use this software, please cite it using these metadata." +repository-code: "https://github.com/citation-file-format/cff-converter-python" +title: cffconvert +version: "1.4.0-alpha0" + """.encode( + "utf-8" + ) + + expected = { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "SoftwareSourceCode", + "author": [ + { + "type": "Person", + "affiliation": { + "type": "Organization", + "name": "Netherlands eScience Center", + }, + "familyName": "Klaver", + "givenName": "Tom", + }, + { + "id": "https://orcid.org/0000-0003-4925-7248", + "type": "Person", + "affiliation": { + "type": "Organization", + "name": "Humboldt-Universität zu Berlin", + }, + "familyName": "Druskat", + "givenName": "Stephan", + }, + ], + "codeRepository": ( + "https://github.com/citation-file-format/cff-converter-python" + ), + "datePublished": "2019-11-12", + "description": """Command line program to convert from \ +Citation File Format to various other formats such as BibTeX, EndNote, \ +RIS, schema.org, CodeMeta, and .zenodo.json.""", + "identifier": "https://doi.org/10.5281/zenodo.1162057", + "keywords": ["citation", "bibliography", "cff", "CITATION.cff"], + "license": "https://spdx.org/licenses/Apache-2.0", + "version": "1.4.0-alpha0", + } + + # when + result = self.cff_mapping.translate(content) + # then + self.assertEqual(expected, result) + def test_compute_metadata_npm(self): """ testing only computation of metadata with hard_mapping_npm """ # given content = b""" { "name": "test_metadata", "version": "0.0.2", "description": "Simple package.json test for indexer", "repository": { "type": "git", "url": "https://github.com/moranegg/metadata_test" }, "author": { "email": "moranegg@example.com", "name": "Morane G" } } """ declared_metadata = { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "name": "test_metadata", "version": "0.0.2", "description": "Simple package.json test for indexer", "codeRepository": "git+https://github.com/moranegg/metadata_test", "author": [ {"type": "Person", "name": "Morane G", "email": "moranegg@example.com",} ], } # when result = self.npm_mapping.translate(content) # then self.assertEqual(declared_metadata, result) def test_index_content_metadata_npm(self): """ testing NPM with package.json - one sha1 uses a file that can't be translated to metadata and should return None in the translated metadata """ # given sha1s = [ hash_to_bytes("26a9f72a7c87cc9205725cfd879f514ff4f3d8d5"), hash_to_bytes("d4c647f0fc257591cc9ba1722484229780d1c607"), hash_to_bytes("02fb2c89e14f7fab46701478c83779c7beb7b069"), ] # this metadata indexer computes only metadata for package.json # in npm context with a hard mapping config = BASE_TEST_CONFIG.copy() config["tools"] = [TRANSLATOR_TOOL] metadata_indexer = ContentMetadataTestIndexer(config=config) fill_obj_storage(metadata_indexer.objstorage) fill_storage(metadata_indexer.storage) # when metadata_indexer.run(sha1s) results = list(metadata_indexer.idx_storage.content_metadata_get(sha1s)) expected_results = [ ContentMetadataRow( id=hash_to_bytes("26a9f72a7c87cc9205725cfd879f514ff4f3d8d5"), tool=TRANSLATOR_TOOL, metadata={ "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "codeRepository": "git+https://github.com/moranegg/metadata_test", "description": "Simple package.json test for indexer", "name": "test_metadata", "version": "0.0.1", }, ), ContentMetadataRow( id=hash_to_bytes("d4c647f0fc257591cc9ba1722484229780d1c607"), tool=TRANSLATOR_TOOL, metadata={ "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "issueTracker": "https://github.com/npm/npm/issues", "author": [ { "type": "Person", "name": "Isaac Z. Schlueter", "email": "i@izs.me", "url": "http://blog.izs.me", } ], "codeRepository": "git+https://github.com/npm/npm", "description": "a package manager for JavaScript", "license": "https://spdx.org/licenses/Artistic-2.0", "version": "5.0.3", "name": "npm", "keywords": [ "install", "modules", "package manager", "package.json", ], "url": "https://docs.npmjs.com/", }, ), ] for result in results: del result.tool["id"] # The assertion below returns False sometimes because of nested lists self.assertEqual(expected_results, results) def test_npm_bugs_normalization(self): # valid dictionary package_json = b"""{ "name": "foo", "bugs": { "url": "https://github.com/owner/project/issues", "email": "foo@example.com" } }""" result = self.npm_mapping.translate(package_json) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "name": "foo", "issueTracker": "https://github.com/owner/project/issues", "type": "SoftwareSourceCode", }, ) # "invalid" dictionary package_json = b"""{ "name": "foo", "bugs": { "email": "foo@example.com" } }""" result = self.npm_mapping.translate(package_json) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "name": "foo", "type": "SoftwareSourceCode", }, ) # string package_json = b"""{ "name": "foo", "bugs": "https://github.com/owner/project/issues" }""" result = self.npm_mapping.translate(package_json) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "name": "foo", "issueTracker": "https://github.com/owner/project/issues", "type": "SoftwareSourceCode", }, ) def test_npm_repository_normalization(self): # normal package_json = b"""{ "name": "foo", "repository": { "type" : "git", "url" : "https://github.com/npm/cli.git" } }""" result = self.npm_mapping.translate(package_json) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "name": "foo", "codeRepository": "git+https://github.com/npm/cli.git", "type": "SoftwareSourceCode", }, ) # missing url package_json = b"""{ "name": "foo", "repository": { "type" : "git" } }""" result = self.npm_mapping.translate(package_json) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "name": "foo", "type": "SoftwareSourceCode", }, ) # github shortcut package_json = b"""{ "name": "foo", "repository": "github:npm/cli" }""" result = self.npm_mapping.translate(package_json) expected_result = { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "name": "foo", "codeRepository": "git+https://github.com/npm/cli.git", "type": "SoftwareSourceCode", } self.assertEqual(result, expected_result) # github shortshortcut package_json = b"""{ "name": "foo", "repository": "npm/cli" }""" result = self.npm_mapping.translate(package_json) self.assertEqual(result, expected_result) # gitlab shortcut package_json = b"""{ "name": "foo", "repository": "gitlab:user/repo" }""" result = self.npm_mapping.translate(package_json) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "name": "foo", "codeRepository": "git+https://gitlab.com/user/repo.git", "type": "SoftwareSourceCode", }, ) def test_detect_metadata_package_json(self): # given df = [ { "sha1_git": b"abc", "name": b"index.js", "target": b"abc", "length": 897, "status": "visible", "type": "file", "perms": 33188, "dir_id": b"dir_a", "sha1": b"bcd", }, { "sha1_git": b"aab", "name": b"package.json", "target": b"aab", "length": 712, "status": "visible", "type": "file", "perms": 33188, "dir_id": b"dir_a", "sha1": b"cde", }, ] # when results = detect_metadata(df) expected_results = {"NpmMapping": [b"cde"]} # then self.assertEqual(expected_results, results) + def test_detect_metadata_codemeta_json_uppercase(self): + # given + df = [ + { + "sha1_git": b"abc", + "name": b"index.html", + "target": b"abc", + "length": 897, + "status": "visible", + "type": "file", + "perms": 33188, + "dir_id": b"dir_a", + "sha1": b"bcd", + }, + { + "sha1_git": b"aab", + "name": b"CODEMETA.json", + "target": b"aab", + "length": 712, + "status": "visible", + "type": "file", + "perms": 33188, + "dir_id": b"dir_a", + "sha1": b"bcd", + }, + ] + # when + results = detect_metadata(df) + + expected_results = {"CodemetaMapping": [b"bcd"]} + # then + self.assertEqual(expected_results, results) + def test_compute_metadata_valid_codemeta(self): raw_content = b"""{ "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "@type": "SoftwareSourceCode", "identifier": "CodeMeta", "description": "CodeMeta is a concept vocabulary that can be used to standardize the exchange of software metadata across repositories and organizations.", "name": "CodeMeta: Minimal metadata schemas for science software and code, in JSON-LD", "codeRepository": "https://github.com/codemeta/codemeta", "issueTracker": "https://github.com/codemeta/codemeta/issues", "license": "https://spdx.org/licenses/Apache-2.0", "version": "2.0", "author": [ { "@type": "Person", "givenName": "Carl", "familyName": "Boettiger", "email": "cboettig@gmail.com", "@id": "http://orcid.org/0000-0002-1642-628X" }, { "@type": "Person", "givenName": "Matthew B.", "familyName": "Jones", "email": "jones@nceas.ucsb.edu", "@id": "http://orcid.org/0000-0003-0077-4738" } ], "maintainer": { "@type": "Person", "givenName": "Carl", "familyName": "Boettiger", "email": "cboettig@gmail.com", "@id": "http://orcid.org/0000-0002-1642-628X" }, "contIntegration": "https://travis-ci.org/codemeta/codemeta", "developmentStatus": "active", "downloadUrl": "https://github.com/codemeta/codemeta/archive/2.0.zip", "funder": { "@id": "https://doi.org/10.13039/100000001", "@type": "Organization", "name": "National Science Foundation" }, "funding":"1549758; Codemeta: A Rosetta Stone for Metadata in Scientific Software", "keywords": [ "metadata", "software" ], "version":"2.0", "dateCreated":"2017-06-05", "datePublished":"2017-06-05", "programmingLanguage": "JSON-LD" }""" # noqa expected_result = { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "identifier": "CodeMeta", "description": "CodeMeta is a concept vocabulary that can " "be used to standardize the exchange of software metadata " "across repositories and organizations.", "name": "CodeMeta: Minimal metadata schemas for science " "software and code, in JSON-LD", "codeRepository": "https://github.com/codemeta/codemeta", "issueTracker": "https://github.com/codemeta/codemeta/issues", "license": "https://spdx.org/licenses/Apache-2.0", "version": "2.0", "author": [ { "type": "Person", "givenName": "Carl", "familyName": "Boettiger", "email": "cboettig@gmail.com", "id": "http://orcid.org/0000-0002-1642-628X", }, { "type": "Person", "givenName": "Matthew B.", "familyName": "Jones", "email": "jones@nceas.ucsb.edu", "id": "http://orcid.org/0000-0003-0077-4738", }, ], "maintainer": { "type": "Person", "givenName": "Carl", "familyName": "Boettiger", "email": "cboettig@gmail.com", "id": "http://orcid.org/0000-0002-1642-628X", }, "contIntegration": "https://travis-ci.org/codemeta/codemeta", "developmentStatus": "active", "downloadUrl": "https://github.com/codemeta/codemeta/archive/2.0.zip", "funder": { "id": "https://doi.org/10.13039/100000001", "type": "Organization", "name": "National Science Foundation", }, "funding": "1549758; Codemeta: A Rosetta Stone for Metadata " "in Scientific Software", "keywords": ["metadata", "software"], "version": "2.0", "dateCreated": "2017-06-05", "datePublished": "2017-06-05", "programmingLanguage": "JSON-LD", } result = self.codemeta_mapping.translate(raw_content) self.assertEqual(result, expected_result) def test_compute_metadata_codemeta_alternate_context(self): raw_content = b"""{ "@context": "https://raw.githubusercontent.com/codemeta/codemeta/master/codemeta.jsonld", "@type": "SoftwareSourceCode", "identifier": "CodeMeta" }""" # noqa expected_result = { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "identifier": "CodeMeta", } result = self.codemeta_mapping.translate(raw_content) self.assertEqual(result, expected_result) def test_compute_metadata_maven(self): raw_content = b""" Maven Default Project 4.0.0 com.mycompany.app my-app 1.2.3 central Maven Repository Switchboard default http://repo1.maven.org/maven2 false Apache License, Version 2.0 https://www.apache.org/licenses/LICENSE-2.0.txt repo A business-friendly OSS license """ result = self.maven_mapping.translate(raw_content) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "name": "Maven Default Project", "identifier": "com.mycompany.app", "version": "1.2.3", "license": "https://www.apache.org/licenses/LICENSE-2.0.txt", "codeRepository": ( "http://repo1.maven.org/maven2/com/mycompany/app/my-app" ), }, ) def test_compute_metadata_maven_empty(self): raw_content = b""" """ result = self.maven_mapping.translate(raw_content) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", }, ) def test_compute_metadata_maven_almost_empty(self): raw_content = b""" """ result = self.maven_mapping.translate(raw_content) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", }, ) def test_compute_metadata_maven_invalid_xml(self): expected_warning = ( "WARNING:swh.indexer.metadata_dictionary.maven.MavenMapping:" "Error parsing XML from foo" ) raw_content = b""" """ with self.assertLogs("swh.indexer.metadata_dictionary", level="WARNING") as cm: result = MAPPINGS["MavenMapping"]("foo").translate(raw_content) self.assertEqual(cm.output, [expected_warning]) self.assertEqual(result, None) raw_content = b""" """ with self.assertLogs("swh.indexer.metadata_dictionary", level="WARNING") as cm: result = MAPPINGS["MavenMapping"]("foo").translate(raw_content) self.assertEqual(cm.output, [expected_warning]) self.assertEqual(result, None) def test_compute_metadata_maven_unknown_encoding(self): expected_warning = ( "WARNING:swh.indexer.metadata_dictionary.maven.MavenMapping:" "Error detecting XML encoding from foo" ) raw_content = b""" """ with self.assertLogs("swh.indexer.metadata_dictionary", level="WARNING") as cm: result = MAPPINGS["MavenMapping"]("foo").translate(raw_content) self.assertEqual(cm.output, [expected_warning]) self.assertEqual(result, None) raw_content = b""" """ with self.assertLogs("swh.indexer.metadata_dictionary", level="WARNING") as cm: result = MAPPINGS["MavenMapping"]("foo").translate(raw_content) self.assertEqual(cm.output, [expected_warning]) self.assertEqual(result, None) def test_compute_metadata_maven_invalid_encoding(self): expected_warning = ( "WARNING:swh.indexer.metadata_dictionary.maven.MavenMapping:" "Error unidecoding XML from foo" ) raw_content = b""" """ with self.assertLogs("swh.indexer.metadata_dictionary", level="WARNING") as cm: result = MAPPINGS["MavenMapping"]("foo").translate(raw_content) self.assertEqual(cm.output, [expected_warning]) self.assertEqual(result, None) def test_compute_metadata_maven_minimal(self): raw_content = b""" Maven Default Project 4.0.0 com.mycompany.app my-app 1.2.3 """ result = self.maven_mapping.translate(raw_content) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "name": "Maven Default Project", "identifier": "com.mycompany.app", "version": "1.2.3", "codeRepository": ( "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app" ), }, ) def test_compute_metadata_maven_empty_nodes(self): raw_content = b""" Maven Default Project 4.0.0 com.mycompany.app my-app 1.2.3 """ result = self.maven_mapping.translate(raw_content) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "name": "Maven Default Project", "identifier": "com.mycompany.app", "version": "1.2.3", "codeRepository": ( "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app" ), }, ) raw_content = b""" Maven Default Project 4.0.0 com.mycompany.app my-app """ result = self.maven_mapping.translate(raw_content) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "name": "Maven Default Project", "identifier": "com.mycompany.app", "codeRepository": ( "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app" ), }, ) raw_content = b""" 4.0.0 com.mycompany.app my-app 1.2.3 """ result = self.maven_mapping.translate(raw_content) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "identifier": "com.mycompany.app", "version": "1.2.3", "codeRepository": ( "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app" ), }, ) raw_content = b""" Maven Default Project 4.0.0 com.mycompany.app my-app 1.2.3 """ result = self.maven_mapping.translate(raw_content) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "name": "Maven Default Project", "identifier": "com.mycompany.app", "version": "1.2.3", "codeRepository": ( "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app" ), }, ) raw_content = b""" 1.2.3 """ result = self.maven_mapping.translate(raw_content) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "version": "1.2.3", }, ) def test_compute_metadata_maven_invalid_licenses(self): raw_content = b""" Maven Default Project 4.0.0 com.mycompany.app my-app 1.2.3 foo """ result = self.maven_mapping.translate(raw_content) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "name": "Maven Default Project", "identifier": "com.mycompany.app", "version": "1.2.3", "codeRepository": ( "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app" ), }, ) def test_compute_metadata_maven_multiple(self): """Tests when there are multiple code repos and licenses.""" raw_content = b""" Maven Default Project 4.0.0 com.mycompany.app my-app 1.2.3 central Maven Repository Switchboard default http://repo1.maven.org/maven2 false example Example Maven Repo default http://example.org/maven2 Apache License, Version 2.0 https://www.apache.org/licenses/LICENSE-2.0.txt repo A business-friendly OSS license MIT license https://opensource.org/licenses/MIT """ result = self.maven_mapping.translate(raw_content) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "name": "Maven Default Project", "identifier": "com.mycompany.app", "version": "1.2.3", "license": [ "https://www.apache.org/licenses/LICENSE-2.0.txt", "https://opensource.org/licenses/MIT", ], "codeRepository": [ "http://repo1.maven.org/maven2/com/mycompany/app/my-app", "http://example.org/maven2/com/mycompany/app/my-app", ], }, ) def test_compute_metadata_pkginfo(self): raw_content = b"""\ Metadata-Version: 2.1 Name: swh.core Version: 0.0.49 Summary: Software Heritage core utilities Home-page: https://forge.softwareheritage.org/diffusion/DCORE/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-core Description: swh-core ======== \x20 core library for swh's modules: - config parser - hash computations - serialization - logging mechanism \x20 Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Description-Content-Type: text/markdown Provides-Extra: testing """ # noqa result = self.pkginfo_mapping.translate(raw_content) self.assertCountEqual( result["description"], [ "Software Heritage core utilities", # note the comma here "swh-core\n" "========\n" "\n" "core library for swh's modules:\n" "- config parser\n" "- hash computations\n" "- serialization\n" "- logging mechanism\n" "", ], result, ) del result["description"] self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "url": "https://forge.softwareheritage.org/diffusion/DCORE/", "name": "swh.core", "author": [ { "type": "Person", "name": "Software Heritage developers", "email": "swh-devel@inria.fr", } ], "version": "0.0.49", }, ) def test_compute_metadata_pkginfo_utf8(self): raw_content = b"""\ Metadata-Version: 1.1 Name: snowpyt Description-Content-Type: UNKNOWN Description: foo Hydrology N\xc2\xb083 """ # noqa result = self.pkginfo_mapping.translate(raw_content) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "name": "snowpyt", "description": "foo\nHydrology N°83", }, ) def test_compute_metadata_pkginfo_keywords(self): raw_content = b"""\ Metadata-Version: 2.1 Name: foo Keywords: foo bar baz """ # noqa result = self.pkginfo_mapping.translate(raw_content) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "name": "foo", "keywords": ["foo", "bar", "baz"], }, ) def test_compute_metadata_pkginfo_license(self): raw_content = b"""\ Metadata-Version: 2.1 Name: foo License: MIT """ # noqa result = self.pkginfo_mapping.translate(raw_content) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "name": "foo", "license": "MIT", }, ) def test_gemspec_base(self): raw_content = b""" Gem::Specification.new do |s| s.name = 'example' s.version = '0.1.0' s.licenses = ['MIT'] s.summary = "This is an example!" s.description = "Much longer explanation of the example!" s.authors = ["Ruby Coder"] s.email = 'rubycoder@example.com' s.files = ["lib/example.rb"] s.homepage = 'https://rubygems.org/gems/example' s.metadata = { "source_code_uri" => "https://github.com/example/example" } end""" result = self.gemspec_mapping.translate(raw_content) self.assertCountEqual( result.pop("description"), ["This is an example!", "Much longer explanation of the example!"], ) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "author": [{"type": "Person", "name": "Ruby Coder"}], "name": "example", "license": "https://spdx.org/licenses/MIT", "codeRepository": "https://rubygems.org/gems/example", "email": "rubycoder@example.com", "version": "0.1.0", }, ) def test_gemspec_two_author_fields(self): raw_content = b""" Gem::Specification.new do |s| s.authors = ["Ruby Coder1"] s.author = "Ruby Coder2" end""" result = self.gemspec_mapping.translate(raw_content) self.assertCountEqual( result.pop("author"), [ {"type": "Person", "name": "Ruby Coder1"}, {"type": "Person", "name": "Ruby Coder2"}, ], ) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", }, ) def test_gemspec_invalid_author(self): raw_content = b""" Gem::Specification.new do |s| s.author = ["Ruby Coder"] end""" result = self.gemspec_mapping.translate(raw_content) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", }, ) raw_content = b""" Gem::Specification.new do |s| s.author = "Ruby Coder1", end""" result = self.gemspec_mapping.translate(raw_content) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", }, ) raw_content = b""" Gem::Specification.new do |s| s.authors = ["Ruby Coder1", ["Ruby Coder2"]] end""" result = self.gemspec_mapping.translate(raw_content) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "author": [{"type": "Person", "name": "Ruby Coder1"}], }, ) def test_gemspec_alternative_header(self): raw_content = b""" require './lib/version' Gem::Specification.new { |s| s.name = 'rb-system-with-aliases' s.summary = 'execute system commands with aliases' } """ result = self.gemspec_mapping.translate(raw_content) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "name": "rb-system-with-aliases", "description": "execute system commands with aliases", }, ) @settings(suppress_health_check=[HealthCheck.too_slow]) @given(json_document_strategy(keys=list(NpmMapping.mapping))) def test_npm_adversarial(self, doc): raw = json.dumps(doc).encode() self.npm_mapping.translate(raw) @settings(suppress_health_check=[HealthCheck.too_slow]) @given(json_document_strategy(keys=CODEMETA_TERMS)) def test_codemeta_adversarial(self, doc): raw = json.dumps(doc).encode() self.codemeta_mapping.translate(raw) @settings(suppress_health_check=[HealthCheck.too_slow]) @given( xml_document_strategy( keys=list(MavenMapping.mapping), root="project", xmlns="http://maven.apache.org/POM/4.0.0", ) ) def test_maven_adversarial(self, doc): self.maven_mapping.translate(doc) @settings(suppress_health_check=[HealthCheck.too_slow]) @given( strategies.dictionaries( # keys strategies.one_of( strategies.text(), *map(strategies.just, GemspecMapping.mapping) ), # values strategies.recursive( strategies.characters(), lambda children: strategies.lists(children, min_size=1), ), ) ) def test_gemspec_adversarial(self, doc): parts = [b"Gem::Specification.new do |s|\n"] for (k, v) in doc.items(): parts.append(" s.{} = {}\n".format(k, repr(v)).encode()) parts.append(b"end\n") self.gemspec_mapping.translate(b"".join(parts)) def test_revision_metadata_indexer(self): metadata_indexer = RevisionMetadataIndexer(config=REVISION_METADATA_CONFIG) fill_obj_storage(metadata_indexer.objstorage) fill_storage(metadata_indexer.storage) tool = metadata_indexer.idx_storage.indexer_configuration_get( {f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()} ) assert tool is not None rev = REVISION assert rev.directory == DIRECTORY2.id metadata_indexer.idx_storage.content_metadata_add( [ ContentMetadataRow( id=DIRECTORY2.entries[0].target, indexer_configuration_id=tool["id"], metadata=YARN_PARSER_METADATA, ) ] ) metadata_indexer.run([rev.id]) results = list( metadata_indexer.idx_storage.revision_intrinsic_metadata_get([REVISION.id]) ) expected_results = [ RevisionIntrinsicMetadataRow( id=rev.id, tool=TRANSLATOR_TOOL, metadata=YARN_PARSER_METADATA, mappings=["npm"], ) ] for result in results: del result.tool["id"] # then self.assertEqual(results, expected_results) def test_revision_metadata_indexer_single_root_dir(self): metadata_indexer = RevisionMetadataIndexer(config=REVISION_METADATA_CONFIG) fill_obj_storage(metadata_indexer.objstorage) fill_storage(metadata_indexer.storage) # Add a parent directory, that is the only directory at the root # of the revision rev = REVISION assert rev.directory == DIRECTORY2.id directory = Directory( entries=( DirectoryEntry( name=b"foobar-1.0.0", type="dir", target=rev.directory, perms=16384, ), ), ) assert directory.id is not None metadata_indexer.storage.directory_add([directory]) new_rev_dict = {**rev.to_dict(), "directory": directory.id} new_rev_dict.pop("id") new_rev = Revision.from_dict(new_rev_dict) metadata_indexer.storage.revision_add([new_rev]) tool = metadata_indexer.idx_storage.indexer_configuration_get( {f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()} ) assert tool is not None metadata_indexer.idx_storage.content_metadata_add( [ ContentMetadataRow( id=DIRECTORY2.entries[0].target, indexer_configuration_id=tool["id"], metadata=YARN_PARSER_METADATA, ) ] ) metadata_indexer.run([new_rev.id]) results = list( metadata_indexer.idx_storage.revision_intrinsic_metadata_get([new_rev.id]) ) expected_results = [ RevisionIntrinsicMetadataRow( id=new_rev.id, tool=TRANSLATOR_TOOL, metadata=YARN_PARSER_METADATA, mappings=["npm"], ) ] for result in results: del result.tool["id"] # then self.assertEqual(results, expected_results) diff --git a/tox.ini b/tox.ini index 56898f0..3afda02 100644 --- a/tox.ini +++ b/tox.ini @@ -1,40 +1,78 @@ [tox] envlist=black,flake8,mypy,py3 [testenv] extras = testing deps = pytest-cov swh-scheduler[testing] >= 0.5.0 swh-storage[testing] >= 0.10.0 dev: pdbpp commands = pytest --doctest-modules \ !slow: --hypothesis-profile=fast \ slow: --hypothesis-profile=slow \ {envsitepackagesdir}/swh/indexer \ --cov={envsitepackagesdir}/swh/indexer \ --cov-branch {posargs} [testenv:black] skip_install = true deps = black==19.10b0 commands = {envpython} -m black --check swh [testenv:flake8] skip_install = true deps = flake8 commands = {envpython} -m flake8 [testenv:mypy] extras = testing deps = mypy commands = mypy swh + +# build documentation outside swh-environment using the current +# git HEAD of swh-docs, is executed on CI for each diff to prevent +# breaking doc build +[testenv:sphinx] +whitelist_externals = make +usedevelop = true +extras = + testing +deps = + # fetch and install swh-docs in develop mode + -e git+https://forge.softwareheritage.org/source/swh-docs#egg=swh.docs + +setenv = + SWH_PACKAGE_DOC_TOX_BUILD = 1 + # turn warnings into errors + SPHINXOPTS = -W +commands = + make -I ../.tox/sphinx/src/swh-docs/swh/ -C docs + + +# build documentation only inside swh-environment using local state +# of swh-docs package +[testenv:sphinx-dev] +whitelist_externals = make +usedevelop = true +extras = + testing +deps = + # install swh-docs in develop mode + -e ../swh-docs + +setenv = + SWH_PACKAGE_DOC_TOX_BUILD = 1 + # turn warnings into errors + SPHINXOPTS = -W +commands = + make -I ../.tox/sphinx-dev/src/swh-docs/swh/ -C docs