diff --git a/.gitignore b/.gitignore
index 1c279bb..f7a062e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,16 +1,17 @@
*.pyc
*.sw?
*~
/.coverage
/.coverage.*
.eggs/
__pycache__
*.egg-info/
build/
dist/
version.txt
/sql/createdb-stamp
/sql/filldb-stamp
.tox/
.hypothesis/
.mypy_cache/
+.vscode/
\ No newline at end of file
diff --git a/CONTRIBUTORS b/CONTRIBUTORS
index a1a7b45..d040f10 100644
--- a/CONTRIBUTORS
+++ b/CONTRIBUTORS
@@ -1,2 +1,3 @@
+Kumar Shivendu
Siddharth Ravikumar
Thibault Allançon
diff --git a/PKG-INFO b/PKG-INFO
index a1441e2..57272cb 100644
--- a/PKG-INFO
+++ b/PKG-INFO
@@ -1,71 +1,75 @@
Metadata-Version: 2.1
Name: swh.indexer
-Version: 0.7.0
+Version: 0.8.0
Summary: Software Heritage Content Indexer
Home-page: https://forge.softwareheritage.org/diffusion/78/
Author: Software Heritage developers
Author-email: swh-devel@inria.fr
License: UNKNOWN
Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest
Project-URL: Funding, https://www.softwareheritage.org/donate
Project-URL: Source, https://forge.softwareheritage.org/source/swh-indexer
Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-indexer/
-Description: swh-indexer
- ============
-
- Tools to compute multiple indexes on SWH's raw contents:
- - content:
- - mimetype
- - ctags
- - language
- - fossology-license
- - metadata
- - revision:
- - metadata
-
- An indexer is in charge of:
- - looking up objects
- - extracting information from those objects
- - store those information in the swh-indexer db
-
- There are multiple indexers working on different object types:
- - content indexer: works with content sha1 hashes
- - revision indexer: works with revision sha1 hashes
- - origin indexer: works with origin identifiers
-
- Indexation procedure:
- - receive batch of ids
- - retrieve the associated data depending on object type
- - compute for that object some index
- - store the result to swh's storage
-
- Current content indexers:
-
- - mimetype (queue swh_indexer_content_mimetype): detect the encoding
- and mimetype
-
- - language (queue swh_indexer_content_language): detect the
- programming language
-
- - ctags (queue swh_indexer_content_ctags): compute tags information
-
- - fossology-license (queue swh_indexer_fossology_license): compute the
- license
-
- - metadata: translate file into translated_metadata dict
-
- Current revision indexers:
-
- - metadata: detects files containing metadata and retrieves translated_metadata
- in content_metadata table in storage or run content indexer to translate
- files.
-
Platform: UNKNOWN
Classifier: Programming Language :: Python :: 3
Classifier: Intended Audience :: Developers
Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
Classifier: Operating System :: OS Independent
Classifier: Development Status :: 5 - Production/Stable
Requires-Python: >=3.7
Description-Content-Type: text/markdown
Provides-Extra: testing
+License-File: LICENSE
+License-File: AUTHORS
+
+swh-indexer
+============
+
+Tools to compute multiple indexes on SWH's raw contents:
+- content:
+ - mimetype
+ - ctags
+ - language
+ - fossology-license
+ - metadata
+- revision:
+ - metadata
+
+An indexer is in charge of:
+- looking up objects
+- extracting information from those objects
+- store those information in the swh-indexer db
+
+There are multiple indexers working on different object types:
+ - content indexer: works with content sha1 hashes
+ - revision indexer: works with revision sha1 hashes
+ - origin indexer: works with origin identifiers
+
+Indexation procedure:
+- receive batch of ids
+- retrieve the associated data depending on object type
+- compute for that object some index
+- store the result to swh's storage
+
+Current content indexers:
+
+- mimetype (queue swh_indexer_content_mimetype): detect the encoding
+ and mimetype
+
+- language (queue swh_indexer_content_language): detect the
+ programming language
+
+- ctags (queue swh_indexer_content_ctags): compute tags information
+
+- fossology-license (queue swh_indexer_fossology_license): compute the
+ license
+
+- metadata: translate file into translated_metadata dict
+
+Current revision indexers:
+
+- metadata: detects files containing metadata and retrieves translated_metadata
+ in content_metadata table in storage or run content indexer to translate
+ files.
+
+
diff --git a/requirements-swh.txt b/requirements-swh.txt
index 4f72e9b..fd8a344 100644
--- a/requirements-swh.txt
+++ b/requirements-swh.txt
@@ -1,6 +1,6 @@
-swh.core[db,http] >= 0.9.1
+swh.core[db,http] >= 0.14.0
swh.model >= 0.0.15
swh.objstorage >= 0.2.2
swh.scheduler >= 0.5.2
swh.storage >= 0.22.0
swh.journal >= 0.1.0
diff --git a/swh.indexer.egg-info/PKG-INFO b/swh.indexer.egg-info/PKG-INFO
index a1441e2..57272cb 100644
--- a/swh.indexer.egg-info/PKG-INFO
+++ b/swh.indexer.egg-info/PKG-INFO
@@ -1,71 +1,75 @@
Metadata-Version: 2.1
Name: swh.indexer
-Version: 0.7.0
+Version: 0.8.0
Summary: Software Heritage Content Indexer
Home-page: https://forge.softwareheritage.org/diffusion/78/
Author: Software Heritage developers
Author-email: swh-devel@inria.fr
License: UNKNOWN
Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest
Project-URL: Funding, https://www.softwareheritage.org/donate
Project-URL: Source, https://forge.softwareheritage.org/source/swh-indexer
Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-indexer/
-Description: swh-indexer
- ============
-
- Tools to compute multiple indexes on SWH's raw contents:
- - content:
- - mimetype
- - ctags
- - language
- - fossology-license
- - metadata
- - revision:
- - metadata
-
- An indexer is in charge of:
- - looking up objects
- - extracting information from those objects
- - store those information in the swh-indexer db
-
- There are multiple indexers working on different object types:
- - content indexer: works with content sha1 hashes
- - revision indexer: works with revision sha1 hashes
- - origin indexer: works with origin identifiers
-
- Indexation procedure:
- - receive batch of ids
- - retrieve the associated data depending on object type
- - compute for that object some index
- - store the result to swh's storage
-
- Current content indexers:
-
- - mimetype (queue swh_indexer_content_mimetype): detect the encoding
- and mimetype
-
- - language (queue swh_indexer_content_language): detect the
- programming language
-
- - ctags (queue swh_indexer_content_ctags): compute tags information
-
- - fossology-license (queue swh_indexer_fossology_license): compute the
- license
-
- - metadata: translate file into translated_metadata dict
-
- Current revision indexers:
-
- - metadata: detects files containing metadata and retrieves translated_metadata
- in content_metadata table in storage or run content indexer to translate
- files.
-
Platform: UNKNOWN
Classifier: Programming Language :: Python :: 3
Classifier: Intended Audience :: Developers
Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
Classifier: Operating System :: OS Independent
Classifier: Development Status :: 5 - Production/Stable
Requires-Python: >=3.7
Description-Content-Type: text/markdown
Provides-Extra: testing
+License-File: LICENSE
+License-File: AUTHORS
+
+swh-indexer
+============
+
+Tools to compute multiple indexes on SWH's raw contents:
+- content:
+ - mimetype
+ - ctags
+ - language
+ - fossology-license
+ - metadata
+- revision:
+ - metadata
+
+An indexer is in charge of:
+- looking up objects
+- extracting information from those objects
+- store those information in the swh-indexer db
+
+There are multiple indexers working on different object types:
+ - content indexer: works with content sha1 hashes
+ - revision indexer: works with revision sha1 hashes
+ - origin indexer: works with origin identifiers
+
+Indexation procedure:
+- receive batch of ids
+- retrieve the associated data depending on object type
+- compute for that object some index
+- store the result to swh's storage
+
+Current content indexers:
+
+- mimetype (queue swh_indexer_content_mimetype): detect the encoding
+ and mimetype
+
+- language (queue swh_indexer_content_language): detect the
+ programming language
+
+- ctags (queue swh_indexer_content_ctags): compute tags information
+
+- fossology-license (queue swh_indexer_fossology_license): compute the
+ license
+
+- metadata: translate file into translated_metadata dict
+
+Current revision indexers:
+
+- metadata: detects files containing metadata and retrieves translated_metadata
+ in content_metadata table in storage or run content indexer to translate
+ files.
+
+
diff --git a/swh.indexer.egg-info/SOURCES.txt b/swh.indexer.egg-info/SOURCES.txt
index 9f4e312..55e837c 100644
--- a/swh.indexer.egg-info/SOURCES.txt
+++ b/swh.indexer.egg-info/SOURCES.txt
@@ -1,142 +1,143 @@
.gitignore
.pre-commit-config.yaml
AUTHORS
CODE_OF_CONDUCT.md
CONTRIBUTORS
LICENSE
MANIFEST.in
Makefile
Makefile.local
README.md
codemeta.json
conftest.py
mypy.ini
pyproject.toml
pytest.ini
requirements-swh.txt
requirements-test.txt
requirements.txt
setup.cfg
setup.py
tox.ini
docs/.gitignore
docs/Makefile
docs/Makefile.local
docs/README.md
docs/cli.rst
docs/conf.py
docs/dev-info.rst
docs/index.rst
docs/metadata-workflow.rst
docs/_static/.placeholder
docs/_templates/.placeholder
docs/images/.gitignore
docs/images/Makefile
docs/images/tasks-metadata-indexers.uml
sql/bin/db-upgrade
sql/bin/dot_add_content
sql/doc/json
sql/doc/json/.gitignore
sql/doc/json/Makefile
sql/doc/json/indexer_configuration.tool_configuration.schema.json
sql/doc/json/revision_metadata.translated_metadata.json
sql/json/.gitignore
sql/json/Makefile
sql/json/indexer_configuration.tool_configuration.schema.json
sql/json/revision_metadata.translated_metadata.json
sql/upgrades/115.sql
sql/upgrades/116.sql
sql/upgrades/117.sql
sql/upgrades/118.sql
sql/upgrades/119.sql
sql/upgrades/120.sql
sql/upgrades/121.sql
sql/upgrades/122.sql
sql/upgrades/123.sql
sql/upgrades/124.sql
sql/upgrades/125.sql
sql/upgrades/126.sql
sql/upgrades/127.sql
sql/upgrades/128.sql
sql/upgrades/129.sql
sql/upgrades/130.sql
sql/upgrades/131.sql
sql/upgrades/132.sql
sql/upgrades/133.sql
swh/__init__.py
swh.indexer.egg-info/PKG-INFO
swh.indexer.egg-info/SOURCES.txt
swh.indexer.egg-info/dependency_links.txt
swh.indexer.egg-info/entry_points.txt
swh.indexer.egg-info/requires.txt
swh.indexer.egg-info/top_level.txt
swh/indexer/__init__.py
swh/indexer/cli.py
swh/indexer/codemeta.py
swh/indexer/ctags.py
swh/indexer/fossology_license.py
swh/indexer/indexer.py
swh/indexer/journal_client.py
swh/indexer/metadata.py
swh/indexer/metadata_detector.py
swh/indexer/mimetype.py
swh/indexer/origin_head.py
swh/indexer/py.typed
swh/indexer/rehash.py
swh/indexer/tasks.py
swh/indexer/data/codemeta/CITATION
swh/indexer/data/codemeta/LICENSE
swh/indexer/data/codemeta/codemeta.jsonld
swh/indexer/data/codemeta/crosswalk.csv
swh/indexer/metadata_dictionary/__init__.py
swh/indexer/metadata_dictionary/base.py
+swh/indexer/metadata_dictionary/cff.py
swh/indexer/metadata_dictionary/codemeta.py
swh/indexer/metadata_dictionary/maven.py
swh/indexer/metadata_dictionary/npm.py
swh/indexer/metadata_dictionary/python.py
swh/indexer/metadata_dictionary/ruby.py
swh/indexer/sql/10-superuser-init.sql
swh/indexer/sql/20-enums.sql
swh/indexer/sql/30-schema.sql
swh/indexer/sql/50-data.sql
swh/indexer/sql/50-func.sql
swh/indexer/sql/60-indexes.sql
swh/indexer/storage/__init__.py
swh/indexer/storage/converters.py
swh/indexer/storage/db.py
swh/indexer/storage/exc.py
swh/indexer/storage/in_memory.py
swh/indexer/storage/interface.py
swh/indexer/storage/metrics.py
swh/indexer/storage/model.py
swh/indexer/storage/writer.py
swh/indexer/storage/api/__init__.py
swh/indexer/storage/api/client.py
swh/indexer/storage/api/serializers.py
swh/indexer/storage/api/server.py
swh/indexer/tests/__init__.py
swh/indexer/tests/conftest.py
swh/indexer/tests/tasks.py
swh/indexer/tests/test_cli.py
swh/indexer/tests/test_codemeta.py
swh/indexer/tests/test_ctags.py
swh/indexer/tests/test_fossology_license.py
swh/indexer/tests/test_indexer.py
swh/indexer/tests/test_journal_client.py
swh/indexer/tests/test_metadata.py
swh/indexer/tests/test_mimetype.py
swh/indexer/tests/test_origin_head.py
swh/indexer/tests/test_origin_metadata.py
swh/indexer/tests/test_tasks.py
swh/indexer/tests/utils.py
swh/indexer/tests/storage/__init__.py
swh/indexer/tests/storage/conftest.py
swh/indexer/tests/storage/generate_data_test.py
swh/indexer/tests/storage/test_api_client.py
swh/indexer/tests/storage/test_converters.py
swh/indexer/tests/storage/test_in_memory.py
swh/indexer/tests/storage/test_init.py
swh/indexer/tests/storage/test_metrics.py
swh/indexer/tests/storage/test_model.py
swh/indexer/tests/storage/test_server.py
swh/indexer/tests/storage/test_storage.py
\ No newline at end of file
diff --git a/swh.indexer.egg-info/requires.txt b/swh.indexer.egg-info/requires.txt
index 2f9c7ae..48a5c82 100644
--- a/swh.indexer.egg-info/requires.txt
+++ b/swh.indexer.egg-info/requires.txt
@@ -1,19 +1,19 @@
click
python-magic>=0.4.13
pyld
xmltodict
typing-extensions
-swh.core[db,http]>=0.9.1
+swh.core[db,http]>=0.14.0
swh.model>=0.0.15
swh.objstorage>=0.2.2
swh.scheduler>=0.5.2
swh.storage>=0.22.0
swh.journal>=0.1.0
[testing]
confluent-kafka
pytest
pytest-mock
hypothesis>=3.11.0
swh.scheduler[testing]>=0.5.0
swh.storage[testing]>=0.10.0
diff --git a/swh/indexer/data/codemeta/CITATION b/swh/indexer/data/codemeta/CITATION
index 92290ce..87ea0e9 100644
--- a/swh/indexer/data/codemeta/CITATION
+++ b/swh/indexer/data/codemeta/CITATION
@@ -1,2 +1,2 @@
Matthew B. Jones, Carl Boettiger, Abby Cabunoc Mayes, Arfon Smith, Peter Slaughter, Kyle Niemeyer, Yolanda Gil, Martin Fenner, Krzysztof Nowak, Mark Hahnel, Luke Coy, Alice Allen, Mercè Crosas, Ashley Sands, Neil Chue Hong, Patricia Cruse, Daniel S. Katz, Carole Goble. 2017. CodeMeta: an exchange schema for software metadata. Version 2.0. KNB Data Repository. doi:10.5063/schema/codemeta-2.0
-swh:1:dir:21824b406ede4d40ac37c693ca9aebd02c85229e;origin=https://github.com/codemeta/codemeta
+swh:1:dir:f39a0ef0005ad0dee50dcd546231ed568cf8705d;origin=https://github.com/codemeta/codemeta
diff --git a/swh/indexer/data/codemeta/codemeta.jsonld b/swh/indexer/data/codemeta/codemeta.jsonld
index 5e19122..5fc4022 100644
--- a/swh/indexer/data/codemeta/codemeta.jsonld
+++ b/swh/indexer/data/codemeta/codemeta.jsonld
@@ -1,80 +1,79 @@
{
"@context": {
"type": "@type",
"id": "@id",
"schema":"http://schema.org/",
"codemeta": "https://codemeta.github.io/terms/",
"Organization": {"@id": "schema:Organization"},
"Person": {"@id": "schema:Person"},
"SoftwareSourceCode": {"@id": "schema:SoftwareSourceCode"},
"SoftwareApplication": {"@id": "schema:SoftwareApplication"},
"Text": {"@id": "schema:Text"},
"URL": {"@id": "schema:URL"},
"address": { "@id": "schema:address"},
"affiliation": { "@id": "schema:affiliation"},
"applicationCategory": { "@id": "schema:applicationCategory", "@type": "@id"},
"applicationSubCategory": { "@id": "schema:applicationSubCategory", "@type": "@id"},
"citation": { "@id": "schema:citation"},
"codeRepository": { "@id": "schema:codeRepository", "@type": "@id"},
"contributor": { "@id": "schema:contributor"},
"copyrightHolder": { "@id": "schema:copyrightHolder"},
"copyrightYear": { "@id": "schema:copyrightYear"},
- "creator": { "@id": "schema:creator"},
"dateCreated": {"@id": "schema:dateCreated", "@type": "schema:Date" },
"dateModified": {"@id": "schema:dateModified", "@type": "schema:Date" },
"datePublished": {"@id": "schema:datePublished", "@type": "schema:Date" },
"description": { "@id": "schema:description"},
"downloadUrl": { "@id": "schema:downloadUrl", "@type": "@id"},
"email": { "@id": "schema:email"},
"editor": { "@id": "schema:editor"},
"encoding": { "@id": "schema:encoding"},
"familyName": { "@id": "schema:familyName"},
"fileFormat": { "@id": "schema:fileFormat", "@type": "@id"},
"fileSize": { "@id": "schema:fileSize"},
"funder": { "@id": "schema:funder"},
"givenName": { "@id": "schema:givenName"},
"hasPart": { "@id": "schema:hasPart" },
"identifier": { "@id": "schema:identifier", "@type": "@id"},
"installUrl": { "@id": "schema:installUrl", "@type": "@id"},
"isAccessibleForFree": { "@id": "schema:isAccessibleForFree"},
"isPartOf": { "@id": "schema:isPartOf"},
"keywords": { "@id": "schema:keywords"},
"license": { "@id": "schema:license", "@type": "@id"},
"memoryRequirements": { "@id": "schema:memoryRequirements", "@type": "@id"},
"name": { "@id": "schema:name"},
"operatingSystem": { "@id": "schema:operatingSystem"},
"permissions": { "@id": "schema:permissions"},
"position": { "@id": "schema:position"},
"processorRequirements": { "@id": "schema:processorRequirements"},
"producer": { "@id": "schema:producer"},
"programmingLanguage": { "@id": "schema:programmingLanguage"},
"provider": { "@id": "schema:provider"},
"publisher": { "@id": "schema:publisher"},
"relatedLink": { "@id": "schema:relatedLink", "@type": "@id"},
"releaseNotes": { "@id": "schema:releaseNotes", "@type": "@id"},
"runtimePlatform": { "@id": "schema:runtimePlatform"},
"sameAs": { "@id": "schema:sameAs", "@type": "@id"},
"softwareHelp": { "@id": "schema:softwareHelp"},
"softwareRequirements": { "@id": "schema:softwareRequirements", "@type": "@id"},
"softwareVersion": { "@id": "schema:softwareVersion"},
"sponsor": { "@id": "schema:sponsor"},
"storageRequirements": { "@id": "schema:storageRequirements", "@type": "@id"},
"supportingData": { "@id": "schema:supportingData"},
"targetProduct": { "@id": "schema:targetProduct"},
"url": { "@id": "schema:url", "@type": "@id"},
"version": { "@id": "schema:version"},
"author": { "@id": "schema:author", "@container": "@list" },
"softwareSuggestions": { "@id": "codemeta:softwareSuggestions", "@type": "@id"},
"contIntegration": { "@id": "codemeta:contIntegration", "@type": "@id"},
"buildInstructions": { "@id": "codemeta:buildInstructions", "@type": "@id"},
"developmentStatus": { "@id": "codemeta:developmentStatus", "@type": "@id"},
"embargoDate": { "@id":"codemeta:embargoDate", "@type": "schema:Date" },
"funding": { "@id": "codemeta:funding" },
"readme": { "@id":"codemeta:readme", "@type": "@id" },
"issueTracker": { "@id":"codemeta:issueTracker", "@type": "@id" },
"referencePublication": { "@id": "codemeta:referencePublication", "@type": "@id"},
"maintainer": { "@id": "codemeta:maintainer" }
}
}
diff --git a/swh/indexer/data/codemeta/crosswalk.csv b/swh/indexer/data/codemeta/crosswalk.csv
index d5fb64d..fb0a688 100644
--- a/swh/indexer/data/codemeta/crosswalk.csv
+++ b/swh/indexer/data/codemeta/crosswalk.csv
@@ -1,77 +1,76 @@
Parent Type,Property,Type,Description,codemeta-V1,DataCite,OntoSoft,Zenodo,GitHub,Figshare,Software Ontology,Software Discovery Index,Dublin Core,R Package Description,Debian Package,Python Distutils (PyPI),Python PKG-INFO,Trove Software Map,Perl Module Description (CPAN::Meta),NodeJS,Java (Maven),Octave,Ruby Gem,ASCL,DOAP,Wikidata,Citation File Format Core (CFF-Core) 1.0.2
-schema:SoftwareSourceCode,codeRepository,URL,"Link to the repository where the un-compiled, human readable code and related code is located (SVN, github, CodePlex).",codeRepository,,,relatedLink,html_url,relatedLink,,,,URL,HomePage,,,,resources.repository,repository,repositories,,homepage,site_list,repository,source code repository,repository-code
-schema:SoftwareSourceCode,programmingLanguage,ComputerLanguage or Text,The computer programming language.,programmingLanguage,Format,hasProgrammingLanguage,,languages_url,,programming language,,,,,classifiers['Programming Language'],,Programming Language,,,,,,,programming-language,programming language,
-schema:SoftwareSourceCode,runtimePlatform,Text,"Runtime platform or script interpreter dependencies (Example - Java v1, Python2.3, .Net Framework 3.0). Supersedes runtime.",,,,,,,,,,,,,,,,,,,platform,,platform,,
+schema:SoftwareSourceCode,codeRepository,URL,"Link to the repository where the un-compiled, human readable code and related code is located (SVN, github, CodePlex).",codeRepository,,,relatedLink,html_url,relatedLink,,,,URL,HomePage,,,,resources.repository,repository,scm / repositories,,homepage,site_list,repository,P1324,repository-code
+schema:SoftwareSourceCode,programmingLanguage,ComputerLanguage or Text,The computer programming language.,programmingLanguage,Format,hasProgrammingLanguage,,languages_url,,programming language,,,,,classifiers['Programming Language'],,Programming Language,,,,,,,programming-language,P277,
+schema:SoftwareSourceCode,runtimePlatform,Text,"Runtime platform or script interpreter dependencies (Example - Java v1, Python2.3, .Net Framework 3.0). Supersedes runtime.",,,,,,,,,,,,,,,,engines,,,platform,,platform,P400,
schema:SoftwareSourceCode,targetProduct,SoftwareApplication,"Target Operating System / Product to which the code applies. If applies to several versions, just the product name can be used.",,,,,,,,,,,,,,,,,,,,,,,
schema:SoftwareApplication,applicationCategory,Text or URL,"Type of software application, e.g. 'Game, Multimedia'.",,,hasSoftwareCategory,communities,,categories,,,,,,classifiers['Topic'],,Topic,Categories,,,Categories,,,,,
schema:SoftwareApplication,applicationSubCategory,Text or URL,"Subcategory of the application, e.g. 'Arcade Game'.",,,,,,,,,,,,,,,,,,,,,,,
-schema:SoftwareApplication,downloadUrl,URL,"If the file can be downloaded, URL to download the binary.",downloadLink,,,,archive_url,,,,,,,download_url,Download-URL,,,,,,,,download-page,,repository-artifact
-schema:SoftwareApplication,fileSize,Text,"Size of the application / package (e.g. 18MB). In the absence of a unit (MB, KB etc.), KB will be assumed.",,,,,,,,,,,,,,,,,,,,,,,
+schema:SoftwareApplication,downloadUrl,URL,"If the file can be downloaded, URL to download the binary.",downloadLink,,,,archive_url,,,,,,,download_url,Download-URL,,,,,,,,download-page,P4945,repository-artifact
+schema:SoftwareApplication,fileSize,Text,"Size of the application / package (e.g. 18MB). In the absence of a unit (MB, KB etc.), KB will be assumed.",,size,,,,,,,,,,,,,,,,,,,,P3575,
schema:SoftwareApplication,installUrl,URL,"URL at which the app may be installed, if different from the URL of the item.",,,,,,,,,,,,,,,,,,,,,download-mirror,,
schema:SoftwareApplication,memoryRequirements,Text or URL,Minimum memory requirements.,,,,,,,,,,,,,,,,,,,,,,,
-schema:SoftwareApplication,operatingSystem,Text,"Operating systems supported (Windows 7, OSX 10.6, Android 1.6).",operatingSystems,,SupportsOperatingSystem,,,,,,,,,classifiers['Operating System'],,Operating System,OSNAMES,os,,,,,os,operating system,
+schema:SoftwareApplication,operatingSystem,Text,"Operating systems supported (Windows 7, OSX 10.6, Android 1.6).",operatingSystems,,SupportsOperatingSystem,,,,,,,,,classifiers['Operating System'],,Operating System,OSNAMES,os,,,,,os,P306,
schema:SoftwareApplication,permissions,Text,"Permission(s) required to run the app (for example, a mobile app may require full internet access or may run only on wifi).",,,,,,,,,,,,,,,,,,,,,,,
-schema:SoftwareApplication,processorRequirements,Text,Processor architecture required to run the application (e.g. IA64).,,,,,,,,,,,,,,,,cpu / engines,,,,,,,
+schema:SoftwareApplication,processorRequirements,Text,Processor architecture required to run the application (e.g. IA64).,,,,,,,,,,,,,,,,cpu,,,,,,,
schema:SoftwareApplication,releaseNotes,Text or URL,Description of what changed in this version.,,,,,,,,,,,,,,,,,,,,,,,
schema:SoftwareApplication,softwareHelp,CreativeWork,Software application help.,,,,,,,,,,,,,,,,,,,,,,,
-schema:SoftwareApplication,softwareRequirements,SoftwareSourceCode,Required software dependencies,depends,,hasDependency->Software,,,,,"""Platform, environment, and dependencies""",,"Depends, SystemRequirements",,install_requires,Requires,Database Environment,prereqs,dependencies / bundledDependencies / bundleDependencies / peerDependencies,prerequisites,"Depends, SystemRequirements","requirements, add_runtime_dependency",,,depends on software,
-schema:SoftwareApplication,softwareVersion,Text,Version of the software instance.,,,,,,,,,,,,,,,,,,,,,release,software version,
+schema:SoftwareApplication,softwareRequirements,SoftwareSourceCode,Required software dependencies,depends,,hasDependency->Software,,,,,"""Platform, environment, and dependencies""",,"Depends, SystemRequirements",,install_requires,Requires,Database Environment,prereqs,dependencies / bundledDependencies / bundleDependencies / peerDependencies,prerequisites,"Depends, SystemRequirements","requirements, add_runtime_dependency",,,P1547,
+schema:SoftwareApplication,softwareVersion,Text,Version of the software instance.,,,,,,,,,,,,,,,,,,,,,release,P348,
schema:SoftwareApplication,storageRequirements,Text or URL,Storage requirements (free space required).,,,,,,,,,,,,,,,,,,,,,,,
schema:SoftwareApplication,supportingData,DataFeed,Supporting data for a SoftwareApplication.,,,,,,,,,,,,,,,,,,,,,,,
-schema:CreativeWork,author,Organization or Person,The author of this content or rating. Please note that author is special in that HTML 5 provides a special mechanism for indicating authorship via the rel tag. That is equivalent to this and may be used interchangeably.,agents,creators,,creators,login,,,,,[aut] in Author,,author,Author,,,author,,,author/authors,,developer,,authors
-schema:CreativeWork,citation,CreativeWork or URL,"A citation or reference to another creative work, such as another publication, web page, scholarly article, etc.",relatedLink,,,,,,,,,,,,,,,,,,,,,,
-schema:CreativeWork,contributor,Organization or Person,A secondary contributor to the CreativeWork or Event.,,,,,,,,,,[ctb] in Author,,,,,,contributor,,,,,developer,,
+schema:CreativeWork,author,Organization or Person,The author of this content or rating. Please note that author is special in that HTML 5 provides a special mechanism for indicating authorship via the rel tag. That is equivalent to this and may be used interchangeably.,agents,creators,,creators,login,,,,creator,[aut] in Author,,author,Author,,,author,,,author/authors,,developer,P50,authors
+schema:CreativeWork,citation,CreativeWork or URL,"A citation or reference to another creative work, such as another publication, web page, scholarly article, etc.",relatedLink,,,,,,,,,,,,,,,,,,,,,P2860,
+schema:CreativeWork,contributor,Organization or Person,A secondary contributor to the CreativeWork or Event.,,contributor,,,,,,,,[ctb] in Author,,,,,,contributors,,,,,developer,P767,
schema:CreativeWork,copyrightHolder,Organization or Person,The party holding the legal copyright to the CreativeWork.,agents [role=copyrightHolder],,,,,,,,,,,,,,,,,,,,,,
schema:CreativeWork,copyrightYear,Number,The year during which the claimed copyright for the CreativeWork was first asserted.,,,,,,,,,,,,,,,,,,,,,,,
-schema:CreativeWork,creator,Organization or Person,The creator/author of this CreativeWork. This is the same as the Author property for CreativeWork.,agent,,,,,,,,creator,[cre] in Author,,,,,,author,,,,,,,
-schema:CreativeWork,dateCreated,Date or DateTime,The date on which the CreativeWork was created or the item was added to a DataFeed.,dateCreated,date,,,created_at,,,,created,,Date,,,,,,,,,,,,
-schema:CreativeWork,dateModified,Date or DateTime,The date on which the CreativeWork was most recently modified or when the item's entry was modified within a DataFeed.,dateModified,date,,,updated_at,,,,,,,,,last-updated,,,,,,,,,
-schema:CreativeWork,datePublished,Date,Date of first broadcast/publication.,datePublished,publicationYear,,date_published,,date_retrieved,,,date,Date,,,,,,,,Date,,,,publication date,date-released
-schema:CreativeWork,editor,Person,Specifies the Person who edited the CreativeWork.,,,,,,,,,,,,,,,,,,,,,,editor,
+schema:CreativeWork,dateCreated,Date or DateTime,The date on which the CreativeWork was created or the item was added to a DataFeed.,dateCreated,date,,,created_at,,,,created,,Date,,,,,,,,,,,P571,
+schema:CreativeWork,dateModified,Date or DateTime,The date on which the CreativeWork was most recently modified or when the item's entry was modified within a DataFeed.,dateModified,date,,,updated_at,,,,,,,,,last-updated,,,,,,,,P5017,
+schema:CreativeWork,datePublished,Date,Date of first broadcast/publication.,datePublished,publicationYear,,date_published,,date_retrieved,,,date,Date,,,,,,,,Date,,,,P577,date-released
+schema:CreativeWork,editor,Person,Specifies the Person who edited the CreativeWork.,,,,,,,,,,,,,,,,,,,,,,P98,
schema:CreativeWork,encoding,MediaObject,A media object that encodes this CreativeWork. This property is a synonym for associatedMedia. Supersedes encodings.,,,,,,,,,,,,,,,,,,,,,,,
-schema:CreativeWork,fileFormat,Text or URL,"Media type, typically MIME format (see IANA site) of the content e.g. application/zip of a SoftwareApplication binary. In cases where a CreativeWork has several media type representations, 'encoding' can be used to indicate each MediaObject alongside particular fileFormat information. Unregistered or niche file formats can be indicated instead via the most appropriate URL, e.g. defining Web page or a Wikipedia entry.",,Format,,,,,,,,,,,,,,,,,,,,,
-schema:CreativeWork,funder,Organization or Person,A person or organization that supports (sponsors) something through some kind of financial contribution.,fundingReference.funderName,,,contributors.Funder,,,,,,,,,,,,,,,,,,,
-schema:CreativeWork,keywords,Text,Keywords or tags used to describe this content. Multiple entries in a keywords list are typically delimited by commas.,controlledTerms,subject,hasDomainKeywords,keywords,,tags,,,,,,keywords,Keywords,,keywords,keywords,,,,,category,,keywords
-schema:CreativeWork,license,CreativeWork or URL,"A license document that applies to this content, typically indicated by URL.",licenseId,rights,License,license,license,License,software license,Software license,license,License,,license,License,license,license,license,license,License,license/licenses,,license,license,license/license-url
-schema:CreativeWork,producer,Organization or Person,"The person or organization who produced the work (e.g. music album, movie, tv/radio series etc.).",,,,,,,,,,,,,,,,,,,,,,,
+schema:CreativeWork,fileFormat,Text or URL,"Media type, typically MIME format (see IANA site) of the content e.g. application/zip of a SoftwareApplication binary. In cases where a CreativeWork has several media type representations, 'encoding' can be used to indicate each MediaObject alongside particular fileFormat information. Unregistered or niche file formats can be indicated instead via the most appropriate URL, e.g. defining Web page or a Wikipedia entry.",,Format,,,,,,,,,,,,,,,,,,,,P2701,
+schema:CreativeWork,funder,Organization or Person,A person or organization that supports (sponsors) something through some kind of financial contribution.,fundingReference.funderName,funderName,,contributors.Funder,,,,,,,,,,,,,,,,,,P859,
+schema:CreativeWork,keywords,Text,Keywords or tags used to describe this content. Multiple entries in a keywords list are typically delimited by commas.,controlledTerms,subject,hasDomainKeywords,keywords,,tags,,,,,,keywords,Keywords,,keywords,keywords,,,,,category,P921,keywords
+schema:CreativeWork,license,CreativeWork or URL,"A license document that applies to this content, typically indicated by URL.",licenseId,rights,License,license,license,License,software license,Software license,license,License,,license,License,license,license,license,license,License,license/licenses,,license,P275,license/license-url
+schema:CreativeWork,producer,Organization or Person,"The person or organization who produced the work (e.g. music album, movie, tv/radio series etc.).",,,,,,,,,,,,,,,,,,,,,,P162,
schema:CreativeWork,provider,Organization or Person,"The service provider, service operator, or service performer; the goods producer. Another party (a seller) may offer those services or goods on behalf of the provider. A provider may also serve as the seller. Supersedes carrier.",,,,,,,,,,,,,,,,,,,,,,,
-schema:CreativeWork,publisher,Organization or Person,The publisher of the creative work.,publisher,publisher,os:hasPublisher,,,,software publisher organization,,publisher,,,,,,,,,,,,vendor,,
+schema:CreativeWork,publisher,Organization or Person,The publisher of the creative work.,publisher,publisher,os:hasPublisher,,,,software publisher organization,,publisher,,,,,,,,,,,,vendor,P123,
schema:CreativeWork,sponsor,Organization or Person,"A person or organization that supports a thing through a pledge, promise, or financial contribution. e.g. a sponsor of a Medical Study or a corporate sponsor of an event.",,,,,,,,,,,,,,,,,,,,,,,
schema:CreativeWork,version,Number or Text,The version of the CreativeWork embodied by a specified resource.,version,version,hasSoftwareVersion,,,,Version,Software version,dcterms:hasVersion,,numeric_version,Version,Version,version,,version,version,version,version,,,,version
schema:CreativeWork,isAccessibleForFree,Boolean,A flag to signal that the publication is accessible for free.,,,,,,,,,,,,,,,,,,,,,,,
-schema:CreativeWork,isPartOf,CreativeWork,Indicates a CreativeWork that this CreativeWork is (in some sense) part of. Reverse property hasPart,,,,,,,,,,,,,,,,,,,,,,,references
-schema:CreativeWork,hasPart,CreativeWork,Indicates a CreativeWork that is (in some sense) a part of this CreativeWork. Reverse property isPartOf,,,,,,,,,,,,,,,,,,,,,,,
+schema:CreativeWork,isPartOf,CreativeWork,Indicates a CreativeWork that this CreativeWork is (in some sense) part of. Reverse property hasPart,,,,,,,,,,,,,,,,,,,,,,P361,references
+schema:CreativeWork,hasPart,CreativeWork,Indicates a CreativeWork that is (in some sense) a part of this CreativeWork. Reverse property isPartOf,,,,,,,,,,,,,,,,,,,,,,P527,
schema:CreativeWork,position,Integer or Text,"The position of an item in a series or sequence of items. (While schema.org considers this a property of CreativeWork, it is also the way to indicate ordering in any list (e.g. the Authors list). By default arrays are unordered in JSON-LD",,,,,,,,,,,,,,,,,,,,,,,
schema:Thing,description,Text,A description of the item.,description,description,hasShortDescription,description/notes,description,Description,software,,description,Description,Description,"description, long_description",Summary / Description,description,"abstract, description",description,description,Description,"summary, description",abstract,,,abstract
schema:Thing,identifier,PropertyValue or URL,"The identifier property represents any kind of identifier for any kind of Thing, such as ISBNs, GTIN codes, UUIDs etc. Schema.org provides dedicated properties for representing many of these, either as textual strings or as URL (URI) links. See background notes for more details.",identifier,identifier,hasUniqueId,id,id,,,Persistent Identifier,identifier,Package,Package,,,,,name,groupId,,,ascl_id,,,doi
-schema:Thing,name,Text,"The name of the item (software, Organization)",name,,hasName,title,full_name,Title,SoftwareTitle,Software title,title,Title,,name,Name,Title,name,name,name,name,name,title,,,title
-schema:Thing,sameAs,URL,"URL of a reference Web page that unambiguously indicates the item's identity. E.g. the URL of the item's Wikipedia page, Wikidata entry, or official website.",,,,,,,,,,,,,,,,,,,,,,,
-schema:Thing,url,URL,URL of the item.,URL,,,,,,,,,URL,,url,Home-Page,,,homepage,,URL,,,homepage,official website,url
-schema:Thing,relatedLink,URL,"A link related to this object, e.g. related web pages",,RelateIdentifier,,,,,,,,,,,,,,,,,,,,,
-schema:Person,givenName,Text,"Given name. In the U.S., the first name of a Person. This can be used along with familyName instead of the name property",,givenName,,,,,,,,givenName,,,,,,,,,,,,,person.given-names
-schema:Person,familyName,Text,"Family name. In the U.S., the last name of an Person. This can be used along with givenName instead of the name property.",,familyName,,,,,,,,familyName,,,,,,,,,,,,,person.name-particle + person.family-names + person.name-suffix
-schema:Person,email,Text,Email address,email,,,,,,,,,email,,author_email,Author-email,,email-address,author.email,,,email,email,,,person.email/entity.email
+schema:Thing,name,Text,"The name of the item (software, Organization)",name,title ,hasName,title,full_name,Title,SoftwareTitle,Software title,title,Title,,name,Name,Title,name,name,name,name,name,title,,,title
+schema:Thing,sameAs,URL,"URL of a reference Web page that unambiguously indicates the item's identity. E.g. the URL of the item's Wikipedia page, Wikidata entry, or official website.",,,,,,,,,,,,,,,,,,,,,,P2888,
+schema:Thing,url,URL,URL of the item.,URL,,,,,,,,,URL,,url,Home-Page,,,homepage,,URL,,,homepage,P856,url
+schema:Thing,relatedLink,URL,"A link related to this object, e.g. related web pages",,RelatedIdentifier,,,,,,,,,,,,,,,,,,,,,
+schema:Person,givenName,Text,"Given name. In the U.S., the first name of a Person. This can be used along with familyName instead of the name property",,givenName,,,,,,,,givenName,,,,,,,,,,,,P735,person.given-names
+schema:Person,familyName,Text,"Family name. In the U.S., the last name of an Person. This can be used along with givenName instead of the name property.",,familyName,,,,,,,,familyName,,,,,,,,,,,,P734,person.name-particle + person.family-names + person.name-suffix
+schema:Person,email,Text,Email address,email,,,,,,,,,email,,author_email,Author-email,,email-address,author.email,,,email,email,,P968,person.email/entity.email
schema:Person,affiliation,Text,"An organization that this person is affiliated with. For example, a school/university",affiliation,affiliation,,affiliation,,,,,,,,,,,,,,,,,,,person.affiliation
schema:Person,identifier,URL,"URL identifer, ideally an ORCID ID for individuals, a FundRef ID for funders",identifier,nameIdentifier,,ORCID,,ORCID,,,,,,,,,,,,,,,,,person.orcid / entity.orcid
-schema:Person,name,Text,"The name of an Organization, or if separate given and family names cannot be resolved for a Person",,,,name,,name,,,,,,,,,author:contact-name,author.name,,,,,,,entity.name
+schema:Person,name,Text,"The name of an Organization, or if separate given and family names cannot be resolved for a Person",,creatorName ,,name,,name,,,,,,,,,author:contact-name,author.name,,,,,,,entity.name
schema:Person,address,PostalAddress or Text,Physical address of the item.,,,,,,,,,,,,,,,,,,,,,,,person.address + person.city + person.region + person.post-code + person.country / entity.address + entity.city + entity.region + entity.post-code + entity.country
schema,type,Object Type (from context or URI),"The object type (e.g. ""Person"", ""Organization"", ""ScientificArticle"", ""SoftwareApplication"", etc).",,,,,,,,,,,,,,,,,,,,,,,reference.type
schema,id,URL,Primary identifier for an object. Must be a resolvable URL or a string used to refer to this node elsewhere in the same document,,,,,,,,,,,,,,,,,,,,,,,
codemeta:SoftwareSourceCode,softwareSuggestions,SoftwareSourceCode,"Optional dependencies , e.g. for optional features, code development, etc",suggests,,,,,,,,,Suggests,,,,,,devDependencies / optionalDependencies,,BuildDepends,add_development_dependency,,,,
codemeta:SoftwareSourceCode,maintainer,Person,Individual responsible for maintaining the software (usually includes an email contact address),uploadedBy,,,,,,,,,Maintainer,,maintainer / maintainer_email,,,,,,,,,maintainer,,
codemeta:SoftwareSourceCode,contIntegration,URL,link to continuous integration service,contIntegration,,,,,,,,,,,,,,,,ciManagement,,,,,,
codemeta:SoftwareSourceCode,buildInstructions,URL,link to installation instructions/documentation,buildInstructions,,,,,,,,,,,,,,,,,,,,,,
codemeta:SoftwareSourceCode,developmentStatus,Text,"Description of development status, e.g. Active, inactive, supsended. See repostatus.org",developmentStatus,,activeDevelopment,,,,,,,,,classifiers['Development Status'],,Development Status,release_status,,,,,,,,
codemeta:SoftwareSourceCode,embargoDate,Date,"Software may be embargoed from public access until a specified date (e.g. pending publication, 1 year from publication)",embargoDate,,,,,embargo_date,,,,,,,,,,,,,,,,,
-codemeta:SoftwareSourceCode,funding,Text,Funding source (e.g. specific grant),funding,,fundingReference.awardTitle or fundingReference.awardNumber,,,,,,,,,,,,,,,,,,,,
-codemeta:SoftwareSourceCode,issueTracker,URL,link to software bug reporting or issue tracking system,issueTracker,,,,issues_url,,,,,BugReports,,,,,resources.bugtracker,bugs,issuesManagement,Problems,,,bug-database,bug tracking system,repository
+codemeta:SoftwareSourceCode,funding,Text,Funding source (e.g. specific grant),funding,awardNumber ,fundingReference.awardTitle or fundingReference.awardNumber,,,,,,,,,,,,,,,,,,,,
+codemeta:SoftwareSourceCode,issueTracker,URL,link to software bug reporting or issue tracking system,issueTracker,,,,issues_url,,,,,BugReports,,,,,resources.bugtracker,bugs,issueManagement,Problems,,,bug-database,bug tracking system,repository
codemeta:SoftwareSourceCode,referencePublication,ScholarlyArticle,An academic publication related to the software.,relatedPublications,,,,,,,,,,,,,,,,,,,,blog,,references
codemeta:SoftwareSourceCode,readme,URL,link to software Readme file,readme,,,,,,,,,,,,,,,,,,,,,,
,,,,relatedIdentifer,,,,,,,,,,,,,,,,,,,,,,
,,,,relatedIdentiferType,,,,,,,,,,,,,,,,,,,,,,
,,,,relationshipType,,,,,,,,,,,,,,,,,,,,,,
,,,,title,,,,,,,,,,,,,,,,,,,,,,
,,,,namespace,,,,,,,,,,,,,,,,,,,,,,
,,,,role,,,,,,,,,,,,,,,,,,,,,,
,,,,roleCode,,,,,,,,,,,,,,,,,,,,,,
,,,,softwarePaperCitationIdenifiers,,,,,,,,,,,,,,,,,,,,,,
diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py
index e81466c..8b207a6 100644
--- a/swh/indexer/metadata.py
+++ b/swh/indexer/metadata.py
@@ -1,378 +1,379 @@
-# Copyright (C) 2017-2020 The Software Heritage developers
+# Copyright (C) 2017-2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from copy import deepcopy
from typing import (
Any,
Callable,
Dict,
Iterable,
Iterator,
List,
Optional,
Tuple,
TypeVar,
)
from swh.core.config import merge_configs
from swh.core.utils import grouper
from swh.indexer.codemeta import merge_documents
from swh.indexer.indexer import ContentIndexer, OriginIndexer, RevisionIndexer
from swh.indexer.metadata_detector import detect_metadata
from swh.indexer.metadata_dictionary import MAPPINGS
from swh.indexer.origin_head import OriginHeadIndexer
from swh.indexer.storage import INDEXER_CFG_KEY, Sha1
from swh.indexer.storage.model import (
ContentMetadataRow,
OriginIntrinsicMetadataRow,
RevisionIntrinsicMetadataRow,
)
from swh.model import hashutil
from swh.model.model import Revision, Sha1Git
REVISION_GET_BATCH_SIZE = 10
ORIGIN_GET_BATCH_SIZE = 10
T1 = TypeVar("T1")
T2 = TypeVar("T2")
def call_with_batches(
f: Callable[[List[T1]], Iterable[T2]], args: List[T1], batch_size: int,
) -> Iterator[T2]:
"""Calls a function with batches of args, and concatenates the results.
"""
groups = grouper(args, batch_size)
for group in groups:
yield from f(list(group))
class ContentMetadataIndexer(ContentIndexer[ContentMetadataRow]):
"""Content-level indexer
This indexer is in charge of:
- filtering out content already indexed in content_metadata
- reading content from objstorage with the content's id sha1
- computing metadata by given context
- using the metadata_dictionary as the 'swh-metadata-translator' tool
- store result in content_metadata table
"""
def filter(self, ids):
"""Filter out known sha1s and return only missing ones.
"""
yield from self.idx_storage.content_metadata_missing(
({"id": sha1, "indexer_configuration_id": self.tool["id"],} for sha1 in ids)
)
def index(
self,
id: Sha1,
data: Optional[bytes] = None,
log_suffix="unknown revision",
**kwargs,
) -> List[ContentMetadataRow]:
"""Index sha1s' content and store result.
Args:
- id (bytes): content's identifier
- data (bytes): raw content in bytes
+ id: content's identifier
+ data: raw content in bytes
Returns:
dict: dictionary representing a content_metadata. If the
translation wasn't successful the metadata keys will
be returned as None
"""
assert isinstance(id, bytes)
assert data is not None
+ metadata = None
try:
mapping_name = self.tool["tool_configuration"]["context"]
log_suffix += ", content_id=%s" % hashutil.hash_to_hex(id)
metadata = MAPPINGS[mapping_name](log_suffix).translate(data)
except Exception:
self.log.exception(
"Problem during metadata translation "
"for content %s" % hashutil.hash_to_hex(id)
)
if metadata is None:
return []
return [
ContentMetadataRow(
id=id, indexer_configuration_id=self.tool["id"], metadata=metadata,
)
]
def persist_index_computations(
self, results: List[ContentMetadataRow]
) -> Dict[str, int]:
"""Persist the results in storage.
Args:
results: list of content_metadata, dict with the
following keys:
- id (bytes): content's identifier (sha1)
- metadata (jsonb): detected metadata
"""
return self.idx_storage.content_metadata_add(results)
DEFAULT_CONFIG: Dict[str, Any] = {
"tools": {
"name": "swh-metadata-detector",
"version": "0.0.2",
"configuration": {},
},
}
class RevisionMetadataIndexer(RevisionIndexer[RevisionIntrinsicMetadataRow]):
"""Revision-level indexer
This indexer is in charge of:
- filtering revisions already indexed in revision_intrinsic_metadata table
with defined computation tool
- retrieve all entry_files in root directory
- use metadata_detector for file_names containing metadata
- compute metadata translation if necessary and possible (depends on tool)
- send sha1s to content indexing if possible
- store the results for revision
"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.config = merge_configs(DEFAULT_CONFIG, self.config)
def filter(self, sha1_gits):
"""Filter out known sha1s and return only missing ones.
"""
yield from self.idx_storage.revision_intrinsic_metadata_missing(
(
{"id": sha1_git, "indexer_configuration_id": self.tool["id"],}
for sha1_git in sha1_gits
)
)
def index(
self, id: Sha1Git, data: Optional[Revision], **kwargs
) -> List[RevisionIntrinsicMetadataRow]:
"""Index rev by processing it and organizing result.
use metadata_detector to iterate on filenames
- if one filename detected -> sends file to content indexer
- if multiple file detected -> translation needed at revision level
Args:
id: sha1_git of the revision
data: revision model object from storage
Returns:
dict: dictionary representing a revision_intrinsic_metadata, with
keys:
- id (str): rev's identifier (sha1_git)
- indexer_configuration_id (bytes): tool used
- metadata: dict of retrieved metadata
"""
rev = data
assert isinstance(rev, Revision)
try:
root_dir = rev.directory
dir_ls = list(self.storage.directory_ls(root_dir, recursive=False))
if [entry["type"] for entry in dir_ls] == ["dir"]:
# If the root is just a single directory, recurse into it
# eg. PyPI packages, GNU tarballs
subdir = dir_ls[0]["target"]
dir_ls = list(self.storage.directory_ls(subdir, recursive=False))
files = [entry for entry in dir_ls if entry["type"] == "file"]
detected_files = detect_metadata(files)
(mappings, metadata) = self.translate_revision_intrinsic_metadata(
detected_files, log_suffix="revision=%s" % hashutil.hash_to_hex(rev.id),
)
except Exception as e:
self.log.exception("Problem when indexing rev: %r", e)
return [
RevisionIntrinsicMetadataRow(
id=rev.id,
indexer_configuration_id=self.tool["id"],
mappings=mappings,
metadata=metadata,
)
]
def persist_index_computations(
self, results: List[RevisionIntrinsicMetadataRow]
) -> Dict[str, int]:
"""Persist the results in storage.
Args:
results: list of content_mimetype, dict with the
following keys:
- id (bytes): content's identifier (sha1)
- mimetype (bytes): mimetype in bytes
- encoding (bytes): encoding in bytes
"""
# TODO: add functions in storage to keep data in
# revision_intrinsic_metadata
return self.idx_storage.revision_intrinsic_metadata_add(results)
def translate_revision_intrinsic_metadata(
self, detected_files: Dict[str, List[Any]], log_suffix: str
) -> Tuple[List[Any], Any]:
"""
Determine plan of action to translate metadata when containing
one or multiple detected files:
Args:
detected_files: dictionary mapping context names (e.g.,
"npm", "authors") to list of sha1
Returns:
(List[str], dict): list of mappings used and dict with
translated metadata according to the CodeMeta vocabulary
"""
used_mappings = [MAPPINGS[context].name for context in detected_files]
metadata = []
tool = {
"name": "swh-metadata-translator",
"version": "0.0.2",
"configuration": {},
}
# TODO: iterate on each context, on each file
# -> get raw_contents
# -> translate each content
config = {k: self.config[k] for k in [INDEXER_CFG_KEY, "objstorage", "storage"]}
config["tools"] = [tool]
for context in detected_files.keys():
cfg = deepcopy(config)
cfg["tools"][0]["configuration"]["context"] = context
c_metadata_indexer = ContentMetadataIndexer(config=cfg)
# sha1s that are in content_metadata table
sha1s_in_storage = []
metadata_generator = self.idx_storage.content_metadata_get(
detected_files[context]
)
for c in metadata_generator:
# extracting metadata
sha1 = c.id
sha1s_in_storage.append(sha1)
local_metadata = c.metadata
# local metadata is aggregated
if local_metadata:
metadata.append(local_metadata)
sha1s_filtered = [
item for item in detected_files[context] if item not in sha1s_in_storage
]
if sha1s_filtered:
# content indexing
try:
c_metadata_indexer.run(
sha1s_filtered, log_suffix=log_suffix,
)
# on the fly possibility:
for result in c_metadata_indexer.results:
local_metadata = result.metadata
metadata.append(local_metadata)
except Exception:
self.log.exception("Exception while indexing metadata on contents")
metadata = merge_documents(metadata)
return (used_mappings, metadata)
class OriginMetadataIndexer(
OriginIndexer[Tuple[OriginIntrinsicMetadataRow, RevisionIntrinsicMetadataRow]]
):
USE_TOOLS = False
def __init__(self, config=None, **kwargs) -> None:
super().__init__(config=config, **kwargs)
self.origin_head_indexer = OriginHeadIndexer(config=config)
self.revision_metadata_indexer = RevisionMetadataIndexer(config=config)
def index_list(
self, origin_urls: List[str], **kwargs
) -> List[Tuple[OriginIntrinsicMetadataRow, RevisionIntrinsicMetadataRow]]:
head_rev_ids = []
origins_with_head = []
origins = list(
call_with_batches(
self.storage.origin_get, origin_urls, ORIGIN_GET_BATCH_SIZE,
)
)
for origin in origins:
if origin is None:
continue
head_results = self.origin_head_indexer.index(origin.url)
if head_results:
(head_result,) = head_results
origins_with_head.append(origin)
head_rev_ids.append(head_result["revision_id"])
head_revs = list(
call_with_batches(
self.storage.revision_get, head_rev_ids, REVISION_GET_BATCH_SIZE
)
)
assert len(head_revs) == len(head_rev_ids)
results = []
for (origin, rev) in zip(origins_with_head, head_revs):
if not rev:
self.log.warning("Missing head revision of origin %r", origin.url)
continue
for rev_metadata in self.revision_metadata_indexer.index(rev.id, rev):
# There is at most one rev_metadata
orig_metadata = OriginIntrinsicMetadataRow(
from_revision=rev_metadata.id,
id=origin.url,
metadata=rev_metadata.metadata,
mappings=rev_metadata.mappings,
indexer_configuration_id=rev_metadata.indexer_configuration_id,
)
results.append((orig_metadata, rev_metadata))
return results
def persist_index_computations(
self,
results: List[Tuple[OriginIntrinsicMetadataRow, RevisionIntrinsicMetadataRow]],
) -> Dict[str, int]:
# Deduplicate revisions
rev_metadata: List[RevisionIntrinsicMetadataRow] = []
orig_metadata: List[OriginIntrinsicMetadataRow] = []
summary: Dict = {}
for (orig_item, rev_item) in results:
assert rev_item.metadata == orig_item.metadata
if rev_item.metadata and not (rev_item.metadata.keys() <= {"@context"}):
# Only store non-empty metadata sets
if rev_item not in rev_metadata:
rev_metadata.append(rev_item)
if orig_item not in orig_metadata:
orig_metadata.append(orig_item)
if rev_metadata:
summary_rev = self.idx_storage.revision_intrinsic_metadata_add(rev_metadata)
summary.update(summary_rev)
if orig_metadata:
summary_ori = self.idx_storage.origin_intrinsic_metadata_add(orig_metadata)
summary.update(summary_ori)
return summary
diff --git a/swh/indexer/metadata_dictionary/__init__.py b/swh/indexer/metadata_dictionary/__init__.py
index a0668df..81df82a 100644
--- a/swh/indexer/metadata_dictionary/__init__.py
+++ b/swh/indexer/metadata_dictionary/__init__.py
@@ -1,39 +1,40 @@
import collections
import click
-from . import codemeta, maven, npm, python, ruby
+from . import cff, codemeta, maven, npm, python, ruby
MAPPINGS = {
"CodemetaMapping": codemeta.CodemetaMapping,
"MavenMapping": maven.MavenMapping,
"NpmMapping": npm.NpmMapping,
"PythonPkginfoMapping": python.PythonPkginfoMapping,
"GemspecMapping": ruby.GemspecMapping,
+ "CffMapping": cff.CffMapping,
}
def list_terms():
"""Returns a dictionary with all supported CodeMeta terms as keys,
and the mappings that support each of them as values."""
d = collections.defaultdict(set)
for mapping in MAPPINGS.values():
for term in mapping.supported_terms():
d[term].add(mapping)
return d
@click.command()
@click.argument("mapping_name")
@click.argument("file_name")
def main(mapping_name, file_name):
from pprint import pprint
with open(file_name, "rb") as fd:
file_content = fd.read()
res = MAPPINGS[mapping_name]().translate(file_content)
pprint(res)
if __name__ == "__main__":
main()
diff --git a/swh/indexer/metadata_dictionary/base.py b/swh/indexer/metadata_dictionary/base.py
index 0ddd975..0a2becf 100644
--- a/swh/indexer/metadata_dictionary/base.py
+++ b/swh/indexer/metadata_dictionary/base.py
@@ -1,177 +1,177 @@
# Copyright (C) 2017-2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import json
import logging
from typing import List
from swh.indexer.codemeta import SCHEMA_URI, compact, merge_values
class BaseMapping:
"""Base class for mappings to inherit from
To implement a new mapping:
- inherit this class
- override translate function
"""
def __init__(self, log_suffix=""):
self.log_suffix = log_suffix
self.log = logging.getLogger(
"%s.%s" % (self.__class__.__module__, self.__class__.__name__)
)
@property
def name(self):
"""A name of this mapping, used as an identifier in the
indexer storage."""
raise NotImplementedError(f"{self.__class__.__name__}.name")
@classmethod
def detect_metadata_files(cls, files):
"""
Detects files potentially containing metadata
Args:
file_entries (list): list of files
Returns:
list: list of sha1 (possibly empty)
"""
raise NotImplementedError(f"{cls.__name__}.detect_metadata_files")
def translate(self, file_content):
raise NotImplementedError(f"{self.__class__.__name__}.translate")
def normalize_translation(self, metadata):
return compact(metadata)
class SingleFileMapping(BaseMapping):
"""Base class for all mappings that use a single file as input."""
@property
def filename(self):
"""The .json file to extract metadata from."""
raise NotImplementedError(f"{self.__class__.__name__}.filename")
@classmethod
def detect_metadata_files(cls, file_entries):
for entry in file_entries:
- if entry["name"] == cls.filename:
+ if entry["name"].lower() == cls.filename.lower():
return [entry["sha1"]]
return []
class DictMapping(BaseMapping):
"""Base class for mappings that take as input a file that is mostly
a key-value store (eg. a shallow JSON dict)."""
string_fields = [] # type: List[str]
"""List of fields that are simple strings, and don't need any
normalization."""
@property
def mapping(self):
"""A translation dict to map dict keys into a canonical name."""
raise NotImplementedError(f"{self.__class__.__name__}.mapping")
@staticmethod
def _normalize_method_name(name):
return name.replace("-", "_")
@classmethod
def supported_terms(cls):
return {
term
for (key, term) in cls.mapping.items()
if key in cls.string_fields
or hasattr(cls, "translate_" + cls._normalize_method_name(key))
or hasattr(cls, "normalize_" + cls._normalize_method_name(key))
}
def _translate_dict(self, content_dict, *, normalize=True):
"""
Translates content by parsing content from a dict object
and translating with the appropriate mapping
Args:
content_dict (dict): content dict to translate
Returns:
dict: translated metadata in json-friendly form needed for
the indexer
"""
translated_metadata = {"@type": SCHEMA_URI + "SoftwareSourceCode"}
for k, v in content_dict.items():
# First, check if there is a specific translation
# method for this key
translation_method = getattr(
self, "translate_" + self._normalize_method_name(k), None
)
if translation_method:
translation_method(translated_metadata, v)
elif k in self.mapping:
# if there is no method, but the key is known from the
# crosswalk table
codemeta_key = self.mapping[k]
# if there is a normalization method, use it on the value
normalization_method = getattr(
self, "normalize_" + self._normalize_method_name(k), None
)
if normalization_method:
v = normalization_method(v)
elif k in self.string_fields and isinstance(v, str):
pass
elif k in self.string_fields and isinstance(v, list):
v = [x for x in v if isinstance(x, str)]
else:
continue
# set the translation metadata with the normalized value
if codemeta_key in translated_metadata:
translated_metadata[codemeta_key] = merge_values(
translated_metadata[codemeta_key], v
)
else:
translated_metadata[codemeta_key] = v
if normalize:
return self.normalize_translation(translated_metadata)
else:
return translated_metadata
class JsonMapping(DictMapping, SingleFileMapping):
"""Base class for all mappings that use a JSON file as input."""
def translate(self, raw_content):
"""
Translates content by parsing content from a bytestring containing
json data and translating with the appropriate mapping
Args:
raw_content (bytes): raw content to translate
Returns:
dict: translated metadata in json-friendly form needed for
the indexer
"""
try:
raw_content = raw_content.decode()
except UnicodeDecodeError:
self.log.warning("Error unidecoding from %s", self.log_suffix)
return
try:
content_dict = json.loads(raw_content)
except json.JSONDecodeError:
self.log.warning("Error unjsoning from %s", self.log_suffix)
return
if isinstance(content_dict, dict):
return self._translate_dict(content_dict)
diff --git a/swh/indexer/metadata_dictionary/cff.py b/swh/indexer/metadata_dictionary/cff.py
new file mode 100644
index 0000000..43f944d
--- /dev/null
+++ b/swh/indexer/metadata_dictionary/cff.py
@@ -0,0 +1,65 @@
+import yaml
+
+from swh.indexer.codemeta import CODEMETA_CONTEXT_URL, CROSSWALK_TABLE, SCHEMA_URI
+
+from .base import DictMapping, SingleFileMapping
+
+yaml.SafeLoader.yaml_implicit_resolvers = {
+ k: [r for r in v if r[0] != "tag:yaml.org,2002:timestamp"]
+ for k, v in yaml.SafeLoader.yaml_implicit_resolvers.items()
+}
+
+
+class CffMapping(DictMapping, SingleFileMapping):
+ """Dedicated class for Citation (CITATION.cff) mapping and translation"""
+
+ name = "cff"
+ filename = b"CITATION.cff"
+ mapping = CROSSWALK_TABLE["Citation File Format Core (CFF-Core) 1.0.2"]
+ string_fields = ["keywords", "license", "abstract", "version", "doi"]
+
+ def translate(self, raw_content):
+ raw_content = raw_content.decode()
+ content_dict = yaml.load(raw_content, Loader=yaml.SafeLoader)
+ metadata = self._translate_dict(content_dict)
+
+ metadata["@context"] = CODEMETA_CONTEXT_URL
+
+ return metadata
+
+ def normalize_authors(self, d):
+ result = []
+ for author in d:
+ author_data = {"@type": SCHEMA_URI + "Person"}
+ if "orcid" in author:
+ author_data["@id"] = author["orcid"]
+ if "affiliation" in author:
+ author_data[SCHEMA_URI + "affiliation"] = {
+ "@type": SCHEMA_URI + "Organization",
+ SCHEMA_URI + "name": author["affiliation"],
+ }
+ if "family-names" in author:
+ author_data[SCHEMA_URI + "familyName"] = author["family-names"]
+ if "given-names" in author:
+ author_data[SCHEMA_URI + "givenName"] = author["given-names"]
+
+ result.append(author_data)
+
+ result = {"@list": result}
+ return result
+
+ def normalize_doi(self, s):
+ if isinstance(s, str):
+ return {"@id": "https://doi.org/" + s}
+
+ def normalize_license(self, s):
+ if isinstance(s, str):
+ return {"@id": "https://spdx.org/licenses/" + s}
+
+ def normalize_repository_code(self, s):
+ if isinstance(s, str):
+ return {"@id": s}
+
+ def normalize_date_released(self, s):
+ if isinstance(s, str):
+ return {"@value": s, "@type": SCHEMA_URI + "Date"}
diff --git a/swh/indexer/metadata_dictionary/maven.py b/swh/indexer/metadata_dictionary/maven.py
index d725f5f..f5f6d9b 100644
--- a/swh/indexer/metadata_dictionary/maven.py
+++ b/swh/indexer/metadata_dictionary/maven.py
@@ -1,158 +1,161 @@
-# Copyright (C) 2018-2019 The Software Heritage developers
+# Copyright (C) 2018-2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import os
import xml.parsers.expat
import xmltodict
from swh.indexer.codemeta import CROSSWALK_TABLE, SCHEMA_URI
from .base import DictMapping, SingleFileMapping
class MavenMapping(DictMapping, SingleFileMapping):
"""
dedicated class for Maven (pom.xml) mapping and translation
"""
name = "maven"
filename = b"pom.xml"
mapping = CROSSWALK_TABLE["Java (Maven)"]
string_fields = ["name", "version", "description", "email"]
def translate(self, content):
try:
d = xmltodict.parse(content).get("project") or {}
except xml.parsers.expat.ExpatError:
self.log.warning("Error parsing XML from %s", self.log_suffix)
return None
except UnicodeDecodeError:
self.log.warning("Error unidecoding XML from %s", self.log_suffix)
return None
except (LookupError, ValueError):
# unknown encoding or multi-byte encoding
self.log.warning("Error detecting XML encoding from %s", self.log_suffix)
return None
+ if not isinstance(d, dict):
+ self.log.warning("Skipping ill-formed XML content: %s", content)
+ return None
metadata = self._translate_dict(d, normalize=False)
metadata[SCHEMA_URI + "codeRepository"] = self.parse_repositories(d)
metadata[SCHEMA_URI + "license"] = self.parse_licenses(d)
return self.normalize_translation(metadata)
_default_repository = {"url": "https://repo.maven.apache.org/maven2/"}
def parse_repositories(self, d):
"""https://maven.apache.org/pom.html#Repositories
>>> import xmltodict
>>> from pprint import pprint
>>> d = xmltodict.parse('''
...
...
... codehausSnapshots
... Codehaus Snapshots
... http://snapshots.maven.codehaus.org/maven2
... default
...
...
... ''')
>>> MavenMapping().parse_repositories(d)
"""
repositories = d.get("repositories")
if not repositories:
results = [self.parse_repository(d, self._default_repository)]
elif isinstance(repositories, dict):
repositories = repositories.get("repository") or []
if not isinstance(repositories, list):
repositories = [repositories]
results = [self.parse_repository(d, repo) for repo in repositories]
else:
results = []
return [res for res in results if res] or None
def parse_repository(self, d, repo):
if not isinstance(repo, dict):
return
if repo.get("layout", "default") != "default":
return # TODO ?
url = repo.get("url")
group_id = d.get("groupId")
artifact_id = d.get("artifactId")
if (
isinstance(url, str)
and isinstance(group_id, str)
and isinstance(artifact_id, str)
):
repo = os.path.join(url, *group_id.split("."), artifact_id)
return {"@id": repo}
def normalize_groupId(self, id_):
"""https://maven.apache.org/pom.html#Maven_Coordinates
>>> MavenMapping().normalize_groupId('org.example')
{'@id': 'org.example'}
"""
if isinstance(id_, str):
return {"@id": id_}
def parse_licenses(self, d):
"""https://maven.apache.org/pom.html#Licenses
>>> import xmltodict
>>> import json
>>> d = xmltodict.parse('''
...
...
... Apache License, Version 2.0
... https://www.apache.org/licenses/LICENSE-2.0.txt
...
...
... ''')
>>> print(json.dumps(d, indent=4))
{
"licenses": {
"license": {
"name": "Apache License, Version 2.0",
"url": "https://www.apache.org/licenses/LICENSE-2.0.txt"
}
}
}
>>> MavenMapping().parse_licenses(d)
[{'@id': 'https://www.apache.org/licenses/LICENSE-2.0.txt'}]
or, if there are more than one license:
>>> import xmltodict
>>> from pprint import pprint
>>> d = xmltodict.parse('''
...
...
... Apache License, Version 2.0
... https://www.apache.org/licenses/LICENSE-2.0.txt
...
...
... MIT License
... https://opensource.org/licenses/MIT
...
...
... ''')
>>> pprint(MavenMapping().parse_licenses(d))
[{'@id': 'https://www.apache.org/licenses/LICENSE-2.0.txt'},
{'@id': 'https://opensource.org/licenses/MIT'}]
"""
licenses = d.get("licenses")
if not isinstance(licenses, dict):
return
licenses = licenses.get("license")
if isinstance(licenses, dict):
licenses = [licenses]
elif not isinstance(licenses, list):
return
return [
{"@id": license["url"]}
for license in licenses
if isinstance(license, dict) and isinstance(license.get("url"), str)
] or None
diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py
index 8d902ce..df4d5ae 100644
--- a/swh/indexer/storage/__init__.py
+++ b/swh/indexer/storage/__init__.py
@@ -1,723 +1,723 @@
# Copyright (C) 2015-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from collections import Counter
from importlib import import_module
import json
from typing import Dict, Iterable, List, Optional, Tuple, Union
import warnings
import psycopg2
import psycopg2.pool
from swh.core.db.common import db_transaction
from swh.indexer.storage.interface import IndexerStorageInterface
from swh.model.hashutil import hash_to_bytes, hash_to_hex
from swh.model.model import SHA1_SIZE
from swh.storage.exc import StorageDBError
from swh.storage.utils import get_partition_bounds_bytes
from . import converters
from .db import Db
from .exc import DuplicateId, IndexerStorageArgumentException
from .interface import PagedResult, Sha1
from .metrics import process_metrics, send_metric, timed
from .model import (
ContentCtagsRow,
ContentLanguageRow,
ContentLicenseRow,
ContentMetadataRow,
ContentMimetypeRow,
OriginIntrinsicMetadataRow,
RevisionIntrinsicMetadataRow,
)
from .writer import JournalWriter
INDEXER_CFG_KEY = "indexer_storage"
-MAPPING_NAMES = ["codemeta", "gemspec", "maven", "npm", "pkg-info"]
+MAPPING_NAMES = ["cff", "codemeta", "gemspec", "maven", "npm", "pkg-info"]
SERVER_IMPLEMENTATIONS: Dict[str, str] = {
"local": ".IndexerStorage",
"remote": ".api.client.RemoteStorage",
"memory": ".in_memory.IndexerStorage",
}
def get_indexer_storage(cls: str, **kwargs) -> IndexerStorageInterface:
"""Instantiate an indexer storage implementation of class `cls` with arguments
`kwargs`.
Args:
cls: indexer storage class (local, remote or memory)
kwargs: dictionary of arguments passed to the
indexer storage class constructor
Returns:
an instance of swh.indexer.storage
Raises:
ValueError if passed an unknown storage class.
"""
if "args" in kwargs:
warnings.warn(
'Explicit "args" key is deprecated, use keys directly instead.',
DeprecationWarning,
)
kwargs = kwargs["args"]
class_path = SERVER_IMPLEMENTATIONS.get(cls)
if class_path is None:
raise ValueError(
f"Unknown indexer storage class `{cls}`. "
f"Supported: {', '.join(SERVER_IMPLEMENTATIONS)}"
)
(module_path, class_name) = class_path.rsplit(".", 1)
module = import_module(module_path if module_path else ".", package=__package__)
BackendClass = getattr(module, class_name)
check_config = kwargs.pop("check_config", {})
idx_storage = BackendClass(**kwargs)
if check_config:
if not idx_storage.check_config(**check_config):
raise EnvironmentError("Indexer storage check config failed")
return idx_storage
def check_id_duplicates(data):
"""
If any two row models in `data` have the same unique key, raises
a `ValueError`.
Values associated to the key must be hashable.
Args:
data (List[dict]): List of dictionaries to be inserted
>>> check_id_duplicates([
... ContentLanguageRow(id=b'foo', indexer_configuration_id=42, lang="python"),
... ContentLanguageRow(id=b'foo', indexer_configuration_id=32, lang="python"),
... ])
>>> check_id_duplicates([
... ContentLanguageRow(id=b'foo', indexer_configuration_id=42, lang="python"),
... ContentLanguageRow(id=b'foo', indexer_configuration_id=42, lang="python"),
... ])
Traceback (most recent call last):
...
swh.indexer.storage.exc.DuplicateId: [{'id': b'foo', 'indexer_configuration_id': 42}]
""" # noqa
counter = Counter(tuple(sorted(item.unique_key().items())) for item in data)
duplicates = [id_ for (id_, count) in counter.items() if count >= 2]
if duplicates:
raise DuplicateId(list(map(dict, duplicates)))
class IndexerStorage:
"""SWH Indexer Storage
"""
def __init__(self, db, min_pool_conns=1, max_pool_conns=10, journal_writer=None):
"""
Args:
db: either a libpq connection string, or a psycopg2 connection
journal_writer: configuration passed to
`swh.journal.writer.get_journal_writer`
"""
self.journal_writer = JournalWriter(self._tool_get_from_id, journal_writer)
try:
if isinstance(db, psycopg2.extensions.connection):
self._pool = None
self._db = Db(db)
else:
self._pool = psycopg2.pool.ThreadedConnectionPool(
min_pool_conns, max_pool_conns, db
)
self._db = None
except psycopg2.OperationalError as e:
raise StorageDBError(e)
def get_db(self):
if self._db:
return self._db
return Db.from_pool(self._pool)
def put_db(self, db):
if db is not self._db:
db.put_conn()
@timed
@db_transaction()
def check_config(self, *, check_write, db=None, cur=None):
# Check permissions on one of the tables
if check_write:
check = "INSERT"
else:
check = "SELECT"
cur.execute(
"select has_table_privilege(current_user, 'content_mimetype', %s)", # noqa
(check,),
)
return cur.fetchone()[0]
@timed
@db_transaction()
def content_mimetype_missing(
self, mimetypes: Iterable[Dict], db=None, cur=None
) -> List[Tuple[Sha1, int]]:
return [obj[0] for obj in db.content_mimetype_missing_from_list(mimetypes, cur)]
@timed
@db_transaction()
def get_partition(
self,
indexer_type: str,
indexer_configuration_id: int,
partition_id: int,
nb_partitions: int,
page_token: Optional[str] = None,
limit: int = 1000,
with_textual_data=False,
db=None,
cur=None,
) -> PagedResult[Sha1]:
"""Retrieve ids of content with `indexer_type` within within partition partition_id
bound by limit.
Args:
**indexer_type**: Type of data content to index (mimetype, language, etc...)
**indexer_configuration_id**: The tool used to index data
**partition_id**: index of the partition to fetch
**nb_partitions**: total number of partitions to split into
**page_token**: opaque token used for pagination
**limit**: Limit result (default to 1000)
**with_textual_data** (bool): Deal with only textual content (True) or all
content (all contents by defaults, False)
Raises:
IndexerStorageArgumentException for;
- limit to None
- wrong indexer_type provided
Returns:
PagedResult of Sha1. If next_page_token is None, there is no more data to
fetch
"""
if limit is None:
raise IndexerStorageArgumentException("limit should not be None")
if indexer_type not in db.content_indexer_names:
err = f"Wrong type. Should be one of [{','.join(db.content_indexer_names)}]"
raise IndexerStorageArgumentException(err)
start, end = get_partition_bounds_bytes(partition_id, nb_partitions, SHA1_SIZE)
if page_token is not None:
start = hash_to_bytes(page_token)
if end is None:
end = b"\xff" * SHA1_SIZE
next_page_token: Optional[str] = None
ids = [
row[0]
for row in db.content_get_range(
indexer_type,
start,
end,
indexer_configuration_id,
limit=limit + 1,
with_textual_data=with_textual_data,
cur=cur,
)
]
if len(ids) >= limit:
next_page_token = hash_to_hex(ids[-1])
ids = ids[:limit]
assert len(ids) <= limit
return PagedResult(results=ids, next_page_token=next_page_token)
@timed
@db_transaction()
def content_mimetype_get_partition(
self,
indexer_configuration_id: int,
partition_id: int,
nb_partitions: int,
page_token: Optional[str] = None,
limit: int = 1000,
db=None,
cur=None,
) -> PagedResult[Sha1]:
return self.get_partition(
"mimetype",
indexer_configuration_id,
partition_id,
nb_partitions,
page_token=page_token,
limit=limit,
db=db,
cur=cur,
)
@timed
@process_metrics
@db_transaction()
def content_mimetype_add(
self, mimetypes: List[ContentMimetypeRow], db=None, cur=None,
) -> Dict[str, int]:
check_id_duplicates(mimetypes)
mimetypes.sort(key=lambda m: m.id)
self.journal_writer.write_additions("content_mimetype", mimetypes)
db.mktemp_content_mimetype(cur)
db.copy_to(
[m.to_dict() for m in mimetypes],
"tmp_content_mimetype",
["id", "mimetype", "encoding", "indexer_configuration_id"],
cur,
)
count = db.content_mimetype_add_from_temp(cur)
return {"content_mimetype:add": count}
@timed
@db_transaction()
def content_mimetype_get(
self, ids: Iterable[Sha1], db=None, cur=None
) -> List[ContentMimetypeRow]:
return [
ContentMimetypeRow.from_dict(
converters.db_to_mimetype(dict(zip(db.content_mimetype_cols, c)))
)
for c in db.content_mimetype_get_from_list(ids, cur)
]
@timed
@db_transaction()
def content_language_missing(
self, languages: Iterable[Dict], db=None, cur=None
) -> List[Tuple[Sha1, int]]:
return [obj[0] for obj in db.content_language_missing_from_list(languages, cur)]
@timed
@db_transaction()
def content_language_get(
self, ids: Iterable[Sha1], db=None, cur=None
) -> List[ContentLanguageRow]:
return [
ContentLanguageRow.from_dict(
converters.db_to_language(dict(zip(db.content_language_cols, c)))
)
for c in db.content_language_get_from_list(ids, cur)
]
@timed
@process_metrics
@db_transaction()
def content_language_add(
self, languages: List[ContentLanguageRow], db=None, cur=None,
) -> Dict[str, int]:
check_id_duplicates(languages)
languages.sort(key=lambda m: m.id)
self.journal_writer.write_additions("content_language", languages)
db.mktemp_content_language(cur)
# empty language is mapped to 'unknown'
db.copy_to(
(
{
"id": lang.id,
"lang": lang.lang or "unknown",
"indexer_configuration_id": lang.indexer_configuration_id,
}
for lang in languages
),
"tmp_content_language",
["id", "lang", "indexer_configuration_id"],
cur,
)
count = db.content_language_add_from_temp(cur)
return {"content_language:add": count}
@timed
@db_transaction()
def content_ctags_missing(
self, ctags: Iterable[Dict], db=None, cur=None
) -> List[Tuple[Sha1, int]]:
return [obj[0] for obj in db.content_ctags_missing_from_list(ctags, cur)]
@timed
@db_transaction()
def content_ctags_get(
self, ids: Iterable[Sha1], db=None, cur=None
) -> List[ContentCtagsRow]:
return [
ContentCtagsRow.from_dict(
converters.db_to_ctags(dict(zip(db.content_ctags_cols, c)))
)
for c in db.content_ctags_get_from_list(ids, cur)
]
@timed
@process_metrics
@db_transaction()
def content_ctags_add(
self, ctags: List[ContentCtagsRow], db=None, cur=None,
) -> Dict[str, int]:
check_id_duplicates(ctags)
ctags.sort(key=lambda m: m.id)
self.journal_writer.write_additions("content_ctags", ctags)
db.mktemp_content_ctags(cur)
db.copy_to(
[ctag.to_dict() for ctag in ctags],
tblname="tmp_content_ctags",
columns=["id", "name", "kind", "line", "lang", "indexer_configuration_id"],
cur=cur,
)
count = db.content_ctags_add_from_temp(cur)
return {"content_ctags:add": count}
@timed
@db_transaction()
def content_ctags_search(
self,
expression: str,
limit: int = 10,
last_sha1: Optional[Sha1] = None,
db=None,
cur=None,
) -> List[ContentCtagsRow]:
return [
ContentCtagsRow.from_dict(
converters.db_to_ctags(dict(zip(db.content_ctags_cols, obj)))
)
for obj in db.content_ctags_search(expression, last_sha1, limit, cur=cur)
]
@timed
@db_transaction()
def content_fossology_license_get(
self, ids: Iterable[Sha1], db=None, cur=None
) -> List[ContentLicenseRow]:
return [
ContentLicenseRow.from_dict(
converters.db_to_fossology_license(
dict(zip(db.content_fossology_license_cols, c))
)
)
for c in db.content_fossology_license_get_from_list(ids, cur)
]
@timed
@process_metrics
@db_transaction()
def content_fossology_license_add(
self, licenses: List[ContentLicenseRow], db=None, cur=None,
) -> Dict[str, int]:
check_id_duplicates(licenses)
licenses.sort(key=lambda m: m.id)
self.journal_writer.write_additions("content_fossology_license", licenses)
db.mktemp_content_fossology_license(cur)
db.copy_to(
[license.to_dict() for license in licenses],
tblname="tmp_content_fossology_license",
columns=["id", "license", "indexer_configuration_id"],
cur=cur,
)
count = db.content_fossology_license_add_from_temp(cur)
return {"content_fossology_license:add": count}
@timed
@db_transaction()
def content_fossology_license_get_partition(
self,
indexer_configuration_id: int,
partition_id: int,
nb_partitions: int,
page_token: Optional[str] = None,
limit: int = 1000,
db=None,
cur=None,
) -> PagedResult[Sha1]:
return self.get_partition(
"fossology_license",
indexer_configuration_id,
partition_id,
nb_partitions,
page_token=page_token,
limit=limit,
with_textual_data=True,
db=db,
cur=cur,
)
@timed
@db_transaction()
def content_metadata_missing(
self, metadata: Iterable[Dict], db=None, cur=None
) -> List[Tuple[Sha1, int]]:
return [obj[0] for obj in db.content_metadata_missing_from_list(metadata, cur)]
@timed
@db_transaction()
def content_metadata_get(
self, ids: Iterable[Sha1], db=None, cur=None
) -> List[ContentMetadataRow]:
return [
ContentMetadataRow.from_dict(
converters.db_to_metadata(dict(zip(db.content_metadata_cols, c)))
)
for c in db.content_metadata_get_from_list(ids, cur)
]
@timed
@process_metrics
@db_transaction()
def content_metadata_add(
self, metadata: List[ContentMetadataRow], db=None, cur=None,
) -> Dict[str, int]:
check_id_duplicates(metadata)
metadata.sort(key=lambda m: m.id)
self.journal_writer.write_additions("content_metadata", metadata)
db.mktemp_content_metadata(cur)
db.copy_to(
[m.to_dict() for m in metadata],
"tmp_content_metadata",
["id", "metadata", "indexer_configuration_id"],
cur,
)
count = db.content_metadata_add_from_temp(cur)
return {
"content_metadata:add": count,
}
@timed
@db_transaction()
def revision_intrinsic_metadata_missing(
self, metadata: Iterable[Dict], db=None, cur=None
) -> List[Tuple[Sha1, int]]:
return [
obj[0]
for obj in db.revision_intrinsic_metadata_missing_from_list(metadata, cur)
]
@timed
@db_transaction()
def revision_intrinsic_metadata_get(
self, ids: Iterable[Sha1], db=None, cur=None
) -> List[RevisionIntrinsicMetadataRow]:
return [
RevisionIntrinsicMetadataRow.from_dict(
converters.db_to_metadata(
dict(zip(db.revision_intrinsic_metadata_cols, c))
)
)
for c in db.revision_intrinsic_metadata_get_from_list(ids, cur)
]
@timed
@process_metrics
@db_transaction()
def revision_intrinsic_metadata_add(
self, metadata: List[RevisionIntrinsicMetadataRow], db=None, cur=None,
) -> Dict[str, int]:
check_id_duplicates(metadata)
metadata.sort(key=lambda m: m.id)
self.journal_writer.write_additions("revision_intrinsic_metadata", metadata)
db.mktemp_revision_intrinsic_metadata(cur)
db.copy_to(
[m.to_dict() for m in metadata],
"tmp_revision_intrinsic_metadata",
["id", "metadata", "mappings", "indexer_configuration_id"],
cur,
)
count = db.revision_intrinsic_metadata_add_from_temp(cur)
return {
"revision_intrinsic_metadata:add": count,
}
@timed
@db_transaction()
def origin_intrinsic_metadata_get(
self, urls: Iterable[str], db=None, cur=None
) -> List[OriginIntrinsicMetadataRow]:
return [
OriginIntrinsicMetadataRow.from_dict(
converters.db_to_metadata(
dict(zip(db.origin_intrinsic_metadata_cols, c))
)
)
for c in db.origin_intrinsic_metadata_get_from_list(urls, cur)
]
@timed
@process_metrics
@db_transaction()
def origin_intrinsic_metadata_add(
self, metadata: List[OriginIntrinsicMetadataRow], db=None, cur=None,
) -> Dict[str, int]:
check_id_duplicates(metadata)
metadata.sort(key=lambda m: m.id)
self.journal_writer.write_additions("origin_intrinsic_metadata", metadata)
db.mktemp_origin_intrinsic_metadata(cur)
db.copy_to(
[m.to_dict() for m in metadata],
"tmp_origin_intrinsic_metadata",
["id", "metadata", "indexer_configuration_id", "from_revision", "mappings"],
cur,
)
count = db.origin_intrinsic_metadata_add_from_temp(cur)
return {
"origin_intrinsic_metadata:add": count,
}
@timed
@db_transaction()
def origin_intrinsic_metadata_search_fulltext(
self, conjunction: List[str], limit: int = 100, db=None, cur=None
) -> List[OriginIntrinsicMetadataRow]:
return [
OriginIntrinsicMetadataRow.from_dict(
converters.db_to_metadata(
dict(zip(db.origin_intrinsic_metadata_cols, c))
)
)
for c in db.origin_intrinsic_metadata_search_fulltext(
conjunction, limit=limit, cur=cur
)
]
@timed
@db_transaction()
def origin_intrinsic_metadata_search_by_producer(
self,
page_token: str = "",
limit: int = 100,
ids_only: bool = False,
mappings: Optional[List[str]] = None,
tool_ids: Optional[List[int]] = None,
db=None,
cur=None,
) -> PagedResult[Union[str, OriginIntrinsicMetadataRow]]:
assert isinstance(page_token, str)
# we go to limit+1 to check whether we should add next_page_token in
# the response
rows = db.origin_intrinsic_metadata_search_by_producer(
page_token, limit + 1, ids_only, mappings, tool_ids, cur
)
next_page_token = None
if ids_only:
results = [origin for (origin,) in rows]
if len(results) > limit:
results[limit:] = []
next_page_token = results[-1]
else:
results = [
OriginIntrinsicMetadataRow.from_dict(
converters.db_to_metadata(
dict(zip(db.origin_intrinsic_metadata_cols, row))
)
)
for row in rows
]
if len(results) > limit:
results[limit:] = []
next_page_token = results[-1].id
return PagedResult(results=results, next_page_token=next_page_token,)
@timed
@db_transaction()
def origin_intrinsic_metadata_stats(self, db=None, cur=None):
mapping_names = [m for m in MAPPING_NAMES]
select_parts = []
# Count rows for each mapping
for mapping_name in mapping_names:
select_parts.append(
(
"sum(case when (mappings @> ARRAY['%s']) "
" then 1 else 0 end)"
)
% mapping_name
)
# Total
select_parts.append("sum(1)")
# Rows whose metadata has at least one key that is not '@context'
select_parts.append(
"sum(case when ('{}'::jsonb @> (metadata - '@context')) "
" then 0 else 1 end)"
)
cur.execute(
"select " + ", ".join(select_parts) + " from origin_intrinsic_metadata"
)
results = dict(zip(mapping_names + ["total", "non_empty"], cur.fetchone()))
return {
"total": results.pop("total"),
"non_empty": results.pop("non_empty"),
"per_mapping": results,
}
@timed
@db_transaction()
def indexer_configuration_add(self, tools, db=None, cur=None):
db.mktemp_indexer_configuration(cur)
db.copy_to(
tools,
"tmp_indexer_configuration",
["tool_name", "tool_version", "tool_configuration"],
cur,
)
tools = db.indexer_configuration_add_from_temp(cur)
results = [dict(zip(db.indexer_configuration_cols, line)) for line in tools]
send_metric(
"indexer_configuration:add",
len(results),
method_name="indexer_configuration_add",
)
return results
@timed
@db_transaction()
def indexer_configuration_get(self, tool, db=None, cur=None):
tool_conf = tool["tool_configuration"]
if isinstance(tool_conf, dict):
tool_conf = json.dumps(tool_conf)
idx = db.indexer_configuration_get(
tool["tool_name"], tool["tool_version"], tool_conf
)
if not idx:
return None
return dict(zip(db.indexer_configuration_cols, idx))
@db_transaction()
def _tool_get_from_id(self, id_, db, cur):
tool = dict(
zip(
db.indexer_configuration_cols,
db.indexer_configuration_get_from_id(id_, cur),
)
)
return {
"id": tool["id"],
"name": tool["tool_name"],
"version": tool["tool_version"],
"configuration": tool["tool_configuration"],
}
diff --git a/swh/indexer/tests/conftest.py b/swh/indexer/tests/conftest.py
index 270812d..5ae6aeb 100644
--- a/swh/indexer/tests/conftest.py
+++ b/swh/indexer/tests/conftest.py
@@ -1,127 +1,127 @@
# Copyright (C) 2019-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from datetime import timedelta
import os
from os import path
from typing import List, Tuple
from unittest.mock import patch
import pytest
import yaml
from swh.core.db.pytest_plugin import postgresql_fact
import swh.indexer
from swh.indexer.storage import get_indexer_storage
from swh.objstorage.factory import get_objstorage
from swh.storage import get_storage
from .utils import fill_obj_storage, fill_storage
TASK_NAMES: List[Tuple[str, str]] = [
# (scheduler-task-type, task-class-test-name)
("index-revision-metadata", "revision_intrinsic_metadata"),
("index-origin-metadata", "origin_intrinsic_metadata"),
]
SQL_FILES = path.join(path.dirname(swh.indexer.__file__), "sql", "*.sql")
idx_storage_postgresql = postgresql_fact(
- "postgresql_proc", db_name="indexer_storage", dump_files=SQL_FILES,
+ "postgresql_proc", dbname="indexer_storage", dump_files=SQL_FILES,
)
@pytest.fixture
def indexer_scheduler(swh_scheduler):
# Insert the expected task types within the scheduler
for task_name, task_class_name in TASK_NAMES:
swh_scheduler.create_task_type(
{
"type": task_name,
"description": f"The {task_class_name} indexer testing task",
"backend_name": f"swh.indexer.tests.tasks.{task_class_name}",
"default_interval": timedelta(days=1),
"min_interval": timedelta(hours=6),
"max_interval": timedelta(days=12),
"num_retries": 3,
}
)
return swh_scheduler
@pytest.fixture
def idx_storage_backend_config(idx_storage_postgresql):
"""Basic pg storage configuration with no journal collaborator for the indexer
storage (to avoid pulling optional dependency on clients of this fixture)
"""
return {
"cls": "local",
"db": idx_storage_postgresql.dsn,
}
@pytest.fixture
def swh_indexer_config(
swh_storage_backend_config, idx_storage_backend_config, swh_scheduler_config
):
return {
"storage": swh_storage_backend_config,
"objstorage": {"cls": "memory"},
"indexer_storage": idx_storage_backend_config,
"scheduler": {"cls": "local", **swh_scheduler_config},
"tools": {
"name": "file",
"version": "1:5.30-1+deb9u1",
"configuration": {"type": "library", "debian-package": "python3-magic"},
},
"compute_checksums": ["blake2b512"], # for rehash indexer
}
@pytest.fixture
def idx_storage(swh_indexer_config):
"""An instance of in-memory indexer storage that gets injected into all
indexers classes.
"""
idx_storage_config = swh_indexer_config["indexer_storage"]
return get_indexer_storage(**idx_storage_config)
@pytest.fixture
def storage(swh_indexer_config):
"""An instance of in-memory storage that gets injected into all indexers
classes.
"""
storage = get_storage(**swh_indexer_config["storage"])
fill_storage(storage)
return storage
@pytest.fixture
def obj_storage(swh_indexer_config):
"""An instance of in-memory objstorage that gets injected into all indexers
classes.
"""
objstorage = get_objstorage(**swh_indexer_config["objstorage"])
fill_obj_storage(objstorage)
with patch.dict(
"swh.objstorage.factory._STORAGE_CLASSES", {"memory": lambda: objstorage}
):
yield objstorage
@pytest.fixture
def swh_config(swh_indexer_config, monkeypatch, tmp_path):
conffile = os.path.join(str(tmp_path), "indexer.yml")
with open(conffile, "w") as f:
f.write(yaml.dump(swh_indexer_config))
monkeypatch.setenv("SWH_CONFIG_FILENAME", conffile)
return conffile
diff --git a/swh/indexer/tests/storage/test_storage.py b/swh/indexer/tests/storage/test_storage.py
index c17c3e8..3067dc2 100644
--- a/swh/indexer/tests/storage/test_storage.py
+++ b/swh/indexer/tests/storage/test_storage.py
@@ -1,1722 +1,1723 @@
# Copyright (C) 2015-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import math
import threading
from typing import Any, Dict, List, Tuple, Type
import attr
import pytest
from swh.indexer.storage.exc import DuplicateId, IndexerStorageArgumentException
from swh.indexer.storage.interface import IndexerStorageInterface, PagedResult
from swh.indexer.storage.model import (
BaseRow,
ContentCtagsRow,
ContentLanguageRow,
ContentLicenseRow,
ContentMetadataRow,
ContentMimetypeRow,
OriginIntrinsicMetadataRow,
RevisionIntrinsicMetadataRow,
)
from swh.model.hashutil import hash_to_bytes
def prepare_mimetypes_from_licenses(
fossology_licenses: List[ContentLicenseRow],
) -> List[ContentMimetypeRow]:
"""Fossology license needs some consistent data in db to run.
"""
mimetypes = []
for c in fossology_licenses:
mimetypes.append(
ContentMimetypeRow(
id=c.id,
mimetype="text/plain", # for filtering on textual data to work
encoding="utf-8",
indexer_configuration_id=c.indexer_configuration_id,
)
)
return mimetypes
def endpoint_name(etype: str, ename: str) -> str:
"""Compute the storage's endpoint's name
>>> endpoint_name('content_mimetype', 'add')
'content_mimetype_add'
>>> endpoint_name('content_fosso_license', 'delete')
'content_fosso_license_delete'
"""
return f"{etype}_{ename}"
def endpoint(storage, etype: str, ename: str):
return getattr(storage, endpoint_name(etype, ename))
def expected_summary(count: int, etype: str, ename: str = "add") -> Dict[str, int]:
"""Compute the expected summary
The key is determine according to etype and ename
>>> expected_summary(10, 'content_mimetype', 'add')
{'content_mimetype:add': 10}
>>> expected_summary(9, 'origin_intrinsic_metadata', 'delete')
{'origin_intrinsic_metadata:del': 9}
"""
pattern = ename[0:3]
key = endpoint_name(etype, ename).replace(f"_{ename}", f":{pattern}")
return {key: count}
def test_check_config(swh_indexer_storage) -> None:
assert swh_indexer_storage.check_config(check_write=True)
assert swh_indexer_storage.check_config(check_write=False)
class StorageETypeTester:
"""Base class for testing a series of common behaviour between a bunch of
endpoint types supported by an IndexerStorage.
This is supposed to be inherited with the following class attributes:
- endpoint_type
- tool_name
- example_data
See below for example usage.
"""
endpoint_type: str
tool_name: str
example_data: List[Dict]
row_class: Type[BaseRow]
def test_missing(
self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]
) -> None:
storage, data = swh_indexer_storage_with_data
etype = self.endpoint_type
tool_id = data.tools[self.tool_name]["id"]
# given 2 (hopefully) unknown objects
query = [
{"id": data.sha1_1, "indexer_configuration_id": tool_id,},
{"id": data.sha1_2, "indexer_configuration_id": tool_id,},
]
# we expect these are both returned by the xxx_missing endpoint
actual_missing = endpoint(storage, etype, "missing")(query)
assert list(actual_missing) == [
data.sha1_1,
data.sha1_2,
]
# now, when we add one of them
summary = endpoint(storage, etype, "add")(
[
self.row_class.from_dict(
{
"id": data.sha1_2,
**self.example_data[0],
"indexer_configuration_id": tool_id,
}
)
]
)
assert summary == expected_summary(1, etype)
# we expect only the other one returned
actual_missing = endpoint(storage, etype, "missing")(query)
assert list(actual_missing) == [data.sha1_1]
def test_add__update_in_place_duplicate(
self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]
) -> None:
storage, data = swh_indexer_storage_with_data
etype = self.endpoint_type
tool = data.tools[self.tool_name]
data_v1 = {
"id": data.sha1_2,
**self.example_data[0],
"indexer_configuration_id": tool["id"],
}
# given
summary = endpoint(storage, etype, "add")([self.row_class.from_dict(data_v1)])
assert summary == expected_summary(1, etype) # not added
# when
actual_data = list(endpoint(storage, etype, "get")([data.sha1_2]))
expected_data_v1 = [
self.row_class.from_dict(
{"id": data.sha1_2, **self.example_data[0], "tool": tool}
)
]
# then
assert actual_data == expected_data_v1
# given
data_v2 = data_v1.copy()
data_v2.update(self.example_data[1])
endpoint(storage, etype, "add")([self.row_class.from_dict(data_v2)])
assert summary == expected_summary(1, etype) # modified so counted
actual_data = list(endpoint(storage, etype, "get")([data.sha1_2]))
expected_data_v2 = [
self.row_class.from_dict(
{"id": data.sha1_2, **self.example_data[1], "tool": tool,}
)
]
# data did change as the v2 was used to overwrite v1
assert actual_data == expected_data_v2
def test_add_deadlock(
self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]
) -> None:
storage, data = swh_indexer_storage_with_data
etype = self.endpoint_type
tool = data.tools[self.tool_name]
hashes = [
hash_to_bytes("34973274ccef6ab4dfaaf86599792fa9c3fe4{:03d}".format(i))
for i in range(1000)
]
data_v1 = [
self.row_class.from_dict(
{
"id": hash_,
**self.example_data[0],
"indexer_configuration_id": tool["id"],
}
)
for hash_ in hashes
]
data_v2 = [
self.row_class.from_dict(
{
"id": hash_,
**self.example_data[1],
"indexer_configuration_id": tool["id"],
}
)
for hash_ in hashes
]
# Remove one item from each, so that both queries have to succeed for
# all items to be in the DB.
data_v2a = data_v2[1:]
data_v2b = list(reversed(data_v2[0:-1]))
# given
endpoint(storage, etype, "add")(data_v1)
# when
actual_data = sorted(
endpoint(storage, etype, "get")(hashes), key=lambda x: x.id,
)
expected_data_v1 = [
self.row_class.from_dict(
{"id": hash_, **self.example_data[0], "tool": tool}
)
for hash_ in hashes
]
# then
assert actual_data == expected_data_v1
# given
def f1() -> None:
endpoint(storage, etype, "add")(data_v2a)
def f2() -> None:
endpoint(storage, etype, "add")(data_v2b)
t1 = threading.Thread(target=f1)
t2 = threading.Thread(target=f2)
t2.start()
t1.start()
t1.join()
t2.join()
actual_data = sorted(
endpoint(storage, etype, "get")(hashes), key=lambda x: x.id,
)
expected_data_v2 = [
self.row_class.from_dict(
{"id": hash_, **self.example_data[1], "tool": tool}
)
for hash_ in hashes
]
assert len(actual_data) == len(expected_data_v1) == len(expected_data_v2)
for (item, expected_item_v1, expected_item_v2) in zip(
actual_data, expected_data_v1, expected_data_v2
):
assert item in (expected_item_v1, expected_item_v2)
def test_add__duplicate_twice(
self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]
) -> None:
storage, data = swh_indexer_storage_with_data
etype = self.endpoint_type
tool = data.tools[self.tool_name]
data_rev1 = self.row_class.from_dict(
{
"id": data.revision_id_2,
**self.example_data[0],
"indexer_configuration_id": tool["id"],
}
)
data_rev2 = self.row_class.from_dict(
{
"id": data.revision_id_2,
**self.example_data[1],
"indexer_configuration_id": tool["id"],
}
)
# when
summary = endpoint(storage, etype, "add")([data_rev1])
assert summary == expected_summary(1, etype)
with pytest.raises(DuplicateId):
endpoint(storage, etype, "add")([data_rev2, data_rev2])
# then
actual_data = list(
endpoint(storage, etype, "get")([data.revision_id_2, data.revision_id_1])
)
expected_data = [
self.row_class.from_dict(
{"id": data.revision_id_2, **self.example_data[0], "tool": tool}
)
]
assert actual_data == expected_data
def test_add(
self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]
) -> None:
storage, data = swh_indexer_storage_with_data
etype = self.endpoint_type
tool = data.tools[self.tool_name]
# conftest fills it with mimetypes
storage.journal_writer.journal.objects = [] # type: ignore
query = [data.sha1_2, data.sha1_1]
data1 = self.row_class.from_dict(
{
"id": data.sha1_2,
**self.example_data[0],
"indexer_configuration_id": tool["id"],
}
)
# when
summary = endpoint(storage, etype, "add")([data1])
assert summary == expected_summary(1, etype)
# then
actual_data = list(endpoint(storage, etype, "get")(query))
# then
expected_data = [
self.row_class.from_dict(
{"id": data.sha1_2, **self.example_data[0], "tool": tool}
)
]
assert actual_data == expected_data
journal_objects = storage.journal_writer.journal.objects # type: ignore
actual_journal_data = [
obj for (obj_type, obj) in journal_objects if obj_type == self.endpoint_type
]
assert list(sorted(actual_journal_data)) == list(sorted(expected_data))
class TestIndexerStorageContentMimetypes(StorageETypeTester):
"""Test Indexer Storage content_mimetype related methods
"""
endpoint_type = "content_mimetype"
tool_name = "file"
example_data = [
{"mimetype": "text/plain", "encoding": "utf-8",},
{"mimetype": "text/html", "encoding": "us-ascii",},
]
row_class = ContentMimetypeRow
def test_generate_content_mimetype_get_partition_failure(
self, swh_indexer_storage: IndexerStorageInterface
) -> None:
"""get_partition call with wrong limit input should fail"""
storage = swh_indexer_storage
indexer_configuration_id = 42
with pytest.raises(
IndexerStorageArgumentException, match="limit should not be None"
):
storage.content_mimetype_get_partition(
indexer_configuration_id, 0, 3, limit=None # type: ignore
)
def test_generate_content_mimetype_get_partition_no_limit(
self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]
) -> None:
"""get_partition should return result"""
storage, data = swh_indexer_storage_with_data
mimetypes = data.mimetypes
expected_ids = set([c.id for c in mimetypes])
indexer_configuration_id = mimetypes[0].indexer_configuration_id
assert len(mimetypes) == 16
nb_partitions = 16
actual_ids = []
for partition_id in range(nb_partitions):
actual_result = storage.content_mimetype_get_partition(
indexer_configuration_id, partition_id, nb_partitions
)
assert actual_result.next_page_token is None
actual_ids.extend(actual_result.results)
assert len(actual_ids) == len(expected_ids)
for actual_id in actual_ids:
assert actual_id in expected_ids
def test_generate_content_mimetype_get_partition_full(
self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]
) -> None:
"""get_partition for a single partition should return available ids
"""
storage, data = swh_indexer_storage_with_data
mimetypes = data.mimetypes
expected_ids = set([c.id for c in mimetypes])
indexer_configuration_id = mimetypes[0].indexer_configuration_id
actual_result = storage.content_mimetype_get_partition(
indexer_configuration_id, 0, 1
)
assert actual_result.next_page_token is None
actual_ids = actual_result.results
assert len(actual_ids) == len(expected_ids)
for actual_id in actual_ids:
assert actual_id in expected_ids
def test_generate_content_mimetype_get_partition_empty(
self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]
) -> None:
"""get_partition when at least one of the partitions is empty"""
storage, data = swh_indexer_storage_with_data
mimetypes = data.mimetypes
expected_ids = set([c.id for c in mimetypes])
indexer_configuration_id = mimetypes[0].indexer_configuration_id
# nb_partitions = smallest power of 2 such that at least one of
# the partitions is empty
nb_mimetypes = len(mimetypes)
nb_partitions = 1 << math.floor(math.log2(nb_mimetypes) + 1)
seen_ids = []
for partition_id in range(nb_partitions):
actual_result = storage.content_mimetype_get_partition(
indexer_configuration_id,
partition_id,
nb_partitions,
limit=nb_mimetypes + 1,
)
for actual_id in actual_result.results:
seen_ids.append(actual_id)
# Limit is higher than the max number of results
assert actual_result.next_page_token is None
assert set(seen_ids) == expected_ids
def test_generate_content_mimetype_get_partition_with_pagination(
self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]
) -> None:
"""get_partition should return ids provided with pagination
"""
storage, data = swh_indexer_storage_with_data
mimetypes = data.mimetypes
expected_ids = set([c.id for c in mimetypes])
indexer_configuration_id = mimetypes[0].indexer_configuration_id
nb_partitions = 4
actual_ids = []
for partition_id in range(nb_partitions):
next_page_token = None
while True:
actual_result = storage.content_mimetype_get_partition(
indexer_configuration_id,
partition_id,
nb_partitions,
limit=2,
page_token=next_page_token,
)
actual_ids.extend(actual_result.results)
next_page_token = actual_result.next_page_token
if next_page_token is None:
break
assert len(set(actual_ids)) == len(set(expected_ids))
for actual_id in actual_ids:
assert actual_id in expected_ids
class TestIndexerStorageContentLanguage(StorageETypeTester):
"""Test Indexer Storage content_language related methods
"""
endpoint_type = "content_language"
tool_name = "pygments"
example_data = [
{"lang": "haskell",},
{"lang": "common-lisp",},
]
row_class = ContentLanguageRow
class TestIndexerStorageContentCTags(StorageETypeTester):
"""Test Indexer Storage content_ctags related methods
"""
endpoint_type = "content_ctags"
tool_name = "universal-ctags"
example_data = [
{"name": "done", "kind": "variable", "line": 119, "lang": "OCaml",},
{"name": "done", "kind": "variable", "line": 100, "lang": "Python",},
{"name": "main", "kind": "function", "line": 119, "lang": "Python",},
]
row_class = ContentCtagsRow
# the following tests are disabled because CTAGS behaves differently
@pytest.mark.skip
def test_add__update_in_place_duplicate(self):
pass
@pytest.mark.skip
def test_add_deadlock(self):
pass
def test_content_ctags_search(
self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]
) -> None:
storage, data = swh_indexer_storage_with_data
# 1. given
tool = data.tools["universal-ctags"]
tool_id = tool["id"]
ctags1 = [
ContentCtagsRow(
id=data.sha1_1,
indexer_configuration_id=tool_id,
**kwargs, # type: ignore
)
for kwargs in [
{"name": "hello", "kind": "function", "line": 133, "lang": "Python",},
{"name": "counter", "kind": "variable", "line": 119, "lang": "Python",},
{"name": "hello", "kind": "variable", "line": 210, "lang": "Python",},
]
]
ctags1_with_tool = [
attr.evolve(ctag, indexer_configuration_id=None, tool=tool)
for ctag in ctags1
]
ctags2 = [
ContentCtagsRow(
id=data.sha1_2,
indexer_configuration_id=tool_id,
**kwargs, # type: ignore
)
for kwargs in [
{"name": "hello", "kind": "variable", "line": 100, "lang": "C",},
{"name": "result", "kind": "variable", "line": 120, "lang": "C",},
]
]
ctags2_with_tool = [
attr.evolve(ctag, indexer_configuration_id=None, tool=tool)
for ctag in ctags2
]
storage.content_ctags_add(ctags1 + ctags2)
# 1. when
actual_ctags = list(storage.content_ctags_search("hello", limit=1))
# 1. then
assert actual_ctags == [ctags1_with_tool[0]]
# 2. when
actual_ctags = list(
storage.content_ctags_search("hello", limit=1, last_sha1=data.sha1_1)
)
# 2. then
assert actual_ctags == [ctags2_with_tool[0]]
# 3. when
actual_ctags = list(storage.content_ctags_search("hello"))
# 3. then
assert actual_ctags == [
ctags1_with_tool[0],
ctags1_with_tool[2],
ctags2_with_tool[0],
]
# 4. when
actual_ctags = list(storage.content_ctags_search("counter"))
# then
assert actual_ctags == [ctags1_with_tool[1]]
# 5. when
actual_ctags = list(storage.content_ctags_search("result", limit=1))
# then
assert actual_ctags == [ctags2_with_tool[1]]
def test_content_ctags_search_no_result(
self, swh_indexer_storage: IndexerStorageInterface
) -> None:
storage = swh_indexer_storage
actual_ctags = list(storage.content_ctags_search("counter"))
assert not actual_ctags
def test_content_ctags_add__add_new_ctags_added(
self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]
) -> None:
storage, data = swh_indexer_storage_with_data
# given
tool = data.tools["universal-ctags"]
tool_id = tool["id"]
ctag1 = ContentCtagsRow(
id=data.sha1_2,
indexer_configuration_id=tool_id,
name="done",
kind="variable",
line=100,
lang="Scheme",
)
ctag1_with_tool = attr.evolve(ctag1, indexer_configuration_id=None, tool=tool)
# given
storage.content_ctags_add([ctag1])
storage.content_ctags_add([ctag1]) # conflict does nothing
# when
actual_ctags = list(storage.content_ctags_get([data.sha1_2]))
# then
assert actual_ctags == [ctag1_with_tool]
# given
ctag2 = ContentCtagsRow(
id=data.sha1_2,
indexer_configuration_id=tool_id,
name="defn",
kind="function",
line=120,
lang="Scheme",
)
ctag2_with_tool = attr.evolve(ctag2, indexer_configuration_id=None, tool=tool)
storage.content_ctags_add([ctag2])
actual_ctags = list(storage.content_ctags_get([data.sha1_2]))
assert actual_ctags == [ctag1_with_tool, ctag2_with_tool]
def test_content_ctags_add__update_in_place(
self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]
) -> None:
storage, data = swh_indexer_storage_with_data
# given
tool = data.tools["universal-ctags"]
tool_id = tool["id"]
ctag1 = ContentCtagsRow(
id=data.sha1_2,
indexer_configuration_id=tool_id,
name="done",
kind="variable",
line=100,
lang="Scheme",
)
ctag1_with_tool = attr.evolve(ctag1, indexer_configuration_id=None, tool=tool)
# given
storage.content_ctags_add([ctag1])
# when
actual_ctags = list(storage.content_ctags_get([data.sha1_2]))
# then
assert actual_ctags == [ctag1_with_tool]
# given
ctag2 = ContentCtagsRow(
id=data.sha1_2,
indexer_configuration_id=tool_id,
name="defn",
kind="function",
line=120,
lang="Scheme",
)
ctag2_with_tool = attr.evolve(ctag2, indexer_configuration_id=None, tool=tool)
storage.content_ctags_add([ctag1, ctag2])
actual_ctags = list(storage.content_ctags_get([data.sha1_2]))
assert actual_ctags == [ctag1_with_tool, ctag2_with_tool]
def test_add_empty(
self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]
) -> None:
(storage, data) = swh_indexer_storage_with_data
etype = self.endpoint_type
summary = endpoint(storage, etype, "add")([])
assert summary == {"content_ctags:add": 0}
actual_ctags = list(endpoint(storage, etype, "get")([data.sha1_2]))
assert actual_ctags == []
def test_get_unknown(
self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]
) -> None:
(storage, data) = swh_indexer_storage_with_data
etype = self.endpoint_type
actual_ctags = list(endpoint(storage, etype, "get")([data.sha1_2]))
assert actual_ctags == []
class TestIndexerStorageContentMetadata(StorageETypeTester):
"""Test Indexer Storage content_metadata related methods
"""
tool_name = "swh-metadata-detector"
endpoint_type = "content_metadata"
example_data = [
{
"metadata": {
"other": {},
"codeRepository": {
"type": "git",
"url": "https://github.com/moranegg/metadata_test",
},
"description": "Simple package.json test for indexer",
"name": "test_metadata",
"version": "0.0.1",
},
},
{"metadata": {"other": {}, "name": "test_metadata", "version": "0.0.1"},},
]
row_class = ContentMetadataRow
class TestIndexerStorageRevisionIntrinsicMetadata(StorageETypeTester):
"""Test Indexer Storage revision_intrinsic_metadata related methods
"""
tool_name = "swh-metadata-detector"
endpoint_type = "revision_intrinsic_metadata"
example_data = [
{
"metadata": {
"other": {},
"codeRepository": {
"type": "git",
"url": "https://github.com/moranegg/metadata_test",
},
"description": "Simple package.json test for indexer",
"name": "test_metadata",
"version": "0.0.1",
},
"mappings": ["mapping1"],
},
{
"metadata": {"other": {}, "name": "test_metadata", "version": "0.0.1"},
"mappings": ["mapping2"],
},
]
row_class = RevisionIntrinsicMetadataRow
class TestIndexerStorageContentFossologyLicense(StorageETypeTester):
endpoint_type = "content_fossology_license"
tool_name = "nomos"
example_data = [
{"license": "Apache-2.0"},
{"license": "BSD-2-Clause"},
]
row_class = ContentLicenseRow
# the following tests are disabled because licenses behaves differently
@pytest.mark.skip
def test_add__update_in_place_duplicate(self):
pass
@pytest.mark.skip
def test_add_deadlock(self):
pass
# content_fossology_license_missing does not exist
@pytest.mark.skip
def test_missing(self):
pass
def test_content_fossology_license_add__new_license_added(
self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]
) -> None:
storage, data = swh_indexer_storage_with_data
# given
tool = data.tools["nomos"]
tool_id = tool["id"]
license1 = ContentLicenseRow(
id=data.sha1_1, license="Apache-2.0", indexer_configuration_id=tool_id,
)
# given
storage.content_fossology_license_add([license1])
# conflict does nothing
storage.content_fossology_license_add([license1])
# when
actual_licenses = list(storage.content_fossology_license_get([data.sha1_1]))
# then
expected_licenses = [
ContentLicenseRow(id=data.sha1_1, license="Apache-2.0", tool=tool,)
]
assert actual_licenses == expected_licenses
# given
license2 = ContentLicenseRow(
id=data.sha1_1, license="BSD-2-Clause", indexer_configuration_id=tool_id,
)
storage.content_fossology_license_add([license2])
actual_licenses = list(storage.content_fossology_license_get([data.sha1_1]))
expected_licenses.append(
ContentLicenseRow(id=data.sha1_1, license="BSD-2-Clause", tool=tool,)
)
# first license was not removed when the second one was added
assert sorted(actual_licenses) == sorted(expected_licenses)
def test_generate_content_fossology_license_get_partition_failure(
self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]
) -> None:
"""get_partition call with wrong limit input should fail"""
storage, data = swh_indexer_storage_with_data
indexer_configuration_id = 42
with pytest.raises(
IndexerStorageArgumentException, match="limit should not be None"
):
storage.content_fossology_license_get_partition(
indexer_configuration_id, 0, 3, limit=None, # type: ignore
)
def test_generate_content_fossology_license_get_partition_no_limit(
self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]
) -> None:
"""get_partition should return results"""
storage, data = swh_indexer_storage_with_data
# craft some consistent mimetypes
fossology_licenses = data.fossology_licenses
mimetypes = prepare_mimetypes_from_licenses(fossology_licenses)
indexer_configuration_id = fossology_licenses[0].indexer_configuration_id
storage.content_mimetype_add(mimetypes)
# add fossology_licenses to storage
storage.content_fossology_license_add(fossology_licenses)
# All ids from the db
expected_ids = set([c.id for c in fossology_licenses])
assert len(fossology_licenses) == 10
assert len(mimetypes) == 10
nb_partitions = 4
actual_ids = []
for partition_id in range(nb_partitions):
actual_result = storage.content_fossology_license_get_partition(
indexer_configuration_id, partition_id, nb_partitions
)
assert actual_result.next_page_token is None
actual_ids.extend(actual_result.results)
assert len(set(actual_ids)) == len(expected_ids)
for actual_id in actual_ids:
assert actual_id in expected_ids
def test_generate_content_fossology_license_get_partition_full(
self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]
) -> None:
"""get_partition for a single partition should return available ids
"""
storage, data = swh_indexer_storage_with_data
# craft some consistent mimetypes
fossology_licenses = data.fossology_licenses
mimetypes = prepare_mimetypes_from_licenses(fossology_licenses)
indexer_configuration_id = fossology_licenses[0].indexer_configuration_id
storage.content_mimetype_add(mimetypes)
# add fossology_licenses to storage
storage.content_fossology_license_add(fossology_licenses)
# All ids from the db
expected_ids = set([c.id for c in fossology_licenses])
actual_result = storage.content_fossology_license_get_partition(
indexer_configuration_id, 0, 1
)
assert actual_result.next_page_token is None
actual_ids = actual_result.results
assert len(set(actual_ids)) == len(expected_ids)
for actual_id in actual_ids:
assert actual_id in expected_ids
def test_generate_content_fossology_license_get_partition_empty(
self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]
) -> None:
"""get_partition when at least one of the partitions is empty"""
storage, data = swh_indexer_storage_with_data
# craft some consistent mimetypes
fossology_licenses = data.fossology_licenses
mimetypes = prepare_mimetypes_from_licenses(fossology_licenses)
indexer_configuration_id = fossology_licenses[0].indexer_configuration_id
storage.content_mimetype_add(mimetypes)
# add fossology_licenses to storage
storage.content_fossology_license_add(fossology_licenses)
# All ids from the db
expected_ids = set([c.id for c in fossology_licenses])
# nb_partitions = smallest power of 2 such that at least one of
# the partitions is empty
nb_licenses = len(fossology_licenses)
nb_partitions = 1 << math.floor(math.log2(nb_licenses) + 1)
seen_ids = []
for partition_id in range(nb_partitions):
actual_result = storage.content_fossology_license_get_partition(
indexer_configuration_id,
partition_id,
nb_partitions,
limit=nb_licenses + 1,
)
for actual_id in actual_result.results:
seen_ids.append(actual_id)
# Limit is higher than the max number of results
assert actual_result.next_page_token is None
assert set(seen_ids) == expected_ids
def test_generate_content_fossology_license_get_partition_with_pagination(
self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]
) -> None:
"""get_partition should return ids provided with paginationv
"""
storage, data = swh_indexer_storage_with_data
# craft some consistent mimetypes
fossology_licenses = data.fossology_licenses
mimetypes = prepare_mimetypes_from_licenses(fossology_licenses)
indexer_configuration_id = fossology_licenses[0].indexer_configuration_id
storage.content_mimetype_add(mimetypes)
# add fossology_licenses to storage
storage.content_fossology_license_add(fossology_licenses)
# All ids from the db
expected_ids = [c.id for c in fossology_licenses]
nb_partitions = 4
actual_ids = []
for partition_id in range(nb_partitions):
next_page_token = None
while True:
actual_result = storage.content_fossology_license_get_partition(
indexer_configuration_id,
partition_id,
nb_partitions,
limit=2,
page_token=next_page_token,
)
actual_ids.extend(actual_result.results)
next_page_token = actual_result.next_page_token
if next_page_token is None:
break
assert len(set(actual_ids)) == len(set(expected_ids))
for actual_id in actual_ids:
assert actual_id in expected_ids
def test_add_empty(
self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]
) -> None:
(storage, data) = swh_indexer_storage_with_data
etype = self.endpoint_type
summary = endpoint(storage, etype, "add")([])
assert summary == {"content_fossology_license:add": 0}
actual_license = list(endpoint(storage, etype, "get")([data.sha1_2]))
assert actual_license == []
def test_get_unknown(
self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]
) -> None:
(storage, data) = swh_indexer_storage_with_data
etype = self.endpoint_type
actual_license = list(endpoint(storage, etype, "get")([data.sha1_2]))
assert actual_license == []
class TestIndexerStorageOriginIntrinsicMetadata:
def test_origin_intrinsic_metadata_add(
self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]
) -> None:
storage, data = swh_indexer_storage_with_data
# given
tool_id = data.tools["swh-metadata-detector"]["id"]
metadata = {
"version": None,
"name": None,
}
metadata_rev = RevisionIntrinsicMetadataRow(
id=data.revision_id_2,
metadata=metadata,
mappings=["mapping1"],
indexer_configuration_id=tool_id,
)
metadata_origin = OriginIntrinsicMetadataRow(
id=data.origin_url_1,
metadata=metadata,
indexer_configuration_id=tool_id,
mappings=["mapping1"],
from_revision=data.revision_id_2,
)
# when
storage.revision_intrinsic_metadata_add([metadata_rev])
storage.origin_intrinsic_metadata_add([metadata_origin])
# then
actual_metadata = list(
storage.origin_intrinsic_metadata_get([data.origin_url_1, "no://where"])
)
expected_metadata = [
OriginIntrinsicMetadataRow(
id=data.origin_url_1,
metadata=metadata,
tool=data.tools["swh-metadata-detector"],
from_revision=data.revision_id_2,
mappings=["mapping1"],
)
]
assert actual_metadata == expected_metadata
journal_objects = storage.journal_writer.journal.objects # type: ignore
actual_journal_metadata = [
obj
for (obj_type, obj) in journal_objects
if obj_type == "origin_intrinsic_metadata"
]
assert list(sorted(actual_journal_metadata)) == list(sorted(expected_metadata))
def test_origin_intrinsic_metadata_add_update_in_place_duplicate(
self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]
) -> None:
storage, data = swh_indexer_storage_with_data
# given
tool_id = data.tools["swh-metadata-detector"]["id"]
metadata_v1: Dict[str, Any] = {
"version": None,
"name": None,
}
metadata_rev_v1 = RevisionIntrinsicMetadataRow(
id=data.revision_id_2,
metadata=metadata_v1,
mappings=[],
indexer_configuration_id=tool_id,
)
metadata_origin_v1 = OriginIntrinsicMetadataRow(
id=data.origin_url_1,
metadata=metadata_v1.copy(),
indexer_configuration_id=tool_id,
mappings=[],
from_revision=data.revision_id_2,
)
# given
storage.revision_intrinsic_metadata_add([metadata_rev_v1])
storage.origin_intrinsic_metadata_add([metadata_origin_v1])
# when
actual_metadata = list(
storage.origin_intrinsic_metadata_get([data.origin_url_1])
)
# then
expected_metadata_v1 = [
OriginIntrinsicMetadataRow(
id=data.origin_url_1,
metadata=metadata_v1,
tool=data.tools["swh-metadata-detector"],
from_revision=data.revision_id_2,
mappings=[],
)
]
assert actual_metadata == expected_metadata_v1
# given
metadata_v2 = metadata_v1.copy()
metadata_v2.update(
{"name": "test_update_duplicated_metadata", "author": "MG",}
)
metadata_rev_v2 = attr.evolve(metadata_rev_v1, metadata=metadata_v2)
metadata_origin_v2 = OriginIntrinsicMetadataRow(
id=data.origin_url_1,
metadata=metadata_v2.copy(),
indexer_configuration_id=tool_id,
mappings=["npm"],
from_revision=data.revision_id_1,
)
storage.revision_intrinsic_metadata_add([metadata_rev_v2])
storage.origin_intrinsic_metadata_add([metadata_origin_v2])
actual_metadata = list(
storage.origin_intrinsic_metadata_get([data.origin_url_1])
)
expected_metadata_v2 = [
OriginIntrinsicMetadataRow(
id=data.origin_url_1,
metadata=metadata_v2,
tool=data.tools["swh-metadata-detector"],
from_revision=data.revision_id_1,
mappings=["npm"],
)
]
# metadata did change as the v2 was used to overwrite v1
assert actual_metadata == expected_metadata_v2
def test_origin_intrinsic_metadata_add__deadlock(
self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]
) -> None:
storage, data = swh_indexer_storage_with_data
# given
tool_id = data.tools["swh-metadata-detector"]["id"]
origins = ["file:///tmp/origin{:02d}".format(i) for i in range(100)]
example_data1: Dict[str, Any] = {
"metadata": {"version": None, "name": None,},
"mappings": [],
}
example_data2: Dict[str, Any] = {
"metadata": {"version": "v1.1.1", "name": "foo",},
"mappings": [],
}
metadata_rev_v1 = RevisionIntrinsicMetadataRow(
id=data.revision_id_2,
metadata={"version": None, "name": None,},
mappings=[],
indexer_configuration_id=tool_id,
)
data_v1 = [
OriginIntrinsicMetadataRow(
id=origin,
from_revision=data.revision_id_2,
indexer_configuration_id=tool_id,
**example_data1,
)
for origin in origins
]
data_v2 = [
OriginIntrinsicMetadataRow(
id=origin,
from_revision=data.revision_id_2,
indexer_configuration_id=tool_id,
**example_data2,
)
for origin in origins
]
# Remove one item from each, so that both queries have to succeed for
# all items to be in the DB.
data_v2a = data_v2[1:]
data_v2b = list(reversed(data_v2[0:-1]))
# given
storage.revision_intrinsic_metadata_add([metadata_rev_v1])
storage.origin_intrinsic_metadata_add(data_v1)
# when
actual_data = list(storage.origin_intrinsic_metadata_get(origins))
expected_data_v1 = [
OriginIntrinsicMetadataRow(
id=origin,
from_revision=data.revision_id_2,
tool=data.tools["swh-metadata-detector"],
**example_data1,
)
for origin in origins
]
# then
assert actual_data == expected_data_v1
# given
def f1() -> None:
storage.origin_intrinsic_metadata_add(data_v2a)
def f2() -> None:
storage.origin_intrinsic_metadata_add(data_v2b)
t1 = threading.Thread(target=f1)
t2 = threading.Thread(target=f2)
t2.start()
t1.start()
t1.join()
t2.join()
actual_data = list(storage.origin_intrinsic_metadata_get(origins))
expected_data_v2 = [
OriginIntrinsicMetadataRow(
id=origin,
from_revision=data.revision_id_2,
tool=data.tools["swh-metadata-detector"],
**example_data2,
)
for origin in origins
]
actual_data.sort(key=lambda item: item.id)
assert len(actual_data) == len(expected_data_v1) == len(expected_data_v2)
for (item, expected_item_v1, expected_item_v2) in zip(
actual_data, expected_data_v1, expected_data_v2
):
assert item in (expected_item_v1, expected_item_v2)
def test_origin_intrinsic_metadata_add__duplicate_twice(
self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]
) -> None:
storage, data = swh_indexer_storage_with_data
# given
tool_id = data.tools["swh-metadata-detector"]["id"]
metadata = {
"developmentStatus": None,
"name": None,
}
metadata_rev = RevisionIntrinsicMetadataRow(
id=data.revision_id_2,
metadata=metadata,
mappings=["mapping1"],
indexer_configuration_id=tool_id,
)
metadata_origin = OriginIntrinsicMetadataRow(
id=data.origin_url_1,
metadata=metadata,
indexer_configuration_id=tool_id,
mappings=["mapping1"],
from_revision=data.revision_id_2,
)
# when
storage.revision_intrinsic_metadata_add([metadata_rev])
with pytest.raises(DuplicateId):
storage.origin_intrinsic_metadata_add([metadata_origin, metadata_origin])
def test_origin_intrinsic_metadata_search_fulltext(
self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]
) -> None:
storage, data = swh_indexer_storage_with_data
# given
tool_id = data.tools["swh-metadata-detector"]["id"]
metadata1 = {
"author": "John Doe",
}
metadata1_rev = RevisionIntrinsicMetadataRow(
id=data.revision_id_1,
metadata=metadata1,
mappings=[],
indexer_configuration_id=tool_id,
)
metadata1_origin = OriginIntrinsicMetadataRow(
id=data.origin_url_1,
metadata=metadata1,
mappings=[],
indexer_configuration_id=tool_id,
from_revision=data.revision_id_1,
)
metadata2 = {
"author": "Jane Doe",
}
metadata2_rev = RevisionIntrinsicMetadataRow(
id=data.revision_id_2,
metadata=metadata2,
mappings=[],
indexer_configuration_id=tool_id,
)
metadata2_origin = OriginIntrinsicMetadataRow(
id=data.origin_url_2,
metadata=metadata2,
mappings=[],
indexer_configuration_id=tool_id,
from_revision=data.revision_id_2,
)
# when
storage.revision_intrinsic_metadata_add([metadata1_rev])
storage.origin_intrinsic_metadata_add([metadata1_origin])
storage.revision_intrinsic_metadata_add([metadata2_rev])
storage.origin_intrinsic_metadata_add([metadata2_origin])
# then
search = storage.origin_intrinsic_metadata_search_fulltext
assert set([res.id for res in search(["Doe"])]) == set(
[data.origin_url_1, data.origin_url_2]
)
assert [res.id for res in search(["John", "Doe"])] == [data.origin_url_1]
assert [res.id for res in search(["John"])] == [data.origin_url_1]
assert not list(search(["John", "Jane"]))
def test_origin_intrinsic_metadata_search_fulltext_rank(
self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]
) -> None:
storage, data = swh_indexer_storage_with_data
# given
tool_id = data.tools["swh-metadata-detector"]["id"]
# The following authors have "Random Person" to add some more content
# to the JSON data, to work around normalization quirks when there
# are few words (rank/(1+ln(nb_words)) is very sensitive to nb_words
# for small values of nb_words).
metadata1 = {"author": ["Random Person", "John Doe", "Jane Doe",]}
metadata1_rev = RevisionIntrinsicMetadataRow(
id=data.revision_id_1,
metadata=metadata1,
mappings=[],
indexer_configuration_id=tool_id,
)
metadata1_origin = OriginIntrinsicMetadataRow(
id=data.origin_url_1,
metadata=metadata1,
mappings=[],
indexer_configuration_id=tool_id,
from_revision=data.revision_id_1,
)
metadata2 = {"author": ["Random Person", "Jane Doe",]}
metadata2_rev = RevisionIntrinsicMetadataRow(
id=data.revision_id_2,
metadata=metadata2,
mappings=[],
indexer_configuration_id=tool_id,
)
metadata2_origin = OriginIntrinsicMetadataRow(
id=data.origin_url_2,
metadata=metadata2,
mappings=[],
indexer_configuration_id=tool_id,
from_revision=data.revision_id_2,
)
# when
storage.revision_intrinsic_metadata_add([metadata1_rev])
storage.origin_intrinsic_metadata_add([metadata1_origin])
storage.revision_intrinsic_metadata_add([metadata2_rev])
storage.origin_intrinsic_metadata_add([metadata2_origin])
# then
search = storage.origin_intrinsic_metadata_search_fulltext
assert [res.id for res in search(["Doe"])] == [
data.origin_url_1,
data.origin_url_2,
]
assert [res.id for res in search(["Doe"], limit=1)] == [data.origin_url_1]
assert [res.id for res in search(["John"])] == [data.origin_url_1]
assert [res.id for res in search(["Jane"])] == [
data.origin_url_2,
data.origin_url_1,
]
assert [res.id for res in search(["John", "Jane"])] == [data.origin_url_1]
def _fill_origin_intrinsic_metadata(
self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]
) -> None:
storage, data = swh_indexer_storage_with_data
tool1_id = data.tools["swh-metadata-detector"]["id"]
tool2_id = data.tools["swh-metadata-detector2"]["id"]
metadata1 = {
"@context": "foo",
"author": "John Doe",
}
metadata1_rev = RevisionIntrinsicMetadataRow(
id=data.revision_id_1,
metadata=metadata1,
mappings=["npm"],
indexer_configuration_id=tool1_id,
)
metadata1_origin = OriginIntrinsicMetadataRow(
id=data.origin_url_1,
metadata=metadata1,
mappings=["npm"],
indexer_configuration_id=tool1_id,
from_revision=data.revision_id_1,
)
metadata2 = {
"@context": "foo",
"author": "Jane Doe",
}
metadata2_rev = RevisionIntrinsicMetadataRow(
id=data.revision_id_2,
metadata=metadata2,
mappings=["npm", "gemspec"],
indexer_configuration_id=tool2_id,
)
metadata2_origin = OriginIntrinsicMetadataRow(
id=data.origin_url_2,
metadata=metadata2,
mappings=["npm", "gemspec"],
indexer_configuration_id=tool2_id,
from_revision=data.revision_id_2,
)
metadata3 = {
"@context": "foo",
}
metadata3_rev = RevisionIntrinsicMetadataRow(
id=data.revision_id_3,
metadata=metadata3,
mappings=["npm", "gemspec"],
indexer_configuration_id=tool2_id,
)
metadata3_origin = OriginIntrinsicMetadataRow(
id=data.origin_url_3,
metadata=metadata3,
mappings=["pkg-info"],
indexer_configuration_id=tool2_id,
from_revision=data.revision_id_3,
)
storage.revision_intrinsic_metadata_add([metadata1_rev])
storage.origin_intrinsic_metadata_add([metadata1_origin])
storage.revision_intrinsic_metadata_add([metadata2_rev])
storage.origin_intrinsic_metadata_add([metadata2_origin])
storage.revision_intrinsic_metadata_add([metadata3_rev])
storage.origin_intrinsic_metadata_add([metadata3_origin])
def test_origin_intrinsic_metadata_search_by_producer(
self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]
) -> None:
storage, data = swh_indexer_storage_with_data
self._fill_origin_intrinsic_metadata(swh_indexer_storage_with_data)
tool1 = data.tools["swh-metadata-detector"]
tool2 = data.tools["swh-metadata-detector2"]
endpoint = storage.origin_intrinsic_metadata_search_by_producer
# test pagination
# no 'page_token' param, return all origins
result = endpoint(ids_only=True)
assert result == PagedResult(
results=[data.origin_url_1, data.origin_url_2, data.origin_url_3,],
next_page_token=None,
)
# 'page_token' is < than origin_1, return everything
result = endpoint(page_token=data.origin_url_1[:-1], ids_only=True)
assert result == PagedResult(
results=[data.origin_url_1, data.origin_url_2, data.origin_url_3,],
next_page_token=None,
)
# 'page_token' is origin_3, return nothing
result = endpoint(page_token=data.origin_url_3, ids_only=True)
assert result == PagedResult(results=[], next_page_token=None)
# test limit argument
result = endpoint(page_token=data.origin_url_1[:-1], limit=2, ids_only=True)
assert result == PagedResult(
results=[data.origin_url_1, data.origin_url_2],
next_page_token=data.origin_url_2,
)
result = endpoint(page_token=data.origin_url_1, limit=2, ids_only=True)
assert result == PagedResult(
results=[data.origin_url_2, data.origin_url_3], next_page_token=None,
)
result = endpoint(page_token=data.origin_url_2, limit=2, ids_only=True)
assert result == PagedResult(results=[data.origin_url_3], next_page_token=None,)
# test mappings filtering
result = endpoint(mappings=["npm"], ids_only=True)
assert result == PagedResult(
results=[data.origin_url_1, data.origin_url_2], next_page_token=None,
)
result = endpoint(mappings=["npm", "gemspec"], ids_only=True)
assert result == PagedResult(
results=[data.origin_url_1, data.origin_url_2], next_page_token=None,
)
result = endpoint(mappings=["gemspec"], ids_only=True)
assert result == PagedResult(results=[data.origin_url_2], next_page_token=None,)
result = endpoint(mappings=["pkg-info"], ids_only=True)
assert result == PagedResult(results=[data.origin_url_3], next_page_token=None,)
result = endpoint(mappings=["foobar"], ids_only=True)
assert result == PagedResult(results=[], next_page_token=None,)
# test pagination + mappings
result = endpoint(mappings=["npm"], limit=1, ids_only=True)
assert result == PagedResult(
results=[data.origin_url_1], next_page_token=data.origin_url_1,
)
# test tool filtering
result = endpoint(tool_ids=[tool1["id"]], ids_only=True)
assert result == PagedResult(results=[data.origin_url_1], next_page_token=None,)
result = endpoint(tool_ids=[tool2["id"]], ids_only=True)
assert sorted(result.results) == [data.origin_url_2, data.origin_url_3]
assert result.next_page_token is None
result = endpoint(tool_ids=[tool1["id"], tool2["id"]], ids_only=True)
assert sorted(result.results) == [
data.origin_url_1,
data.origin_url_2,
data.origin_url_3,
]
assert result.next_page_token is None
# test ids_only=False
assert endpoint(mappings=["gemspec"]) == PagedResult(
results=[
OriginIntrinsicMetadataRow(
id=data.origin_url_2,
metadata={"@context": "foo", "author": "Jane Doe",},
mappings=["npm", "gemspec"],
tool=tool2,
from_revision=data.revision_id_2,
)
],
next_page_token=None,
)
def test_origin_intrinsic_metadata_stats(
self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]
) -> None:
storage, data = swh_indexer_storage_with_data
self._fill_origin_intrinsic_metadata(swh_indexer_storage_with_data)
result = storage.origin_intrinsic_metadata_stats()
assert result == {
"per_mapping": {
+ "cff": 0,
"gemspec": 1,
"npm": 2,
"pkg-info": 1,
"codemeta": 0,
"maven": 0,
},
"total": 3,
"non_empty": 2,
}
class TestIndexerStorageIndexerConfiguration:
def test_indexer_configuration_add(
self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]
) -> None:
storage, data = swh_indexer_storage_with_data
tool = {
"tool_name": "some-unknown-tool",
"tool_version": "some-version",
"tool_configuration": {"debian-package": "some-package"},
}
actual_tool = storage.indexer_configuration_get(tool)
assert actual_tool is None # does not exist
# add it
actual_tools = list(storage.indexer_configuration_add([tool]))
assert len(actual_tools) == 1
actual_tool = actual_tools[0]
assert actual_tool is not None # now it exists
new_id = actual_tool.pop("id")
assert actual_tool == tool
actual_tools2 = list(storage.indexer_configuration_add([tool]))
actual_tool2 = actual_tools2[0]
assert actual_tool2 is not None # now it exists
new_id2 = actual_tool2.pop("id")
assert new_id == new_id2
assert actual_tool == actual_tool2
def test_indexer_configuration_add_multiple(
self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]
) -> None:
storage, data = swh_indexer_storage_with_data
tool = {
"tool_name": "some-unknown-tool",
"tool_version": "some-version",
"tool_configuration": {"debian-package": "some-package"},
}
actual_tools = list(storage.indexer_configuration_add([tool]))
assert len(actual_tools) == 1
new_tools = [
tool,
{
"tool_name": "yet-another-tool",
"tool_version": "version",
"tool_configuration": {},
},
]
actual_tools = list(storage.indexer_configuration_add(new_tools))
assert len(actual_tools) == 2
# order not guaranteed, so we iterate over results to check
for tool in actual_tools:
_id = tool.pop("id")
assert _id is not None
assert tool in new_tools
def test_indexer_configuration_get_missing(
self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]
) -> None:
storage, data = swh_indexer_storage_with_data
tool = {
"tool_name": "unknown-tool",
"tool_version": "3.1.0rc2-31-ga2cbb8c",
"tool_configuration": {"command_line": "nomossa "},
}
actual_tool = storage.indexer_configuration_get(tool)
assert actual_tool is None
def test_indexer_configuration_get(
self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]
) -> None:
storage, data = swh_indexer_storage_with_data
tool = {
"tool_name": "nomos",
"tool_version": "3.1.0rc2-31-ga2cbb8c",
"tool_configuration": {"command_line": "nomossa "},
}
actual_tool = storage.indexer_configuration_get(tool)
assert actual_tool
expected_tool = tool.copy()
del actual_tool["id"]
assert expected_tool == actual_tool
def test_indexer_configuration_metadata_get_missing_context(
self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]
) -> None:
storage, data = swh_indexer_storage_with_data
tool = {
"tool_name": "swh-metadata-translator",
"tool_version": "0.0.1",
"tool_configuration": {"context": "unknown-context"},
}
actual_tool = storage.indexer_configuration_get(tool)
assert actual_tool is None
def test_indexer_configuration_metadata_get(
self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]
) -> None:
storage, data = swh_indexer_storage_with_data
tool = {
"tool_name": "swh-metadata-translator",
"tool_version": "0.0.1",
"tool_configuration": {"type": "local", "context": "NpmMapping"},
}
storage.indexer_configuration_add([tool])
actual_tool = storage.indexer_configuration_get(tool)
assert actual_tool
expected_tool = tool.copy()
expected_tool["id"] = actual_tool["id"]
assert expected_tool == actual_tool
diff --git a/swh/indexer/tests/test_cli.py b/swh/indexer/tests/test_cli.py
index 265fbea..97698d3 100644
--- a/swh/indexer/tests/test_cli.py
+++ b/swh/indexer/tests/test_cli.py
@@ -1,493 +1,501 @@
# Copyright (C) 2019-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import datetime
from functools import reduce
import re
from typing import Any, Dict, List
from unittest.mock import patch
from click.testing import CliRunner
from confluent_kafka import Consumer
import pytest
from swh.indexer.cli import indexer_cli_group
from swh.indexer.storage.interface import IndexerStorageInterface
from swh.indexer.storage.model import (
OriginIntrinsicMetadataRow,
RevisionIntrinsicMetadataRow,
)
from swh.journal.writer import get_journal_writer
from swh.model.hashutil import hash_to_bytes
from swh.model.model import OriginVisitStatus
def fill_idx_storage(idx_storage: IndexerStorageInterface, nb_rows: int) -> List[int]:
tools: List[Dict[str, Any]] = [
{"tool_name": "tool %d" % i, "tool_version": "0.0.1", "tool_configuration": {},}
for i in range(2)
]
tools = idx_storage.indexer_configuration_add(tools)
origin_metadata = [
OriginIntrinsicMetadataRow(
id="file://dev/%04d" % origin_id,
from_revision=hash_to_bytes("abcd{:0>36}".format(origin_id)),
indexer_configuration_id=tools[origin_id % 2]["id"],
metadata={"name": "origin %d" % origin_id},
mappings=["mapping%d" % (origin_id % 10)],
)
for origin_id in range(nb_rows)
]
revision_metadata = [
RevisionIntrinsicMetadataRow(
id=hash_to_bytes("abcd{:0>36}".format(origin_id)),
indexer_configuration_id=tools[origin_id % 2]["id"],
metadata={"name": "origin %d" % origin_id},
mappings=["mapping%d" % (origin_id % 10)],
)
for origin_id in range(nb_rows)
]
idx_storage.revision_intrinsic_metadata_add(revision_metadata)
idx_storage.origin_intrinsic_metadata_add(origin_metadata)
return [tool["id"] for tool in tools]
def _origins_in_task_args(tasks):
"""Returns the set of origins contained in the arguments of the
provided tasks (assumed to be of type index-origin-metadata)."""
return reduce(
set.union, (set(task["arguments"]["args"][0]) for task in tasks), set()
)
def _assert_tasks_for_origins(tasks, origins):
expected_kwargs = {}
assert {task["type"] for task in tasks} == {"index-origin-metadata"}
assert all(len(task["arguments"]["args"]) == 1 for task in tasks)
for task in tasks:
assert task["arguments"]["kwargs"] == expected_kwargs, task
assert _origins_in_task_args(tasks) == set(["file://dev/%04d" % i for i in origins])
@pytest.fixture
def cli_runner():
return CliRunner()
def test_cli_mapping_list(cli_runner, swh_config):
result = cli_runner.invoke(
indexer_cli_group,
["-C", swh_config, "mapping", "list"],
catch_exceptions=False,
)
expected_output = "\n".join(
- ["codemeta", "gemspec", "maven", "npm", "pkg-info", "",]
+ [
+ "cff",
+ "codemeta",
+ "gemspec",
+ "maven",
+ "npm",
+ "pkg-info",
+ "",
+ ] # must be sorted for test to pass
)
assert result.exit_code == 0, result.output
assert result.output == expected_output
def test_cli_mapping_list_terms(cli_runner, swh_config):
result = cli_runner.invoke(
indexer_cli_group,
["-C", swh_config, "mapping", "list-terms"],
catch_exceptions=False,
)
assert result.exit_code == 0, result.output
assert re.search(r"http://schema.org/url:\n.*npm", result.output)
assert re.search(r"http://schema.org/url:\n.*codemeta", result.output)
assert re.search(
r"https://codemeta.github.io/terms/developmentStatus:\n\tcodemeta",
result.output,
)
def test_cli_mapping_list_terms_exclude(cli_runner, swh_config):
result = cli_runner.invoke(
indexer_cli_group,
["-C", swh_config, "mapping", "list-terms", "--exclude-mapping", "codemeta"],
catch_exceptions=False,
)
assert result.exit_code == 0, result.output
assert re.search(r"http://schema.org/url:\n.*npm", result.output)
assert not re.search(r"http://schema.org/url:\n.*codemeta", result.output)
assert not re.search(
r"https://codemeta.github.io/terms/developmentStatus:\n\tcodemeta",
result.output,
)
@patch("swh.scheduler.cli.utils.TASK_BATCH_SIZE", 3)
@patch("swh.scheduler.cli_utils.TASK_BATCH_SIZE", 3)
def test_cli_origin_metadata_reindex_empty_db(
cli_runner, swh_config, indexer_scheduler, idx_storage, storage
):
result = cli_runner.invoke(
indexer_cli_group,
["-C", swh_config, "schedule", "reindex_origin_metadata",],
catch_exceptions=False,
)
expected_output = "Nothing to do (no origin metadata matched the criteria).\n"
assert result.exit_code == 0, result.output
assert result.output == expected_output
tasks = indexer_scheduler.search_tasks()
assert len(tasks) == 0
@patch("swh.scheduler.cli.utils.TASK_BATCH_SIZE", 3)
@patch("swh.scheduler.cli_utils.TASK_BATCH_SIZE", 3)
def test_cli_origin_metadata_reindex_divisor(
cli_runner, swh_config, indexer_scheduler, idx_storage, storage
):
"""Tests the re-indexing when origin_batch_size*task_batch_size is a
divisor of nb_origins."""
fill_idx_storage(idx_storage, 90)
result = cli_runner.invoke(
indexer_cli_group,
["-C", swh_config, "schedule", "reindex_origin_metadata",],
catch_exceptions=False,
)
# Check the output
expected_output = (
"Scheduled 3 tasks (30 origins).\n"
"Scheduled 6 tasks (60 origins).\n"
"Scheduled 9 tasks (90 origins).\n"
"Done.\n"
)
assert result.exit_code == 0, result.output
assert result.output == expected_output
# Check scheduled tasks
tasks = indexer_scheduler.search_tasks()
assert len(tasks) == 9
_assert_tasks_for_origins(tasks, range(90))
@patch("swh.scheduler.cli.utils.TASK_BATCH_SIZE", 3)
@patch("swh.scheduler.cli_utils.TASK_BATCH_SIZE", 3)
def test_cli_origin_metadata_reindex_dry_run(
cli_runner, swh_config, indexer_scheduler, idx_storage, storage
):
"""Tests the re-indexing when origin_batch_size*task_batch_size is a
divisor of nb_origins."""
fill_idx_storage(idx_storage, 90)
result = cli_runner.invoke(
indexer_cli_group,
["-C", swh_config, "schedule", "--dry-run", "reindex_origin_metadata",],
catch_exceptions=False,
)
# Check the output
expected_output = (
"Scheduled 3 tasks (30 origins).\n"
"Scheduled 6 tasks (60 origins).\n"
"Scheduled 9 tasks (90 origins).\n"
"Done.\n"
)
assert result.exit_code == 0, result.output
assert result.output == expected_output
# Check scheduled tasks
tasks = indexer_scheduler.search_tasks()
assert len(tasks) == 0
@patch("swh.scheduler.cli.utils.TASK_BATCH_SIZE", 3)
@patch("swh.scheduler.cli_utils.TASK_BATCH_SIZE", 3)
def test_cli_origin_metadata_reindex_nondivisor(
cli_runner, swh_config, indexer_scheduler, idx_storage, storage
):
"""Tests the re-indexing when neither origin_batch_size or
task_batch_size is a divisor of nb_origins."""
fill_idx_storage(idx_storage, 70)
result = cli_runner.invoke(
indexer_cli_group,
[
"-C",
swh_config,
"schedule",
"reindex_origin_metadata",
"--batch-size",
"20",
],
catch_exceptions=False,
)
# Check the output
expected_output = (
"Scheduled 3 tasks (60 origins).\n"
"Scheduled 4 tasks (70 origins).\n"
"Done.\n"
)
assert result.exit_code == 0, result.output
assert result.output == expected_output
# Check scheduled tasks
tasks = indexer_scheduler.search_tasks()
assert len(tasks) == 4
_assert_tasks_for_origins(tasks, range(70))
@patch("swh.scheduler.cli.utils.TASK_BATCH_SIZE", 3)
@patch("swh.scheduler.cli_utils.TASK_BATCH_SIZE", 3)
def test_cli_origin_metadata_reindex_filter_one_mapping(
cli_runner, swh_config, indexer_scheduler, idx_storage, storage
):
"""Tests the re-indexing when origin_batch_size*task_batch_size is a
divisor of nb_origins."""
fill_idx_storage(idx_storage, 110)
result = cli_runner.invoke(
indexer_cli_group,
[
"-C",
swh_config,
"schedule",
"reindex_origin_metadata",
"--mapping",
"mapping1",
],
catch_exceptions=False,
)
# Check the output
expected_output = "Scheduled 2 tasks (11 origins).\nDone.\n"
assert result.exit_code == 0, result.output
assert result.output == expected_output
# Check scheduled tasks
tasks = indexer_scheduler.search_tasks()
assert len(tasks) == 2
_assert_tasks_for_origins(tasks, [1, 11, 21, 31, 41, 51, 61, 71, 81, 91, 101])
@patch("swh.scheduler.cli.utils.TASK_BATCH_SIZE", 3)
@patch("swh.scheduler.cli_utils.TASK_BATCH_SIZE", 3)
def test_cli_origin_metadata_reindex_filter_two_mappings(
cli_runner, swh_config, indexer_scheduler, idx_storage, storage
):
"""Tests the re-indexing when origin_batch_size*task_batch_size is a
divisor of nb_origins."""
fill_idx_storage(idx_storage, 110)
result = cli_runner.invoke(
indexer_cli_group,
[
"--config-file",
swh_config,
"schedule",
"reindex_origin_metadata",
"--mapping",
"mapping1",
"--mapping",
"mapping2",
],
catch_exceptions=False,
)
# Check the output
expected_output = "Scheduled 3 tasks (22 origins).\nDone.\n"
assert result.exit_code == 0, result.output
assert result.output == expected_output
# Check scheduled tasks
tasks = indexer_scheduler.search_tasks()
assert len(tasks) == 3
_assert_tasks_for_origins(
tasks,
[
1,
11,
21,
31,
41,
51,
61,
71,
81,
91,
101,
2,
12,
22,
32,
42,
52,
62,
72,
82,
92,
102,
],
)
@patch("swh.scheduler.cli.utils.TASK_BATCH_SIZE", 3)
@patch("swh.scheduler.cli_utils.TASK_BATCH_SIZE", 3)
def test_cli_origin_metadata_reindex_filter_one_tool(
cli_runner, swh_config, indexer_scheduler, idx_storage, storage
):
"""Tests the re-indexing when origin_batch_size*task_batch_size is a
divisor of nb_origins."""
tool_ids = fill_idx_storage(idx_storage, 110)
result = cli_runner.invoke(
indexer_cli_group,
[
"-C",
swh_config,
"schedule",
"reindex_origin_metadata",
"--tool-id",
str(tool_ids[0]),
],
catch_exceptions=False,
)
# Check the output
expected_output = (
"Scheduled 3 tasks (30 origins).\n"
"Scheduled 6 tasks (55 origins).\n"
"Done.\n"
)
assert result.exit_code == 0, result.output
assert result.output == expected_output
# Check scheduled tasks
tasks = indexer_scheduler.search_tasks()
assert len(tasks) == 6
_assert_tasks_for_origins(tasks, [x * 2 for x in range(55)])
def now():
return datetime.datetime.now(tz=datetime.timezone.utc)
def test_cli_journal_client(
cli_runner,
swh_config,
indexer_scheduler,
kafka_prefix: str,
kafka_server,
consumer: Consumer,
):
"""Test the 'swh indexer journal-client' cli tool."""
journal_writer = get_journal_writer(
"kafka",
brokers=[kafka_server],
prefix=kafka_prefix,
client_id="test producer",
value_sanitizer=lambda object_type, value: value,
flush_timeout=3, # fail early if something is going wrong
)
visit_statuses = [
OriginVisitStatus(
origin="file:///dev/zero",
visit=1,
date=now(),
status="full",
snapshot=None,
),
OriginVisitStatus(
origin="file:///dev/foobar",
visit=2,
date=now(),
status="full",
snapshot=None,
),
OriginVisitStatus(
origin="file:///tmp/spamegg",
visit=3,
date=now(),
status="full",
snapshot=None,
),
OriginVisitStatus(
origin="file:///dev/0002",
visit=6,
date=now(),
status="full",
snapshot=None,
),
OriginVisitStatus( # will be filtered out due to its 'partial' status
origin="file:///dev/0000",
visit=4,
date=now(),
status="partial",
snapshot=None,
),
OriginVisitStatus( # will be filtered out due to its 'ongoing' status
origin="file:///dev/0001",
visit=5,
date=now(),
status="ongoing",
snapshot=None,
),
]
journal_writer.write_additions("origin_visit_status", visit_statuses)
visit_statuses_full = [vs for vs in visit_statuses if vs.status == "full"]
result = cli_runner.invoke(
indexer_cli_group,
[
"-C",
swh_config,
"journal-client",
"--broker",
kafka_server,
"--prefix",
kafka_prefix,
"--group-id",
"test-consumer",
"--stop-after-objects",
len(visit_statuses),
"--origin-metadata-task-type",
"index-origin-metadata",
],
catch_exceptions=False,
)
# Check the output
expected_output = "Done.\n"
assert result.exit_code == 0, result.output
assert result.output == expected_output
# Check scheduled tasks
tasks = indexer_scheduler.search_tasks(task_type="index-origin-metadata")
# This can be split into multiple tasks but no more than the origin-visit-statuses
# written in the journal
assert len(tasks) <= len(visit_statuses_full)
actual_origins = []
for task in tasks:
actual_task = dict(task)
assert actual_task["type"] == "index-origin-metadata"
scheduled_origins = actual_task["arguments"]["args"][0]
actual_origins.extend(scheduled_origins)
assert set(actual_origins) == {vs.origin for vs in visit_statuses_full}
def test_cli_journal_client_without_brokers(
cli_runner, swh_config, kafka_prefix: str, kafka_server, consumer: Consumer
):
"""Without brokers configuration, the cli fails."""
with pytest.raises(ValueError, match="brokers"):
cli_runner.invoke(
indexer_cli_group,
["-C", swh_config, "journal-client",],
catch_exceptions=False,
)
diff --git a/swh/indexer/tests/test_codemeta.py b/swh/indexer/tests/test_codemeta.py
index 7214e73..cadc35f 100644
--- a/swh/indexer/tests/test_codemeta.py
+++ b/swh/indexer/tests/test_codemeta.py
@@ -1,258 +1,258 @@
# Copyright (C) 2018-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import pytest
from swh.indexer.codemeta import CROSSWALK_TABLE, merge_documents, merge_values
def test_crosstable():
assert CROSSWALK_TABLE["NodeJS"] == {
"repository": "http://schema.org/codeRepository",
"os": "http://schema.org/operatingSystem",
"cpu": "http://schema.org/processorRequirements",
- "engines": "http://schema.org/processorRequirements",
+ "engines": "http://schema.org/runtimePlatform",
"author": "http://schema.org/author",
"author.email": "http://schema.org/email",
"author.name": "http://schema.org/name",
- "contributor": "http://schema.org/contributor",
+ "contributors": "http://schema.org/contributor",
"keywords": "http://schema.org/keywords",
"license": "http://schema.org/license",
"version": "http://schema.org/version",
"description": "http://schema.org/description",
"name": "http://schema.org/name",
"bugs": "https://codemeta.github.io/terms/issueTracker",
"homepage": "http://schema.org/url",
}
def test_merge_values():
assert merge_values("a", "b") == ["a", "b"]
assert merge_values(["a", "b"], "c") == ["a", "b", "c"]
assert merge_values("a", ["b", "c"]) == ["a", "b", "c"]
assert merge_values({"@list": ["a"]}, {"@list": ["b"]}) == {"@list": ["a", "b"]}
assert merge_values({"@list": ["a", "b"]}, {"@list": ["c"]}) == {
"@list": ["a", "b", "c"]
}
with pytest.raises(ValueError):
merge_values({"@list": ["a"]}, "b")
with pytest.raises(ValueError):
merge_values("a", {"@list": ["b"]})
with pytest.raises(ValueError):
merge_values({"@list": ["a"]}, ["b"])
with pytest.raises(ValueError):
merge_values(["a"], {"@list": ["b"]})
assert merge_values("a", None) == "a"
assert merge_values(["a", "b"], None) == ["a", "b"]
assert merge_values(None, ["b", "c"]) == ["b", "c"]
assert merge_values({"@list": ["a"]}, None) == {"@list": ["a"]}
assert merge_values(None, {"@list": ["a"]}) == {"@list": ["a"]}
def test_merge_documents():
"""
Test the creation of a coherent minimal metadata set
"""
# given
metadata_list = [
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"name": "test_1",
"version": "0.0.2",
"description": "Simple package.json test for indexer",
"codeRepository": "git+https://github.com/moranegg/metadata_test",
},
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"name": "test_0_1",
"version": "0.0.2",
"description": "Simple package.json test for indexer",
"codeRepository": "git+https://github.com/moranegg/metadata_test",
},
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"name": "test_metadata",
"version": "0.0.2",
"author": {"type": "Person", "name": "moranegg",},
},
]
# when
results = merge_documents(metadata_list)
# then
expected_results = {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"version": "0.0.2",
"description": "Simple package.json test for indexer",
"name": ["test_1", "test_0_1", "test_metadata"],
"author": [{"type": "Person", "name": "moranegg"}],
"codeRepository": "git+https://github.com/moranegg/metadata_test",
}
assert results == expected_results
def test_merge_documents_ids():
# given
metadata_list = [
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"id": "http://example.org/test1",
"name": "test_1",
},
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"id": "http://example.org/test2",
"name": "test_2",
},
]
# when
results = merge_documents(metadata_list)
# then
expected_results = {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"id": "http://example.org/test1",
"schema:sameAs": "http://example.org/test2",
"name": ["test_1", "test_2"],
}
assert results == expected_results
def test_merge_documents_duplicate_ids():
# given
metadata_list = [
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"id": "http://example.org/test1",
"name": "test_1",
},
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"id": "http://example.org/test1",
"name": "test_1b",
},
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"id": "http://example.org/test2",
"name": "test_2",
},
]
# when
results = merge_documents(metadata_list)
# then
expected_results = {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"id": "http://example.org/test1",
"schema:sameAs": "http://example.org/test2",
"name": ["test_1", "test_1b", "test_2"],
}
assert results == expected_results
def test_merge_documents_lists():
"""Tests merging two @list elements."""
# given
metadata_list = [
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"author": {"@list": [{"name": "test_1"},]},
},
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"author": {"@list": [{"name": "test_2"},]},
},
]
# when
results = merge_documents(metadata_list)
# then
expected_results = {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"author": [{"name": "test_1"}, {"name": "test_2"},],
}
assert results == expected_results
def test_merge_documents_lists_duplicates():
"""Tests merging two @list elements with a duplicate subelement."""
# given
metadata_list = [
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"author": {"@list": [{"name": "test_1"},]},
},
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"author": {"@list": [{"name": "test_2"}, {"name": "test_1"},]},
},
]
# when
results = merge_documents(metadata_list)
# then
expected_results = {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"author": [{"name": "test_1"}, {"name": "test_2"},],
}
assert results == expected_results
def test_merge_documents_list_left():
"""Tests merging a singleton with an @list."""
# given
metadata_list = [
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"author": {"name": "test_1"},
},
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"author": {"@list": [{"name": "test_2"},]},
},
]
# when
results = merge_documents(metadata_list)
# then
expected_results = {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"author": [{"name": "test_1"}, {"name": "test_2"},],
}
assert results == expected_results
def test_merge_documents_list_right():
"""Tests merging an @list with a singleton."""
# given
metadata_list = [
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"author": {"@list": [{"name": "test_1"},]},
},
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"author": {"name": "test_2"},
},
]
# when
results = merge_documents(metadata_list)
# then
expected_results = {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"author": [{"name": "test_1"}, {"name": "test_2"},],
}
assert results == expected_results
diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py
index 0505934..d3bddc5 100644
--- a/swh/indexer/tests/test_metadata.py
+++ b/swh/indexer/tests/test_metadata.py
@@ -1,1205 +1,1318 @@
# Copyright (C) 2017-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import json
import unittest
from hypothesis import HealthCheck, given, settings, strategies
from swh.indexer.codemeta import CODEMETA_TERMS
from swh.indexer.metadata import ContentMetadataIndexer, RevisionMetadataIndexer
from swh.indexer.metadata_detector import detect_metadata
from swh.indexer.metadata_dictionary import MAPPINGS
from swh.indexer.metadata_dictionary.maven import MavenMapping
from swh.indexer.metadata_dictionary.npm import NpmMapping
from swh.indexer.metadata_dictionary.ruby import GemspecMapping
from swh.indexer.storage.model import ContentMetadataRow, RevisionIntrinsicMetadataRow
from swh.indexer.tests.utils import DIRECTORY2, REVISION
from swh.model.hashutil import hash_to_bytes
from swh.model.model import Directory, DirectoryEntry, Revision
from .utils import (
BASE_TEST_CONFIG,
YARN_PARSER_METADATA,
fill_obj_storage,
fill_storage,
json_document_strategy,
xml_document_strategy,
)
TRANSLATOR_TOOL = {
"name": "swh-metadata-translator",
"version": "0.0.2",
"configuration": {"type": "local", "context": "NpmMapping"},
}
class ContentMetadataTestIndexer(ContentMetadataIndexer):
"""Specific Metadata whose configuration is enough to satisfy the
indexing tests.
"""
def parse_config_file(self, *args, **kwargs):
assert False, "should not be called; the rev indexer configures it."
REVISION_METADATA_CONFIG = {
**BASE_TEST_CONFIG,
"tools": TRANSLATOR_TOOL,
}
class Metadata(unittest.TestCase):
"""
Tests metadata_mock_tool tool for Metadata detection
"""
def setUp(self):
"""
shows the entire diff in the results
"""
self.maxDiff = None
self.npm_mapping = MAPPINGS["NpmMapping"]()
self.codemeta_mapping = MAPPINGS["CodemetaMapping"]()
self.maven_mapping = MAPPINGS["MavenMapping"]()
self.pkginfo_mapping = MAPPINGS["PythonPkginfoMapping"]()
self.gemspec_mapping = MAPPINGS["GemspecMapping"]()
+ self.cff_mapping = MAPPINGS["CffMapping"]()
def test_compute_metadata_none(self):
"""
testing content empty content is empty
should return None
"""
# given
content = b""
# None if no metadata was found or an error occurred
declared_metadata = None
# when
result = self.npm_mapping.translate(content)
# then
self.assertEqual(declared_metadata, result)
+ def test_compute_metadata_cff(self):
+ """
+ testing CITATION.cff translation
+ """
+ # given
+ content = """# YAML 1.2
+---
+abstract: "Command line program to convert from Citation File \
+Format to various other formats such as BibTeX, EndNote, RIS, \
+schema.org, CodeMeta, and .zenodo.json."
+authors:
+ -
+ affiliation: "Netherlands eScience Center"
+ family-names: Klaver
+ given-names: Tom
+ -
+ affiliation: "Humboldt-Universität zu Berlin"
+ family-names: Druskat
+ given-names: Stephan
+ orcid: https://orcid.org/0000-0003-4925-7248
+cff-version: "1.0.3"
+date-released: 2019-11-12
+doi: 10.5281/zenodo.1162057
+keywords:
+ - "citation"
+ - "bibliography"
+ - "cff"
+ - "CITATION.cff"
+license: Apache-2.0
+message: "If you use this software, please cite it using these metadata."
+repository-code: "https://github.com/citation-file-format/cff-converter-python"
+title: cffconvert
+version: "1.4.0-alpha0"
+ """.encode(
+ "utf-8"
+ )
+
+ expected = {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ "author": [
+ {
+ "type": "Person",
+ "affiliation": {
+ "type": "Organization",
+ "name": "Netherlands eScience Center",
+ },
+ "familyName": "Klaver",
+ "givenName": "Tom",
+ },
+ {
+ "id": "https://orcid.org/0000-0003-4925-7248",
+ "type": "Person",
+ "affiliation": {
+ "type": "Organization",
+ "name": "Humboldt-Universität zu Berlin",
+ },
+ "familyName": "Druskat",
+ "givenName": "Stephan",
+ },
+ ],
+ "codeRepository": (
+ "https://github.com/citation-file-format/cff-converter-python"
+ ),
+ "datePublished": "2019-11-12",
+ "description": """Command line program to convert from \
+Citation File Format to various other formats such as BibTeX, EndNote, \
+RIS, schema.org, CodeMeta, and .zenodo.json.""",
+ "identifier": "https://doi.org/10.5281/zenodo.1162057",
+ "keywords": ["citation", "bibliography", "cff", "CITATION.cff"],
+ "license": "https://spdx.org/licenses/Apache-2.0",
+ "version": "1.4.0-alpha0",
+ }
+
+ # when
+ result = self.cff_mapping.translate(content)
+ # then
+ self.assertEqual(expected, result)
+
def test_compute_metadata_npm(self):
"""
testing only computation of metadata with hard_mapping_npm
"""
# given
content = b"""
{
"name": "test_metadata",
"version": "0.0.2",
"description": "Simple package.json test for indexer",
"repository": {
"type": "git",
"url": "https://github.com/moranegg/metadata_test"
},
"author": {
"email": "moranegg@example.com",
"name": "Morane G"
}
}
"""
declared_metadata = {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"name": "test_metadata",
"version": "0.0.2",
"description": "Simple package.json test for indexer",
"codeRepository": "git+https://github.com/moranegg/metadata_test",
"author": [
{"type": "Person", "name": "Morane G", "email": "moranegg@example.com",}
],
}
# when
result = self.npm_mapping.translate(content)
# then
self.assertEqual(declared_metadata, result)
def test_index_content_metadata_npm(self):
"""
testing NPM with package.json
- one sha1 uses a file that can't be translated to metadata and
should return None in the translated metadata
"""
# given
sha1s = [
hash_to_bytes("26a9f72a7c87cc9205725cfd879f514ff4f3d8d5"),
hash_to_bytes("d4c647f0fc257591cc9ba1722484229780d1c607"),
hash_to_bytes("02fb2c89e14f7fab46701478c83779c7beb7b069"),
]
# this metadata indexer computes only metadata for package.json
# in npm context with a hard mapping
config = BASE_TEST_CONFIG.copy()
config["tools"] = [TRANSLATOR_TOOL]
metadata_indexer = ContentMetadataTestIndexer(config=config)
fill_obj_storage(metadata_indexer.objstorage)
fill_storage(metadata_indexer.storage)
# when
metadata_indexer.run(sha1s)
results = list(metadata_indexer.idx_storage.content_metadata_get(sha1s))
expected_results = [
ContentMetadataRow(
id=hash_to_bytes("26a9f72a7c87cc9205725cfd879f514ff4f3d8d5"),
tool=TRANSLATOR_TOOL,
metadata={
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"codeRepository": "git+https://github.com/moranegg/metadata_test",
"description": "Simple package.json test for indexer",
"name": "test_metadata",
"version": "0.0.1",
},
),
ContentMetadataRow(
id=hash_to_bytes("d4c647f0fc257591cc9ba1722484229780d1c607"),
tool=TRANSLATOR_TOOL,
metadata={
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"issueTracker": "https://github.com/npm/npm/issues",
"author": [
{
"type": "Person",
"name": "Isaac Z. Schlueter",
"email": "i@izs.me",
"url": "http://blog.izs.me",
}
],
"codeRepository": "git+https://github.com/npm/npm",
"description": "a package manager for JavaScript",
"license": "https://spdx.org/licenses/Artistic-2.0",
"version": "5.0.3",
"name": "npm",
"keywords": [
"install",
"modules",
"package manager",
"package.json",
],
"url": "https://docs.npmjs.com/",
},
),
]
for result in results:
del result.tool["id"]
# The assertion below returns False sometimes because of nested lists
self.assertEqual(expected_results, results)
def test_npm_bugs_normalization(self):
# valid dictionary
package_json = b"""{
"name": "foo",
"bugs": {
"url": "https://github.com/owner/project/issues",
"email": "foo@example.com"
}
}"""
result = self.npm_mapping.translate(package_json)
self.assertEqual(
result,
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"name": "foo",
"issueTracker": "https://github.com/owner/project/issues",
"type": "SoftwareSourceCode",
},
)
# "invalid" dictionary
package_json = b"""{
"name": "foo",
"bugs": {
"email": "foo@example.com"
}
}"""
result = self.npm_mapping.translate(package_json)
self.assertEqual(
result,
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"name": "foo",
"type": "SoftwareSourceCode",
},
)
# string
package_json = b"""{
"name": "foo",
"bugs": "https://github.com/owner/project/issues"
}"""
result = self.npm_mapping.translate(package_json)
self.assertEqual(
result,
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"name": "foo",
"issueTracker": "https://github.com/owner/project/issues",
"type": "SoftwareSourceCode",
},
)
def test_npm_repository_normalization(self):
# normal
package_json = b"""{
"name": "foo",
"repository": {
"type" : "git",
"url" : "https://github.com/npm/cli.git"
}
}"""
result = self.npm_mapping.translate(package_json)
self.assertEqual(
result,
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"name": "foo",
"codeRepository": "git+https://github.com/npm/cli.git",
"type": "SoftwareSourceCode",
},
)
# missing url
package_json = b"""{
"name": "foo",
"repository": {
"type" : "git"
}
}"""
result = self.npm_mapping.translate(package_json)
self.assertEqual(
result,
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"name": "foo",
"type": "SoftwareSourceCode",
},
)
# github shortcut
package_json = b"""{
"name": "foo",
"repository": "github:npm/cli"
}"""
result = self.npm_mapping.translate(package_json)
expected_result = {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"name": "foo",
"codeRepository": "git+https://github.com/npm/cli.git",
"type": "SoftwareSourceCode",
}
self.assertEqual(result, expected_result)
# github shortshortcut
package_json = b"""{
"name": "foo",
"repository": "npm/cli"
}"""
result = self.npm_mapping.translate(package_json)
self.assertEqual(result, expected_result)
# gitlab shortcut
package_json = b"""{
"name": "foo",
"repository": "gitlab:user/repo"
}"""
result = self.npm_mapping.translate(package_json)
self.assertEqual(
result,
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"name": "foo",
"codeRepository": "git+https://gitlab.com/user/repo.git",
"type": "SoftwareSourceCode",
},
)
def test_detect_metadata_package_json(self):
# given
df = [
{
"sha1_git": b"abc",
"name": b"index.js",
"target": b"abc",
"length": 897,
"status": "visible",
"type": "file",
"perms": 33188,
"dir_id": b"dir_a",
"sha1": b"bcd",
},
{
"sha1_git": b"aab",
"name": b"package.json",
"target": b"aab",
"length": 712,
"status": "visible",
"type": "file",
"perms": 33188,
"dir_id": b"dir_a",
"sha1": b"cde",
},
]
# when
results = detect_metadata(df)
expected_results = {"NpmMapping": [b"cde"]}
# then
self.assertEqual(expected_results, results)
+ def test_detect_metadata_codemeta_json_uppercase(self):
+ # given
+ df = [
+ {
+ "sha1_git": b"abc",
+ "name": b"index.html",
+ "target": b"abc",
+ "length": 897,
+ "status": "visible",
+ "type": "file",
+ "perms": 33188,
+ "dir_id": b"dir_a",
+ "sha1": b"bcd",
+ },
+ {
+ "sha1_git": b"aab",
+ "name": b"CODEMETA.json",
+ "target": b"aab",
+ "length": 712,
+ "status": "visible",
+ "type": "file",
+ "perms": 33188,
+ "dir_id": b"dir_a",
+ "sha1": b"bcd",
+ },
+ ]
+ # when
+ results = detect_metadata(df)
+
+ expected_results = {"CodemetaMapping": [b"bcd"]}
+ # then
+ self.assertEqual(expected_results, results)
+
def test_compute_metadata_valid_codemeta(self):
raw_content = b"""{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"@type": "SoftwareSourceCode",
"identifier": "CodeMeta",
"description": "CodeMeta is a concept vocabulary that can be used to standardize the exchange of software metadata across repositories and organizations.",
"name": "CodeMeta: Minimal metadata schemas for science software and code, in JSON-LD",
"codeRepository": "https://github.com/codemeta/codemeta",
"issueTracker": "https://github.com/codemeta/codemeta/issues",
"license": "https://spdx.org/licenses/Apache-2.0",
"version": "2.0",
"author": [
{
"@type": "Person",
"givenName": "Carl",
"familyName": "Boettiger",
"email": "cboettig@gmail.com",
"@id": "http://orcid.org/0000-0002-1642-628X"
},
{
"@type": "Person",
"givenName": "Matthew B.",
"familyName": "Jones",
"email": "jones@nceas.ucsb.edu",
"@id": "http://orcid.org/0000-0003-0077-4738"
}
],
"maintainer": {
"@type": "Person",
"givenName": "Carl",
"familyName": "Boettiger",
"email": "cboettig@gmail.com",
"@id": "http://orcid.org/0000-0002-1642-628X"
},
"contIntegration": "https://travis-ci.org/codemeta/codemeta",
"developmentStatus": "active",
"downloadUrl": "https://github.com/codemeta/codemeta/archive/2.0.zip",
"funder": {
"@id": "https://doi.org/10.13039/100000001",
"@type": "Organization",
"name": "National Science Foundation"
},
"funding":"1549758; Codemeta: A Rosetta Stone for Metadata in Scientific Software",
"keywords": [
"metadata",
"software"
],
"version":"2.0",
"dateCreated":"2017-06-05",
"datePublished":"2017-06-05",
"programmingLanguage": "JSON-LD"
}""" # noqa
expected_result = {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"identifier": "CodeMeta",
"description": "CodeMeta is a concept vocabulary that can "
"be used to standardize the exchange of software metadata "
"across repositories and organizations.",
"name": "CodeMeta: Minimal metadata schemas for science "
"software and code, in JSON-LD",
"codeRepository": "https://github.com/codemeta/codemeta",
"issueTracker": "https://github.com/codemeta/codemeta/issues",
"license": "https://spdx.org/licenses/Apache-2.0",
"version": "2.0",
"author": [
{
"type": "Person",
"givenName": "Carl",
"familyName": "Boettiger",
"email": "cboettig@gmail.com",
"id": "http://orcid.org/0000-0002-1642-628X",
},
{
"type": "Person",
"givenName": "Matthew B.",
"familyName": "Jones",
"email": "jones@nceas.ucsb.edu",
"id": "http://orcid.org/0000-0003-0077-4738",
},
],
"maintainer": {
"type": "Person",
"givenName": "Carl",
"familyName": "Boettiger",
"email": "cboettig@gmail.com",
"id": "http://orcid.org/0000-0002-1642-628X",
},
"contIntegration": "https://travis-ci.org/codemeta/codemeta",
"developmentStatus": "active",
"downloadUrl": "https://github.com/codemeta/codemeta/archive/2.0.zip",
"funder": {
"id": "https://doi.org/10.13039/100000001",
"type": "Organization",
"name": "National Science Foundation",
},
"funding": "1549758; Codemeta: A Rosetta Stone for Metadata "
"in Scientific Software",
"keywords": ["metadata", "software"],
"version": "2.0",
"dateCreated": "2017-06-05",
"datePublished": "2017-06-05",
"programmingLanguage": "JSON-LD",
}
result = self.codemeta_mapping.translate(raw_content)
self.assertEqual(result, expected_result)
def test_compute_metadata_codemeta_alternate_context(self):
raw_content = b"""{
"@context": "https://raw.githubusercontent.com/codemeta/codemeta/master/codemeta.jsonld",
"@type": "SoftwareSourceCode",
"identifier": "CodeMeta"
}""" # noqa
expected_result = {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"identifier": "CodeMeta",
}
result = self.codemeta_mapping.translate(raw_content)
self.assertEqual(result, expected_result)
def test_compute_metadata_maven(self):
raw_content = b"""
Maven Default Project
4.0.0
com.mycompany.app
my-app
1.2.3
central
Maven Repository Switchboard
default
http://repo1.maven.org/maven2
false
Apache License, Version 2.0
https://www.apache.org/licenses/LICENSE-2.0.txt
repo
A business-friendly OSS license
"""
result = self.maven_mapping.translate(raw_content)
self.assertEqual(
result,
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"name": "Maven Default Project",
"identifier": "com.mycompany.app",
"version": "1.2.3",
"license": "https://www.apache.org/licenses/LICENSE-2.0.txt",
"codeRepository": (
"http://repo1.maven.org/maven2/com/mycompany/app/my-app"
),
},
)
def test_compute_metadata_maven_empty(self):
raw_content = b"""
"""
result = self.maven_mapping.translate(raw_content)
self.assertEqual(
result,
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
},
)
def test_compute_metadata_maven_almost_empty(self):
raw_content = b"""
"""
result = self.maven_mapping.translate(raw_content)
self.assertEqual(
result,
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
},
)
def test_compute_metadata_maven_invalid_xml(self):
expected_warning = (
"WARNING:swh.indexer.metadata_dictionary.maven.MavenMapping:"
"Error parsing XML from foo"
)
raw_content = b"""
"""
with self.assertLogs("swh.indexer.metadata_dictionary", level="WARNING") as cm:
result = MAPPINGS["MavenMapping"]("foo").translate(raw_content)
self.assertEqual(cm.output, [expected_warning])
self.assertEqual(result, None)
raw_content = b"""
"""
with self.assertLogs("swh.indexer.metadata_dictionary", level="WARNING") as cm:
result = MAPPINGS["MavenMapping"]("foo").translate(raw_content)
self.assertEqual(cm.output, [expected_warning])
self.assertEqual(result, None)
def test_compute_metadata_maven_unknown_encoding(self):
expected_warning = (
"WARNING:swh.indexer.metadata_dictionary.maven.MavenMapping:"
"Error detecting XML encoding from foo"
)
raw_content = b"""
"""
with self.assertLogs("swh.indexer.metadata_dictionary", level="WARNING") as cm:
result = MAPPINGS["MavenMapping"]("foo").translate(raw_content)
self.assertEqual(cm.output, [expected_warning])
self.assertEqual(result, None)
raw_content = b"""
"""
with self.assertLogs("swh.indexer.metadata_dictionary", level="WARNING") as cm:
result = MAPPINGS["MavenMapping"]("foo").translate(raw_content)
self.assertEqual(cm.output, [expected_warning])
self.assertEqual(result, None)
def test_compute_metadata_maven_invalid_encoding(self):
expected_warning = (
"WARNING:swh.indexer.metadata_dictionary.maven.MavenMapping:"
"Error unidecoding XML from foo"
)
raw_content = b"""
"""
with self.assertLogs("swh.indexer.metadata_dictionary", level="WARNING") as cm:
result = MAPPINGS["MavenMapping"]("foo").translate(raw_content)
self.assertEqual(cm.output, [expected_warning])
self.assertEqual(result, None)
def test_compute_metadata_maven_minimal(self):
raw_content = b"""
Maven Default Project
4.0.0
com.mycompany.app
my-app
1.2.3
"""
result = self.maven_mapping.translate(raw_content)
self.assertEqual(
result,
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"name": "Maven Default Project",
"identifier": "com.mycompany.app",
"version": "1.2.3",
"codeRepository": (
"https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
),
},
)
def test_compute_metadata_maven_empty_nodes(self):
raw_content = b"""
Maven Default Project
4.0.0
com.mycompany.app
my-app
1.2.3
"""
result = self.maven_mapping.translate(raw_content)
self.assertEqual(
result,
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"name": "Maven Default Project",
"identifier": "com.mycompany.app",
"version": "1.2.3",
"codeRepository": (
"https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
),
},
)
raw_content = b"""
Maven Default Project
4.0.0
com.mycompany.app
my-app
"""
result = self.maven_mapping.translate(raw_content)
self.assertEqual(
result,
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"name": "Maven Default Project",
"identifier": "com.mycompany.app",
"codeRepository": (
"https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
),
},
)
raw_content = b"""
4.0.0
com.mycompany.app
my-app
1.2.3
"""
result = self.maven_mapping.translate(raw_content)
self.assertEqual(
result,
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"identifier": "com.mycompany.app",
"version": "1.2.3",
"codeRepository": (
"https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
),
},
)
raw_content = b"""
Maven Default Project
4.0.0
com.mycompany.app
my-app
1.2.3
"""
result = self.maven_mapping.translate(raw_content)
self.assertEqual(
result,
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"name": "Maven Default Project",
"identifier": "com.mycompany.app",
"version": "1.2.3",
"codeRepository": (
"https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
),
},
)
raw_content = b"""
1.2.3
"""
result = self.maven_mapping.translate(raw_content)
self.assertEqual(
result,
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"version": "1.2.3",
},
)
def test_compute_metadata_maven_invalid_licenses(self):
raw_content = b"""
Maven Default Project
4.0.0
com.mycompany.app
my-app
1.2.3
foo
"""
result = self.maven_mapping.translate(raw_content)
self.assertEqual(
result,
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"name": "Maven Default Project",
"identifier": "com.mycompany.app",
"version": "1.2.3",
"codeRepository": (
"https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
),
},
)
def test_compute_metadata_maven_multiple(self):
"""Tests when there are multiple code repos and licenses."""
raw_content = b"""
Maven Default Project
4.0.0
com.mycompany.app
my-app
1.2.3
central
Maven Repository Switchboard
default
http://repo1.maven.org/maven2
false
example
Example Maven Repo
default
http://example.org/maven2
Apache License, Version 2.0
https://www.apache.org/licenses/LICENSE-2.0.txt
repo
A business-friendly OSS license
MIT license
https://opensource.org/licenses/MIT
"""
result = self.maven_mapping.translate(raw_content)
self.assertEqual(
result,
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"name": "Maven Default Project",
"identifier": "com.mycompany.app",
"version": "1.2.3",
"license": [
"https://www.apache.org/licenses/LICENSE-2.0.txt",
"https://opensource.org/licenses/MIT",
],
"codeRepository": [
"http://repo1.maven.org/maven2/com/mycompany/app/my-app",
"http://example.org/maven2/com/mycompany/app/my-app",
],
},
)
def test_compute_metadata_pkginfo(self):
raw_content = b"""\
Metadata-Version: 2.1
Name: swh.core
Version: 0.0.49
Summary: Software Heritage core utilities
Home-page: https://forge.softwareheritage.org/diffusion/DCORE/
Author: Software Heritage developers
Author-email: swh-devel@inria.fr
License: UNKNOWN
Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest
Project-URL: Funding, https://www.softwareheritage.org/donate
Project-URL: Source, https://forge.softwareheritage.org/source/swh-core
Description: swh-core
========
\x20
core library for swh's modules:
- config parser
- hash computations
- serialization
- logging mechanism
\x20
Platform: UNKNOWN
Classifier: Programming Language :: Python :: 3
Classifier: Intended Audience :: Developers
Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
Classifier: Operating System :: OS Independent
Classifier: Development Status :: 5 - Production/Stable
Description-Content-Type: text/markdown
Provides-Extra: testing
""" # noqa
result = self.pkginfo_mapping.translate(raw_content)
self.assertCountEqual(
result["description"],
[
"Software Heritage core utilities", # note the comma here
"swh-core\n"
"========\n"
"\n"
"core library for swh's modules:\n"
"- config parser\n"
"- hash computations\n"
"- serialization\n"
"- logging mechanism\n"
"",
],
result,
)
del result["description"]
self.assertEqual(
result,
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"url": "https://forge.softwareheritage.org/diffusion/DCORE/",
"name": "swh.core",
"author": [
{
"type": "Person",
"name": "Software Heritage developers",
"email": "swh-devel@inria.fr",
}
],
"version": "0.0.49",
},
)
def test_compute_metadata_pkginfo_utf8(self):
raw_content = b"""\
Metadata-Version: 1.1
Name: snowpyt
Description-Content-Type: UNKNOWN
Description: foo
Hydrology N\xc2\xb083
""" # noqa
result = self.pkginfo_mapping.translate(raw_content)
self.assertEqual(
result,
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"name": "snowpyt",
"description": "foo\nHydrology N°83",
},
)
def test_compute_metadata_pkginfo_keywords(self):
raw_content = b"""\
Metadata-Version: 2.1
Name: foo
Keywords: foo bar baz
""" # noqa
result = self.pkginfo_mapping.translate(raw_content)
self.assertEqual(
result,
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"name": "foo",
"keywords": ["foo", "bar", "baz"],
},
)
def test_compute_metadata_pkginfo_license(self):
raw_content = b"""\
Metadata-Version: 2.1
Name: foo
License: MIT
""" # noqa
result = self.pkginfo_mapping.translate(raw_content)
self.assertEqual(
result,
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"name": "foo",
"license": "MIT",
},
)
def test_gemspec_base(self):
raw_content = b"""
Gem::Specification.new do |s|
s.name = 'example'
s.version = '0.1.0'
s.licenses = ['MIT']
s.summary = "This is an example!"
s.description = "Much longer explanation of the example!"
s.authors = ["Ruby Coder"]
s.email = 'rubycoder@example.com'
s.files = ["lib/example.rb"]
s.homepage = 'https://rubygems.org/gems/example'
s.metadata = { "source_code_uri" => "https://github.com/example/example" }
end"""
result = self.gemspec_mapping.translate(raw_content)
self.assertCountEqual(
result.pop("description"),
["This is an example!", "Much longer explanation of the example!"],
)
self.assertEqual(
result,
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"author": [{"type": "Person", "name": "Ruby Coder"}],
"name": "example",
"license": "https://spdx.org/licenses/MIT",
"codeRepository": "https://rubygems.org/gems/example",
"email": "rubycoder@example.com",
"version": "0.1.0",
},
)
def test_gemspec_two_author_fields(self):
raw_content = b"""
Gem::Specification.new do |s|
s.authors = ["Ruby Coder1"]
s.author = "Ruby Coder2"
end"""
result = self.gemspec_mapping.translate(raw_content)
self.assertCountEqual(
result.pop("author"),
[
{"type": "Person", "name": "Ruby Coder1"},
{"type": "Person", "name": "Ruby Coder2"},
],
)
self.assertEqual(
result,
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
},
)
def test_gemspec_invalid_author(self):
raw_content = b"""
Gem::Specification.new do |s|
s.author = ["Ruby Coder"]
end"""
result = self.gemspec_mapping.translate(raw_content)
self.assertEqual(
result,
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
},
)
raw_content = b"""
Gem::Specification.new do |s|
s.author = "Ruby Coder1",
end"""
result = self.gemspec_mapping.translate(raw_content)
self.assertEqual(
result,
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
},
)
raw_content = b"""
Gem::Specification.new do |s|
s.authors = ["Ruby Coder1", ["Ruby Coder2"]]
end"""
result = self.gemspec_mapping.translate(raw_content)
self.assertEqual(
result,
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"author": [{"type": "Person", "name": "Ruby Coder1"}],
},
)
def test_gemspec_alternative_header(self):
raw_content = b"""
require './lib/version'
Gem::Specification.new { |s|
s.name = 'rb-system-with-aliases'
s.summary = 'execute system commands with aliases'
}
"""
result = self.gemspec_mapping.translate(raw_content)
self.assertEqual(
result,
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"name": "rb-system-with-aliases",
"description": "execute system commands with aliases",
},
)
@settings(suppress_health_check=[HealthCheck.too_slow])
@given(json_document_strategy(keys=list(NpmMapping.mapping)))
def test_npm_adversarial(self, doc):
raw = json.dumps(doc).encode()
self.npm_mapping.translate(raw)
@settings(suppress_health_check=[HealthCheck.too_slow])
@given(json_document_strategy(keys=CODEMETA_TERMS))
def test_codemeta_adversarial(self, doc):
raw = json.dumps(doc).encode()
self.codemeta_mapping.translate(raw)
@settings(suppress_health_check=[HealthCheck.too_slow])
@given(
xml_document_strategy(
keys=list(MavenMapping.mapping),
root="project",
xmlns="http://maven.apache.org/POM/4.0.0",
)
)
def test_maven_adversarial(self, doc):
self.maven_mapping.translate(doc)
@settings(suppress_health_check=[HealthCheck.too_slow])
@given(
strategies.dictionaries(
# keys
strategies.one_of(
strategies.text(), *map(strategies.just, GemspecMapping.mapping)
),
# values
strategies.recursive(
strategies.characters(),
lambda children: strategies.lists(children, min_size=1),
),
)
)
def test_gemspec_adversarial(self, doc):
parts = [b"Gem::Specification.new do |s|\n"]
for (k, v) in doc.items():
parts.append(" s.{} = {}\n".format(k, repr(v)).encode())
parts.append(b"end\n")
self.gemspec_mapping.translate(b"".join(parts))
def test_revision_metadata_indexer(self):
metadata_indexer = RevisionMetadataIndexer(config=REVISION_METADATA_CONFIG)
fill_obj_storage(metadata_indexer.objstorage)
fill_storage(metadata_indexer.storage)
tool = metadata_indexer.idx_storage.indexer_configuration_get(
{f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()}
)
assert tool is not None
rev = REVISION
assert rev.directory == DIRECTORY2.id
metadata_indexer.idx_storage.content_metadata_add(
[
ContentMetadataRow(
id=DIRECTORY2.entries[0].target,
indexer_configuration_id=tool["id"],
metadata=YARN_PARSER_METADATA,
)
]
)
metadata_indexer.run([rev.id])
results = list(
metadata_indexer.idx_storage.revision_intrinsic_metadata_get([REVISION.id])
)
expected_results = [
RevisionIntrinsicMetadataRow(
id=rev.id,
tool=TRANSLATOR_TOOL,
metadata=YARN_PARSER_METADATA,
mappings=["npm"],
)
]
for result in results:
del result.tool["id"]
# then
self.assertEqual(results, expected_results)
def test_revision_metadata_indexer_single_root_dir(self):
metadata_indexer = RevisionMetadataIndexer(config=REVISION_METADATA_CONFIG)
fill_obj_storage(metadata_indexer.objstorage)
fill_storage(metadata_indexer.storage)
# Add a parent directory, that is the only directory at the root
# of the revision
rev = REVISION
assert rev.directory == DIRECTORY2.id
directory = Directory(
entries=(
DirectoryEntry(
name=b"foobar-1.0.0", type="dir", target=rev.directory, perms=16384,
),
),
)
assert directory.id is not None
metadata_indexer.storage.directory_add([directory])
new_rev_dict = {**rev.to_dict(), "directory": directory.id}
new_rev_dict.pop("id")
new_rev = Revision.from_dict(new_rev_dict)
metadata_indexer.storage.revision_add([new_rev])
tool = metadata_indexer.idx_storage.indexer_configuration_get(
{f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()}
)
assert tool is not None
metadata_indexer.idx_storage.content_metadata_add(
[
ContentMetadataRow(
id=DIRECTORY2.entries[0].target,
indexer_configuration_id=tool["id"],
metadata=YARN_PARSER_METADATA,
)
]
)
metadata_indexer.run([new_rev.id])
results = list(
metadata_indexer.idx_storage.revision_intrinsic_metadata_get([new_rev.id])
)
expected_results = [
RevisionIntrinsicMetadataRow(
id=new_rev.id,
tool=TRANSLATOR_TOOL,
metadata=YARN_PARSER_METADATA,
mappings=["npm"],
)
]
for result in results:
del result.tool["id"]
# then
self.assertEqual(results, expected_results)
diff --git a/tox.ini b/tox.ini
index 56898f0..3afda02 100644
--- a/tox.ini
+++ b/tox.ini
@@ -1,40 +1,78 @@
[tox]
envlist=black,flake8,mypy,py3
[testenv]
extras =
testing
deps =
pytest-cov
swh-scheduler[testing] >= 0.5.0
swh-storage[testing] >= 0.10.0
dev: pdbpp
commands =
pytest --doctest-modules \
!slow: --hypothesis-profile=fast \
slow: --hypothesis-profile=slow \
{envsitepackagesdir}/swh/indexer \
--cov={envsitepackagesdir}/swh/indexer \
--cov-branch {posargs}
[testenv:black]
skip_install = true
deps =
black==19.10b0
commands =
{envpython} -m black --check swh
[testenv:flake8]
skip_install = true
deps =
flake8
commands =
{envpython} -m flake8
[testenv:mypy]
extras =
testing
deps =
mypy
commands =
mypy swh
+
+# build documentation outside swh-environment using the current
+# git HEAD of swh-docs, is executed on CI for each diff to prevent
+# breaking doc build
+[testenv:sphinx]
+whitelist_externals = make
+usedevelop = true
+extras =
+ testing
+deps =
+ # fetch and install swh-docs in develop mode
+ -e git+https://forge.softwareheritage.org/source/swh-docs#egg=swh.docs
+
+setenv =
+ SWH_PACKAGE_DOC_TOX_BUILD = 1
+ # turn warnings into errors
+ SPHINXOPTS = -W
+commands =
+ make -I ../.tox/sphinx/src/swh-docs/swh/ -C docs
+
+
+# build documentation only inside swh-environment using local state
+# of swh-docs package
+[testenv:sphinx-dev]
+whitelist_externals = make
+usedevelop = true
+extras =
+ testing
+deps =
+ # install swh-docs in develop mode
+ -e ../swh-docs
+
+setenv =
+ SWH_PACKAGE_DOC_TOX_BUILD = 1
+ # turn warnings into errors
+ SPHINXOPTS = -W
+commands =
+ make -I ../.tox/sphinx-dev/src/swh-docs/swh/ -C docs