diff --git a/swh/indexer/data/maven/CITATION b/swh/indexer/data/maven/CITATION new file mode 100644 --- /dev/null +++ b/swh/indexer/data/maven/CITATION @@ -0,0 +1 @@ +https://maven.apache.org/ref/3.6.0/maven-model/maven.html (visited on 2019-02-12) diff --git a/swh/indexer/data/maven/LICENSE b/swh/indexer/data/maven/LICENSE new file mode 100644 --- /dev/null +++ b/swh/indexer/data/maven/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/swh/indexer/data/maven/maven-4.0.0.xsd b/swh/indexer/data/maven/maven-4.0.0.xsd new file mode 100644 --- /dev/null +++ b/swh/indexer/data/maven/maven-4.0.0.xsd @@ -0,0 +1,2484 @@ + + + + + + + + + 3.0.0+ + + + The <code>&lt;project&gt;</code> element is the root of the descriptor. + The following table lists all of the possible child elements. + + + + + + + 3.0.0+ + + + The <code>&lt;project&gt;</code> element is the root of the descriptor. + The following table lists all of the possible child elements. + + + + + + + 4.0.0+ + Declares to which version of project descriptor this POM conforms. + + + + + 4.0.0+ + The location of the parent project, if one exists. Values from the parent + project will be the default for this project if they are left unspecified. The location + is given as a group ID, artifact ID and version. + + + + + 3.0.0+ + + + A universally unique identifier for a project. It is normal to + use a fully-qualified package name to distinguish it from other + projects with a similar name (eg. <code>org.apache.maven</code>). + + + + + + + 3.0.0+ + The identifier for this artifact that is unique within the group given by the + group ID. An artifact is something that is either produced or used by a project. + Examples of artifacts produced by Maven for a project include: JARs, source and binary + distributions, and WARs. + + + + + 4.0.0+ + The current version of the artifact produced by this project. + + + + + 4.0.0+ + + + The type of artifact this project produces, for example <code>jar</code> + <code>war</code> + <code>ear</code> + <code>pom</code>. + Plugins can create their own packaging, and + therefore their own packaging types, + so this list does not contain all possible types. + + + + + + + 3.0.0+ + The full name of the project. + + + + + 3.0.0+ + A detailed description of the project, used by Maven whenever it needs to + describe the project, such as on the web site. While this element can be specified as + CDATA to enable the use of HTML tags within the description, it is discouraged to allow + plain text representation. If you need to modify the index page of the generated web + site, you are able to specify your own instead of adjusting this text. + + + + + 3.0.0+ + + + The URL to the project's homepage. + <br /><b>Default value is</b>: parent value [+ path adjustment] + artifactId + + + + + + + 3.0.0+ + The year of the project's inception, specified with 4 digits. This value is + used when generating copyright notices as well as being informational. + + + + + 3.0.0+ + This element describes various attributes of the organization to which the + project belongs. These attributes are utilized when documentation is created (for + copyright notices and links). + + + + + 3.0.0+ + + + This element describes all of the licenses for this project. + Each license is described by a <code>license</code> element, which + is then described by additional elements. + Projects should only list the license(s) that applies to the project + and not the licenses that apply to dependencies. + If multiple licenses are listed, it is assumed that the user can select + any of them, not that they must accept all. + + + + + + + + + + + + 3.0.0+ + Describes the committers of a project. + + + + + + + + + + 3.0.0+ + Describes the contributors to a project that are not yet committers. + + + + + + + + + + 3.0.0+ + Contains information about a project's mailing lists. + + + + + + + + + + 4.0.0+ + Describes the prerequisites in the build environment for this project. + + + + + 4.0.0+ + The modules (sometimes called subprojects) to build as a part of this + project. Each module listed is a relative path to the directory containing the module. + To be consistent with the way default urls are calculated from parent, it is recommended + to have module names match artifact ids. + + + + + + + + + + 4.0.0+ + Specification for the SCM used by the project, such as CVS, Subversion, etc. + + + + + 4.0.0+ + The project's issue management system information. + + + + + 4.0.0+ + The project's continuous integration information. + + + + + 4.0.0+ + Distribution information for a project that enables deployment of the site + and artifacts to remote web servers and repositories respectively. + + + + + 4.0.0+ + + + Properties that can be used throughout the POM as a substitution, and + are used as filters in resources if enabled. + The format is <code>&lt;name&gt;value&lt;/name&gt;</code>. + + + + + + + + + + + + 4.0.0+ + Default dependency information for projects that inherit from this one. The + dependencies in this section are not immediately resolved. Instead, when a POM derived + from this one declares a dependency described by a matching groupId and artifactId, the + version and other values from this section are used for that dependency if they were not + already specified. + + + + + 3.0.0+ + + + This element describes all of the dependencies associated with a + project. + These dependencies are used to construct a classpath for your + project during the build process. They are automatically downloaded from the + repositories defined in this project. + See <a href="http://maven.apache.org/guides/introduction/introduction-to-dependency-mechanism.html">the + dependency mechanism</a> for more information. + + + + + + + + + + + + 4.0.0+ + The lists of the remote repositories for discovering dependencies and + extensions. + + + + + + + + + + 4.0.0+ + The lists of the remote repositories for discovering plugins for builds and + reports. + + + + + + + + + + 3.0.0+ + Information required to build the project. + + + + + 4.0.0+ + + + <b>Deprecated</b>. Now ignored by Maven. + + + + + + + + + + + + 4.0.0+ + + + This element includes the specification of report plugins to use + to generate the reports on the Maven-generated site. + These reports will be run when a user executes <code>mvn site</code>. + All of the reports will be included in the navigation bar for browsing. + + + + + + + 4.0.0+ + A listing of project-local build profiles which will modify the build process + when activated. + + + + + + + + + + + + 4.0.0+ + + + The <code>&lt;parent&gt;</code> element contains information required to locate the parent project from which + this project will inherit from. + <strong>Note:</strong> The children of this element are not interpolated and must be given as literal values. + + + + + + + 4.0.0+ + The group id of the parent project to inherit from. + + + + + 4.0.0+ + The artifact id of the parent project to inherit from. + + + + + 4.0.0+ + The version of the parent project to inherit. + + + + + 4.0.0+ + + + The relative path of the parent <code>pom.xml</code> file within the check out. + If not specified, it defaults to <code>../pom.xml</code>. + Maven looks for the parent POM first in this location on + the filesystem, then the local repository, and lastly in the remote repo. + <code>relativePath</code> allows you to select a different location, + for example when your structure is flat, or deeper without an intermediate parent POM. + However, the group ID, artifact ID and version are still required, + and must match the file in the location given or it will revert to the repository for the POM. + This feature is only for enhancing the development in a local checkout of that project. + Set the value to an empty string in case you want to disable the feature and always resolve + the parent POM from the repositories. + + + + + + + + + 3.0.0+ + Specifies the organization that produces this project. + + + + + 3.0.0+ + The full name of the organization. + + + + + 3.0.0+ + The URL to the organization's home page. + + + + + + + 4.0.0+ + This elements describes all that pertains to distribution for a project. It is + primarily used for deployment of artifacts and the site produced by the build. + + + + + 4.0.0+ + Information needed to deploy the artifacts generated by the project to a + remote repository. + + + + + 4.0.0+ + + + Where to deploy snapshots of artifacts to. If not given, it defaults to the + <code>repository</code> element. + + + + + + + 4.0.0+ + Information needed for deploying the web site of the project. + + + + + 4.0.0+ + + + The URL of the project's download page. If not given users will be + referred to the homepage given by <code>url</code>. + This is given to assist in locating artifacts that are not in the repository due to + licensing restrictions. + + + + + + + 4.0.0+ + Relocation information of the artifact if it has been moved to a new group ID + and/or artifact ID. + + + + + 4.0.0+ + + + Gives the status of this artifact in the remote repository. + This must not be set in your local project, as it is updated by + tools placing it in the reposiory. Valid values are: <code>none</code> (default), + <code>converted</code> (repository manager converted this from an Maven 1 POM), + <code>partner</code> + (directly synced from a partner Maven 2 repository), <code>deployed</code> (was deployed from a Maven 2 + instance), <code>verified</code> (has been hand verified as correct and final). + + + + + + + + + 4.0.0+ + Describes where an artifact has moved to. If any of the values are omitted, it is + assumed to be the same as it was before. + + + + + 4.0.0+ + The group ID the artifact has moved to. + + + + + 4.0.0+ + The new artifact ID of the artifact. + + + + + 4.0.0+ + The new version of the artifact. + + + + + 4.0.0+ + An additional message to show the user about the move, such as the reason. + + + + + + + 4.0.0+ + Contains the information needed for deploying websites. + + + + + 4.0.0+ + + + A unique identifier for a deployment location. This is used to match the + site to configuration in the <code>settings.xml</code> file, for example. + + + + + + + 4.0.0+ + Human readable name of the deployment location. + + + + + 4.0.0+ + + + The url of the location where website is deployed, in the form <code>protocol://hostname/path</code>. + <br /><b>Default value is</b>: parent value [+ path adjustment] + artifactId + + + + + + + + + 4.0.0+ + Repository contains the information needed for deploying to the remote + repository. + + + + + 4.0.0+ + Whether to assign snapshots a unique version comprised of the timestamp and + build number, or to use the same version each time + + + + + 4.0.0+ + How to handle downloading of releases from this repository. + + + + + 4.0.0+ + How to handle downloading of snapshots from this repository. + + + + + 4.0.0+ + + + A unique identifier for a repository. This is used to match the repository + to configuration in the <code>settings.xml</code> file, for example. Furthermore, the identifier is + used during POM inheritance and profile injection to detect repositories that should be merged. + + + + + + + 4.0.0+ + Human readable name of the repository. + + + + + 4.0.0+ + + + The url of the repository, in the form <code>protocol://hostname/path</code>. + + + + + + + 4.0.0+ + + + The type of layout this repository uses for locating and storing artifacts - + can be <code>legacy</code> or <code>default</code>. + + + + + + + + + 4.0.0+ + Download policy. + + + + + 4.0.0+ + + + Whether to use this repository for downloading this type of artifact. Note: While the type + of this field is <code>String</code> for technical reasons, the semantic type is actually + <code>Boolean</code>. Default value is <code>true</code>. + + + + + + + 4.0.0+ + + + The frequency for downloading updates - can be + <code>always,</code> + <code>daily</code> + (default), + <code>interval:XXX</code> + (in minutes) or + <code>never</code> + (only if it doesn't exist locally). + + + + + + + 4.0.0+ + + + What to do when verification of an artifact checksum fails. Valid values are + <code>ignore</code> + , + <code>fail</code> + or + <code>warn</code> + (the default). + + + + + + + + + 4.0.0+ + Describes the prerequisites a project can have. + + + + + 4.0.0+ + + For a plugin project, the minimum version of Maven required to use + the resulting plugin.<br /> + For specifying the minimum version of Maven required to build a + project, this element is <b>deprecated</b>. Use the Maven Enforcer + Plugin's <a href="https://maven.apache.org/enforcer/enforcer-rules/requireMavenVersion.html"><code>requireMavenVersion</code></a> + rule instead. + + + + + + + + + 3.0.0+ + Description of a person who has contributed to the project, but who does not have + commit privileges. Usually, these contributions come in the form of patches submitted. + + + + + 3.0.0+ + The full name of the contributor. + + + + + 3.0.0+ + The email address of the contributor. + + + + + 3.0.0+ + The URL for the homepage of the contributor. + + + + + 3.0.0+ + The organization to which the contributor belongs. + + + + + 3.0.0+ + The URL of the organization. + + + + + 3.0.0+ + + + The roles the contributor plays in the project. Each role is described by a + <code>role</code> element, the body of which is a role name. This can also be used to + describe the contribution. + + + + + + + + + + + + 3.0.0+ + + + The timezone the contributor is in. Typically, this is a number in the range + <a href="http://en.wikipedia.org/wiki/UTC%E2%88%9212:00">-12</a> to <a href="http://en.wikipedia.org/wiki/UTC%2B14:00">+14</a> + or a valid time zone id like "America/Montreal" (UTC-05:00) or "Europe/Paris" (UTC+01:00). + + + + + + + 3.0.0+ + Properties about the contributor, such as an instant messenger handle. + + + + + + + + + + + + 4.0.0+ + + + The <code>&lt;scm&gt;</code> element contains informations required to the SCM + (Source Control Management) of the project. + + + + + + + 4.0.0+ + + + The source control management system URL + that describes the repository and how to connect to the + repository. For more information, see the + <a href="http://maven.apache.org/scm/scm-url-format.html">URL format</a> + and <a href="http://maven.apache.org/scm/scms-overview.html">list of supported SCMs</a>. + This connection is read-only. + <br /><b>Default value is</b>: parent value [+ path adjustment] + artifactId + + + + + + + 4.0.0+ + + + Just like <code>connection</code>, but for developers, i.e. this scm connection + will not be read only. + <br /><b>Default value is</b>: parent value [+ path adjustment] + artifactId + + + + + + + 4.0.0+ + The tag of current code. By default, it's set to HEAD during development. + + + + + 4.0.0+ + + + The URL to the project's browsable SCM repository, such as ViewVC or Fisheye. + <br /><b>Default value is</b>: parent value [+ path adjustment] + artifactId + + + + + + + + + 4.0.0+ + A repository contains the information needed for establishing connections with + remote repository. + + + + + 4.0.0+ + How to handle downloading of releases from this repository. + + + + + 4.0.0+ + How to handle downloading of snapshots from this repository. + + + + + 4.0.0+ + + + A unique identifier for a repository. This is used to match the repository + to configuration in the <code>settings.xml</code> file, for example. Furthermore, the identifier is + used during POM inheritance and profile injection to detect repositories that should be merged. + + + + + + + 4.0.0+ + Human readable name of the repository. + + + + + 4.0.0+ + + + The url of the repository, in the form <code>protocol://hostname/path</code>. + + + + + + + 4.0.0+ + + + The type of layout this repository uses for locating and storing artifacts - + can be <code>legacy</code> or <code>default</code>. + + + + + + + + + 4.0.0+ + Information about the issue tracking (or bug tracking) system used to manage this + project. + + + + + 4.0.0+ + The name of the issue management system, e.g. Bugzilla + + + + + 4.0.0+ + URL for the issue management system used by the project. + + + + + + + 4.0.0+ + + + The <code>&lt;CiManagement&gt;</code> element contains informations required to the + continuous integration system of the project. + + + + + + + 4.0.0+ + + + The name of the continuous integration system, e.g. <code>continuum</code>. + + + + + + + 4.0.0+ + URL for the continuous integration system used by the project if it has a web + interface. + + + + + 4.0.0+ + Configuration for notifying developers/users when a build is unsuccessful, + including user information and notification mode. + + + + + + + + + + + + 4.0.0+ + Configures one method for notifying users/developers when a build breaks. + + + + + 4.0.0+ + The mechanism used to deliver notifications. + + + + + 4.0.0+ + Whether to send notifications on error. + + + + + 4.0.0+ + Whether to send notifications on failure. + + + + + 4.0.0+ + Whether to send notifications on success. + + + + + 4.0.0+ + Whether to send notifications on warning. + + + + + 4.0.0+ + + + <b>Deprecated</b>. Where to send the notification to - eg email address. + + + + + + + 0.0.0+ + Extended configuration specific to this notifier goes here. + + + + + + + + + + + + 4.0.0+ + Modifications to the build process which is activated based on environmental + parameters or command line arguments. + + + + + 4.0.0+ + The identifier of this build profile. This is used for command line + activation, and identifies profiles to be merged. + + + + + + 4.0.0+ + The conditional logic which will automatically trigger the inclusion of this + profile. + + + + + 4.0.0+ + Information required to build the project. + + + + + 4.0.0+ + The modules (sometimes called subprojects) to build as a part of this + project. Each module listed is a relative path to the directory containing the module. + To be consistent with the way default urls are calculated from parent, it is recommended + to have module names match artifact ids. + + + + + + + + + + 4.0.0+ + Distribution information for a project that enables deployment of the site + and artifacts to remote web servers and repositories respectively. + + + + + 4.0.0+ + + + Properties that can be used throughout the POM as a substitution, and + are used as filters in resources if enabled. + The format is <code>&lt;name&gt;value&lt;/name&gt;</code>. + + + + + + + + + + + + 4.0.0+ + Default dependency information for projects that inherit from this one. The + dependencies in this section are not immediately resolved. Instead, when a POM derived + from this one declares a dependency described by a matching groupId and artifactId, the + version and other values from this section are used for that dependency if they were not + already specified. + + + + + 3.0.0+ + + + This element describes all of the dependencies associated with a + project. + These dependencies are used to construct a classpath for your + project during the build process. They are automatically downloaded from the + repositories defined in this project. + See <a href="http://maven.apache.org/guides/introduction/introduction-to-dependency-mechanism.html">the + dependency mechanism</a> for more information. + + + + + + + + + + + + 4.0.0+ + The lists of the remote repositories for discovering dependencies and + extensions. + + + + + + + + + + 4.0.0+ + The lists of the remote repositories for discovering plugins for builds and + reports. + + + + + + + + + + 4.0.0+ + + + <b>Deprecated</b>. Now ignored by Maven. + + + + + + + + + + + + 4.0.0+ + + + This element includes the specification of report plugins to use + to generate the reports on the Maven-generated site. + These reports will be run when a user executes <code>mvn site</code>. + All of the reports will be included in the navigation bar for browsing. + + + + + + + + + 3.0.0+ + Generic informations for a build. + + + + + 3.0.0+ + The default goal (or phase in Maven 2) to execute when none is specified for + the project. Note that in case of a multi-module build, only the default goal of the top-level + project is relevant, i.e. the default goals of child modules are ignored. Since Maven 3, + multiple goals/phases can be separated by whitespace. + + + + + 3.0.0+ + + This element describes all of the classpath resources such as properties + files associated with a project. These resources are often included in the final + package. + The default value is <code>src/main/resources</code>. + + + + + + + + + + + 4.0.0+ + + This element describes all of the classpath resources such as properties + files associated with a project's unit tests. + The default value is <code>src/test/resources</code>. + + + + + + + + + + + 4.0.0+ + + The directory where all files generated by the build are placed. + The default value is <code>target</code>. + + + + + + 4.0.0+ + + + The filename (excluding the extension, and with no path information) that + the produced artifact will be called. + The default value is <code>${artifactId}-${version}</code>. + + + + + + + 4.0.0+ + The list of filter properties files that are used when filtering is enabled. + + + + + + + + + + 4.0.0+ + Default plugin information to be made available for reference by projects + derived from this one. This plugin configuration will not be resolved or bound to the + lifecycle unless referenced. Any local configuration for a given plugin will override + the plugin's entire definition here. + + + + + 4.0.0+ + The list of plugins to use. + + + + + + + + + + + + 4.0.0+ + + + The <code>&lt;plugin&gt;</code> element contains informations required for a plugin. + + + + + + + 4.0.0+ + The group ID of the plugin in the repository. + + + + + 4.0.0+ + The artifact ID of the plugin in the repository. + + + + + 4.0.0+ + The version (or valid range of versions) of the plugin to be used. + + + + + 4.0.0+ + + + Whether to load Maven extensions (such as packaging and type handlers) from + this plugin. For performance reasons, this should only be enabled when necessary. Note: While the type + of this field is <code>String</code> for technical reasons, the semantic type is actually + <code>Boolean</code>. Default value is <code>false</code>. + + + + + + + 4.0.0+ + Multiple specifications of a set of goals to execute during the build + lifecycle, each having (possibly) a different configuration. + + + + + + + + + + 4.0.0+ + Additional dependencies that this project needs to introduce to the plugin's + classloader. + + + + + + + + + + 4.0.0+ + + + <b>Deprecated</b>. Unused by Maven. + + + + + + + + + + + + 4.0.0+ + + + Whether any configuration should be propagated to child POMs. Note: While the type + of this field is <code>String</code> for technical reasons, the semantic type is actually + <code>Boolean</code>. Default value is <code>true</code>. + + + + + + + 0.0.0+ + + + <p>The configuration as DOM object.</p> + <p>By default, every element content is trimmed, but starting with Maven 3.1.0, you can add + <code>xml:space="preserve"</code> to elements you want to preserve whitespace.</p> + <p>You can control how child POMs inherit configuration from parent POMs by adding <code>combine.children</code> + or <code>combine.self</code> attributes to the children of the configuration element:</p> + <ul> + <li><code>combine.children</code>: available values are <code>merge</code> (default) and <code>append</code>,</li> + <li><code>combine.self</code>: available values are <code>merge</code> (default) and <code>override</code>.</li> + </ul> + <p>See <a href="http://maven.apache.org/pom.html#Plugins">POM Reference documentation</a> and + <a href="http://plexus.codehaus.org/plexus-utils/apidocs/org/codehaus/plexus/util/xml/Xpp3DomUtils.html">Xpp3DomUtils</a> + for more information.</p> + + + + + + + + + + + + + + 3.0.0+ + + + The <code>&lt;dependency&gt;</code> element contains information about a dependency + of the project. + + + + + + + 3.0.0+ + + + The project group that produced the dependency, e.g. + <code>org.apache.maven</code>. + + + + + + + 3.0.0+ + + + The unique id for an artifact produced by the project group, e.g. + <code>maven-artifact</code>. + + + + + + + 3.0.0+ + + + The version of the dependency, e.g. <code>3.2.1</code>. In Maven 2, this can also be + specified as a range of versions. + + + + + + + 4.0.0+ + + + The type of dependency. While it + usually represents the extension on the filename of the dependency, + that is not always the case. A type can be mapped to a different + extension and a classifier. + The type often corresponds to the packaging used, though this is also + not always the case. + Some examples are <code>jar</code>, <code>war</code>, <code>ejb-client</code> + and <code>test-jar</code>: see <a href="../maven-core/artifact-handlers.html">default + artifact handlers</a> for a list. + New types can be defined by plugins that set + <code>extensions</code> to <code>true</code>, so this is not a complete list. + + + + + + + 4.0.0+ + + + The classifier of the dependency. It is appended to + the filename after the version. This allows: + <ul> + <li>refering to attached artifact, for example <code>sources</code> and <code>javadoc</code>: + see <a href="../maven-core/artifact-handlers.html">default artifact handlers</a> for a list,</li> + <li>distinguishing two artifacts + that belong to the same POM but were built differently. + For example, <code>jdk14</code> and <code>jdk15</code>.</li> + </ul> + + + + + + + 4.0.0+ + + + The scope of the dependency - <code>compile</code>, <code>runtime</code>, + <code>test</code>, <code>system</code>, and <code>provided</code>. Used to + calculate the various classpaths used for compilation, testing, and so on. + It also assists in determining which artifacts to include in a distribution of + this project. For more information, see + <a href="http://maven.apache.org/guides/introduction/introduction-to-dependency-mechanism.html">the + dependency mechanism</a>. + + + + + + + 4.0.0+ + + + FOR SYSTEM SCOPE ONLY. Note that use of this property is <b>discouraged</b> + and may be replaced in later versions. This specifies the path on the filesystem + for this dependency. + Requires an absolute path for the value, not relative. + Use a property that gives the machine specific absolute path, + e.g. <code>${java.home}</code>. + + + + + + + 4.0.0+ + Lists a set of artifacts that should be excluded from this dependency's + artifact list when it comes to calculating transitive dependencies. + + + + + + + + + + 4.0.0+ + + + Indicates the dependency is optional for use of this library. While the + version of the dependency will be taken into account for dependency calculation if the + library is used elsewhere, it will not be passed on transitively. Note: While the type + of this field is <code>String</code> for technical reasons, the semantic type is actually + <code>Boolean</code>. Default value is <code>false</code>. + + + + + + + + + 4.0.0+ + + + The <code>&lt;exclusion&gt;</code> element contains informations required to exclude + an artifact to the project. + + + + + + + 4.0.0+ + The artifact ID of the project to exclude. + + + + + 4.0.0+ + The group ID of the project to exclude. + + + + + + + 4.0.0+ + + + The <code>&lt;execution&gt;</code> element contains informations required for the + execution of a plugin. + + + + + + + 4.0.0+ + The identifier of this execution for labelling the goals during the build, + and for matching executions to merge during inheritance and profile injection. + + + + + 4.0.0+ + The build lifecycle phase to bind the goals in this execution to. If omitted, + the goals will be bound to the default phase specified by the plugin. + + + + + 4.0.0+ + The goals to execute with the given configuration. + + + + + + + + + + 4.0.0+ + + + Whether any configuration should be propagated to child POMs. Note: While the type + of this field is <code>String</code> for technical reasons, the semantic type is actually + <code>Boolean</code>. Default value is <code>true</code>. + + + + + + + 0.0.0+ + + + <p>The configuration as DOM object.</p> + <p>By default, every element content is trimmed, but starting with Maven 3.1.0, you can add + <code>xml:space="preserve"</code> to elements you want to preserve whitespace.</p> + <p>You can control how child POMs inherit configuration from parent POMs by adding <code>combine.children</code> + or <code>combine.self</code> attributes to the children of the configuration element:</p> + <ul> + <li><code>combine.children</code>: available values are <code>merge</code> (default) and <code>append</code>,</li> + <li><code>combine.self</code>: available values are <code>merge</code> (default) and <code>override</code>.</li> + </ul> + <p>See <a href="http://maven.apache.org/pom.html#Plugins">POM Reference documentation</a> and + <a href="http://plexus.codehaus.org/plexus-utils/apidocs/org/codehaus/plexus/util/xml/Xpp3DomUtils.html">Xpp3DomUtils</a> + for more information.</p> + + + + + + + + + + + + + + 3.0.0+ + This element describes all of the classpath resources associated with a project + or unit tests. + + + + + 3.0.0+ + + + Describe the resource target path. The path is relative to the target/classes + directory (i.e. <code>${project.build.outputDirectory}</code>). + For example, if you want that resource to appear in a specific package + (<code>org.apache.maven.messages</code>), you must specify this + element with this value: <code>org/apache/maven/messages</code>. + This is not required if you simply put the resources in that directory + structure at the source, however. + + + + + + + 3.0.0+ + + + Whether resources are filtered to replace tokens with parameterised values or not. + The values are taken from the <code>properties</code> element and from the + properties in the files listed in the <code>filters</code> element. Note: While the type + of this field is <code>String</code> for technical reasons, the semantic type is actually + <code>Boolean</code>. Default value is <code>false</code>. + + + + + + + 3.0.0+ + Describe the directory where the resources are stored. The path is relative + to the POM. + + + + + 3.0.0+ + + + A list of patterns to include, e.g. <code>**&#47;*.xml</code>. + + + + + + + + + + + + 3.0.0+ + + + A list of patterns to exclude, e.g. <code>**&#47;*.xml</code> + + + + + + + + + + + + + + 4.0.0+ + Section for management of default plugin information for use in a group of POMs. + + + + + + 4.0.0+ + The list of plugins to use. + + + + + + + + + + + + 4.0.0+ + Section for management of reports and their configuration. + + + + + 4.0.0+ + + + If true, then the default reports are not included in the site generation. + This includes the reports in the "Project Info" menu. Note: While the type + of this field is <code>String</code> for technical reasons, the semantic type is actually + <code>Boolean</code>. Default value is <code>false</code>. + + + + + + + 4.0.0+ + + + Where to store all of the generated reports. The default is + <code>${project.build.directory}/site</code>. + + + + + + + 4.0.0+ + The reporting plugins to use and their configuration. + + + + + + + + + + + + 4.0.0+ + + + The <code>&lt;plugin&gt;</code> element contains informations required for a report plugin. + + + + + + + 4.0.0+ + The group ID of the reporting plugin in the repository. + + + + + 4.0.0+ + The artifact ID of the reporting plugin in the repository. + + + + + 4.0.0+ + The version of the reporting plugin to be used. + + + + + 4.0.0+ + + + Multiple specifications of a set of reports, each having (possibly) different + configuration. This is the reporting parallel to an <code>execution</code> in the build. + + + + + + + + + + + + 4.0.0+ + + + Whether any configuration should be propagated to child POMs. Note: While the type + of this field is <code>String</code> for technical reasons, the semantic type is actually + <code>Boolean</code>. Default value is <code>true</code>. + + + + + + + 0.0.0+ + + + <p>The configuration as DOM object.</p> + <p>By default, every element content is trimmed, but starting with Maven 3.1.0, you can add + <code>xml:space="preserve"</code> to elements you want to preserve whitespace.</p> + <p>You can control how child POMs inherit configuration from parent POMs by adding <code>combine.children</code> + or <code>combine.self</code> attributes to the children of the configuration element:</p> + <ul> + <li><code>combine.children</code>: available values are <code>merge</code> (default) and <code>append</code>,</li> + <li><code>combine.self</code>: available values are <code>merge</code> (default) and <code>override</code>.</li> + </ul> + <p>See <a href="http://maven.apache.org/pom.html#Plugins">POM Reference documentation</a> and + <a href="http://plexus.codehaus.org/plexus-utils/apidocs/org/codehaus/plexus/util/xml/Xpp3DomUtils.html">Xpp3DomUtils</a> + for more information.</p> + + + + + + + + + + + + + + 4.0.0+ + Represents a set of reports and configuration to be used to generate them. + + + + + 0.0.0+ + The unique id for this report set, to be used during POM inheritance and profile injection + for merging of report sets. + + + + + + 4.0.0+ + The list of reports from this plugin which should be generated from this set. + + + + + + + + + + 4.0.0+ + + + Whether any configuration should be propagated to child POMs. Note: While the type + of this field is <code>String</code> for technical reasons, the semantic type is actually + <code>Boolean</code>. Default value is <code>true</code>. + + + + + + + 0.0.0+ + + + <p>The configuration as DOM object.</p> + <p>By default, every element content is trimmed, but starting with Maven 3.1.0, you can add + <code>xml:space="preserve"</code> to elements you want to preserve whitespace.</p> + <p>You can control how child POMs inherit configuration from parent POMs by adding <code>combine.children</code> + or <code>combine.self</code> attributes to the children of the configuration element:</p> + <ul> + <li><code>combine.children</code>: available values are <code>merge</code> (default) and <code>append</code>,</li> + <li><code>combine.self</code>: available values are <code>merge</code> (default) and <code>override</code>.</li> + </ul> + <p>See <a href="http://maven.apache.org/pom.html#Plugins">POM Reference documentation</a> and + <a href="http://plexus.codehaus.org/plexus-utils/apidocs/org/codehaus/plexus/util/xml/Xpp3DomUtils.html">Xpp3DomUtils</a> + for more information.</p> + + + + + + + + + + + + + + 4.0.0+ + The conditions within the build runtime environment which will trigger the + automatic inclusion of the build profile. Multiple conditions can be defined, which must + be all satisfied to activate the profile. + + + + + + 4.0.0+ + If set to true, this profile will be active unless another profile in this + pom is activated using the command line -P option or by one of that profile's + activators. + + + + + 4.0.0+ + + + Specifies that this profile will be activated when a matching JDK is detected. + For example, <code>1.4</code> only activates on JDKs versioned 1.4, + while <code>!1.4</code> matches any JDK that is not version 1.4. Ranges are supported too: + <code>[1.5,)</code> activates when the JDK is 1.5 minimum. + + + + + + + 4.0.0+ + Specifies that this profile will be activated when matching operating system + attributes are detected. + + + + + 4.0.0+ + Specifies that this profile will be activated when this system property is + specified. + + + + + 4.0.0+ + Specifies that this profile will be activated based on existence of a file. + + + + + + + 4.0.0+ + This is the property specification used to activate a profile. If the value field + is empty, then the existence of the named property will activate the profile, otherwise it + does a case-sensitive match against the property value as well. + + + + + 4.0.0+ + The name of the property to be used to activate a profile. + + + + + 4.0.0+ + The value of the property required to activate a profile. + + + + + + + 4.0.0+ + This is an activator which will detect an operating system's attributes in order + to activate its profile. + + + + + 4.0.0+ + + + The name of the operating system to be used to activate the profile. This must be an exact match + of the <code>${os.name}</code> Java property, such as <code>Windows XP</code>. + + + + + + + 4.0.0+ + + + The general family of the OS to be used to activate the profile, such as + <code>windows</code> or <code>unix</code>. + + + + + + + 4.0.0+ + The architecture of the operating system to be used to activate the + profile. + + + + + 4.0.0+ + The version of the operating system to be used to activate the + profile. + + + + + + + 4.0.0+ + This is the file specification used to activate the profile. The <code>missing</code> value + is the location of a file that needs to exist, and if it doesn't, the profile will be + activated. On the other hand, <code>exists</code> will test for the existence of the file and if it is + there, the profile will be activated.<br/> + Variable interpolation for these file specifications is limited to <code>${basedir}</code>, + System properties and request properties. + + + + + 4.0.0+ + The name of the file that must be missing to activate the + profile. + + + + + 4.0.0+ + The name of the file that must exist to activate the profile. + + + + + + + 4.0.0+ + Section for management of default dependency information for use in a group of + POMs. + + + + + 4.0.0+ + The dependencies specified here are not used until they are referenced in a + POM within the group. This allows the specification of a "standard" version for a + particular dependency. + + + + + + + + + + + + 3.0.0+ + + + The <code>&lt;build&gt;</code> element contains informations required to build the project. + Default values are defined in Super POM. + + + + + + + 3.0.0+ + + This element specifies a directory containing the source of the project. The + generated build system will compile the sources from this directory when the project is + built. The path given is relative to the project descriptor. + The default value is <code>src/main/java</code>. + + + + + + 4.0.0+ + + This element specifies a directory containing the script sources of the + project. This directory is meant to be different from the sourceDirectory, in that its + contents will be copied to the output directory in most cases (since scripts are + interpreted rather than compiled). + The default value is <code>src/main/scripts</code>. + + + + + + 4.0.0+ + + This element specifies a directory containing the unit test source of the + project. The generated build system will compile these directories when the project is + being tested. The path given is relative to the project descriptor. + The default value is <code>src/test/java</code>. + + + + + + 4.0.0+ + + The directory where compiled application classes are placed. + The default value is <code>target/classes</code>. + + + + + + 4.0.0+ + + The directory where compiled test classes are placed. + The default value is <code>target/test-classes</code>. + + + + + + 4.0.0+ + A set of build extensions to use from this project. + + + + + + + + + + 3.0.0+ + The default goal (or phase in Maven 2) to execute when none is specified for + the project. Note that in case of a multi-module build, only the default goal of the top-level + project is relevant, i.e. the default goals of child modules are ignored. Since Maven 3, + multiple goals/phases can be separated by whitespace. + + + + + 3.0.0+ + + This element describes all of the classpath resources such as properties + files associated with a project. These resources are often included in the final + package. + The default value is <code>src/main/resources</code>. + + + + + + + + + + + 4.0.0+ + + This element describes all of the classpath resources such as properties + files associated with a project's unit tests. + The default value is <code>src/test/resources</code>. + + + + + + + + + + + 4.0.0+ + + The directory where all files generated by the build are placed. + The default value is <code>target</code>. + + + + + + 4.0.0+ + + + The filename (excluding the extension, and with no path information) that + the produced artifact will be called. + The default value is <code>${artifactId}-${version}</code>. + + + + + + + 4.0.0+ + The list of filter properties files that are used when filtering is enabled. + + + + + + + + + + 4.0.0+ + Default plugin information to be made available for reference by projects + derived from this one. This plugin configuration will not be resolved or bound to the + lifecycle unless referenced. Any local configuration for a given plugin will override + the plugin's entire definition here. + + + + + 4.0.0+ + The list of plugins to use. + + + + + + + + + + + + 4.0.0+ + Describes a build extension to utilise. + + + + + 4.0.0+ + The group ID of the extension's artifact. + + + + + 4.0.0+ + The artifact ID of the extension. + + + + + 4.0.0+ + The version of the extension. + + + + + + + 3.0.0+ + Describes the licenses for this project. This is used to generate the license + page of the project's web site, as well as being taken into consideration in other reporting + and validation. The licenses listed for the project are that of the project itself, and not + of dependencies. + + + + + 3.0.0+ + The full legal name of the license. + + + + + 3.0.0+ + The official url for the license text. + + + + + 3.0.0+ + + + The primary method by which this project may be distributed. + <dl> + <dt>repo</dt> + <dd>may be downloaded from the Maven repository</dd> + <dt>manual</dt> + <dd>user must manually download and install the dependency.</dd> + </dl> + + + + + + + 3.0.0+ + Addendum information pertaining to this license. + + + + + + + 3.0.0+ + This element describes all of the mailing lists associated with a project. The + auto-generated site references this information. + + + + + 3.0.0+ + + + The name of the mailing list. + + + + + + + 3.0.0+ + + + The email address or link that can be used to subscribe to + the mailing list. If this is an email address, a + <code>mailto:</code> link will automatically be created + when the documentation is created. + + + + + + + 3.0.0+ + + + The email address or link that can be used to unsubscribe to + the mailing list. If this is an email address, a + <code>mailto:</code> link will automatically be created + when the documentation is created. + + + + + + + 3.0.0+ + + + The email address or link that can be used to post to + the mailing list. If this is an email address, a + <code>mailto:</code> link will automatically be created + when the documentation is created. + + + + + + + 3.0.0+ + The link to a URL where you can browse the mailing list archive. + + + + + 3.0.0+ + The link to alternate URLs where you can browse the list archive. + + + + + + + + + + + + 3.0.0+ + Information about one of the committers on this project. + + + + + 3.0.0+ + The unique ID of the developer in the SCM. + + + + + 3.0.0+ + The full name of the contributor. + + + + + 3.0.0+ + The email address of the contributor. + + + + + 3.0.0+ + The URL for the homepage of the contributor. + + + + + 3.0.0+ + The organization to which the contributor belongs. + + + + + 3.0.0+ + The URL of the organization. + + + + + 3.0.0+ + + + The roles the contributor plays in the project. Each role is described by a + <code>role</code> element, the body of which is a role name. This can also be used to + describe the contribution. + + + + + + + + + + + + 3.0.0+ + + + The timezone the contributor is in. Typically, this is a number in the range + <a href="http://en.wikipedia.org/wiki/UTC%E2%88%9212:00">-12</a> to <a href="http://en.wikipedia.org/wiki/UTC%2B14:00">+14</a> + or a valid time zone id like "America/Montreal" (UTC-05:00) or "Europe/Paris" (UTC+01:00). + + + + + + + 3.0.0+ + Properties about the contributor, such as an instant messenger handle. + + + + + + + + + + \ No newline at end of file diff --git a/swh/indexer/exc.py b/swh/indexer/exc.py new file mode 100644 --- /dev/null +++ b/swh/indexer/exc.py @@ -0,0 +1,2 @@ +class UnknownNamespace(Exception): + pass diff --git a/swh/indexer/metadata_dictionary.py b/swh/indexer/metadata_dictionary.py --- a/swh/indexer/metadata_dictionary.py +++ b/swh/indexer/metadata_dictionary.py @@ -11,14 +11,15 @@ import logging import itertools import email.parser -import xml.parsers.expat import email.policy import click -import xmltodict +import defusedxml.ElementTree +import xmlschema from swh.indexer.codemeta import CROSSWALK_TABLE, SCHEMA_URI from swh.indexer.codemeta import compact, expand +from swh.indexer.exc import UnknownNamespace MAPPINGS = {} @@ -137,6 +138,11 @@ """A translation dict to map dict keys into a canonical name.""" pass + def _normalize_key(self, key): + """Normalizes a key of the input dictionary so it can be mapped to + a method name.""" + return key.replace('-', '_') + def _translate_dict(self, content_dict, *, normalize=True): """ Translates content by parsing content from a dict object @@ -154,8 +160,11 @@ for k, v in content_dict.items(): # First, check if there is a specific translation # method for this key - translation_method = getattr( - self, 'translate_' + k.replace('-', '_'), None) + try: + translation_method = getattr( + self, 'translate_' + self._normalize_key(k), None) + except UnknownNamespace: + continue if translation_method: translation_method(translated_metadata, v) elif k in self.mapping: @@ -165,7 +174,7 @@ # if there is a normalization method, use it on the value normalization_method = getattr( - self, 'normalize_' + k.replace('-', '_'), None) + self, 'normalize_' + self._normalize_key(k), None) if normalization_method: v = normalization_method(v) elif k in self.string_fields and isinstance(v, str): @@ -384,6 +393,10 @@ return None +POM_XMLNS = 'http://maven.apache.org/POM/4.0.0' +POM_PREFIX = '{' + POM_XMLNS + '}' + + @register_mapping class MavenMapping(DictMapping, SingleFileMapping): """ @@ -391,13 +404,26 @@ """ name = 'maven' filename = b'pom.xml' - mapping = CROSSWALK_TABLE['Java (Maven)'] - string_fields = ['name', 'version', 'description', 'email'] + raw_mapping = CROSSWALK_TABLE['Java (Maven)'] + mapping = {POM_PREFIX + pom_name: codemeta_name + for (pom_name, codemeta_name) + in CROSSWALK_TABLE['Java (Maven)'].items()} + string_fields = [POM_PREFIX + k + for k in ['name', 'version', 'description', 'email']] + schema = xmlschema.XMLSchema(os.path.join( + os.path.dirname(__file__), 'data', 'maven', 'maven-4.0.0.xsd'), + defuse='always') + + def _normalize_key(self, key): + if not key.startswith(POM_PREFIX): + raise UnknownNamespace(key) + key = key[len(POM_PREFIX):] # strip the prefix + return super()._normalize_key(key) def translate(self, content): try: - d = xmltodict.parse(content).get('project') or {} - except xml.parsers.expat.ExpatError: + tree = defusedxml.ElementTree.fromstring(content) + except defusedxml.ElementTree.ParseError: self.log.warning('Error parsing XML from %s', self.log_suffix) return None except UnicodeDecodeError: @@ -408,35 +434,40 @@ self.log.warning('Error detecting XML encoding from %s', self.log_suffix) return None + + d = self.schema.to_dict(tree, validation='skip') + d = d or {} # it may be None if the document is empty but for the root metadata = self._translate_dict(d, normalize=False) metadata[SCHEMA_URI+'codeRepository'] = self.parse_repositories(d) metadata[SCHEMA_URI+'license'] = self.parse_licenses(d) return self.normalize_translation(metadata) - _default_repository = {'url': 'https://repo.maven.apache.org/maven2/'} + _default_repository = { + POM_PREFIX + 'url': 'https://repo.maven.apache.org/maven2/'} def parse_repositories(self, d): """https://maven.apache.org/pom.html#Repositories - >>> import xmltodict - >>> from pprint import pprint - >>> d = xmltodict.parse(''' - ... - ... - ... codehausSnapshots - ... Codehaus Snapshots - ... http://snapshots.maven.codehaus.org/maven2 - ... default - ... - ... + >>> tree = defusedxml.ElementTree.fromstring(''' + ... + ... + ... + ... codehausSnapshots + ... Codehaus Snapshots + ... http://snapshots.maven.codehaus.org/maven2 + ... default + ... + ... + ... ... ''') + >>> d = MavenMapping.schema.to_dict(tree) >>> MavenMapping().parse_repositories(d) """ - repositories = d.get('repositories') + repositories = d.get(POM_PREFIX + 'repositories') if not repositories: results = [self.parse_repository(d, self._default_repository)] elif isinstance(repositories, dict): - repositories = repositories.get('repository') or [] + repositories = repositories.get(POM_PREFIX + 'repository') or [] if not isinstance(repositories, list): repositories = [repositories] results = [self.parse_repository(d, repo) @@ -450,9 +481,9 @@ return if repo.get('layout', 'default') != 'default': return # TODO ? - url = repo.get('url') - group_id = d.get('groupId') - artifact_id = d.get('artifactId') + url = repo.get(POM_PREFIX + 'url') + group_id = d.get(POM_PREFIX + 'groupId') + artifact_id = d.get(POM_PREFIX + 'artifactId') if (isinstance(url, str) and isinstance(group_id, str) and isinstance(artifact_id, str)): repo = os.path.join(url, *group_id.split('.'), artifact_id) @@ -470,23 +501,27 @@ def parse_licenses(self, d): """https://maven.apache.org/pom.html#Licenses - >>> import xmltodict >>> import json - >>> d = xmltodict.parse(''' - ... - ... - ... Apache License, Version 2.0 - ... https://www.apache.org/licenses/LICENSE-2.0.txt - ... - ... + >>> tree = defusedxml.ElementTree.fromstring(''' + ... + ... + ... + ... Apache License, Version 2.0 + ... https://www.apache.org/licenses/LICENSE-2.0.txt + ... + ... + ... ... ''') - >>> print(json.dumps(d, indent=4)) + >>> d = MavenMapping.schema.to_dict(tree) + >>> print(json.dumps(d, indent=4, sort_keys=True)) { - "licenses": { - "license": { - "name": "Apache License, Version 2.0", - "url": "https://www.apache.org/licenses/LICENSE-2.0.txt" - } + "{http://maven.apache.org/POM/4.0.0}licenses": { + "{http://maven.apache.org/POM/4.0.0}license": [ + { + "{http://maven.apache.org/POM/4.0.0}name": "Apache License, Version 2.0", + "{http://maven.apache.org/POM/4.0.0}url": "https://www.apache.org/licenses/LICENSE-2.0.txt" + } + ] } } >>> MavenMapping().parse_licenses(d) @@ -494,37 +529,39 @@ or, if there are more than one license: - >>> import xmltodict >>> from pprint import pprint - >>> d = xmltodict.parse(''' - ... - ... - ... Apache License, Version 2.0 - ... https://www.apache.org/licenses/LICENSE-2.0.txt - ... - ... - ... MIT License - ... https://opensource.org/licenses/MIT - ... - ... + >>> tree = defusedxml.ElementTree.fromstring(''' + ... + ... + ... + ... Apache License, Version 2.0 + ... https://www.apache.org/licenses/LICENSE-2.0.txt + ... + ... + ... MIT License + ... https://opensource.org/licenses/MIT + ... + ... + ... ... ''') + >>> d = MavenMapping.schema.to_dict(tree) >>> pprint(MavenMapping().parse_licenses(d)) [{'@id': 'https://www.apache.org/licenses/LICENSE-2.0.txt'}, {'@id': 'https://opensource.org/licenses/MIT'}] - """ + """ # noqa: E501 - licenses = d.get('licenses') + licenses = d.get(POM_PREFIX + 'licenses') if not isinstance(licenses, dict): return - licenses = licenses.get('license') + licenses = licenses.get(POM_PREFIX + 'license') if isinstance(licenses, dict): licenses = [licenses] elif not isinstance(licenses, list): return - return [{"@id": license['url']} + return [{"@id": license[POM_PREFIX + 'url']} for license in licenses if isinstance(license, dict) - and isinstance(license.get('url'), str)] or None + and isinstance(license.get(POM_PREFIX + 'url'), str)] or None _normalize_pkginfo_key = str.lower diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py --- a/swh/indexer/tests/test_metadata.py +++ b/swh/indexer/tests/test_metadata.py @@ -572,7 +572,7 @@ def test_compute_metadata_maven(self): raw_content = b""" - + Maven Default Project 4.0.0 com.mycompany.app @@ -612,7 +612,7 @@ def test_compute_metadata_maven_empty(self): raw_content = b""" - + """ result = self.maven_mapping.translate(raw_content) self.assertEqual(result, { @@ -622,7 +622,7 @@ def test_compute_metadata_maven_almost_empty(self): raw_content = b""" - + """ result = self.maven_mapping.translate(raw_content) @@ -631,13 +631,40 @@ 'type': 'SoftwareSourceCode', }) + def test_compute_metadata_maven_unknown_project_ns(self): + raw_content = b""" + + Maven Default Project + 1.2.3 + """ + result = self.maven_mapping.translate(raw_content) + self.assertEqual(result, { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'type': 'SoftwareSourceCode', + }) + + def test_compute_metadata_maven_unknown_element_ns(self): + raw_content = b""" + + Maven Default Project + 1.2.3 + """ + result = self.maven_mapping.translate(raw_content) + self.assertEqual(result, { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'type': 'SoftwareSourceCode', + 'version': '1.2.3', + }) + def test_compute_metadata_maven_invalid_xml(self): expected_warning = ( 'WARNING:swh.indexer.metadata_dictionary.MavenMapping:' 'Error parsing XML from foo') raw_content = b""" - """ + """ with self.assertLogs('swh.indexer.metadata_dictionary', level='WARNING') as cm: result = MAPPINGS["MavenMapping"]('foo').translate(raw_content) @@ -658,7 +685,7 @@ 'Error detecting XML encoding from foo') raw_content = b""" - + """ with self.assertLogs('swh.indexer.metadata_dictionary', level='WARNING') as cm: @@ -667,7 +694,7 @@ self.assertEqual(result, None) raw_content = b""" - + """ with self.assertLogs('swh.indexer.metadata_dictionary', level='WARNING') as cm: @@ -691,7 +718,7 @@ def test_compute_metadata_maven_minimal(self): raw_content = b""" - + Maven Default Project 4.0.0 com.mycompany.app @@ -711,7 +738,7 @@ def test_compute_metadata_maven_empty_nodes(self): raw_content = b""" - + Maven Default Project 4.0.0 com.mycompany.app @@ -732,7 +759,7 @@ }) raw_content = b""" - + Maven Default Project 4.0.0 com.mycompany.app @@ -750,7 +777,7 @@ }) raw_content = b""" - + 4.0.0 com.mycompany.app @@ -768,7 +795,7 @@ }) raw_content = b""" - + Maven Default Project 4.0.0 com.mycompany.app @@ -789,7 +816,7 @@ }) raw_content = b""" - + 1.2.3 """ @@ -802,7 +829,7 @@ def test_compute_metadata_maven_invalid_licenses(self): raw_content = b""" - + Maven Default Project 4.0.0 com.mycompany.app @@ -826,7 +853,7 @@ def test_compute_metadata_maven_multiple(self): '''Tests when there are multiple code repos and licenses.''' raw_content = b""" - + Maven Default Project 4.0.0 com.mycompany.app