diff --git a/mypy.ini b/mypy.ini --- a/mypy.ini +++ b/mypy.ini @@ -40,8 +40,5 @@ [mypy-urllib3.util.*] ignore_missing_imports = True -[mypy-xmltodict.*] -ignore_missing_imports = True - [mypy-dulwich.*] ignore_missing_imports = True diff --git a/requirements.txt b/requirements.txt --- a/requirements.txt +++ b/requirements.txt @@ -5,6 +5,5 @@ beautifulsoup4 launchpadlib tenacity >= 6.2 -xmltodict lxml dulwich diff --git a/swh/lister/maven/lister.py b/swh/lister/maven/lister.py --- a/swh/lister/maven/lister.py +++ b/swh/lister/maven/lister.py @@ -10,9 +10,10 @@ from typing import Any, Dict, Iterator, Optional from urllib.parse import urljoin +from bs4 import BeautifulSoup +import lxml import requests from tenacity.before_sleep import before_sleep_log -import xmltodict from swh.core.github.utils import GitHubSession from swh.lister.utils import throttling_retry @@ -252,16 +253,18 @@ for pom in out_pom: try: response = self.page_request(pom, {}) - project = xmltodict.parse(response.content) - project_d = project.get("project", {}) - scm_d = project_d.get("scm") - if scm_d is not None: - connection = scm_d.get("connection") + parsed_pom = BeautifulSoup(response.content, "xml") + project = parsed_pom.find("project") + if project is None: + continue + scm = project.find("scm") + if scm is not None: + connection = scm.find("connection") if connection is not None: artifact_metadata_d = { "type": "scm", "doc": out_pom[pom], - "url": connection, + "url": connection.text, } logger.debug("* Yielding pom %s: %s", pom, artifact_metadata_d) yield artifact_metadata_d @@ -274,8 +277,8 @@ "POM info page could not be fetched, skipping project '%s'", pom, ) - except xmltodict.expat.ExpatError as error: - logger.info("Could not parse POM %s XML: %s. Next.", pom, error) + except lxml.etree.Error as error: + logger.info("Could not parse POM %s XML: %s.", pom, error) def get_scm(self, page: RepoPage) -> Optional[ListedOrigin]: """Retrieve scm origin out of the page information. Only called when type of the diff --git a/swh/lister/maven/tests/data/https_maven.org/citrus-parent-3.0.7.pom b/swh/lister/maven/tests/data/https_maven.org/citrus-parent-3.0.7.pom new file mode 100644 --- /dev/null +++ b/swh/lister/maven/tests/data/https_maven.org/citrus-parent-3.0.7.pom @@ -0,0 +1,769 @@ + + + + org.sonatype.oss + oss-parent + 7 + + 4.0.0 + com.alibaba.citrus + citrus-parent + pom + Citrus Parent Project + 3.0.7 + Another Java-based WEB Framework + http://www.openwebx.org/ + + + The Apache Software License, Version 2.0 + http://www.apache.org/licenses/LICENSE-2.0.txt + repo + + + + scm:git:https://github.com/webx/citrus + scm:git:git@github.com:webx/citrus.git + http://github.com/webx/citrus + + + + baobao + Michael Zhou + yizhi@taobao.com + + architect + developer + + + http://i54.tinypic.com/2jewmjr.jpg + + -6 + + + + 1.5 + GBK + 3.0.7 + 2.5.6.SEC03 + 1.0 + 1.0 + 6.1.22 + ${java.encoding} + + + + spring3 + + 3.0.6.RELEASE + + + + + dist/webx + dist/test + common/expr + common/logconfig + common/util + common/springext + common/generictype + common/asm + common/hessian + common/codegen + test/util + test/webx + service/base + service/dataresolver + service/form + service/resource + service/upload + service/requestcontext + service/pipeline + service/configuration + service/mappingrule + service/moduleloader + service/pull + service/template + service/jsp + service/velocity + service/freemarker + service/uribroker + service/mail + webx/framework + webx/turbine + webx/dev + + + + org.slf4j + slf4j-api + + + org.slf4j + jcl-over-slf4j + + + + + + + + + ${project.groupId} + citrus-webx-all + ${webx-version} + + + ${project.groupId} + citrus-test-all + ${webx-version} + test + + + ${project.groupId} + citrus-common-logconfig + ${webx-version} + + + ${project.groupId} + citrus-common-expr + ${webx-version} + + + ${project.groupId} + citrus-common-util + ${webx-version} + + + ${project.groupId} + citrus-common-springext + ${webx-version} + + + ${project.groupId} + citrus-common-generictype + ${webx-version} + + + ${project.groupId} + citrus-common-asm + ${webx-version} + + + ${project.groupId} + citrus-common-hessian + ${webx-version} + + + ${project.groupId} + citrus-common-codegen + ${webx-version} + + + ${project.groupId} + citrus-test-util + ${webx-version} + test + + + ${project.groupId} + citrus-test-webx + ${webx-version} + test + + + ${project.groupId} + citrus-service-base + ${webx-version} + + + ${project.groupId} + citrus-service-dataresolver + ${webx-version} + + + ${project.groupId} + citrus-service-form + ${webx-version} + + + ${project.groupId} + citrus-service-resource + ${webx-version} + + + ${project.groupId} + citrus-service-upload + ${webx-version} + + + ${project.groupId} + citrus-service-requestcontext + ${webx-version} + + + ${project.groupId} + citrus-service-pipeline + ${webx-version} + + + ${project.groupId} + citrus-service-configuration + ${webx-version} + + + ${project.groupId} + citrus-service-mappingrule + ${webx-version} + + + ${project.groupId} + citrus-service-moduleloader + ${webx-version} + + + ${project.groupId} + citrus-service-pull + ${webx-version} + + + ${project.groupId} + citrus-service-template + ${webx-version} + + + ${project.groupId} + citrus-service-jsp + ${webx-version} + + + ${project.groupId} + citrus-service-velocity + ${webx-version} + + + ${project.groupId} + citrus-service-freemarker + ${webx-version} + + + ${project.groupId} + citrus-service-uribroker + ${webx-version} + + + ${project.groupId} + citrus-service-mail + ${webx-version} + + + ${project.groupId} + citrus-webx-framework + ${webx-version} + + + ${project.groupId} + citrus-webx-turbine + ${webx-version} + + + ${project.groupId} + citrus-webx-dev + ${webx-version} + + + + + + org.slf4j + slf4j-api + 1.6.1 + + + + org.slf4j + jcl-over-slf4j + 1.6.1 + + + + commons-logging + commons-logging + 1.1.1 + provided + + + + ch.qos.logback + logback-classic + 0.9.24 + runtime + + + + org.slf4j + slf4j-log4j12 + 1.6.1 + runtime + + + log4j + log4j + 1.2.16 + runtime + + + + + + junit + junit + 4.8.2 + test + + + org.hamcrest + hamcrest-library + 1.1 + test + + + httpunit + httpunit + 1.7 + test + + + jtidy + jtidy + + + + + rhino + js + 1.7R1 + test + + + nekohtml + nekohtml + 1.9.6 + test + + + xerces + xercesImpl + 2.9.1 + test + + + xml-apis + xml-apis + + + + + xalan + xalan + 2.7.1 + test + + + xml-apis + xml-apis + + + + + org.easymock + easymockclassextension + 3.0 + test + + + org.apache.tomcat + jasper + 6.0.33 + test + + + org.jvnet.mock-javamail + mock-javamail + 1.7 + test + + + oro + oro + 2.0.8 + + + + + + ecs + ecs + 1.4.2 + + + org.apache.commons + commons-jexl + 2.0.1 + + + org.apache.velocity + velocity + 1.6.4 + + + org.freemarker + freemarker + 2.3.16 + + + commons-fileupload + commons-fileupload + 1.2.1 + + + commons-io + commons-io + 1.4 + + + commons-codec + commons-codec + 1.3 + + + org.codehaus.groovy + groovy-all + 1.6.3 + runtime + + + org.apache.ant + ant + + + org.apache.ant + ant-launcher + + + jline + jline + + + + + dom4j + dom4j + 1.6.1 + + + xml-apis + xml-apis + + + + + cglib + cglib-nodep + 2.2 + + + javax.servlet + servlet-api + 2.5 + provided + + + javax.mail + mail + 1.4.1 + provided + + + javax.activation + activation + 1.1 + provided + + + janino + janino + 2.5.10 + test + + + xml-apis + xml-apis + 1.3.04 + + + + + + org.springframework + spring-core + ${spring-version} + + + org.springframework + spring-beans + ${spring-version} + + + org.springframework + spring-aop + ${spring-version} + + + org.springframework + spring-context + ${spring-version} + + + org.springframework + spring-context-support + ${spring-version} + + + org.springframework + spring-tx + ${spring-version} + + + org.springframework + spring-jdbc + ${spring-version} + + + org.springframework + spring-orm + ${spring-version} + + + org.springframework + spring-web + ${spring-version} + + + org.springframework + spring-webmvc + ${spring-version} + + + org.springframework + spring-test + ${spring-version} + test + + + + + + + maven-deploy-plugin + false + + false + + + + maven-compiler-plugin + + ${java.version} + ${java.version} + + + + maven-jar-plugin + + + + true + true + + + + + + maven-antrun-plugin + + + compile + + + + + + + + + + + run + + + + + + maven-surefire-plugin + + + **/*Tests.java + + -Xmx256m + + + + maven-eclipse-plugin + + true + + org.eclipse.jdt.launching.JRE_CONTAINER + + + + + maven-source-plugin + + + attach-sources + + jar-no-fork + + + + + + org.mortbay.jetty + maven-jetty-plugin + + citrus + 9999 + + + productionMode + false + + + + + + com.alibaba.citrus.tool + maven-springext-plugin + + + maven-gpg-plugin + + + sign-artifacts + verify + + sign + + + + + + + + + maven-antrun-plugin + 1.6 + + + maven-compiler-plugin + + 2.3.2 + + + maven-jar-plugin + 2.3.2 + + + maven-deploy-plugin + 2.7 + + true + + + + maven-eclipse-plugin + 2.8 + + + maven-shade-plugin + 1.4 + + + maven-source-plugin + 2.1.2 + + + maven-javadoc-plugin + 2.8 + + + maven-surefire-plugin + 2.10 + + + org.mortbay.jetty + maven-jetty-plugin + ${jetty-version} + + + com.alibaba.citrus.tool + maven-springext-plugin + ${springext-plugin-version} + + + maven-gpg-plugin + 1.4 + + + + org.eclipse.m2e + lifecycle-mapping + 1.0.0 + + + + + + org.apache.maven.plugins + maven-antrun-plugin + [1.0,) + + run + + + + + + + + + org.apache.maven.plugins + maven-enforcer-plugin + [1.0,) + + enforce + + + + + + + + + + + + + + \ No newline at end of file diff --git a/swh/lister/maven/tests/test_lister.py b/swh/lister/maven/tests/test_lister.py --- a/swh/lister/maven/tests/test_lister.py +++ b/swh/lister/maven/tests/test_lister.py @@ -30,6 +30,10 @@ GIT_REPO_URL1_API = f"https://api.github.com/repos/{USER_REPO1}" LIST_GIT_INCR = (GIT_REPO_URL1_HTTPS,) +USER_REPO2 = "webx/citrus" +GIT_REPO_URL2_HTTPS = f"https://github.com/{USER_REPO2}" +GIT_REPO_URL2_API = f"https://api.github.com/repos/{USER_REPO2}" + LIST_SRC = (MVN_URL + "al/aldi/sprova4j",) LIST_SRC_DATA = ( @@ -91,12 +95,18 @@ return Path(datadir, "https_maven.org", "arangodb-graphql-1.2.pom").read_bytes() +@pytest.fixture +def maven_pom_multi_byte_encoding(datadir) -> bytes: + return Path(datadir, "https_maven.org", "citrus-parent-3.0.7.pom").read_bytes() + + @pytest.fixture def requests_mock(requests_mock): """If github api calls for the configured scm repository, returns its canonical url.""" for url_api, url_html in [ (GIT_REPO_URL0_API, GIT_REPO_URL0_HTTPS), (GIT_REPO_URL1_API, GIT_REPO_URL1_HTTPS), + (GIT_REPO_URL2_API, GIT_REPO_URL2_HTTPS), ]: requests_mock.get( url_api, @@ -351,3 +361,19 @@ # then we get only one maven-jar origin and one git origin. scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results assert len(scheduler_origins) == 2 + + +def test_maven_list_pom_multi_byte_encoding( + swh_scheduler, requests_mock, maven_pom_multi_byte_encoding +): + """should parse POM file with multi-byte encoding.""" + + # replace pom file with a multi-byte encoding one + requests_mock.get(URL_POM_1, content=maven_pom_multi_byte_encoding) + + lister = MavenLister(scheduler=swh_scheduler, url=MVN_URL, index_url=INDEX_URL) + + lister.run() + + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + assert len(scheduler_origins) == 3