Changeset View
Changeset View
Standalone View
Standalone View
swh/lister/maven/lister.py
# Copyright (C) 2021-2022 The Software Heritage developers | # Copyright (C) 2021-2022 The Software Heritage developers | |||||||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | |||||||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | |||||||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | |||||||||
from dataclasses import asdict, dataclass | from dataclasses import asdict, dataclass | |||||||||
from datetime import datetime, timezone | from datetime import datetime, timezone | |||||||||
import logging | import logging | |||||||||
import re | import re | |||||||||
from typing import Any, Dict, Iterator, Optional | from typing import Any, Dict, Iterator, Optional | |||||||||
from urllib.parse import urljoin | from urllib.parse import urljoin | |||||||||
from bs4 import BeautifulSoup | ||||||||||
import lxml | ||||||||||
import requests | import requests | |||||||||
from tenacity.before_sleep import before_sleep_log | from tenacity.before_sleep import before_sleep_log | |||||||||
import xmltodict | ||||||||||
from swh.core.github.utils import GitHubSession | from swh.core.github.utils import GitHubSession | |||||||||
from swh.lister.utils import throttling_retry | from swh.lister.utils import throttling_retry | |||||||||
from swh.scheduler.interface import SchedulerInterface | from swh.scheduler.interface import SchedulerInterface | |||||||||
from swh.scheduler.model import ListedOrigin | from swh.scheduler.model import ListedOrigin | |||||||||
from .. import USER_AGENT | from .. import USER_AGENT | |||||||||
from ..pattern import CredentialsType, Lister | from ..pattern import CredentialsType, Lister | |||||||||
▲ Show 20 Lines • Show All 223 Lines • ▼ Show 20 Lines | def get_pages(self) -> Iterator[RepoPage]: | |||||||||
logger.info("Found %s poms.", len(out_pom)) | logger.info("Found %s poms.", len(out_pom)) | |||||||||
# Now fetch pom files and scan them for scm info. | # Now fetch pom files and scan them for scm info. | |||||||||
logger.info("Fetching poms..") | logger.info("Fetching poms..") | |||||||||
for pom in out_pom: | for pom in out_pom: | |||||||||
try: | try: | |||||||||
response = self.page_request(pom, {}) | response = self.page_request(pom, {}) | |||||||||
project = xmltodict.parse(response.content) | parsed_pom = BeautifulSoup(response.content, "xml") | |||||||||
project_d = project.get("project", {}) | project = parsed_pom.find("project") | |||||||||
scm_d = project_d.get("scm") | if project is None: | |||||||||
if scm_d is not None: | continue | |||||||||
connection = scm_d.get("connection") | scm = project.find("scm") | |||||||||
if scm is not None: | ||||||||||
connection = scm.find("connection") | ||||||||||
if connection is not None: | if connection is not None: | |||||||||
artifact_metadata_d = { | artifact_metadata_d = { | |||||||||
"type": "scm", | "type": "scm", | |||||||||
"doc": out_pom[pom], | "doc": out_pom[pom], | |||||||||
"url": connection, | "url": connection.text, | |||||||||
} | } | |||||||||
logger.debug("* Yielding pom %s: %s", pom, artifact_metadata_d) | logger.debug("* Yielding pom %s: %s", pom, artifact_metadata_d) | |||||||||
yield artifact_metadata_d | yield artifact_metadata_d | |||||||||
else: | else: | |||||||||
logger.debug("No scm.connection in pom %s", pom) | logger.debug("No scm.connection in pom %s", pom) | |||||||||
else: | else: | |||||||||
logger.debug("No scm in pom %s", pom) | logger.debug("No scm in pom %s", pom) | |||||||||
except requests.HTTPError: | except requests.HTTPError: | |||||||||
logger.warning( | logger.warning( | |||||||||
"POM info page could not be fetched, skipping project '%s'", | "POM info page could not be fetched, skipping project '%s'", | |||||||||
pom, | pom, | |||||||||
) | ) | |||||||||
except xmltodict.expat.ExpatError as error: | except lxml.etree.Error as error: | |||||||||
logger.info("Could not parse POM %s XML: %s. Next.", pom, error) | logger.info("Could not parse POM %s XML: %s.", pom, error) | |||||||||
vlorentzUnsubmitted Not Done Inline Actions
vlorentz: | ||||||||||
def get_scm(self, page: RepoPage) -> Optional[ListedOrigin]: | def get_scm(self, page: RepoPage) -> Optional[ListedOrigin]: | |||||||||
"""Retrieve scm origin out of the page information. Only called when type of the | """Retrieve scm origin out of the page information. Only called when type of the | |||||||||
page is scm. | page is scm. | |||||||||
Try and detect an scm/vcs repository. Note that official format is in the form: | Try and detect an scm/vcs repository. Note that official format is in the form: | |||||||||
scm:{type}:git://example.org/{user}/{repo}.git but some projects directly put | scm:{type}:git://example.org/{user}/{repo}.git but some projects directly put | |||||||||
the repo url (without the "scm:type"), so we have to check against the content | the repo url (without the "scm:type"), so we have to check against the content | |||||||||
▲ Show 20 Lines • Show All 139 Lines • Show Last 20 Lines |