Changeset View
Changeset View
Standalone View
Standalone View
swh/lister/maven/lister.py
# Copyright (C) 2021-2022 The Software Heritage developers | # Copyright (C) 2021-2022 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
from dataclasses import asdict, dataclass | from dataclasses import asdict, dataclass | ||||
from datetime import datetime, timezone | from datetime import datetime, timezone | ||||
import logging | import logging | ||||
import re | import re | ||||
from typing import Any, Dict, Iterator, Optional | from typing import Any, Dict, Iterator, Optional | ||||
from urllib.parse import urljoin | from urllib.parse import urljoin | ||||
from bs4 import BeautifulSoup | from bs4 import BeautifulSoup | ||||
import lxml | import lxml | ||||
import requests | import requests | ||||
from tenacity.before_sleep import before_sleep_log | |||||
from swh.core.github.utils import GitHubSession | from swh.core.github.utils import GitHubSession | ||||
from swh.lister.utils import http_retry | |||||
from swh.scheduler.interface import SchedulerInterface | from swh.scheduler.interface import SchedulerInterface | ||||
from swh.scheduler.model import ListedOrigin | from swh.scheduler.model import ListedOrigin | ||||
from .. import USER_AGENT | from .. import USER_AGENT | ||||
from ..pattern import CredentialsType, Lister | from ..pattern import CredentialsType, Lister | ||||
logger = logging.getLogger(__name__) | logger = logging.getLogger(__name__) | ||||
▲ Show 20 Lines • Show All 60 Lines • ▼ Show 20 Lines | ): | ||||
super().__init__( | super().__init__( | ||||
scheduler=scheduler, | scheduler=scheduler, | ||||
credentials=credentials, | credentials=credentials, | ||||
url=url, | url=url, | ||||
instance=instance, | instance=instance, | ||||
) | ) | ||||
self.session = requests.Session() | self.session.headers.update({"Accept": "application/json"}) | ||||
self.session.headers.update( | |||||
{ | |||||
"Accept": "application/json", | |||||
"User-Agent": USER_AGENT, | |||||
} | |||||
) | |||||
self.jar_origins: Dict[str, ListedOrigin] = {} | self.jar_origins: Dict[str, ListedOrigin] = {} | ||||
self.github_session = GitHubSession( | self.github_session = GitHubSession( | ||||
credentials=self.credentials, user_agent=USER_AGENT | credentials=self.credentials, user_agent=USER_AGENT | ||||
) | ) | ||||
def state_from_dict(self, d: Dict[str, Any]) -> MavenListerState: | def state_from_dict(self, d: Dict[str, Any]) -> MavenListerState: | ||||
return MavenListerState(**d) | return MavenListerState(**d) | ||||
def state_to_dict(self, state: MavenListerState) -> Dict[str, Any]: | def state_to_dict(self, state: MavenListerState) -> Dict[str, Any]: | ||||
return asdict(state) | return asdict(state) | ||||
@http_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) | |||||
def page_request(self, url: str, params: Dict[str, Any]) -> requests.Response: | |||||
logger.info("Fetching URL %s with params %s", url, params) | |||||
response = self.session.get(url, params=params) | |||||
if response.status_code != 200: | |||||
logger.warning( | |||||
"Unexpected HTTP status code %s on %s: %s", | |||||
response.status_code, | |||||
response.url, | |||||
response.content, | |||||
) | |||||
response.raise_for_status() | |||||
return response | |||||
def get_pages(self) -> Iterator[RepoPage]: | def get_pages(self) -> Iterator[RepoPage]: | ||||
"""Retrieve and parse exported maven indexes to | """Retrieve and parse exported maven indexes to | ||||
identify all pom files and src archives. | identify all pom files and src archives. | ||||
""" | """ | ||||
# Example of returned RepoPage's: | # Example of returned RepoPage's: | ||||
# [ | # [ | ||||
# { | # { | ||||
Show All 10 Lines | def get_pages(self) -> Iterator[RepoPage]: | ||||
# "project": "openengsb-framework", | # "project": "openengsb-framework", | ||||
# }, | # }, | ||||
# ... | # ... | ||||
# ] | # ] | ||||
# Download the main text index file. | # Download the main text index file. | ||||
logger.info("Downloading computed index from %s.", self.INDEX_URL) | logger.info("Downloading computed index from %s.", self.INDEX_URL) | ||||
assert self.INDEX_URL is not None | assert self.INDEX_URL is not None | ||||
response = requests.get(self.INDEX_URL, stream=True) | try: | ||||
if response.status_code != 200: | response = self.http_request(self.INDEX_URL, stream=True) | ||||
except requests.HTTPError: | |||||
logger.error("Index %s not found, stopping", self.INDEX_URL) | logger.error("Index %s not found, stopping", self.INDEX_URL) | ||||
response.raise_for_status() | raise | ||||
# Prepare regexes to parse index exports. | # Prepare regexes to parse index exports. | ||||
# Parse doc id. | # Parse doc id. | ||||
# Example line: "doc 13" | # Example line: "doc 13" | ||||
re_doc = re.compile(r"^doc (?P<doc>\d+)$") | re_doc = re.compile(r"^doc (?P<doc>\d+)$") | ||||
# Parse gid, aid, version, classifier, extension. | # Parse gid, aid, version, classifier, extension. | ||||
▲ Show 20 Lines • Show All 75 Lines • ▼ Show 20 Lines | def get_pages(self) -> Iterator[RepoPage]: | ||||
yield artifact_metadata_d | yield artifact_metadata_d | ||||
url_src = None | url_src = None | ||||
logger.info("Found %s poms.", len(out_pom)) | logger.info("Found %s poms.", len(out_pom)) | ||||
# Now fetch pom files and scan them for scm info. | # Now fetch pom files and scan them for scm info. | ||||
logger.info("Fetching poms..") | logger.info("Fetching poms..") | ||||
for pom in out_pom: | for pom_url in out_pom: | ||||
try: | try: | ||||
response = self.page_request(pom, {}) | response = self.http_request(pom_url) | ||||
parsed_pom = BeautifulSoup(response.content, "xml") | parsed_pom = BeautifulSoup(response.content, "xml") | ||||
project = parsed_pom.find("project") | project = parsed_pom.find("project") | ||||
if project is None: | if project is None: | ||||
continue | continue | ||||
scm = project.find("scm") | scm = project.find("scm") | ||||
if scm is not None: | if scm is not None: | ||||
connection = scm.find("connection") | connection = scm.find("connection") | ||||
if connection is not None: | if connection is not None: | ||||
artifact_metadata_d = { | artifact_metadata_d = { | ||||
"type": "scm", | "type": "scm", | ||||
"doc": out_pom[pom], | "doc": out_pom[pom_url], | ||||
"url": connection.text, | "url": connection.text, | ||||
} | } | ||||
logger.debug("* Yielding pom %s: %s", pom, artifact_metadata_d) | logger.debug( | ||||
"* Yielding pom %s: %s", pom_url, artifact_metadata_d | |||||
) | |||||
yield artifact_metadata_d | yield artifact_metadata_d | ||||
else: | else: | ||||
logger.debug("No scm.connection in pom %s", pom) | logger.debug("No scm.connection in pom %s", pom_url) | ||||
else: | else: | ||||
logger.debug("No scm in pom %s", pom) | logger.debug("No scm in pom %s", pom_url) | ||||
except requests.HTTPError: | except requests.HTTPError: | ||||
logger.warning( | logger.warning( | ||||
"POM info page could not be fetched, skipping project '%s'", | "POM info page could not be fetched, skipping project '%s'", | ||||
pom, | pom_url, | ||||
) | ) | ||||
except lxml.etree.Error as error: | except lxml.etree.Error as error: | ||||
logger.info("Could not parse POM %s XML: %s.", pom, error) | logger.info("Could not parse POM %s XML: %s.", pom_url, error) | ||||
def get_scm(self, page: RepoPage) -> Optional[ListedOrigin]: | def get_scm(self, page: RepoPage) -> Optional[ListedOrigin]: | ||||
"""Retrieve scm origin out of the page information. Only called when type of the | """Retrieve scm origin out of the page information. Only called when type of the | ||||
page is scm. | page is scm. | ||||
Try and detect an scm/vcs repository. Note that official format is in the form: | Try and detect an scm/vcs repository. Note that official format is in the form: | ||||
scm:{type}:git://example.org/{user}/{repo}.git but some projects directly put | scm:{type}:git://example.org/{user}/{repo}.git but some projects directly put | ||||
the repo url (without the "scm:type"), so we have to check against the content | the repo url (without the "scm:type"), so we have to check against the content | ||||
▲ Show 20 Lines • Show All 139 Lines • Show Last 20 Lines |