Differential D8520 Diff 30745 swh/lister/maven/lister.py

Changeset View

Standalone View

swh/lister/maven/lister.py

# Copyright (C) 2021-2022 The Software Heritage developers		# Copyright (C) 2021-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution		# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version		# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information		# See top-level LICENSE file for more information

from dataclasses import asdict, dataclass		from dataclasses import asdict, dataclass
from datetime import datetime, timezone		from datetime import datetime, timezone
import logging		import logging
import re		import re
from typing import Any, Dict, Iterator, Optional		from typing import Any, Dict, Iterator, Optional
from urllib.parse import urljoin		from urllib.parse import urljoin

from bs4 import BeautifulSoup		from bs4 import BeautifulSoup
import lxml		import lxml
import requests		import requests
from tenacity.before_sleep import before_sleep_log

from swh.core.github.utils import GitHubSession		from swh.core.github.utils import GitHubSession
from swh.lister.utils import http_retry
from swh.scheduler.interface import SchedulerInterface		from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin		from swh.scheduler.model import ListedOrigin

from .. import USER_AGENT		from .. import USER_AGENT
from ..pattern import CredentialsType, Lister		from ..pattern import CredentialsType, Lister

logger = logging.getLogger(__name__)		logger = logging.getLogger(__name__)

▲ Show 20 Lines • Show All 60 Lines • ▼ Show 20 Lines	):

super().__init__(		super().__init__(
scheduler=scheduler,		scheduler=scheduler,
credentials=credentials,		credentials=credentials,
url=url,		url=url,
instance=instance,		instance=instance,
)		)

self.session = requests.Session()		self.session.headers.update({"Accept": "application/json"})
self.session.headers.update(
{
"Accept": "application/json",
"User-Agent": USER_AGENT,
}
)

self.jar_origins: Dict[str, ListedOrigin] = {}		self.jar_origins: Dict[str, ListedOrigin] = {}
self.github_session = GitHubSession(		self.github_session = GitHubSession(
credentials=self.credentials, user_agent=USER_AGENT		credentials=self.credentials, user_agent=USER_AGENT
)		)

def state_from_dict(self, d: Dict[str, Any]) -> MavenListerState:		def state_from_dict(self, d: Dict[str, Any]) -> MavenListerState:
return MavenListerState(**d)		return MavenListerState(**d)

def state_to_dict(self, state: MavenListerState) -> Dict[str, Any]:		def state_to_dict(self, state: MavenListerState) -> Dict[str, Any]:
return asdict(state)		return asdict(state)

@http_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
def page_request(self, url: str, params: Dict[str, Any]) -> requests.Response:

logger.info("Fetching URL %s with params %s", url, params)

response = self.session.get(url, params=params)
if response.status_code != 200:
logger.warning(
"Unexpected HTTP status code %s on %s: %s",
response.status_code,
response.url,
response.content,
)
response.raise_for_status()

return response

def get_pages(self) -> Iterator[RepoPage]:		def get_pages(self) -> Iterator[RepoPage]:
"""Retrieve and parse exported maven indexes to		"""Retrieve and parse exported maven indexes to
identify all pom files and src archives.		identify all pom files and src archives.
"""		"""

# Example of returned RepoPage's:		# Example of returned RepoPage's:
# [		# [
# {		# {
Show All 10 Lines	def get_pages(self) -> Iterator[RepoPage]:
# "project": "openengsb-framework",		# "project": "openengsb-framework",
# },		# },
# ...		# ...
# ]		# ]

# Download the main text index file.		# Download the main text index file.
logger.info("Downloading computed index from %s.", self.INDEX_URL)		logger.info("Downloading computed index from %s.", self.INDEX_URL)
assert self.INDEX_URL is not None		assert self.INDEX_URL is not None
response = requests.get(self.INDEX_URL, stream=True)		try:
if response.status_code != 200:		response = self.http_request(self.INDEX_URL, stream=True)
		except requests.HTTPError:
logger.error("Index %s not found, stopping", self.INDEX_URL)		logger.error("Index %s not found, stopping", self.INDEX_URL)
response.raise_for_status()		raise

# Prepare regexes to parse index exports.		# Prepare regexes to parse index exports.

# Parse doc id.		# Parse doc id.
# Example line: "doc 13"		# Example line: "doc 13"
re_doc = re.compile(r"^doc (?P<doc>\d+)$")		re_doc = re.compile(r"^doc (?P<doc>\d+)$")

# Parse gid, aid, version, classifier, extension.		# Parse gid, aid, version, classifier, extension.
▲ Show 20 Lines • Show All 75 Lines • ▼ Show 20 Lines	def get_pages(self) -> Iterator[RepoPage]:
yield artifact_metadata_d		yield artifact_metadata_d
url_src = None		url_src = None

logger.info("Found %s poms.", len(out_pom))		logger.info("Found %s poms.", len(out_pom))

# Now fetch pom files and scan them for scm info.		# Now fetch pom files and scan them for scm info.

logger.info("Fetching poms..")		logger.info("Fetching poms..")
for pom in out_pom:		for pom_url in out_pom:
try:		try:
response = self.page_request(pom, {})		response = self.http_request(pom_url)
parsed_pom = BeautifulSoup(response.content, "xml")		parsed_pom = BeautifulSoup(response.content, "xml")
project = parsed_pom.find("project")		project = parsed_pom.find("project")
if project is None:		if project is None:
continue		continue
scm = project.find("scm")		scm = project.find("scm")
if scm is not None:		if scm is not None:
connection = scm.find("connection")		connection = scm.find("connection")
if connection is not None:		if connection is not None:
artifact_metadata_d = {		artifact_metadata_d = {
"type": "scm",		"type": "scm",
"doc": out_pom[pom],		"doc": out_pom[pom_url],
"url": connection.text,		"url": connection.text,
}		}
logger.debug("* Yielding pom %s: %s", pom, artifact_metadata_d)		logger.debug(
		"* Yielding pom %s: %s", pom_url, artifact_metadata_d
		)
yield artifact_metadata_d		yield artifact_metadata_d
else:		else:
logger.debug("No scm.connection in pom %s", pom)		logger.debug("No scm.connection in pom %s", pom_url)
else:		else:
logger.debug("No scm in pom %s", pom)		logger.debug("No scm in pom %s", pom_url)
except requests.HTTPError:		except requests.HTTPError:
logger.warning(		logger.warning(
"POM info page could not be fetched, skipping project '%s'",		"POM info page could not be fetched, skipping project '%s'",
pom,		pom_url,
)		)
except lxml.etree.Error as error:		except lxml.etree.Error as error:
logger.info("Could not parse POM %s XML: %s.", pom, error)		logger.info("Could not parse POM %s XML: %s.", pom_url, error)

def get_scm(self, page: RepoPage) -> Optional[ListedOrigin]:		def get_scm(self, page: RepoPage) -> Optional[ListedOrigin]:
"""Retrieve scm origin out of the page information. Only called when type of the		"""Retrieve scm origin out of the page information. Only called when type of the
page is scm.		page is scm.

Try and detect an scm/vcs repository. Note that official format is in the form:		Try and detect an scm/vcs repository. Note that official format is in the form:
scm:{type}:git://example.org/{user}/{repo}.git but some projects directly put		scm:{type}:git://example.org/{user}/{repo}.git but some projects directly put
the repo url (without the "scm:type"), so we have to check against the content		the repo url (without the "scm:type"), so we have to check against the content
▲ Show 20 Lines • Show All 139 Lines • Show Last 20 Lines