Differential D8217 Diff 29657 swh/lister/maven/lister.py

Changeset View

Standalone View

View Options

swh/lister/maven/lister.py

# See the AUTHORS file at the top-level directory of this distribution

# License: GNU General Public License version 3, or any later version

# See top-level LICENSE file for more information

from dataclasses import asdict, dataclass

from datetime import datetime, timezone

import logging

import re

from typing import Any, Dict, Iterator, Optional

from urllib.parse import urljoin

from bs4 import BeautifulSoup

import lxml

import requests

from tenacity.before_sleep import before_sleep_log

import xmltodict

from swh.core.github.utils import GitHubSession

from swh.lister.utils import throttling_retry

from swh.scheduler.interface import SchedulerInterface

from swh.scheduler.model import ListedOrigin

from .. import USER_AGENT

from ..pattern import CredentialsType, Lister

▲ Show 20 Lines • Show All 223 Lines • ▼ Show 20 Lines

def get_pages(self) -> Iterator[RepoPage]:

logger.info("Found %s poms.", len(out_pom))

# Now fetch pom files and scan them for scm info.

logger.info("Fetching poms..")

for pom in out_pom:

try:

response = self.page_request(pom, {})

project = xmltodict.parse(response.content)

parsed_pom = BeautifulSoup(response.content, "xml")

project_d = project.get("project", {})

project = parsed_pom.find("project")

scm_d = project_d.get("scm")

if project is None:

if scm_d is not None:

continue

connection = scm_d.get("connection")

scm = project.find("scm")

if scm is not None:

connection = scm.find("connection")

if connection is not None:

artifact_metadata_d = {

"type": "scm",

"doc": out_pom[pom],

"url": connection,

"url": connection.text,

}

logger.debug("* Yielding pom %s: %s", pom, artifact_metadata_d)

yield artifact_metadata_d

else:

logger.debug("No scm.connection in pom %s", pom)

else:

logger.debug("No scm in pom %s", pom)

except requests.HTTPError:

logger.warning(

"POM info page could not be fetched, skipping project '%s'",

pom,

)

except xmltodict.expat.ExpatError as error:

except lxml.etree.Error as error:

logger.info("Could not parse POM %s XML: %s. Next.", pom, error)

logger.info("Could not parse POM %s XML: %s.", pom, error)

vlorentzUnsubmitted

Not Done

pom,

)

- except Exception as error:

+ except lxml.etree.Error as error:

logger.info("Could not parse POM %s XML: %s.", pom, error)

vlorentz:

def get_scm(self, page: RepoPage) -> Optional[ListedOrigin]:

"""Retrieve scm origin out of the page information. Only called when type of the

page is scm.

Try and detect an scm/vcs repository. Note that official format is in the form:

scm:{type}:git://example.org/{user}/{repo}.git but some projects directly put

the repo url (without the "scm:type"), so we have to check against the content

▲ Show 20 Lines • Show All 139 Lines • Show Last 20 Lines