Differential D6133 Diff 23788 swh/lister/maven/lister.py

Changeset View

Standalone View

View Options

swh/lister/maven/lister.py

This file was added.

# See the AUTHORS file at the top-level directory of this distribution

# License: GNU General Public License version 3, or any later version

# See top-level LICENSE file for more information

from dataclasses import asdict, dataclass

import logging

import re

from typing import Any, Dict, Iterator, Optional

from urllib.parse import urljoin

import requests

from tenacity.before_sleep import before_sleep_log

from urllib3.util import parse_url

import xmltodict

from swh.lister.utils import throttling_retry

from swh.scheduler.interface import SchedulerInterface

from swh.scheduler.model import ListedOrigin

from .. import USER_AGENT

from ..pattern import CredentialsType, Lister

logger = logging.getLogger(__name__)

RepoPage = Dict[str, Any]

@dataclass

class MavenListerState:

"""State of the MavenLister"""

last_seen_doc: int = -1

"""Last doc ID ingested during an incremental pass

"""

last_seen_pom: int = -1

"""Last doc ID related to a pom and ingested during

an incremental pass

"""

class MavenLister(Lister[MavenListerState, RepoPage]):

"""List origins from a Maven repository.

Maven Central provides artifacts for Java builds.

It includes POM files and source archives, which we download to get

the source code of artifacts and links to their scm repository.

This lister yields origins of types: git/svn/hg or whatever the Artifacts

use as repository type, plus maven types for the maven loader (tgz, jar)."""

LISTER_NAME = "maven"

def __init__(

self,

douarddaUnsubmitted

Done

Why is docker involved here? I see no other mention of a docker stuff anywhere in this diff.
I don't understand what this "index_url" is and how it is supposed to be used.

douardda: Why is docker involved here? I see no other mention of a docker stuff anywhere in this diff. I…

borisbaldassariAuthorUnsubmitted

Done

It's been a long discussion held back in June on IRC and in task T1724. In a nutshell, we need two tools to transform the maven indexes into something readable: maven-indexer-cli and clue. Rather than have a virtual machine (there is no way to run java code in python without one) it was requested that the tools be put in a docker container. End of August, the docker image was ready and olasd asked me (08-25) to put it on a separate server so the lister would simply have to query it on the network.

so: index_url is the name (or IP address) of this local server that hosts the docker image.

borisbaldassari: It's been a long discussion held back in June on IRC and in task T1724. In a nutshell, we need…

borisbaldassariAuthorUnsubmitted

Done

so: index_url is the name (or IP address) of this local server that hosts the docker image.

so: index_url is the name (or IP address) of this local server that hosts the docker image *and* the exported indexes to be downloaded.

borisbaldassari: > so: index_url is the name (or IP address) of this local server that hosts the docker image.

douarddaUnsubmitted

Done

Ok, so please document all this some where (in this diff). The README file shoudl give some high level explanations, and this docstring should refer to this former doc.

Also, where is documented/specified this index file format? It should be somewhere.

douardda: Ok, so please document all this some where (in this diff). The README file shoudl give some…

borisbaldassariAuthorUnsubmitted

Done

Added a README.md (in this arc diff) and added a link to the readme in the f-string (yet to commit).

borisbaldassari: Added a README.md (in this arc diff) and added a link to the readme in the f-string (yet to…

scheduler: SchedulerInterface,

url: str,

index_url: str = None,

instance: Optional[str] = None,

credentials: CredentialsType = None,

# credentials: really needed?

incremental: bool = True,

"""Lister class for Maven repositories.

Args:

url: main URL of the Maven repository, i.e. url of the base index

used to fetch maven artifacts. For Maven central use

https://repo1.maven.org/maven2/

index_url: the URL to download the exported text indexes from.

Would typically be a local host running the export docker image.

See README.md in this directory for more information.

instance: Name of maven instance. Defaults to url's network location

if unset.

incremental: bool, defaults to True. Defines if incremental listing

is activated or not.

"""

self.BASE_URL = url

self.INDEX_URL = index_url

self.incremental = incremental

if instance is None:

instance = parse_url(url).host

super().__init__(

scheduler=scheduler, credentials=credentials, url=url, instance=instance,

)

self.session = requests.Session()

self.session.headers.update(

{"Accept": "application/json", "User-Agent": USER_AGENT,}

)

def state_from_dict(self, d: Dict[str, Any]) -> MavenListerState:

return MavenListerState(**d)

def state_to_dict(self, state: MavenListerState) -> Dict[str, Any]:

return asdict(state)

@throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING))

def page_request(self, url: str, params: Dict[str, Any]) -> requests.Response:

logger.info("Fetching URL %s with params %s", url, params)

response = self.session.get(url, params=params)

if response.status_code != 200:

logger.warning(

"Unexpected HTTP status code %s on %s: %s",

response.status_code,

response.url,

response.content,

)

response.raise_for_status()

return response

def get_pages(self) -> Iterator[RepoPage]:

""" Retrieve and parse exported maven indexes to

identify all pom files and src archives.

"""

# Example of returned RepoPage's:

# [

douarddaUnsubmitted

Done

why not use the context manager API of the NamedTemporaryFile here?

douardda: why not use the context manager API of the `NamedTemporaryFile` here?

borisbaldassariAuthorUnsubmitted

Done

As far as i can remember, because I wanted to stream it in order to reduce memory footprint: the download can be huge.
For maven-central the main download would be 49GB.

borisbaldassari: As far as i can remember, because I wanted to stream it in order to reduce memory footprint…

douarddaUnsubmitted

Done

The context manager is unrelated with loading the file in RAM or not.

douardda: The context manager is unrelated with loading the file in RAM or not.

borisbaldassariAuthorUnsubmitted

Done

Done.

borisbaldassari: Done.

# {

# "type": "maven",

# "url": "https://maven.xwiki.org/..-5.4.2-sources.jar",

# "time": 1626109619335,

# "gid": "org.xwiki.platform",

# "aid": "xwiki-platform-wikistream-events-xwiki",

# "version": "5.4.2"

douarddaUnsubmitted

Done

Please document a bit these regex. Also please prefer named match groups when possible (?P<name>...) which helps to "self-document" regexes.

douardda: Please document a bit these regex. Also please prefer named match groups when possible `(?

borisbaldassariAuthorUnsubmitted

Done

You're definitely right. Fixed, thanks.

borisbaldassari: You're definitely right. Fixed, thanks.

# },

# {

# "type": "scm",

# "url": "scm:git:git://github.com/openengsb/openengsb-framework.git",

# "project": "openengsb-framework",

# },

# ...

douarddaUnsubmitted

Done

a python text file object is iterable, so one would prefer the form:

for line in file_txt:
  [...]

douardda: a python text file object is iterable, so one would prefer the form: ``` for line in…

borisbaldassariAuthorUnsubmitted

Done

Yes, I'm shameful.
The reason for this is there is a second readline later on in the loop, and I could not find a more elegant way yet. There are newlines in some of the fields, I didn't come up with a nice way to do it. Yet.

borisbaldassari: Yes, I'm shameful. The reason for this is there is a second readline later on in the loop, and…

borisbaldassariAuthorUnsubmitted

Done

Fixed with refactoring (see below).

borisbaldassari: Fixed with refactoring (see below).

# ]

douarddaUnsubmitted

Done

while line: is enough here

douardda: `while line:` is enough here

borisbaldassariAuthorUnsubmitted

Done

Fixed, thanks.

borisbaldassari: Fixed, thanks.

# Download the main text index file.

logger.info(f"Downloading text index from {self.INDEX_URL}.")

assert self.INDEX_URL is not None

response = requests.get(self.INDEX_URL, stream=True)

douarddaUnsubmitted

Done

this could be handled by the regex itself

douardda: this could be handled by the regex itself

borisbaldassariAuthorUnsubmitted

Done

Yes, it could. But it seems to me that readability is better this way.
As you say. Want me to do it?

borisbaldassari: Yes, it could. But it seems to me that readability is better this way. As you say. Want me to…

response.raise_for_status()

# Prepare regexes to parse index exports.

# Parse doc id.

# Example line: "doc 13"

re_doc = re.compile(r"^doc (?P<doc>\d+)$")

# Parse gid, aid, version, classifier, extension.

# Example line: " value al.aldi|sprova4j|0.1.0|sources|jar"

re_val = re.compile(

r"^\s{4}value (?P<gid>[^|]+)\|(?P<aid>[^|]+)\|(?P<version>[^|]+)\|"

+ r"(?P<classifier>[^|]+)\|(?P<ext>[^|]+)$"

)

# Parse last modification time.

douarddaUnsubmitted

Done

why use urljoin while hand-building the URL by concatenation of strings with '/'?

I mean urljoin does support multiple arguments, like:

urljoin(self.BASE_URL, path, aid, version, f"{aid}-{version}.{ext}")

douardda: why use `urljoin` while hand-building the URL by concatenation of strings with '/'? I mean…

borisbaldassariAuthorUnsubmitted

Done

Hum. It yields [1] when I try, and that's not what I've read [2].

[1] "TypeError: urljoin() takes from 2 to 3 positional arguments but 6 were given"
[2] https://docs.python.org/3/library/urllib.parse.html#urllib.parse.urljoin

Am I missing something?

Note: I have fixed the ugliness of if by using f-strings, and it looks a lot better.

borisbaldassari: Hum. It yields [1] when I try, and that's not what I've read [2]. [1] "TypeError: urljoin()…

douarddaUnsubmitted

Done

No you are right, my mistake, I was assuming urljoin has a decent API, which is not the case. sorry.

douardda: No you are right, my mistake, I was assuming urljoin has a decent API, which is not the case.

borisbaldassariAuthorUnsubmitted

Done

I did assume that too at some point, rings a bell. yeah.

borisbaldassari: I did assume that too at some point, rings a bell. yeah.

# Example line: " value jar|1626109619335|14316|2|2|0|jar"

re_time = re.compile(

r"^\s{4}value ([^|]+)\|(?P<mtime>[^|]+)\|([^|]+)\|([^|]+)\|([^|]+)"

douarddaUnsubmitted

Done

and ext in ("zip", "jar")

douardda: `and ext in ("zip", "jar")`

borisbaldassariAuthorUnsubmitted

Done

ext in (a, b) => Far more elegant, of course. Fixed, thanks.

uppercase => Yes, good point. As a matter of fact there is no uppercase extensions on maven central (just checked) but I'm not sure why (part of the maven convention, maybe?) and that can surely happen.
Anyway, it's definitely good practice, fixed, thanks.

borisbaldassari: ext in (a, b) => Far more elegant, of course. Fixed, thanks. uppercase => Yes, good point. As…

douarddaUnsubmitted

Done

BTW, what about an upper case ext value?

douardda: BTW, what about an upper case `ext` value?

+ r"\|([^|]+)\|([^|]+)$"

)

# Read file line by line and process it

out_pom: Dict = {}

jar_src: Dict = {}

doc_id: int = 0

jar_src["doc"] = None

url_src = None

iterator = response.iter_lines(chunk_size=1024)

for line_bytes in iterator:

# Read the index text export and get URLs and SCMs.

line = line_bytes.decode(errors="ignore")

m_doc = re_doc.match(line)

if m_doc is not None:

doc_id = int(m_doc.group("doc"))

if (

self.incremental

and self.state

douarddaUnsubmitted

Done

out_src[url_src] = {"g": gid, "a": aid, "v": version}

Not sure out_src and out_pom really need to be defaultdict actually.

douardda: ``` out_src[url_src] = {"g": gid, "a": aid, "v": version} ``` Not sure `out_src` and `out_pom`…

borisbaldassariAuthorUnsubmitted

Done

For uniqueness of entries (some entries tend to appear a few times). Is there a way to do it better with Python?

borisbaldassari: For uniqueness of entries (some entries tend to appear a few times). Is there a way to do it…

douarddaUnsubmitted

Done

I don't see how using defaultdict is related with this uniqueness question.
What does it bring that a simple dict would not?

douardda: I don't see how using defaultdict is related with this uniqueness question. What does it bring…

borisbaldassariAuthorUnsubmitted

Done

Ok, I get it. That's probably an old Perl habit to explicitly have hashes of hashes.
Fixed to Dicts, thanks.

borisbaldassari: Ok, I get it. That's probably an old Perl habit to explicitly have hashes of hashes. Fixed to…

and self.state.last_seen_doc

and self.state.last_seen_doc >= doc_id

# jar_src["doc"] contains the id of the current document, whatever

# its type (scm or jar).

jar_src["doc"] = None

else:

jar_src["doc"] = doc_id

else:

# If incremental mode, we don't record any line that is

douarddaUnsubmitted

Done

Now that I read this, why not do the processing on the flight? Why bother storing the file on disk then read it back line by line to do a bunch of regex?

requests does provide a nice API for this:
https://docs.python-requests.org/en/master/api/#requests.Response.iter_lines

douardda: Now that I read this, why not do the processing on the flight? Why bother storing the file on…

borisbaldassariAuthorUnsubmitted

Done

Short answer: because I didn't know of iter_*lines*. Oh god. :-)
Thanks!!

borisbaldassari: Short answer: because I didn't know of iter_*lines*. Oh god. :-) Thanks!!

borisbaldassariAuthorUnsubmitted

Done

That's exactly why we need peers: to get out of our own train of thought.

Ok, fixed as you proposed. The file is now parsed as it is downloaded, and that solved a few other points. When the design is broken, everything looks weird, right?
Yet to commit: no more file kept.

Thanks a lot for the feedback!

borisbaldassari: That's exactly why we need peers: to get out of our own train of thought. Ok, fixed as you…

# before our last recorded doc id.

if self.incremental and jar_src["doc"] is None:

continue

m_val = re_val.match(line)

if m_val is not None:

douarddaUnsubmitted

Done

no need for the .keys() here, iterating on a dict is iterating on its keys.
But more importantly, you probably want to actually iterate on both keys and values:

for src, val in out_src.items():
  ...
  yield {
    ...
    "time": val["t"],
      ...

BTW, if you use proper keys in out_src[*] (i.e. "time" instead of "t" and so on) you can just use it as is here:

  yield {
    "type": "jar",
    "url": src,
   **val
}

douardda: no need for the `.keys()` here, iterating on a dict is iterating on its keys. But more…

(gid, aid, version, classifier, ext) = m_val.groups()

ext = ext.strip()

path = "/".join(gid.split("."))

if classifier == "NA" and ext.lower() == "pom":

# If incremental mode, we don't record any line that is

# before our last recorded doc id.

if (

self.incremental

and self.state

and self.state.last_seen_pom

and self.state.last_seen_pom >= doc_id

continue

vlorentzUnsubmitted

Done

that case is missing from the example in the comment above

vlorentz: that case is missing from the example in the comment above

borisbaldassariAuthorUnsubmitted

Done

That's a very good point. This kind of metadata is useful for the jar loader, but not so much for the other types of scm loaders (scm_type, which could be about anything). Do we want to keep them?

borisbaldassari: That's a very good point. This kind of metadata is useful for the jar loader, but not so much…

url_path = f"{path}/{aid}/{version}/{aid}-{version}.{ext}"

url_pom = urljoin(self.BASE_URL, url_path,)

out_pom[url_pom] = doc_id

elif (

classifier.lower() == "sources" or ("src" in classifier)

douarddaUnsubmitted

Done

I'm always nervous when I see a bytes.decode() called on some content coming from The Workd™.

Is there any change of getting some encoding error here?

douardda: I'm always nervous when I see a `bytes.decode()` called on some content coming from The Workd™.

borisbaldassariAuthorUnsubmitted

Done

Thanks for spotting that, it needs some consideration.

Theoretically, no: we're decoding the content of a file downloaded from a local server (transport errors should be ok), which is output by clue asynchronously (i.e. file is not served if the process fails), so corruption or weird content is unlikely, but.. it's still clearly out of our control. So yes, we never know and you're definitely right.

OTOH we can't go on without that data, and I have the feeling we should rather fail (throw an exception about decoding and end execution) than pass silently (adding errors='ignore' to decode could do that). Would you like to try & catch, and then throw a specific error? What would you recommend?

=> added errors='ignore' so the list will simply be empty.

borisbaldassari: Thanks for spotting that, it needs some consideration. Theoretically, no: we're decoding the…

) and ext.lower() in ("zip", "jar"):

url_path = (

f"{path}/{aid}/{version}/{aid}-{version}-{classifier}.{ext}"

)

url_src = urljoin(self.BASE_URL, url_path)

jar_src["gid"] = gid

jar_src["aid"] = aid

jar_src["version"] = version

else:

m_time = re_time.match(line)

if m_time is not None and url_src is not None:

time = m_time.group("mtime")

jar_src["time"] = int(time)

logger.debug(f"* Yielding jar {url_src}.")

yield {

douarddaUnsubmitted

Done

use items() on the dict:

for src, project in out_pom.items():
   yield {"type": "scm", "url": src, "project": project}

BTW, why build the dict to yield its values just after building it?

Why not yielding values directly from the for pom in out_pom loop?

douardda: use `items()` on the dict: ``` for src, project in out_pom.items(): yield {"type"…

borisbaldassariAuthorUnsubmitted

Done

Very good point, moved it to the for pom in out_pom loop. Thanks!

borisbaldassari: Very good point, moved it to the `for pom in out_pom` loop. Thanks!

"type": "maven",

"url": url_src,

**jar_src,

}

url_src = None

logger.info(f"Found {len(out_pom)} poms.")

# Now fetch pom files and scan them for scm info.

logger.info("Fetching poms..")

for pom in out_pom:

text = self.page_request(pom, {})

try:

project = xmltodict.parse(text.content.decode())

if "scm" in project["project"]:

if "connection" in project["project"]["scm"]:

scm = project["project"]["scm"]["connection"]

gid = project["project"]["groupId"]

aid = project["project"]["artifactId"]

douarddaUnsubmitted

Done

no need to compile the regex if it's used only once. Just use the match function directly:

m_scm = re.match(r"^scm:([^:]+):(.*)$", page["url"])

Also please prefer named group matching (https://docs.python.org/3/library/re.html#index-17)

douardda: no need to compile the regex if it's used only once. Just use the `match` function directly…

borisbaldassariAuthorUnsubmitted

Done

Right, fixed, thank you! :-)

borisbaldassari: Right, fixed, thank you! :-)

yield {

"type": "scm",

"doc": out_pom[pom],

"url": scm,

"project": f"{gid}.{aid}",

}

douarddaUnsubmitted

Done

what's the comment for?

douardda: what's the comment for?

borisbaldassariAuthorUnsubmitted

Done

Removed.

borisbaldassari: Removed.

else:

logger.debug(f"No scm.connection in pom {pom}")

else:

logger.debug(f"No scm in pom {pom}")

except xmltodict.expat.ExpatError as error:

logger.info(f"Could not parse POM {pom} XML: {error}. Next.")

def get_origins_from_page(self, page: RepoPage) -> Iterator[ListedOrigin]:

"""Convert a page of Maven repositories into a list of ListedOrigins.

"""

douarddaUnsubmitted

Done

yield origin

- else:

- re_scm = re.compile(r".*\.git$")

- m_scm = re_scm.match(page["url"])

- if m_scm is not None:

- origin = ListedOrigin(

+ elif page["url"].endswith(".git"):

+ yield ListedOrigin(

lister_id=self.lister_obj.id, url=page["url"], visit_type="git",

- )

- yield origin

+ )

else:

do you really need a regex to check for a '.git' at the end? I means page["url"].enndswith(".git") should do the trick here.

douardda: do you really need a regex to check for a '.git' at the end? I means page["url"].enndswith(".

assert self.lister_obj.id is not None

if page["type"] == "scm":

# If origin is a scm url: detect scm type and yield.

# Note that the official format is:

# scm:git:git://github.com/openengsb/openengsb-framework.git

# but many, many projects directly put the repo url, so we have to

# detect the content to match it properly.

douarddaUnsubmitted

Done

why the commented line?

douardda: why the commented line?

m_scm = re.match(r"^scm:(?P<type>[^:]+):(?P<url>.*)$", page["url"])

if m_scm is not None:

scm_type = m_scm.group("type")

scm_url = m_scm.group("url")

origin = ListedOrigin(

lister_id=self.lister_obj.id, url=scm_url, visit_type=scm_type,

)

yield origin

else:

if page["url"].endswith(".git"):

origin = ListedOrigin(

lister_id=self.lister_obj.id, url=page["url"], visit_type="git",

)

yield origin

else:

# Origin is a source archive:

origin = ListedOrigin(

lister_id=self.lister_obj.id,

url=page["url"],

visit_type=page["type"],

extra_loader_arguments={

"artifacts": [

{

"time": page["time"],

"gid": page["gid"],

"aid": page["aid"],

"version": page["version"],

}

]

)

yield origin

def commit_page(self, page: RepoPage) -> None:

"""Update currently stored state using the latest listed doc.

Note: this is a noop for full listing mode

"""

if self.incremental and self.state:

# We need to differentiate the two state counters according

# to the type of origin.

if page["type"] == "maven" and page["doc"] > self.state.last_seen_doc:

self.state.last_seen_doc = page["doc"]

elif page["type"] == "scm" and page["doc"] > self.state.last_seen_pom:

self.state.last_seen_doc = page["doc"]

self.state.last_seen_pom = page["doc"]

def finalize(self) -> None:

"""Finalize the lister state, set update if any progress has been made.

Note: this is a noop for full listing mode

"""

if self.incremental and self.state:

last_seen_doc = self.state.last_seen_doc

last_seen_pom = self.state.last_seen_pom

scheduler_state = self.get_state_from_scheduler()

if last_seen_doc and last_seen_pom:

if (scheduler_state.last_seen_doc < last_seen_doc) or (

scheduler_state.last_seen_pom < last_seen_pom

self.updated = True