Changeset View
Standalone View
swh/loader/package/maven/loader.py
- This file was added.
# Copyright (C) 2021 The Software Heritage developers | ||||||||||||||||
# See the AUTHORS file at the top-level directory of this distribution | ||||||||||||||||
# License: GNU General Public License version 3, or any later version | ||||||||||||||||
# See top-level LICENSE file for more information | ||||||||||||||||
from datetime import datetime, timezone | ||||||||||||||||
import hashlib | ||||||||||||||||
import json | ||||||||||||||||
import logging | ||||||||||||||||
from os import path | ||||||||||||||||
import string | ||||||||||||||||
from typing import Any, Dict, Iterator, Mapping, Optional, OrderedDict, Sequence, Tuple | ||||||||||||||||
from urllib.parse import urlparse | ||||||||||||||||
import attr | ||||||||||||||||
from swh.loader.package.loader import ( | ||||||||||||||||
BasePackageInfo, | ||||||||||||||||
PackageLoader, | ||||||||||||||||
PartialExtID, | ||||||||||||||||
RawExtrinsicMetadataCore, | ||||||||||||||||
) | ||||||||||||||||
from swh.loader.package.utils import release_name | ||||||||||||||||
from swh.model.model import ( | ||||||||||||||||
MetadataAuthority, | ||||||||||||||||
MetadataAuthorityType, | ||||||||||||||||
ObjectType, | ||||||||||||||||
Person, | ||||||||||||||||
Release, | ||||||||||||||||
Sha1Git, | ||||||||||||||||
TimestampWithTimezone, | ||||||||||||||||
) | ||||||||||||||||
from swh.storage.interface import StorageInterface | ||||||||||||||||
logger = logging.getLogger(__name__) | ||||||||||||||||
SWH_PERSON = Person( | ||||||||||||||||
name=b"Software Heritage", | ||||||||||||||||
fullname=b"Software Heritage", | ||||||||||||||||
email=b"robot@softwareheritage.org", | ||||||||||||||||
) | ||||||||||||||||
ardumont: According to your last version, this is now a iso8601 string.
You could even make that a… | ||||||||||||||||
Done Inline ActionsYes, now it's a string. Updated as suggested to use datetime. borisbaldassari: Yes, now it's a string. Updated as suggested to use datetime. | ||||||||||||||||
Done Inline Actions
It's not a string anymore vlorentz: It's not a string anymore | ||||||||||||||||
@attr.s | ||||||||||||||||
class MavenPackageInfo(BasePackageInfo): | ||||||||||||||||
time = attr.ib(type=str) | ||||||||||||||||
"""Timestamp of the last update of jar file on the server""" | ||||||||||||||||
vlorentzUnsubmitted Done Inline ActionsCould you use structured data (eg. int or preferably datetime) instead of a string? And the documentation in the constructor below should be updated to match, either way vlorentz: Could you use structured data (eg. int or preferably datetime) instead of a string?
If it… | ||||||||||||||||
borisbaldassariAuthorUnsubmitted Done Inline ActionsI think I can make it an int. borisbaldassari: I think I can make it an int.
Will try and update the doc as requested on Thursday. | ||||||||||||||||
# raw_info = attr.ib(type=Dict[str, Any]) | ||||||||||||||||
gid = attr.ib(type=str) | ||||||||||||||||
"""Group ID of the maven artifact""" | ||||||||||||||||
aid = attr.ib(type=str) | ||||||||||||||||
"""Artifact ID of the maven artifact""" | ||||||||||||||||
version = attr.ib(type=str) | ||||||||||||||||
"""Version of the maven artifact""" | ||||||||||||||||
# default format for maven artifacts | ||||||||||||||||
Done Inline ActionsIt doesn't look like it can ever be a datetime. vlorentz: It doesn't look like it can ever be a datetime. | ||||||||||||||||
MANIFEST_FORMAT = string.Template("$gid $aid $version $url $timestamp") | ||||||||||||||||
Done Inline ActionsWhen vlorentz: When | ||||||||||||||||
Done Inline ActionsAs per the Lucene docs, it is the last update time of the file. Fixed. borisbaldassari: As per the Lucene docs, it is the last update time of the file. Fixed. | ||||||||||||||||
Done Inline Actionsthis doesn't seem to be used vlorentz: this doesn't seem to be used | ||||||||||||||||
Done Inline ActionsInteresting. It was initially copied from deposit (which i used as a template to start with). Removed it. borisbaldassari: Interesting. It was initially copied from deposit (which i used as a template to start with). | ||||||||||||||||
def extid(self, manifest_format: Optional[string.Template] = None) -> PartialExtID: | ||||||||||||||||
"""Returns a unique intrinsic identifier of this package info | ||||||||||||||||
``manifest_format`` allows overriding the class' default MANIFEST_FORMAT""" | ||||||||||||||||
manifest_format = manifest_format or self.MANIFEST_FORMAT | ||||||||||||||||
manifest = manifest_format.substitute( | ||||||||||||||||
{ | ||||||||||||||||
"gid": self.gid, | ||||||||||||||||
"aid": self.aid, | ||||||||||||||||
"version": self.version, | ||||||||||||||||
"url": self.url, | ||||||||||||||||
"timestamp": self.time, | ||||||||||||||||
} | ||||||||||||||||
) | ||||||||||||||||
return ("maven-jar", hashlib.sha256(manifest.encode()).digest()) | ||||||||||||||||
@classmethod | ||||||||||||||||
def from_metadata(cls, a_metadata: Dict[str, Any]) -> "MavenPackageInfo": | ||||||||||||||||
url = a_metadata["url"] | ||||||||||||||||
filename = a_metadata.get("filename") | ||||||||||||||||
gid = a_metadata["gid"] | ||||||||||||||||
aid = a_metadata["aid"] | ||||||||||||||||
Done Inline ActionsCan you pass this URL from the lister instead of re-computing it here? Or are we sure this URL will always be right? vlorentz: Can you pass this URL from the lister instead of re-computing it here? Or are we sure this URL… | ||||||||||||||||
Done Inline ActionsNo, we don't have it at the lister level either. And we are not sure the URL exists. This is what I had written in another comment -- but I have the feeling that some of my answers are lost, which does not help. We have no reliable way to know the POM; this is a heuristic that should work most of the time. borisbaldassari: No, we don't have it at the lister level either. And we are not sure the URL exists.
This is… | ||||||||||||||||
version = a_metadata["version"] | ||||||||||||||||
Done Inline Actions@vlorentz Do we allow such side-effects [1] to occur this early in the package loading process? I guess it's fine since it's called at the get_package_info time... [1] any form of interactions with the outside world (db, remote service, file, whatever can raise any form of issues...) ardumont: @vlorentz Do we allow such side-effects [1] to occur this early in the package loading process? | ||||||||||||||||
Done Inline ActionsIt's not too early; but this is outside any exception handling, which means a crash here (and they *will* happen since it's a network request) won't update the visit status, so it must be done somewhere else, eg. by specializing build_extrinsic_directory_metadata. vlorentz: It's not too early; but this is outside any exception handling, which means a crash here (and… | ||||||||||||||||
return cls( | ||||||||||||||||
url=url, | ||||||||||||||||
filename=filename if filename else path.split(url)[-1], | ||||||||||||||||
ardumontUnsubmitted Done Inline Actions
ardumont: | ||||||||||||||||
borisbaldassariAuthorUnsubmitted Done Inline Actionsneat! borisbaldassari: neat! | ||||||||||||||||
time=a_metadata["time"], | ||||||||||||||||
Done Inline Actionsare there any packages without a POM? vlorentz: are there any packages without a POM? | ||||||||||||||||
Done Inline ActionsAs stated above, yes. This is what I explained in another, now invisible answer. Let me look for it. Ah, actually it has simply moved, it's right below: So we can choose between having the XML pom (complete, but not reliable) or having the JSON (incomplete, but always provided). borisbaldassari: As stated above, yes. This is what I explained in another, now invisible answer. Let me look… | ||||||||||||||||
gid=gid, | ||||||||||||||||
aid=aid, | ||||||||||||||||
version=version, | ||||||||||||||||
directory_extrinsic_metadata=[ | ||||||||||||||||
RawExtrinsicMetadataCore( | ||||||||||||||||
format="maven-pom-json", metadata=json.dumps(a_metadata).encode(), | ||||||||||||||||
) | ||||||||||||||||
], | ||||||||||||||||
) | ||||||||||||||||
Done Inline Actions
ardumont: | ||||||||||||||||
class MavenLoader(PackageLoader[MavenPackageInfo]): | ||||||||||||||||
"""Load source code jar origin's artifact files into swh archive | ||||||||||||||||
Done Inline Actions*thumbs up* ardumont: *thumbs up* | ||||||||||||||||
""" | ||||||||||||||||
visit_type = "maven" | ||||||||||||||||
Done Inline ActionsCould you write all the metadata here? ideally the original POM file, instead of converting to JSON vlorentz: Could you write all the metadata here? ideally the original POM file, instead of converting to… | ||||||||||||||||
Done Inline ActionsWe don't have the original POM file around when loading the jar, and I don't see a reliable way to get it. We might use the (gId, aId, version) coordinates to try to download a pom and set it as metadata, but I'm not sure how often it would work. What do you prefer? I've set the a_metadata full info instead of the extract for now. borisbaldassari: We don't have the original POM file around when loading the jar, and I don't see a reliable way… | ||||||||||||||||
def __init__( | ||||||||||||||||
self, | ||||||||||||||||
storage: StorageInterface, | ||||||||||||||||
url: str, | ||||||||||||||||
artifacts: Sequence[Dict[str, Any]], | ||||||||||||||||
extid_manifest_format: Optional[str] = None, | ||||||||||||||||
max_content_size: Optional[int] = None, | ||||||||||||||||
): | ||||||||||||||||
f"""Loader constructor. | ||||||||||||||||
For now, this is the lister's task output. | ||||||||||||||||
There is one, and only one, artefact (jar or zip) per version, as guaranteed by | ||||||||||||||||
the Maven coordinates system. | ||||||||||||||||
Done Inline Actionsis this still true with the latest version of the lister? ardumont: is this still true with the latest version of the lister? | ||||||||||||||||
Done Inline ActionsSee next comment, yes. We introduced last_update as a datetime, but the time parameter passed to the loader is an int. And it stil has the same meaning, i.e. last modification time on the server (as defined in the lucene index at least). Do you think it's not? borisbaldassari: See next comment, yes.
We introduced last_update as a datetime, but the time parameter passed… | ||||||||||||||||
Args: | ||||||||||||||||
url: Origin url | ||||||||||||||||
artifacts: List of single artifact information with keys: | ||||||||||||||||
- **time**: the timestamp of the jar file as an int | ||||||||||||||||
ardumontUnsubmitted Done Inline Actionsis this still true (a timestamp?) regarding the latest version of the lister? ardumont: is this still true (a timestamp?) regarding the latest version of the lister? | ||||||||||||||||
borisbaldassariAuthorUnsubmitted Done Inline ActionsYes, it still is a timestamp. We changed the type to int, but the content is basically the same. borisbaldassari: Yes, it still is a timestamp. We changed the type to int, but the content is basically the same. | ||||||||||||||||
ardumontUnsubmitted Done Inline Actionsping? ardumont: ping? | ||||||||||||||||
- **url**: the artifact url to retrieve filename | ||||||||||||||||
- **filename**: optionally, the file's name | ||||||||||||||||
- **gid**: artifact's groupId | ||||||||||||||||
- **aid**: artifact's artifactId | ||||||||||||||||
- **version**: artifact's version | ||||||||||||||||
extid_manifest_format: template string used to format a manifest, | ||||||||||||||||
which is hashed to get the extid of a package. | ||||||||||||||||
Defaults to {MavenPackageInfo.MANIFEST_FORMAT!r} | ||||||||||||||||
""" | ||||||||||||||||
super().__init__(storage=storage, url=url, max_content_size=max_content_size) | ||||||||||||||||
self.artifacts = artifacts # assume order is enforced in the lister | ||||||||||||||||
self.version_artifact: OrderedDict[str, Dict[str, Any]] | ||||||||||||||||
self.version_artifact = OrderedDict( | ||||||||||||||||
{str(jar.get("version")): jar for jar in artifacts if jar.get("version")} | ||||||||||||||||
) | ||||||||||||||||
Done Inline Actions
Internal state of version to the associated jar (assuming on jar per version). ardumont: Internal state of version to the associated jar (assuming on jar per version).
(you may want to… | ||||||||||||||||
Done Inline Actions
Assumption is correct.
Done. borisbaldassari: > Internal state of version to the associated jar (assuming on jar per version).
Assumption is… | ||||||||||||||||
def get_versions(self) -> Sequence[str]: | ||||||||||||||||
return list(self.version_artifact.keys()) | ||||||||||||||||
def get_default_version(self) -> str: | ||||||||||||||||
# Default version is the last item | ||||||||||||||||
return self.artifacts[-1]["version"] | ||||||||||||||||
Done Inline ActionsWhat is this for? vlorentz: What is this for? | ||||||||||||||||
Done Inline ActionsHum. Interesting, thanks for pointing that out. I thought it was somehow used by the loading mechanism to register its state, but on a second look it's probably just a carryover from the example I took in the archive loader. Removed. borisbaldassari: Hum. Interesting, thanks for pointing that out.
I thought it was somehow used by the loading… | ||||||||||||||||
Done Inline Actions
to stay consistent with the other suggestion. ardumont: to stay consistent with the other suggestion. | ||||||||||||||||
def get_metadata_authority(self): | ||||||||||||||||
p_url = urlparse(self.url) | ||||||||||||||||
return MetadataAuthority( | ||||||||||||||||
Done Inline Actions
Assuming my early assumption about 1 artifact per version is correct, mention it before in the constructor maybe. ardumont: Assuming my early assumption about 1 artifact per version is correct, mention it before in the… | ||||||||||||||||
type=MetadataAuthorityType.FORGE, | ||||||||||||||||
url=f"{p_url.scheme}://{p_url.netloc}/", | ||||||||||||||||
metadata={}, | ||||||||||||||||
) | ||||||||||||||||
Done Inline ActionsYou can remove this, it's not used vlorentz: You can remove this, it's not used | ||||||||||||||||
def get_package_info(self, version: str) -> Iterator[Tuple[str, MavenPackageInfo]]: | ||||||||||||||||
a_metadata = self.version_artifact[version] | ||||||||||||||||
yield release_name(a_metadata["version"]), MavenPackageInfo.from_metadata( | ||||||||||||||||
a_metadata | ||||||||||||||||
) | ||||||||||||||||
Done Inline ActionsIs the version ever missing from the input? Can we be sure there is at most one package with a missing version per (gid, aid)? vlorentz: Is the version ever missing from the input? Can we be sure there is at most one package with a… | ||||||||||||||||
Done Inline ActionsNo, the lister is expected to always provide the version (alongside the gid and aid). If you're wondering why having this loop, it's because it's more elegant than artefact[0] and it feels more safe. But I can change it. (Note that there is only one artefact in the artifact list, as identified by the url.) borisbaldassari: No, the lister is expected to //always// provide the version (alongside the gid and aid). If… | ||||||||||||||||
Done Inline ActionsI don't think this is addressed, there is still this code: self.version_artifact = OrderedDict( {str(jar.get("version")): jar for jar in artifacts if jar.get("version")} ) vlorentz: I don't think this is addressed, there is still this code:
```
self.version_artifact =… | ||||||||||||||||
Done Inline ActionsI don't know. I'm not sure to understand what you mean or expect. What I know:
To answer your initial comment, when the loader is invoked there is indeed at most one package (whatever the gid, aid, version). Is there a specific case/situation you have in mind? borisbaldassari: I don't know. I'm not sure to understand what you mean or expect.
What I know:
* This loader… | ||||||||||||||||
Done Inline Actionsping? vlorentz: ping? | ||||||||||||||||
def build_release( | ||||||||||||||||
self, p_info: MavenPackageInfo, uncompressed_path: str, directory: Sha1Git | ||||||||||||||||
) -> Optional[Release]: | ||||||||||||||||
time = p_info.time | ||||||||||||||||
Done Inline Actions
That smells funny to me. You will always load the metadata from the first artifact here. You need to have an inverted dict from version to artifact initialized before this call. ardumont: That smells funny to me. You will always load the metadata from the first artifact here.
You… | ||||||||||||||||
Done Inline ActionsThanks for the whole proposed change. Yes, I agree it's much more clean this way. borisbaldassari: Thanks for the whole proposed change. Yes, I agree it's much more clean this way. | ||||||||||||||||
Done Inline Actionspush that next to the parsing step? ardumont: push that next to the parsing step?
that may even need to be part of the lister? | ||||||||||||||||
Done Inline ActionsMoved that next to the parsing step. I don't think it should be put in the lister, as what is yielded by the lister is already a UTC time. This line simply sets the loader local var as a UTC time to be interpreted correctly. borisbaldassari: Moved that next to the parsing step.
I don't think it should be put in the lister, as what is… | ||||||||||||||||
msg = f"Synthetic release for archive at {p_info.url}\n".encode("utf-8") | ||||||||||||||||
if isinstance(time, datetime): | ||||||||||||||||
parsed_time = time | ||||||||||||||||
vlorentzUnsubmitted Done Inline Actionsdead code according to the type of p_info.time vlorentz: dead code according to the type of `p_info.time` | ||||||||||||||||
else: # assume it's a timestamp (in milliseconds) | ||||||||||||||||
raw_time = int(time) | ||||||||||||||||
parsed_time = datetime.fromtimestamp(raw_time / 1e3) | ||||||||||||||||
parsed_time = parsed_time.astimezone(tz=timezone.utc) | ||||||||||||||||
ardumontUnsubmitted Done Inline ActionsI believe that it is now the else conditional which is the dead code [1] [1] regarding the latest version of the lister (which does the timestamp conversion and parsing). ardumont: I believe that it is now the else conditional which is the dead code [1]
[1] regarding the… | ||||||||||||||||
borisbaldassariAuthorUnsubmitted Done Inline ActionsHum. No, I don't think so. The last_update parameter has been added as a datetime for the scheduler, but the time parameter passed to the loader is still an int. Am I missing something? borisbaldassari: Hum. No, I don't think so. The last_update parameter has been added as a datetime for the… | ||||||||||||||||
normalized_time = TimestampWithTimezone.from_datetime(parsed_time) | ||||||||||||||||
return Release( | ||||||||||||||||
name=p_info.version.encode(), | ||||||||||||||||
message=msg, | ||||||||||||||||
date=normalized_time, | ||||||||||||||||
author=SWH_PERSON, | ||||||||||||||||
target=directory, | ||||||||||||||||
target_type=ObjectType.DIRECTORY, | ||||||||||||||||
synthetic=True, | ||||||||||||||||
) | ||||||||||||||||
def extra_branches(self) -> Dict[bytes, Mapping[str, Any]]: | ||||||||||||||||
last_snapshot = self.last_snapshot() | ||||||||||||||||
return last_snapshot.to_dict()["branches"] if last_snapshot else {} | ||||||||||||||||
Done Inline Actions
vlorentz: | ||||||||||||||||
Done Inline ActionsThis should be the empty Person, as documented vlorentz: This should be the empty Person, as documented | ||||||||||||||||
Done Inline ActionsI've updated the documentation to use "SWH robot", for consistency with the deposit and archive loaders (which I've used intensively as examples). borisbaldassari: I've updated the documentation to use "SWH robot", for consistency with the deposit and archive… | ||||||||||||||||
Done Inline Actionsping? vlorentz: ping? | ||||||||||||||||
Done Inline ActionsAs mentioned I've updated the package-loader-specifications.rst file to be consistent with other loaders. It was a mistake from me to create the maven entry with an empty person at first. borisbaldassari: As mentioned I've updated the package-loader-specifications.rst file to be consistent with… |
According to your last version, this is now a iso8601 string.
You could even make that a datetime now providing you adapt the from_metadata call to actually parse the date string as a datetime.
And then slightly adapt the loader to "just use" that time (instead of parsing it)