Changeset View
Standalone View
swh/loader/package/maven/loader.py
- This file was added.
# Copyright (C) 2021 The Software Heritage developers | ||||||||||||||||
# See the AUTHORS file at the top-level directory of this distribution | ||||||||||||||||
# License: GNU General Public License version 3, or any later version | ||||||||||||||||
# See top-level LICENSE file for more information | ||||||||||||||||
from datetime import datetime, timezone | ||||||||||||||||
import hashlib | ||||||||||||||||
import json | ||||||||||||||||
import logging | ||||||||||||||||
from os import path | ||||||||||||||||
import string | ||||||||||||||||
from typing import ( | ||||||||||||||||
Any, | ||||||||||||||||
Dict, | ||||||||||||||||
Iterator, | ||||||||||||||||
List, | ||||||||||||||||
Mapping, | ||||||||||||||||
Optional, | ||||||||||||||||
OrderedDict, | ||||||||||||||||
Sequence, | ||||||||||||||||
Tuple, | ||||||||||||||||
) | ||||||||||||||||
from urllib.parse import urlparse | ||||||||||||||||
import attr | ||||||||||||||||
import iso8601 | ||||||||||||||||
import requests | ||||||||||||||||
from swh.loader.package.loader import ( | ||||||||||||||||
BasePackageInfo, | ||||||||||||||||
PackageLoader, | ||||||||||||||||
PartialExtID, | ||||||||||||||||
RawExtrinsicMetadataCore, | ||||||||||||||||
) | ||||||||||||||||
from swh.loader.package.utils import EMPTY_AUTHOR, release_name | ||||||||||||||||
from swh.model.model import ( | ||||||||||||||||
MetadataAuthority, | ||||||||||||||||
MetadataAuthorityType, | ||||||||||||||||
ObjectType, | ||||||||||||||||
RawExtrinsicMetadata, | ||||||||||||||||
ardumont: According to your last version, this is now a iso8601 string.
You could even make that a… | ||||||||||||||||
Done Inline ActionsYes, now it's a string. Updated as suggested to use datetime. borisbaldassari: Yes, now it's a string. Updated as suggested to use datetime. | ||||||||||||||||
Release, | ||||||||||||||||
Sha1Git, | ||||||||||||||||
Done Inline Actions
It's not a string anymore vlorentz: It's not a string anymore | ||||||||||||||||
TimestampWithTimezone, | ||||||||||||||||
) | ||||||||||||||||
from swh.storage.interface import StorageInterface | ||||||||||||||||
Done Inline ActionsCould you use structured data (eg. int or preferably datetime) instead of a string? And the documentation in the constructor below should be updated to match, either way vlorentz: Could you use structured data (eg. int or preferably datetime) instead of a string?
If it… | ||||||||||||||||
Done Inline ActionsI think I can make it an int. borisbaldassari: I think I can make it an int.
Will try and update the doc as requested on Thursday. | ||||||||||||||||
logger = logging.getLogger(__name__) | ||||||||||||||||
@attr.s | ||||||||||||||||
class MavenPackageInfo(BasePackageInfo): | ||||||||||||||||
time = attr.ib(type=datetime) | ||||||||||||||||
"""Timestamp of the last update of jar file on the server.""" | ||||||||||||||||
gid = attr.ib(type=str) | ||||||||||||||||
"""Group ID of the maven artifact""" | ||||||||||||||||
Done Inline ActionsIt doesn't look like it can ever be a datetime. vlorentz: It doesn't look like it can ever be a datetime. | ||||||||||||||||
aid = attr.ib(type=str) | ||||||||||||||||
Done Inline ActionsWhen vlorentz: When | ||||||||||||||||
Done Inline ActionsAs per the Lucene docs, it is the last update time of the file. Fixed. borisbaldassari: As per the Lucene docs, it is the last update time of the file. Fixed. | ||||||||||||||||
"""Artifact ID of the maven artifact""" | ||||||||||||||||
Done Inline Actionsthis doesn't seem to be used vlorentz: this doesn't seem to be used | ||||||||||||||||
Done Inline ActionsInteresting. It was initially copied from deposit (which i used as a template to start with). Removed it. borisbaldassari: Interesting. It was initially copied from deposit (which i used as a template to start with). | ||||||||||||||||
version = attr.ib(type=str) | ||||||||||||||||
"""Version of the maven artifact""" | ||||||||||||||||
# default format for maven artifacts | ||||||||||||||||
MANIFEST_FORMAT = string.Template("$gid $aid $version $url $time") | ||||||||||||||||
def extid(self, manifest_format: Optional[string.Template] = None) -> PartialExtID: | ||||||||||||||||
"""Returns a unique intrinsic identifier of this package info | ||||||||||||||||
``manifest_format`` allows overriding the class' default MANIFEST_FORMAT""" | ||||||||||||||||
manifest_format = manifest_format or self.MANIFEST_FORMAT | ||||||||||||||||
manifest = manifest_format.substitute( | ||||||||||||||||
{ | ||||||||||||||||
"gid": self.gid, | ||||||||||||||||
"aid": self.aid, | ||||||||||||||||
"version": self.version, | ||||||||||||||||
"url": self.url, | ||||||||||||||||
"time": str(self.time), | ||||||||||||||||
} | ||||||||||||||||
) | ||||||||||||||||
return ("maven-jar", hashlib.sha256(manifest.encode()).digest()) | ||||||||||||||||
Done Inline ActionsCan you pass this URL from the lister instead of re-computing it here? Or are we sure this URL will always be right? vlorentz: Can you pass this URL from the lister instead of re-computing it here? Or are we sure this URL… | ||||||||||||||||
Done Inline ActionsNo, we don't have it at the lister level either. And we are not sure the URL exists. This is what I had written in another comment -- but I have the feeling that some of my answers are lost, which does not help. We have no reliable way to know the POM; this is a heuristic that should work most of the time. borisbaldassari: No, we don't have it at the lister level either. And we are not sure the URL exists.
This is… | ||||||||||||||||
@classmethod | ||||||||||||||||
Done Inline Actions@vlorentz Do we allow such side-effects [1] to occur this early in the package loading process? I guess it's fine since it's called at the get_package_info time... [1] any form of interactions with the outside world (db, remote service, file, whatever can raise any form of issues...) ardumont: @vlorentz Do we allow such side-effects [1] to occur this early in the package loading process? | ||||||||||||||||
Done Inline ActionsIt's not too early; but this is outside any exception handling, which means a crash here (and they *will* happen since it's a network request) won't update the visit status, so it must be done somewhere else, eg. by specializing build_extrinsic_directory_metadata. vlorentz: It's not too early; but this is outside any exception handling, which means a crash here (and… | ||||||||||||||||
def from_metadata(cls, a_metadata: Dict[str, Any]) -> "MavenPackageInfo": | ||||||||||||||||
url = a_metadata["url"] | ||||||||||||||||
filename = a_metadata.get("filename") | ||||||||||||||||
Done Inline Actions
ardumont: | ||||||||||||||||
Done Inline Actionsneat! borisbaldassari: neat! | ||||||||||||||||
time = iso8601.parse_date(a_metadata["time"]) | ||||||||||||||||
Done Inline Actionsare there any packages without a POM? vlorentz: are there any packages without a POM? | ||||||||||||||||
Done Inline ActionsAs stated above, yes. This is what I explained in another, now invisible answer. Let me look for it. Ah, actually it has simply moved, it's right below: So we can choose between having the XML pom (complete, but not reliable) or having the JSON (incomplete, but always provided). borisbaldassari: As stated above, yes. This is what I explained in another, now invisible answer. Let me look… | ||||||||||||||||
time = time.astimezone(tz=timezone.utc) | ||||||||||||||||
gid = a_metadata["gid"] | ||||||||||||||||
aid = a_metadata["aid"] | ||||||||||||||||
version = a_metadata["version"] | ||||||||||||||||
return cls( | ||||||||||||||||
url=url, | ||||||||||||||||
filename=filename or path.split(url)[-1], | ||||||||||||||||
time=time, | ||||||||||||||||
gid=gid, | ||||||||||||||||
aid=aid, | ||||||||||||||||
Done Inline Actions
ardumont: | ||||||||||||||||
version=version, | ||||||||||||||||
directory_extrinsic_metadata=[ | ||||||||||||||||
RawExtrinsicMetadataCore( | ||||||||||||||||
Done Inline Actions*thumbs up* ardumont: *thumbs up* | ||||||||||||||||
format="maven-json", metadata=json.dumps(a_metadata).encode(), | ||||||||||||||||
), | ||||||||||||||||
], | ||||||||||||||||
) | ||||||||||||||||
Done Inline ActionsCould you write all the metadata here? ideally the original POM file, instead of converting to JSON vlorentz: Could you write all the metadata here? ideally the original POM file, instead of converting to… | ||||||||||||||||
Done Inline ActionsWe don't have the original POM file around when loading the jar, and I don't see a reliable way to get it. We might use the (gId, aId, version) coordinates to try to download a pom and set it as metadata, but I'm not sure how often it would work. What do you prefer? I've set the a_metadata full info instead of the extract for now. borisbaldassari: We don't have the original POM file around when loading the jar, and I don't see a reliable way… | ||||||||||||||||
class MavenLoader(PackageLoader[MavenPackageInfo]): | ||||||||||||||||
"""Load source code jar origin's artifact files into swh archive | ||||||||||||||||
""" | ||||||||||||||||
visit_type = "maven" | ||||||||||||||||
def __init__( | ||||||||||||||||
self, | ||||||||||||||||
storage: StorageInterface, | ||||||||||||||||
url: str, | ||||||||||||||||
artifacts: Sequence[Dict[str, Any]], | ||||||||||||||||
extid_manifest_format: Optional[str] = None, | ||||||||||||||||
Done Inline Actionsis this still true with the latest version of the lister? ardumont: is this still true with the latest version of the lister? | ||||||||||||||||
Done Inline ActionsSee next comment, yes. We introduced last_update as a datetime, but the time parameter passed to the loader is an int. And it stil has the same meaning, i.e. last modification time on the server (as defined in the lucene index at least). Do you think it's not? borisbaldassari: See next comment, yes.
We introduced last_update as a datetime, but the time parameter passed… | ||||||||||||||||
max_content_size: Optional[int] = None, | ||||||||||||||||
): | ||||||||||||||||
f"""Loader constructor. | ||||||||||||||||
For now, this is the lister's task output. | ||||||||||||||||
Done Inline Actionsis this still true (a timestamp?) regarding the latest version of the lister? ardumont: is this still true (a timestamp?) regarding the latest version of the lister? | ||||||||||||||||
Done Inline ActionsYes, it still is a timestamp. We changed the type to int, but the content is basically the same. borisbaldassari: Yes, it still is a timestamp. We changed the type to int, but the content is basically the same. | ||||||||||||||||
Done Inline Actionsping? ardumont: ping? | ||||||||||||||||
There is one, and only one, artefact (jar or zip) per version, as guaranteed by | ||||||||||||||||
the Maven coordinates system. | ||||||||||||||||
Args: | ||||||||||||||||
url: Origin url | ||||||||||||||||
artifacts: List of single artifact information with keys: | ||||||||||||||||
- **time**: the time of the last update of jar file on the server | ||||||||||||||||
as an iso8601 date string | ||||||||||||||||
- **url**: the artifact url to retrieve filename | ||||||||||||||||
- **filename**: optionally, the file's name | ||||||||||||||||
- **gid**: artifact's groupId | ||||||||||||||||
- **aid**: artifact's artifactId | ||||||||||||||||
- **version**: artifact's version | ||||||||||||||||
extid_manifest_format: template string used to format a manifest, | ||||||||||||||||
which is hashed to get the extid of a package. | ||||||||||||||||
Defaults to {MavenPackageInfo.MANIFEST_FORMAT!r} | ||||||||||||||||
Done Inline Actions
Internal state of version to the associated jar (assuming on jar per version). ardumont: Internal state of version to the associated jar (assuming on jar per version).
(you may want to… | ||||||||||||||||
Done Inline Actions
Assumption is correct.
Done. borisbaldassari: > Internal state of version to the associated jar (assuming on jar per version).
Assumption is… | ||||||||||||||||
""" | ||||||||||||||||
super().__init__(storage=storage, url=url, max_content_size=max_content_size) | ||||||||||||||||
self.artifacts = artifacts # assume order is enforced in the lister | ||||||||||||||||
self.version_artifact: OrderedDict[str, Dict[str, Any]] | ||||||||||||||||
self.version_artifact = OrderedDict( | ||||||||||||||||
Done Inline ActionsWhat is this for? vlorentz: What is this for? | ||||||||||||||||
Done Inline ActionsHum. Interesting, thanks for pointing that out. I thought it was somehow used by the loading mechanism to register its state, but on a second look it's probably just a carryover from the example I took in the archive loader. Removed. borisbaldassari: Hum. Interesting, thanks for pointing that out.
I thought it was somehow used by the loading… | ||||||||||||||||
{str(jar["version"]): jar for jar in artifacts if jar["version"]} | ||||||||||||||||
Done Inline Actions
to stay consistent with the other suggestion. ardumont: to stay consistent with the other suggestion. | ||||||||||||||||
) | ||||||||||||||||
def get_versions(self) -> Sequence[str]: | ||||||||||||||||
Done Inline Actions
Assuming my early assumption about 1 artifact per version is correct, mention it before in the constructor maybe. ardumont: Assuming my early assumption about 1 artifact per version is correct, mention it before in the… | ||||||||||||||||
return list(self.version_artifact.keys()) | ||||||||||||||||
def get_default_version(self) -> str: | ||||||||||||||||
# Default version is the last item | ||||||||||||||||
return self.artifacts[-1]["version"] | ||||||||||||||||
Done Inline ActionsYou can remove this, it's not used vlorentz: You can remove this, it's not used | ||||||||||||||||
def get_metadata_authority(self): | ||||||||||||||||
p_url = urlparse(self.url) | ||||||||||||||||
return MetadataAuthority( | ||||||||||||||||
type=MetadataAuthorityType.FORGE, | ||||||||||||||||
Done Inline ActionsIs the version ever missing from the input? Can we be sure there is at most one package with a missing version per (gid, aid)? vlorentz: Is the version ever missing from the input? Can we be sure there is at most one package with a… | ||||||||||||||||
Done Inline ActionsNo, the lister is expected to always provide the version (alongside the gid and aid). If you're wondering why having this loop, it's because it's more elegant than artefact[0] and it feels more safe. But I can change it. (Note that there is only one artefact in the artifact list, as identified by the url.) borisbaldassari: No, the lister is expected to //always// provide the version (alongside the gid and aid). If… | ||||||||||||||||
Done Inline ActionsI don't think this is addressed, there is still this code: self.version_artifact = OrderedDict( {str(jar.get("version")): jar for jar in artifacts if jar.get("version")} ) vlorentz: I don't think this is addressed, there is still this code:
```
self.version_artifact =… | ||||||||||||||||
Done Inline ActionsI don't know. I'm not sure to understand what you mean or expect. What I know:
To answer your initial comment, when the loader is invoked there is indeed at most one package (whatever the gid, aid, version). Is there a specific case/situation you have in mind? borisbaldassari: I don't know. I'm not sure to understand what you mean or expect.
What I know:
* This loader… | ||||||||||||||||
Done Inline Actionsping? vlorentz: ping? | ||||||||||||||||
url=f"{p_url.scheme}://{p_url.netloc}/", | ||||||||||||||||
metadata={}, | ||||||||||||||||
) | ||||||||||||||||
def build_extrinsic_directory_metadata( | ||||||||||||||||
Done Inline Actions
That smells funny to me. You will always load the metadata from the first artifact here. You need to have an inverted dict from version to artifact initialized before this call. ardumont: That smells funny to me. You will always load the metadata from the first artifact here.
You… | ||||||||||||||||
Done Inline ActionsThanks for the whole proposed change. Yes, I agree it's much more clean this way. borisbaldassari: Thanks for the whole proposed change. Yes, I agree it's much more clean this way. | ||||||||||||||||
Done Inline Actionspush that next to the parsing step? ardumont: push that next to the parsing step?
that may even need to be part of the lister? | ||||||||||||||||
Done Inline ActionsMoved that next to the parsing step. I don't think it should be put in the lister, as what is yielded by the lister is already a UTC time. This line simply sets the loader local var as a UTC time to be interpreted correctly. borisbaldassari: Moved that next to the parsing step.
I don't think it should be put in the lister, as what is… | ||||||||||||||||
self, p_info: MavenPackageInfo, release_id: Sha1Git, directory_id: Sha1Git, | ||||||||||||||||
) -> List[RawExtrinsicMetadata]: | ||||||||||||||||
if not p_info.directory_extrinsic_metadata: | ||||||||||||||||
Done Inline Actionsdead code according to the type of p_info.time vlorentz: dead code according to the type of `p_info.time` | ||||||||||||||||
# If this package loader doesn't write metadata, no need to require | ||||||||||||||||
# an implementation for get_metadata_authority. | ||||||||||||||||
return [] | ||||||||||||||||
Done Inline ActionsI believe that it is now the else conditional which is the dead code [1] [1] regarding the latest version of the lister (which does the timestamp conversion and parsing). ardumont: I believe that it is now the else conditional which is the dead code [1]
[1] regarding the… | ||||||||||||||||
Done Inline ActionsHum. No, I don't think so. The last_update parameter has been added as a datetime for the scheduler, but the time parameter passed to the loader is still an int. Am I missing something? borisbaldassari: Hum. No, I don't think so. The last_update parameter has been added as a datetime for the… | ||||||||||||||||
# Get artifacts | ||||||||||||||||
dir_ext_metadata = p_info.directory_extrinsic_metadata[0] | ||||||||||||||||
a_metadata = json.loads(dir_ext_metadata.metadata) | ||||||||||||||||
aid = a_metadata["aid"] | ||||||||||||||||
version = a_metadata["version"] | ||||||||||||||||
# Rebuild POM URL. | ||||||||||||||||
pom_url = path.dirname(p_info.url) | ||||||||||||||||
pom_url = f"{pom_url}/{aid}-{version}.pom" | ||||||||||||||||
r = requests.get(pom_url, allow_redirects=True) | ||||||||||||||||
if r.status_code == 200: | ||||||||||||||||
metadata_pom = r.content | ||||||||||||||||
else: | ||||||||||||||||
metadata_pom = b"" | ||||||||||||||||
return super().build_extrinsic_directory_metadata( | ||||||||||||||||
attr.evolve( | ||||||||||||||||
p_info, | ||||||||||||||||
directory_extrinsic_metadata=[ | ||||||||||||||||
RawExtrinsicMetadataCore( | ||||||||||||||||
format="maven-pom", metadata=metadata_pom, | ||||||||||||||||
Done Inline Actions
vlorentz: | ||||||||||||||||
), | ||||||||||||||||
dir_ext_metadata, | ||||||||||||||||
], | ||||||||||||||||
), | ||||||||||||||||
release_id=release_id, | ||||||||||||||||
directory_id=directory_id, | ||||||||||||||||
) | ||||||||||||||||
Done Inline ActionsThis should be the empty Person, as documented vlorentz: This should be the empty Person, as documented | ||||||||||||||||
Done Inline ActionsI've updated the documentation to use "SWH robot", for consistency with the deposit and archive loaders (which I've used intensively as examples). borisbaldassari: I've updated the documentation to use "SWH robot", for consistency with the deposit and archive… | ||||||||||||||||
Done Inline Actionsping? vlorentz: ping? | ||||||||||||||||
Done Inline ActionsAs mentioned I've updated the package-loader-specifications.rst file to be consistent with other loaders. It was a mistake from me to create the maven entry with an empty person at first. borisbaldassari: As mentioned I've updated the package-loader-specifications.rst file to be consistent with… | ||||||||||||||||
def get_package_info(self, version: str) -> Iterator[Tuple[str, MavenPackageInfo]]: | ||||||||||||||||
a_metadata = self.version_artifact[version] | ||||||||||||||||
yield release_name(a_metadata["version"]), MavenPackageInfo.from_metadata( | ||||||||||||||||
a_metadata | ||||||||||||||||
) | ||||||||||||||||
def build_release( | ||||||||||||||||
self, p_info: MavenPackageInfo, uncompressed_path: str, directory: Sha1Git | ||||||||||||||||
) -> Optional[Release]: | ||||||||||||||||
msg = f"Synthetic release for archive at {p_info.url}\n".encode("utf-8") | ||||||||||||||||
# time is an iso8601 date | ||||||||||||||||
normalized_time = TimestampWithTimezone.from_datetime(p_info.time) | ||||||||||||||||
return Release( | ||||||||||||||||
name=p_info.version.encode(), | ||||||||||||||||
message=msg, | ||||||||||||||||
date=normalized_time, | ||||||||||||||||
author=EMPTY_AUTHOR, | ||||||||||||||||
target=directory, | ||||||||||||||||
target_type=ObjectType.DIRECTORY, | ||||||||||||||||
synthetic=True, | ||||||||||||||||
) | ||||||||||||||||
def extra_branches(self) -> Dict[bytes, Mapping[str, Any]]: | ||||||||||||||||
last_snapshot = self.last_snapshot() | ||||||||||||||||
return last_snapshot.to_dict()["branches"] if last_snapshot else {} |
According to your last version, this is now a iso8601 string.
You could even make that a datetime now providing you adapt the from_metadata call to actually parse the date string as a datetime.
And then slightly adapt the loader to "just use" that time (instead of parsing it)