Changeset View
Standalone View
swh/loader/package/crates/loader.py
# Copyright (C) 2022 The Software Heritage developers | # Copyright (C) 2022 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
from distutils.version import StrictVersion | from datetime import datetime | ||||
import json | import json | ||||
from pathlib import Path | from pathlib import Path | ||||
import string | |||||
from typing import Any, Dict, Iterator, List, Optional, Sequence, Tuple | from typing import Any, Dict, Iterator, List, Optional, Sequence, Tuple | ||||
from urllib.parse import urlparse | from urllib.parse import urlparse | ||||
import attr | import attr | ||||
from packaging.version import parse as parse_version | |||||
import toml | import toml | ||||
from typing_extensions import TypedDict | |||||
from swh.loader.package.loader import BasePackageInfo, PackageLoader | from swh.loader.package.loader import ( | ||||
from swh.loader.package.utils import cached_method, get_url_body, release_name | BasePackageInfo, | ||||
from swh.model.model import ObjectType, Person, Release, Sha1Git, TimestampWithTimezone | PackageLoader, | ||||
RawExtrinsicMetadataCore, | |||||
) | |||||
from swh.loader.package.utils import EMPTY_AUTHOR, release_name | |||||
from swh.model.model import ( | |||||
MetadataAuthority, | |||||
MetadataAuthorityType, | |||||
ObjectType, | |||||
Person, | |||||
Release, | |||||
Sha1Git, | |||||
TimestampWithTimezone, | |||||
) | |||||
from swh.storage.interface import StorageInterface | from swh.storage.interface import StorageInterface | ||||
class ExtrinsicPackageMetadata(TypedDict): | |||||
"""Data structure for package extrinsic metadata pulled from http api endpoint. | |||||
We set only the keys we need according to what is available when querying | |||||
https://crates.io/api/v1/crates/<name>, where `name` is the name of the crate | |||||
package (see JSON response example at https://crates.io/api/v1/crates/hg-core). | |||||
Usage example: | |||||
.. code-block:: python | |||||
e_metadata = ExtrinsicPackageMetadata(**self.info()) | |||||
""" # noqa | |||||
categories: List[Dict[Any, Any]] | |||||
"""Related categories""" | |||||
crate: Dict[Any, Any] | |||||
"""Crate project information""" | |||||
keywords: List[Any] | |||||
"""Keywords""" | |||||
versions: List[Dict[Any, Any]] | |||||
"""A list of released versions for a crate""" | |||||
class ExtrinsicVersionPackageMetadata(TypedDict): | |||||
"""Data structure for specific package version extrinsic metadata, pulled | |||||
from http api endpoint. | |||||
Similar to `ExtrinsicPackageMetadata` in its usage, but we flatten the data | |||||
related to a specific version. | |||||
""" | |||||
crate: str | |||||
"""The package name""" | |||||
crate_size: int | |||||
"""The package size""" | |||||
created_at: str | |||||
"""First released at""" | |||||
downloads: str | |||||
"""Number of downloads""" | |||||
license: str | |||||
"""Package license""" | |||||
num: str | |||||
"""Package version""" | |||||
published_by: Dict[Any, Any] | |||||
"""Publishers information""" | |||||
updated_at: str | |||||
"""Last update""" | |||||
yanked: bool | |||||
"""Is that version yanked? (yanked means release-level deprecation)""" | |||||
class IntrinsicPackageMetadata(TypedDict): | |||||
"""Data structure for specific package version intrinsic metadata. | |||||
Data is extracted from the crate package's .toml file. Then the data of the | |||||
'package' entry is flattened. | |||||
Cargo.toml file content example: | |||||
.. code-block:: toml | |||||
[package] | |||||
name = "hg-core" | |||||
version = "0.0.1" | |||||
authors = ["Georges Racinet <georges.racinet@octobus.net>"] | |||||
description = "Mercurial pure Rust core library, with no assumption on | |||||
Python bindings (FFI)" | |||||
homepage = "https://mercurial-scm.org" | |||||
license = "GPL-2.0-or-later" | |||||
repository = "https://www.mercurial-scm.org/repo/hg" | |||||
[lib] | |||||
name = "hg" | |||||
[dev-dependencies.rand] | |||||
version = "~0.6" | |||||
[dev-dependencies.rand_pcg] | |||||
version = "~0.1" | |||||
:param toml: toml object | |||||
""" | |||||
name: str | |||||
"""The package name""" | |||||
version: str | |||||
"""Package version""" | |||||
authors: List[str] | |||||
"""Authors""" | |||||
description: str | |||||
"""Package and release description""" | |||||
homepage: str | |||||
"""Homepage of the project""" | |||||
license: str | |||||
"""Package license""" | |||||
repository: str | |||||
"""Source code repository""" | |||||
@attr.s | @attr.s | ||||
class CratesPackageInfo(BasePackageInfo): | class CratesPackageInfo(BasePackageInfo): | ||||
name = attr.ib(type=str) | name = attr.ib(type=str) | ||||
"""Name of the package""" | """Name of the package""" | ||||
version = attr.ib(type=str) | version = attr.ib(type=str) | ||||
"""Current version""" | """Current version""" | ||||
e_metadata: Dict[str, Any] = attr.ib(factory=ExtrinsicPackageMetadata) | sha256 = attr.ib(type=str) | ||||
"""Extrinsic package metadata, common to all versions""" | """Extid as sha256""" | ||||
e_metadata_version: Dict[str, Any] = attr.ib( | last_update = attr.ib(type=datetime) | ||||
factory=ExtrinsicVersionPackageMetadata | """Last update as release date""" | ||||
) | |||||
"""Extrinsic package metadata specific to a version""" | |||||
i_metadata: Dict[str, Any] = attr.ib(factory=IntrinsicPackageMetadata) | yanked = attr.ib(type=bool) | ||||
"""Intrinsic metadata of the current package version""" | """Whether the package is yanked or not""" | ||||
MANIFEST_FORMAT = string.Template( | |||||
"name $name\nshasum $sha256\nurl $url\nversion $version\nlast_update $last_update" | |||||
vlorentz: It's missing the date.
(All fields used to build release objects should be covered by this… | |||||
) | |||||
EXTID_TYPE = "crates-manifest-sha256" | |||||
EXTID_VERSION = 0 | |||||
Not Done Inline ActionsAre all attributes of CratesPackageInfo guaranteed to be generated deterministically from content covered by this hash? (See D8173 for details) vlorentz: Are all attributes of `CratesPackageInfo` guaranteed to be generated deterministically from… | |||||
Done Inline ActionsNo. There is also some extrinsic data retrieved from the api, I guess they are not related to the hash. I also don't get how I can ensure that CratesPackageInfo attributes are generated deterministically from content covered by this hash, because when CratesPackageInfo is executed we don't have downloaded the archive yet. Can you please tell me what should now be the best and quick way to go next step? given by 'artifacts' from intrinsic metadata except extrinsic metadata that franckbret: No.
The CratesPackageInfo attributes are mainly generated depending on artifact data… | |||||
Done Inline Actions
Release objects match git tags; their date is the date of the upstream object (so the date it was published, here).
But you have a hash of that archive, right? Therefore, the extid should be hash(hash(tarball) + field1 + field2 + ...) where field1, field2, ... are all the fields not computed from the tarball but included in the Release object. So in theory, you would need to fetch this publication date from the API every time ; but I think we can make an exception here, because it would be rather costly to run this many API calls just for this bit of data (that is very unlikely to change) AND because excluding the revision date from the extid does not bring any risk of confusing Release objects between packages.
Switch to the "manifest" mechanism, and use something like this as manifest: "$name $version $sha256 $instance_url". I am including $instance_url so that two packages with the same name + version + tarball hash uploaded to different instances of Crates.io at different times do not end up with the same Release object (and they shouldn't because Release.date differs) vlorentz: > I'm still not sure of what that date is supposed to represents (the real date a release has… | |||||
Done Inline ActionsThanks for your answers, i'm going the manifest way and ping you when I have something franckbret: Thanks for your answers, i'm going the manifest way and ping you when I have something | |||||
Done Inline Actionsplease use a different type name for the extid, so it does not conflict with other manifest formats. eg. crates-manifest-sha256 to be consistent with the NPM loader. Actually, now that I think of it, it could use exactly the same manifest format as NPM: swh/loader/package/npm/loader.py: MANIFEST_FORMAT = string.Template( swh/loader/package/npm/loader.py- "date $date\nname $package_name\nshasum $shasum\nurl $url\nversion $version" swh/loader/package/npm/loader.py- ) (but keep a different type, name, it would be confusing to call this npm-manifest-sha256) vlorentz: please use a different type name for the extid, so it does not conflict with other manifest… | |||||
def extract_intrinsic_metadata(dir_path: Path) -> Dict[str, Any]: | def extract_intrinsic_metadata(dir_path: Path) -> Dict[str, Any]: | ||||
"""Extract intrinsic metadata from Cargo.toml file at dir_path. | """Extract intrinsic metadata from Cargo.toml file at dir_path. | ||||
Each crate archive has a Cargo.toml at the root of the archive. | Each crate archive has a Cargo.toml at the root of the archive. | ||||
Args: | Args: | ||||
dir_path: A directory on disk where a Cargo.toml must be present | dir_path: A directory on disk where a Cargo.toml must be present | ||||
Returns: | Returns: | ||||
A dict mapping from toml parser | A dict mapping from toml parser | ||||
""" | """ | ||||
return toml.load(dir_path / "Cargo.toml") | return toml.load(dir_path / "Cargo.toml") | ||||
def extract_author(p_info: CratesPackageInfo) -> Person: | |||||
"""Extract package author from intrinsic metadata and return it as a | |||||
`Person` model. | |||||
Args: | |||||
p_info: CratesPackageInfo that should contains i_metadata entries | |||||
Returns: | |||||
Only one author (Person) of the package. Currently limited by internal detail | |||||
of the swh stack (see T3887). | |||||
""" | |||||
authors = p_info.i_metadata["authors"] | |||||
fullname = authors[0] # TODO: here we have a list of author, see T3887 | |||||
return Person.from_fullname(fullname.encode()) | |||||
def extract_description(p_info: CratesPackageInfo) -> str: | |||||
"""Extract package description from intrinsic metadata and return it as a | |||||
string. | |||||
Args: | |||||
p_info: CratesPackageInfo that should contains i_metadata and entries | |||||
Returns: | |||||
Package description from metadata. | |||||
""" | |||||
return p_info.i_metadata["description"] | |||||
class CratesLoader(PackageLoader[CratesPackageInfo]): | class CratesLoader(PackageLoader[CratesPackageInfo]): | ||||
"""Load Crates package origins into swh archive.""" | """Load Crates package origins into swh archive.""" | ||||
visit_type = "crates" | visit_type = "crates" | ||||
def __init__( | def __init__( | ||||
self, | self, | ||||
storage: StorageInterface, | storage: StorageInterface, | ||||
url: str, | url: str, | ||||
artifacts: List[Dict[str, Any]], | artifacts: List[Dict[str, Any]], | ||||
crates_metadata: List[Dict[str, Any]], | |||||
**kwargs, | **kwargs, | ||||
Not Done Inline ActionsTo align with lister output: artifacts: Dict[str, Dict[str, Any]], crates_metadata: Dict[str, Dict[str, Any]], anlambert: To align with lister output:
```lang=python
artifacts: Dict[str, Dict[str, Any]]… | |||||
Not Done Inline ActionsIgnore this comment, I was not aware that we should use this format anlambert: Ignore this comment, I was not aware that we should use this [format](https://docs. | |||||
): | ): | ||||
"""Constructor | """Constructor | ||||
Args: | Args: | ||||
url: | url: | ||||
Origin url, (e.g. https://crates.io/api/v1/crates/<package_name>) | Origin url, (e.g. https://crates.io/api/v1/crates/<package_name>) | ||||
artifacts: | artifacts: | ||||
A list of dict listing all existing released versions for a | A list of dict listing all existing released versions for a | ||||
package (Usually set with crates lister `extra_loader_arguments`). | package (Usually set with crates lister `extra_loader_arguments`). | ||||
Each line is a dict that should have an `url` | Each line is a dict that should have an `url` | ||||
(where to download package specific version) and a `version` entry. | (where to download package specific version), a `version`, a | ||||
`filename` and a `checksums['sha256']` entry. | |||||
Example:: | Example:: | ||||
[ | [ | ||||
{ | { | ||||
"version": <version>, | "version": <version>, | ||||
"url": "https://static.crates.io/crates/<package_name>/<package_name>-<version>.crate", | "url": "https://static.crates.io/crates/<package_name>/<package_name>-<version>.crate", | ||||
"filename": "<package_name>-<version>.crate", | |||||
"checksums": { | |||||
"sha256": "<sha256>", | |||||
}, | |||||
} | } | ||||
] | ] | ||||
crates_metadata: | |||||
Same as previously but for Crates metadata. | |||||
For now it only has one boolean key `yanked`. | |||||
Example:: | |||||
[ | |||||
{ | |||||
"version": "<version>", | |||||
"yanked": <yanked>, | |||||
}, | |||||
] | |||||
""" # noqa | """ # noqa | ||||
super().__init__(storage=storage, url=url, **kwargs) | super().__init__(storage=storage, url=url, **kwargs) | ||||
self.url = url | self.url = url | ||||
self.artifacts: Dict[str, Dict] = { | self.artifacts: Dict[str, Dict] = { | ||||
artifact["version"]: artifact for artifact in artifacts | artifact["version"]: artifact for artifact in artifacts | ||||
} | } | ||||
self.crates_metadata: Dict[str, Dict] = { | |||||
@cached_method | data["version"]: data for data in crates_metadata | ||||
def _raw_info(self) -> bytes: | } | ||||
"""Get crate metadata (fetched from http api endpoint set as self.url) | |||||
Returns: | |||||
Content response as bytes. Content response is a json document. | |||||
""" | |||||
return get_url_body(self.url) | |||||
@cached_method | |||||
def info(self) -> Dict: | |||||
"""Parse http api json response and return the crate metadata information | |||||
as a Dict.""" | |||||
return json.loads(self._raw_info()) | |||||
def get_versions(self) -> Sequence[str]: | def get_versions(self) -> Sequence[str]: | ||||
"""Get all released versions of a crate | """Get all released versions of a crate | ||||
Returns: | Returns: | ||||
A sequence of versions | A sequence of versions | ||||
Not Done Inline ActionsTo align with lister output: self.artifacts = artifacts self.crates_metadata = crates_metadata anlambert: To align with lister output:
```lang=python
self.artifacts = artifacts
self.crates_metadata =… | |||||
Not Done Inline ActionsIgnore this comment, I was not aware that we should use this format Use this instead to ensure all versions get loaded: self.artifacts: Dict[str, Dict] = { artifact["version"]: artifact for artifact in artifacts } self.crates_metadata: Dict[str, Dict] = { metadata["version"]: metadata for metadata in crates_metadata } anlambert: Ignore this comment, I was not aware that we should use this [format](https://docs. | |||||
Example:: | Example:: | ||||
["0.1.1", "0.10.2"] | ["0.1.1", "0.10.2"] | ||||
""" | """ | ||||
versions = list(self.artifacts.keys()) | versions = list(self.artifacts.keys()) | ||||
versions.sort(key=StrictVersion) | versions.sort(key=parse_version) | ||||
return versions | return versions | ||||
def get_default_version(self) -> str: | def get_default_version(self) -> str: | ||||
"""Get the newest release version of a crate | """Get the newest release version of a crate | ||||
Returns: | Returns: | ||||
A string representing a version | A string representing a version | ||||
Example:: | Example:: | ||||
"0.1.2" | "0.1.2" | ||||
""" | """ | ||||
return self.get_versions()[-1] | return self.get_versions()[-1] | ||||
def get_metadata_authority(self): | |||||
return MetadataAuthority( | |||||
type=MetadataAuthorityType.FORGE, | |||||
url="https://crates.io/", | |||||
) | |||||
def get_package_info(self, version: str) -> Iterator[Tuple[str, CratesPackageInfo]]: | def get_package_info(self, version: str) -> Iterator[Tuple[str, CratesPackageInfo]]: | ||||
"""Get release name and package information from version | """Get release name and package information from version | ||||
Args: | Args: | ||||
version: crate version (e.g: "0.1.0") | version: crate version (e.g: "0.1.0") | ||||
Returns: | Returns: | ||||
Iterator of tuple (release_name, p_info) | Iterator of tuple (release_name, p_info) | ||||
""" | """ | ||||
artifact = self.artifacts[version] | artifact = self.artifacts[version].copy() | ||||
filename = artifact["filename"] | filename = artifact["filename"] | ||||
assert artifact["checksums"]["sha256"] | |||||
sha256 = artifact["checksums"]["sha256"] | |||||
Done Inline Actionsthe first assertion is redundant; artifact["checksums"]["sha256"] would raise KeyError anyway vlorentz: the first assertion is redundant; `artifact["checksums"]["sha256"]` would raise `KeyError`… | |||||
package_name = urlparse(self.url).path.split("/")[-1] | package_name = urlparse(self.url).path.split("/")[-1] | ||||
url = artifact["url"] | url = artifact["url"] | ||||
# Get extrinsic metadata from http api | crate_metadata = self.crates_metadata[version].copy() | ||||
e_metadata = ExtrinsicPackageMetadata(**self.info()) # type: ignore[misc] | yanked = crate_metadata["yanked"] | ||||
last_update = datetime.fromisoformat(crate_metadata["last_update"]) | |||||
# Extract crate info for current version (One .crate file for a given version) | |||||
(crate_version,) = [ | # Remove "version" from artifact to follow "original-artifacts-json" extrinsic | ||||
crate for crate in e_metadata["versions"] if crate["num"] == version | # metadata format specifications | ||||
] | # See https://docs.softwareheritage.org/devel/swh-storage/extrinsic-metadata-specification.html#extrinsic-metadata-formats # noqa: B950 | ||||
e_metadata_version = ExtrinsicVersionPackageMetadata( # type: ignore[misc] | del artifact["version"] | ||||
Done Inline Actionsyou can keep the version in crate metadata, there is no format specification for it. anlambert: you can keep the version in crate metadata, there is no format specification for it. | |||||
**crate_version | |||||
) | |||||
p_info = CratesPackageInfo( | p_info = CratesPackageInfo( | ||||
name=package_name, | name=package_name, | ||||
filename=filename, | filename=filename, | ||||
url=url, | url=url, | ||||
version=version, | version=version, | ||||
e_metadata=e_metadata, | sha256=sha256, | ||||
e_metadata_version=e_metadata_version, | checksums={"sha256": sha256}, | ||||
yanked=yanked, | |||||
last_update=last_update, | |||||
directory_extrinsic_metadata=[ | |||||
RawExtrinsicMetadataCore( | |||||
format="crates-package-json", | |||||
metadata=json.dumps([crate_metadata]).encode(), | |||||
Done Inline Actionssame comment as on the other diffs; original-artifacts-json should already be created by the base package loader vlorentz: same comment as on the other diffs; `original-artifacts-json` should already be created by the… | |||||
), | |||||
], | |||||
Done Inline ActionsUse checksums={"sha256": sha256} instead in order for the loader to check download integrity. anlambert: Use `checksums={"sha256": sha256}` instead in order for the loader to check download integrity. | |||||
Done Inline Actions@franckbret , ping for this important change to handle. anlambert: @franckbret , ping for this important change to handle. | |||||
) | ) | ||||
yield release_name(version, filename), p_info | yield release_name(version, filename), p_info | ||||
def build_release( | def build_release( | ||||
self, p_info: CratesPackageInfo, uncompressed_path: str, directory: Sha1Git | self, p_info: CratesPackageInfo, uncompressed_path: str, directory: Sha1Git | ||||
) -> Optional[Release]: | ) -> Optional[Release]: | ||||
# Extract intrinsic metadata from dir_path/Cargo.toml | # Extract intrinsic metadata from dir_path/Cargo.toml | ||||
name = p_info.name | dir_path = Path(uncompressed_path, f"{p_info.name}-{p_info.version}") | ||||
version = p_info.version | i_metadata = extract_intrinsic_metadata(dir_path) | ||||
dir_path = Path(uncompressed_path, f"{name}-{version}") | |||||
i_metadata_raw = extract_intrinsic_metadata(dir_path) | author = EMPTY_AUTHOR | ||||
# Get only corresponding key of IntrinsicPackageMetadata | authors = i_metadata.get("package", {}).get("authors") | ||||
Done Inline Actionsany(authors) here too vlorentz: `any(authors)` here too | |||||
i_metadata_keys = [k for k in IntrinsicPackageMetadata.__annotations__.keys()] | if authors and isinstance(authors, list): | ||||
# We use data only from "package" entry | # TODO: here we have a list of author, see T3887 | ||||
i_metadata = { | author = Person.from_fullname(authors[0].encode()) | ||||
k: v for k, v in i_metadata_raw["package"].items() if k in i_metadata_keys | |||||
} | |||||
p_info.i_metadata = IntrinsicPackageMetadata(**i_metadata) # type: ignore[misc] | |||||
author = extract_author(p_info) | |||||
description = extract_description(p_info) | |||||
message = ( | message = ( | ||||
f"Synthetic release for Crate source package {p_info.name} " | f"Synthetic release for Crate source package {p_info.name} " | ||||
f"version {p_info.version}\n\n" | f"version {p_info.version}\n" | ||||
Done Inline ActionsBased on @vlorentz remarks on irc, we should remove the description in the release message as it is related to the crate package and not that particular release. anlambert: Based on @vlorentz remarks on irc, we should remove the description in the release message as… | |||||
f"{description}\n" | |||||
) | ) | ||||
# The only way to get a value for updated_at is through extrinsic metadata | |||||
updated_at = p_info.e_metadata_version.get("updated_at") | |||||
return Release( | return Release( | ||||
name=version.encode(), | name=p_info.version.encode(), | ||||
date=TimestampWithTimezone.from_datetime(p_info.last_update), | |||||
author=author, | author=author, | ||||
date=TimestampWithTimezone.from_iso8601(updated_at), | |||||
Not Done Inline Actionser, why did you remove the date? this is valuable information vlorentz: er, why did you remove the date? this is valuable information | |||||
Done Inline ActionsBecause I can not access it since there is no call to the http api anymore. franckbret: Because I can not access it since there is no call to the http api anymore.
What do you… | |||||
Done Inline Actions@vlorentz Considering that the crate file is a tar.gz I may stat the Cargo.toml to get a timestamp that should be quite accurate? franckbret: @vlorentz Considering that the crate file is a tar.gz I may stat the Cargo.toml to get a… | |||||
Done Inline ActionsI've investigate and it's not possible to get something accurate stating Cargo.toml from within the archive. Looks like that for a lot of old packages the file date is 01011970. Do you have any other ideas, or do I have to get back to get it from http api call? franckbret: I've investigate and it's not possible to get something accurate stating Cargo.toml from within… | |||||
Not Done Inline ActionsAPI call is fine. But actually, can't you get it from the lister? Would get_last_update_by_file match what the API returns? vlorentz: API call is fine.
But actually, can't you get it from the lister? Would… | |||||
Done Inline Actionsget_last_update_by_file returns the date of the last commit for a package, not for each version. Please note that the git repository we target is squashed every few months. See https://internals.rust-lang.org/t/cargos-crate-index-upcoming-squash-into-one-commit/8440 So I suspected that even when we get_last_update_by_file the results are not consistent. Example with hg-core that have only one version from 2019:
franck@debian-franck:/tmp/crates.io-index/hg/-c$ git log hg-core commit 9ceec3bd05e9d6ca5a70084cc0078a2f324b66af Author: bors <bors@rust-lang.org> Date: Wed Jul 6 02:31:28 2022 +0000 Collapse index into one commit Previous HEAD was 075e7a606882092af5c5bbe4124872745dc4611c, now on the `snapshot-2022-07-06` branch More information about this change can be found [online] and on [this issue]. [online]: https://internals.rust-lang.org/t/cargos-crate-index-upcoming-squash-into-one-commit/8440 [this issue]: https://github.com/rust-lang/crates-io-cargo-teams/issues/47 Now if I do the same on the archive repository https://github.com/rust-lang/crates.io-index-archive: franck@debian-franck:/tmp$ git ls-remote https://github.com/rust-lang/crates.io-index-archive | grep refs/heads/snapshot 9110daee6752e903379f3af955506d6116315273 refs/heads/snapshot-2018-09-26 e669e7256d9d00baea377e9f487c0d086ac78c2c refs/heads/snapshot-2019-10-17 f6bccfc6021a2088cb0e89652b9bfcd105c0c2a0 refs/heads/snapshot-2020-03-25 eb6c4f86a152ee407c7a466327c6a4cbbb92cd7a refs/heads/snapshot-2020-08-04 1b7e17acbb67d41e148ba6dbaf8975f412dc6207 refs/heads/snapshot-2020-11-20 a5dcd8438da2d8f99e3661a1956afbfb8f026fa0 refs/heads/snapshot-2021-05-05 4181c62812c70fafb2b56cbbd66c31056671b445 refs/heads/snapshot-2021-07-02 f954048ea7b374a6261fa751710b73981b292048 refs/heads/snapshot-2021-09-24 94b5429198de77c890839b962228b187f0c25468 refs/heads/snapshot-2021-12-21 ba5efd5ab04919dd77b8a7b8298327c3ce75457e refs/heads/snapshot-2022-03-02 075e7a606882092af5c5bbe4124872745dc4611c refs/heads/snapshot-2022-07-06 Now clone the latest branch (it can takes minutes) franck@debian-franck:/tmp$ git clone -b snapshot-2022-07-06 https://github.com/rust-lang/crates.io-index-archive franck@debian-franck:/tmp/crates.io-index-archive$ git log hg/-c/hg-core commit d511f68fa91e266ba7a20b5f37e7a4801423c289 Author: bors <bors@rust-lang.org> Date: Wed Mar 2 02:43:52 2022 +0000 Collapse index into one commit Previous HEAD was ba5efd5ab04919dd77b8a7b8298327c3ce75457e, now on the `snapshot-2022-03-02` branch More information about this change can be found [online] and on [this issue]. [online]: https://internals.rust-lang.org/t/cargos-crate-index-upcoming-squash-into-one-commit/8440 [this issue]: https://github.com/rust-lang/crates-io-cargo-teams/issues/47 Not better. Let's try with first snapshot franck@debian-franck:/tmp/crates.io-index-archive$ git checkout snapshot-2018-09-26 Updating files: 100% (76174/76174), done. Branch 'snapshot-2018-09-26' set up to track remote branch 'snapshot-2018-09-26' from 'origin'. Switched to a new branch 'snapshot-2018-09-26' franck@debian-franck:/tmp/crates.io-index-archive$ git log hg/-c/hg-core fatal: ambiguous argument 'hg/-c/hg-core': unknown revision or path not in the working tree. Use '--' to separate paths from revisions, like this: 'git <command> [<revision>...] -- [<file>...]' Obviously file doesn't exists because its 2018 franck@debian-franck:/tmp/crates.io-index-archive$ git checkout snapshot-2019-10-17 Updating files: 100% (17852/17852), done. Branch 'snapshot-2019-10-17' set up to track remote branch 'snapshot-2019-10-17' from 'origin'. Switched to a new branch 'snapshot-2019-10-17' franck@debian-franck:/tmp/crates.io-index-archive$ git log hg/-c/hg-core commit 57336a33dde6225e0cc201fe7c5715f0351702cb Author: bors <bors@rust-lang.org> Date: Tue Apr 16 18:48:16 2019 +0000 Updating crate `hg-core#0.0.1` Ok here is the first commit, and the date Tue Apr 16 18:48:16 2019 +0000 seems 5 seconds after the one from the api which was 2019-04-16T18:48:11.404457+00:00. I don't know if it is doable to rebuild a full linear git log from all those snapshot from crates.io-index-archives while excluding all of those squashed commits, but it could be a way to get accurate release date for each versions only using git at the lister level. Another way I explored is downloading the experimental db dump https://static.crates.io/db-dump.tar.gz which contains two interesting files, crates.csv which list all packages name with a unique id per line and versions.csv which lists all package versions and reference the previous crate_id. The database is dumped every 24 hours. Let's check date for hg-core: franck@debian-franck:~/Téléchargements/2022-08-08-020027/data$ cat crates.csv | grep hg-core 2019-04-16 18:48:11.404457,"Mercurial pure Rust core library, with no assumption on Python bindings (FFI)",,563,https://mercurial-scm.org,128438,,hg-core,,https://www.mercurial-scm.org/repo/hg,2019-04-16 18:48:11.404457 franck@debian-franck:~/Téléchargements/2022-08-08-020027/data$ cat versions.csv | grep 128438 128438,21344,2019-04-16 18:48:11.404457,563,{},145309,GPL-2.0-or-later,0.0.1,45544,2019-04-16 18:48:11.404457,f The date is corresponding and we grab also the package name and version, but we miss the cksum of each crates versions (the dump is not a real iso dump of the database as it excludes some table and or some columns). With that say, the options I see now are: If that problem of squashed commit date is not a problem at all on the lister side :
If we want accurate date for both the lister and the loader:
franckbret: get_last_update_by_file returns the date of the last commit for a package, not for each version. | |||||
Not Done Inline ActionsHmm ok, squashes are going to make it tricky. Using the DB dump is a nice idea, but it would add complexity on our side to manage this kind of large data dump and share it across workers. We would probably add a dedicated worker for this, but this also adds complexity. I think API access is fine. https://crates.io/policies#crawlers says they allow up to 1 request per second; which we are unlikely to hit anyway, given the time it takes to ingest a package. However, we would need some way to ensure we don't exceed it, and I don't see a way to do it without assigning a dedicated worker... I'd like @ardumont's input as he may have some insight; but sadly he is on vacation until the end of the month :/ vlorentz: Hmm ok, squashes are going to make it tricky.
Using the DB dump is a nice idea, but it would… | |||||
Done Inline Actions
When talking about using the db dump its on the lister side not on the loader. The db dump is about 200mo the git repository is about 800mo. Does the problem you talk about also exists for the git repo? I will go back to finalize arch and aur for now. franckbret: > Using the DB dump is a nice idea, but it would add complexity on our side to manage this kind… | |||||
Not Done Inline Actionsah yes, of course. I guess that would be fine then, as listers are already assigned to specific workers afaik vlorentz: ah yes, of course. I guess that would be fine then, as listers are already assigned to specific… | |||||
message=message.encode(), | message=message.encode(), | ||||
target_type=ObjectType.DIRECTORY, | target_type=ObjectType.DIRECTORY, | ||||
target=directory, | target=directory, | ||||
synthetic=True, | synthetic=True, | ||||
) | ) |
It's missing the date.
(All fields used to build release objects should be covered by this manifest)