Changeset View
Standalone View
swh/lister/crates/lister.py
# Copyright (C) 2022 The Software Heritage developers | # Copyright (C) 2022 The Software Heritage developers | |||||||||||||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | |||||||||||||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | |||||||||||||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | |||||||||||||||
from dataclasses import asdict, dataclass | ||||||||||||||||
import datetime | ||||||||||||||||
import io | ||||||||||||||||
import json | import json | |||||||||||||||
import logging | import logging | |||||||||||||||
from pathlib import Path | from pathlib import Path | |||||||||||||||
import subprocess | import shutil | |||||||||||||||
from typing import Any, Dict, Iterator, List | from typing import Any, Dict, Iterator, List, Optional | |||||||||||||||
from urllib.parse import urlparse | from urllib.parse import urlparse | |||||||||||||||
import iso8601 | from dulwich import porcelain | |||||||||||||||
from dulwich.patch import write_tree_diff | ||||||||||||||||
from dulwich.repo import Repo | ||||||||||||||||
from swh.scheduler.interface import SchedulerInterface | from swh.scheduler.interface import SchedulerInterface | |||||||||||||||
from swh.scheduler.model import ListedOrigin | from swh.scheduler.model import ListedOrigin | |||||||||||||||
from ..pattern import CredentialsType, StatelessLister | from ..pattern import CredentialsType, Lister | |||||||||||||||
logger = logging.getLogger(__name__) | logger = logging.getLogger(__name__) | |||||||||||||||
# Aliasing the page results returned by `get_pages` method from the lister. | # Aliasing the page results returned by `get_pages` method from the lister. | |||||||||||||||
CratesListerPage = List[Dict[str, Any]] | CratesListerPage = List[Dict[str, Any]] | |||||||||||||||
class CratesLister(StatelessLister[CratesListerPage]): | @dataclass | |||||||||||||||
class CratesListerState: | ||||||||||||||||
"""Store lister state for incremental mode operations. | ||||||||||||||||
'last_commit' represents a git commit hash | ||||||||||||||||
""" | ||||||||||||||||
last_commit: str = "" | ||||||||||||||||
class CratesLister(Lister[CratesListerState, CratesListerPage]): | ||||||||||||||||
"""List origins from the "crates.io" forge. | """List origins from the "crates.io" forge. | |||||||||||||||
It basically fetches https://github.com/rust-lang/crates.io-index.git to a | It basically fetches https://github.com/rust-lang/crates.io-index.git to a | |||||||||||||||
temp directory and then walks through each file to get the crate's info. | temp directory and then walks through each file to get the crate's info on | |||||||||||||||
the first run. | ||||||||||||||||
In incremental mode, it relies on the same Git repository but instead of reading | ||||||||||||||||
each file of the repo, it get the differences through ``git log last_commit..HEAD``. | ||||||||||||||||
vlorentzUnsubmitted Done Inline Actions
vlorentz: | ||||||||||||||||
Resulting output string is parsed to build page entries. | ||||||||||||||||
""" | """ | |||||||||||||||
# Part of the lister API, that identifies this lister | # Part of the lister API, that identifies this lister | |||||||||||||||
LISTER_NAME = "crates" | LISTER_NAME = "crates" | |||||||||||||||
# (Optional) CVS type of the origins listed by this lister, if constant | # (Optional) CVS type of the origins listed by this lister, if constant | |||||||||||||||
VISIT_TYPE = "crates" | VISIT_TYPE = "crates" | |||||||||||||||
INSTANCE = "crates" | INSTANCE = "crates" | |||||||||||||||
Show All 11 Lines | class CratesLister(Lister[CratesListerState, CratesListerPage]): | |||||||||||||||
): | ): | |||||||||||||||
super().__init__( | super().__init__( | |||||||||||||||
scheduler=scheduler, | scheduler=scheduler, | |||||||||||||||
credentials=credentials, | credentials=credentials, | |||||||||||||||
url=self.INDEX_REPOSITORY_URL, | url=self.INDEX_REPOSITORY_URL, | |||||||||||||||
instance=self.INSTANCE, | instance=self.INSTANCE, | |||||||||||||||
) | ) | |||||||||||||||
def state_from_dict(self, d: Dict[str, Any]) -> CratesListerState: | ||||||||||||||||
Done Inline Actions
this is equivalent vlorentz: this is equivalent | ||||||||||||||||
if "last_commit" not in d: | ||||||||||||||||
d["last_commit"] = "" | ||||||||||||||||
return CratesListerState(**d) | ||||||||||||||||
def state_to_dict(self, state: CratesListerState) -> Dict[str, Any]: | ||||||||||||||||
return asdict(state) | ||||||||||||||||
def get_index_repository(self) -> None: | def get_index_repository(self) -> None: | |||||||||||||||
"""Get crates.io-index repository up to date running git command.""" | """Get crates.io-index repository up to date running git command.""" | |||||||||||||||
if self.DESTINATION_PATH.exists(): | ||||||||||||||||
subprocess.check_call( | porcelain.pull( | |||||||||||||||
[ | self.DESTINATION_PATH, remote_location=self.INDEX_REPOSITORY_URL | |||||||||||||||
"git", | ) | |||||||||||||||
"clone", | else: | |||||||||||||||
self.INDEX_REPOSITORY_URL, | porcelain.clone( | |||||||||||||||
self.DESTINATION_PATH, | source=self.INDEX_REPOSITORY_URL, target=self.DESTINATION_PATH | |||||||||||||||
] | ||||||||||||||||
) | ) | |||||||||||||||
Done Inline ActionsCould you swap the if and else blocks? No big deal, but I prefer to avoid not in conditionals when there is a else clause. vlorentz: Could you swap the `if` and `else` blocks? No big deal, but I prefer to avoid `not` in… | ||||||||||||||||
Done Inline ActionsSure franckbret: Sure | ||||||||||||||||
def get_crates_index(self) -> List[Path]: | def get_crates_index(self) -> List[Path]: | |||||||||||||||
"""Build a sorted list of file paths excluding dotted directories and | """Build a sorted list of file paths excluding dotted directories and | |||||||||||||||
dotted files. | dotted files. | |||||||||||||||
Each file path corresponds to a crate that lists all available | Each file path corresponds to a crate that lists all available | |||||||||||||||
versions. | versions. | |||||||||||||||
""" | """ | |||||||||||||||
crates_index = sorted( | crates_index = sorted( | |||||||||||||||
path | path | |||||||||||||||
for path in self.DESTINATION_PATH.rglob("*") | for path in self.DESTINATION_PATH.rglob("*") | |||||||||||||||
if not any(part.startswith(".") for part in path.parts) | if not any(part.startswith(".") for part in path.parts) | |||||||||||||||
and path.is_file() | and path.is_file() | |||||||||||||||
and path != self.DESTINATION_PATH / "config.json" | and path != self.DESTINATION_PATH / "config.json" | |||||||||||||||
) | ) | |||||||||||||||
return crates_index | return crates_index | |||||||||||||||
def get_last_commit_hash(self, repository_path: Path) -> str: | ||||||||||||||||
"""Returns the last commit hash of a git repository""" | ||||||||||||||||
assert repository_path.exists() | ||||||||||||||||
repo = Repo(str(repository_path)) | ||||||||||||||||
head = repo.head() | ||||||||||||||||
last_commit = repo[head] | ||||||||||||||||
return last_commit.id.decode() | ||||||||||||||||
def get_last_update_by_file(self, filepath: Path) -> Optional[datetime.datetime]: | ||||||||||||||||
"""Given a file path within a Git repository, returns its last commit | ||||||||||||||||
date as iso8601 | ||||||||||||||||
""" | ||||||||||||||||
repo = Repo(str(self.DESTINATION_PATH)) | ||||||||||||||||
# compute relative path otherwise it fails | ||||||||||||||||
relative_path = filepath.relative_to(self.DESTINATION_PATH) | ||||||||||||||||
Done Inline Actions
vlorentz: | ||||||||||||||||
walker = repo.get_walker(paths=[bytes(relative_path)], max_entries=1) | ||||||||||||||||
try: | ||||||||||||||||
commit = next(iter(walker)).commit | ||||||||||||||||
except StopIteration: | ||||||||||||||||
Done Inline Actionsyou need to sanitize or escape filepath vlorentz: you need to sanitize or escape `filepath` | ||||||||||||||||
logger.error( | ||||||||||||||||
"Can not find %s related commits in repository %s" | ||||||||||||||||
% (relative_path, repo) | ||||||||||||||||
ardumontUnsubmitted Done Inline Actions
let logging instructions do the formatting. ardumont: let logging instructions do the formatting. | ||||||||||||||||
) | ||||||||||||||||
return None | ||||||||||||||||
else: | ||||||||||||||||
last_update = datetime.datetime.fromtimestamp( | ||||||||||||||||
commit.author_time, datetime.timezone.utc | ||||||||||||||||
) | ||||||||||||||||
return last_update | ||||||||||||||||
def page_entry_dict(self, entry: Dict[str, Any]) -> Dict[str, Any]: | ||||||||||||||||
"""Transform package version definition dict to a suitable | ||||||||||||||||
page entry dict | ||||||||||||||||
""" | ||||||||||||||||
return dict( | ||||||||||||||||
name=entry["name"], | ||||||||||||||||
version=entry["vers"], | ||||||||||||||||
Done Inline Actionslooks like dead code vlorentz: looks like dead code | ||||||||||||||||
checksum=entry["cksum"], | ||||||||||||||||
yanked=entry["yanked"], | ||||||||||||||||
crate_file=self.CRATE_FILE_URL_PATTERN.format( | ||||||||||||||||
crate=entry["name"], version=entry["vers"] | ||||||||||||||||
), | ||||||||||||||||
) | ||||||||||||||||
def get_pages(self) -> Iterator[CratesListerPage]: | def get_pages(self) -> Iterator[CratesListerPage]: | |||||||||||||||
"""Yield an iterator sorted by name in ascending order of pages. | """Yield an iterator sorted by name in ascending order of pages. | |||||||||||||||
Each page is a list of crate versions with: | Each page is a list of crate versions with: | |||||||||||||||
- name: Name of the crate | - name: Name of the crate | |||||||||||||||
- version: Version | - version: Version | |||||||||||||||
- checksum: Checksum | - checksum: Checksum | |||||||||||||||
- crate_file: Url of the crate file | - crate_file: Url of the crate file | |||||||||||||||
- last_update: Date of the last commit of the corresponding index | - last_update: Date of the last commit of the corresponding index | |||||||||||||||
file | file | |||||||||||||||
""" | """ | |||||||||||||||
# Fetch crates.io index repository | # Fetch crates.io index repository | |||||||||||||||
self.get_index_repository() | self.get_index_repository() | |||||||||||||||
# Get a list of all crates files from the index repository | if not self.state.last_commit: | |||||||||||||||
# First discovery | ||||||||||||||||
# List all crates files from the index repository | ||||||||||||||||
crates_index = self.get_crates_index() | crates_index = self.get_crates_index() | |||||||||||||||
logger.debug("found %s crates in crates_index", len(crates_index)) | else: | |||||||||||||||
# Incremental case | ||||||||||||||||
# Get new package version by parsing a range of commit from index repository | ||||||||||||||||
ardumontUnsubmitted Done Inline Actions
ardumont: | ||||||||||||||||
repo = Repo(str(self.DESTINATION_PATH)) | ||||||||||||||||
head = repo[repo.head()] | ||||||||||||||||
last = repo[self.state.last_commit.encode()] | ||||||||||||||||
outstream = io.BytesIO() | ||||||||||||||||
write_tree_diff(outstream, repo.object_store, last.tree, head.tree) | ||||||||||||||||
raw_diff = outstream.getvalue() | ||||||||||||||||
crates_index = [] | ||||||||||||||||
for line in raw_diff.splitlines(): | ||||||||||||||||
if line.startswith(b"+++ b/"): | ||||||||||||||||
filepath = line.split(b"+++ b/", 1)[1] | ||||||||||||||||
crates_index.append(self.DESTINATION_PATH / filepath.decode()) | ||||||||||||||||
crates_index = sorted(crates_index) | ||||||||||||||||
logger.debug("Found %s crates in crates_index", len(crates_index)) | ||||||||||||||||
# Each line of a crate file is a json entry describing released versions | ||||||||||||||||
# for a package | ||||||||||||||||
for crate in crates_index: | for crate in crates_index: | |||||||||||||||
page = [] | page = [] | |||||||||||||||
Done Inline Actions
just to be safe vlorentz: just to be safe | ||||||||||||||||
# %cI is for strict iso8601 date formatting | last_update = self.get_last_update_by_file(crate) | |||||||||||||||
Done Inline Actions
I don't think you actually need to decode paths here. Can you try this patch? Removing conversions avoids bugs if crates.io ever adds non-UTF8 filenames. (Don't bother adding a test for this) vlorentz: I don't think you actually need to decode paths here. Can you try this patch?
Removing… | ||||||||||||||||
last_update_str = subprocess.check_output( | ||||||||||||||||
["git", "log", "-1", "--pretty=format:%cI", str(crate)], | ||||||||||||||||
cwd=self.DESTINATION_PATH, | ||||||||||||||||
) | ||||||||||||||||
last_update = iso8601.parse_date(last_update_str.decode().strip()) | ||||||||||||||||
with crate.open("rb") as current_file: | with crate.open("rb") as current_file: | |||||||||||||||
for line in current_file: | for line in current_file: | |||||||||||||||
data = json.loads(line) | data = json.loads(line) | |||||||||||||||
# pick only the data we need | entry = self.page_entry_dict(data) | |||||||||||||||
page.append( | entry["last_update"] = last_update | |||||||||||||||
dict( | page.append(entry) | |||||||||||||||
name=data["name"], | ||||||||||||||||
version=data["vers"], | ||||||||||||||||
checksum=data["cksum"], | ||||||||||||||||
crate_file=self.CRATE_FILE_URL_PATTERN.format( | ||||||||||||||||
crate=data["name"], version=data["vers"] | ||||||||||||||||
), | ||||||||||||||||
last_update=last_update, | ||||||||||||||||
) | ||||||||||||||||
) | ||||||||||||||||
yield page | yield page | |||||||||||||||
def get_origins_from_page(self, page: CratesListerPage) -> Iterator[ListedOrigin]: | def get_origins_from_page(self, page: CratesListerPage) -> Iterator[ListedOrigin]: | |||||||||||||||
"""Iterate on all crate pages and yield ListedOrigin instances.""" | """Iterate on all crate pages and yield ListedOrigin instances.""" | |||||||||||||||
assert self.lister_obj.id is not None | assert self.lister_obj.id is not None | |||||||||||||||
url = self.CRATE_API_URL_PATTERN.format(crate=page[0]["name"]) | url = self.CRATE_API_URL_PATTERN.format(crate=page[0]["name"]) | |||||||||||||||
last_update = page[0]["last_update"] | last_update = page[0]["last_update"] | |||||||||||||||
artifacts = [] | artifacts = [] | |||||||||||||||
metadata = [] | ||||||||||||||||
for version in page: | for version in page: | |||||||||||||||
filename = urlparse(version["crate_file"]).path.split("/")[-1] | filename = urlparse(version["crate_file"]).path.split("/")[-1] | |||||||||||||||
# Build an artifact entry following original-artifacts-json specification | # Build an artifact entry following original-artifacts-json specification | |||||||||||||||
# https://docs.softwareheritage.org/devel/swh-storage/extrinsic-metadata-specification.html#original-artifacts-json # noqa: B950 | # https://docs.softwareheritage.org/devel/swh-storage/extrinsic-metadata-specification.html#original-artifacts-json # noqa: B950 | |||||||||||||||
artifact = { | artifact = { | |||||||||||||||
"filename": f"{filename}", | "filename": f"{filename}", | |||||||||||||||
"checksums": { | "checksums": { | |||||||||||||||
"sha256": f"{version['checksum']}", | "sha256": f"{version['checksum']}", | |||||||||||||||
}, | }, | |||||||||||||||
"url": version["crate_file"], | "url": version["crate_file"], | |||||||||||||||
"version": version["version"], | "version": version["version"], | |||||||||||||||
} | } | |||||||||||||||
artifacts.append(artifact) | artifacts.append(artifact) | |||||||||||||||
data = {f"{version['version']}": {"yanked": f"{version['yanked']}"}} | ||||||||||||||||
metadata.append(data) | ||||||||||||||||
ardumontUnsubmitted Done Inline Actions
i'm not sure whether you actually want to wrap version['version'] into a string or not (i assumed it was a string already). (Also, feel free to dismiss this comment if that does not make any sense to you ;) ardumont: i'm not sure whether you actually want to wrap version['version'] into a string or not (i… | ||||||||||||||||
franckbretAuthorUnsubmitted Done Inline ActionsFor now let's let it as is. It's the same flat structure as artifacts and the loader do something similar when it receives the artifacts data. Regarding crates_metadata and the yanked value, for now it's not used in the loader for anything else than being informative. franckbret: For now let's let it as is. It's the same flat structure as artifacts and the loader do… | ||||||||||||||||
yield ListedOrigin( | yield ListedOrigin( | |||||||||||||||
lister_id=self.lister_obj.id, | lister_id=self.lister_obj.id, | |||||||||||||||
visit_type=self.VISIT_TYPE, | visit_type=self.VISIT_TYPE, | |||||||||||||||
url=url, | url=url, | |||||||||||||||
last_update=last_update, | last_update=last_update, | |||||||||||||||
extra_loader_arguments={ | extra_loader_arguments={ | |||||||||||||||
"artifacts": artifacts, | "artifacts": artifacts, | |||||||||||||||
"metadata": metadata, | ||||||||||||||||
}, | }, | |||||||||||||||
) | ) | |||||||||||||||
def finalize(self) -> None: | ||||||||||||||||
last = self.get_last_commit_hash(repository_path=self.DESTINATION_PATH) | ||||||||||||||||
if self.state.last_commit == last: | ||||||||||||||||
self.updated = False | ||||||||||||||||
Done Inline Actionswhere is this used? vlorentz: where is this used? | ||||||||||||||||
Done Inline Actionsah, it's used in the base class, nvm. You should check the value of self.state.last_commit changed, though. vlorentz: ah, it's used in the base class, nvm.
You should check the value of `self.state.last_commit`… | ||||||||||||||||
else: | ||||||||||||||||
self.state.last_commit = last | ||||||||||||||||
self.updated = True | ||||||||||||||||
logger.debug("Listing crates origin completed with last commit id %s", last) | ||||||||||||||||
Done Inline Actionsadd a test for this (both when successful and failed) vlorentz: add a test for this (both when successful and failed) | ||||||||||||||||
Done Inline ActionsI added a test to ensure it cleans up successfully after lister runs, but can't find a way to make it fail with current code. One way could be by improving get_index_repository method to raise if directory already exists, but looks like blocking and useless unless you have a better idea? franckbret: I added a test to ensure it cleans up successfully after lister runs, but can't find a way to… | ||||||||||||||||
Done Inline Actionsnah, it's fine vlorentz: nah, it's fine | ||||||||||||||||
# Cleanup by removing the repository directory | ||||||||||||||||
if self.DESTINATION_PATH.exists(): | ||||||||||||||||
Done Inline Actionsnot needed to test this either; shutil.rmtree raises an exception when it fails vlorentz: not needed to test this either; `shutil.rmtree` raises an exception when it fails | ||||||||||||||||
shutil.rmtree(self.DESTINATION_PATH) | ||||||||||||||||
logger.debug( | ||||||||||||||||
"Successfully removed %s directory", str(self.DESTINATION_PATH) | ||||||||||||||||
) |