Changeset View
Standalone View
swh/lister/crates/lister.py
# Copyright (C) 2022 The Software Heritage developers | # Copyright (C) 2022 The Software Heritage developers | |||||||||||||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | |||||||||||||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | |||||||||||||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | |||||||||||||||
from dataclasses import asdict, dataclass | ||||||||||||||||
import datetime | ||||||||||||||||
import json | import json | |||||||||||||||
import logging | import logging | |||||||||||||||
from pathlib import Path | from pathlib import Path | |||||||||||||||
import subprocess | import subprocess | |||||||||||||||
from typing import Any, Dict, Iterator, List | from typing import Any, Dict, Iterator, List, Optional | |||||||||||||||
from urllib.parse import urlparse | from urllib.parse import urlparse | |||||||||||||||
import iso8601 | from dulwich import porcelain | |||||||||||||||
from dulwich.repo import Repo | ||||||||||||||||
from swh.scheduler.interface import SchedulerInterface | from swh.scheduler.interface import SchedulerInterface | |||||||||||||||
from swh.scheduler.model import ListedOrigin | from swh.scheduler.model import ListedOrigin | |||||||||||||||
from ..pattern import CredentialsType, StatelessLister | from ..pattern import CredentialsType, Lister | |||||||||||||||
logger = logging.getLogger(__name__) | logger = logging.getLogger(__name__) | |||||||||||||||
# Aliasing the page results returned by `get_pages` method from the lister. | # Aliasing the page results returned by `get_pages` method from the lister. | |||||||||||||||
CratesListerPage = List[Dict[str, Any]] | CratesListerPage = List[Dict[str, Any]] | |||||||||||||||
class CratesLister(StatelessLister[CratesListerPage]): | @dataclass | |||||||||||||||
class CratesListerState: | ||||||||||||||||
"""Store lister state for incremental mode operations. | ||||||||||||||||
'last_commit' represents a git commit hash | ||||||||||||||||
""" | ||||||||||||||||
last_commit: str = "" | ||||||||||||||||
class CratesLister(Lister[CratesListerState, CratesListerPage]): | ||||||||||||||||
"""List origins from the "crates.io" forge. | """List origins from the "crates.io" forge. | |||||||||||||||
It basically fetches https://github.com/rust-lang/crates.io-index.git to a | It basically fetches https://github.com/rust-lang/crates.io-index.git to a | |||||||||||||||
temp directory and then walks through each file to get the crate's info. | temp directory and then walks through each file to get the crate's info on | |||||||||||||||
the first run. | ||||||||||||||||
In incremental mode, it relies on the same Git repository but instead of reading | ||||||||||||||||
each file of the repo, it get the differences through ``git log last_commit..HEAD``. | ||||||||||||||||
vlorentzUnsubmitted Done Inline Actions
vlorentz: | ||||||||||||||||
Resulting output string is parsed to build page entries. | ||||||||||||||||
""" | """ | |||||||||||||||
# Part of the lister API, that identifies this lister | # Part of the lister API, that identifies this lister | |||||||||||||||
LISTER_NAME = "crates" | LISTER_NAME = "crates" | |||||||||||||||
# (Optional) CVS type of the origins listed by this lister, if constant | # (Optional) CVS type of the origins listed by this lister, if constant | |||||||||||||||
VISIT_TYPE = "crates" | VISIT_TYPE = "crates" | |||||||||||||||
INSTANCE = "crates" | INSTANCE = "crates" | |||||||||||||||
Show All 11 Lines | class CratesLister(Lister[CratesListerState, CratesListerPage]): | |||||||||||||||
): | ): | |||||||||||||||
super().__init__( | super().__init__( | |||||||||||||||
scheduler=scheduler, | scheduler=scheduler, | |||||||||||||||
credentials=credentials, | credentials=credentials, | |||||||||||||||
url=self.INDEX_REPOSITORY_URL, | url=self.INDEX_REPOSITORY_URL, | |||||||||||||||
instance=self.INSTANCE, | instance=self.INSTANCE, | |||||||||||||||
) | ) | |||||||||||||||
def state_from_dict(self, d: Dict[str, Any]) -> CratesListerState: | ||||||||||||||||
Done Inline Actions
this is equivalent vlorentz: this is equivalent | ||||||||||||||||
if "last_commit" not in d: | ||||||||||||||||
d["last_commit"] = "" | ||||||||||||||||
return CratesListerState(**d) | ||||||||||||||||
def state_to_dict(self, state: CratesListerState) -> Dict[str, Any]: | ||||||||||||||||
return asdict(state) | ||||||||||||||||
def get_index_repository(self) -> None: | def get_index_repository(self) -> None: | |||||||||||||||
"""Get crates.io-index repository up to date running git command.""" | """Get crates.io-index repository up to date running git command.""" | |||||||||||||||
if self.DESTINATION_PATH.exists(): | ||||||||||||||||
subprocess.check_call( | porcelain.pull( | |||||||||||||||
[ | self.DESTINATION_PATH, remote_location=self.INDEX_REPOSITORY_URL | |||||||||||||||
"git", | ) | |||||||||||||||
"clone", | else: | |||||||||||||||
self.INDEX_REPOSITORY_URL, | porcelain.clone( | |||||||||||||||
self.DESTINATION_PATH, | source=self.INDEX_REPOSITORY_URL, target=self.DESTINATION_PATH | |||||||||||||||
] | ||||||||||||||||
) | ) | |||||||||||||||
Done Inline ActionsCould you swap the if and else blocks? No big deal, but I prefer to avoid not in conditionals when there is a else clause. vlorentz: Could you swap the `if` and `else` blocks? No big deal, but I prefer to avoid `not` in… | ||||||||||||||||
Done Inline ActionsSure franckbret: Sure | ||||||||||||||||
def get_crates_index(self) -> List[Path]: | def get_crates_index(self) -> List[Path]: | |||||||||||||||
"""Build a sorted list of file paths excluding dotted directories and | """Build a sorted list of file paths excluding dotted directories and | |||||||||||||||
dotted files. | dotted files. | |||||||||||||||
Each file path corresponds to a crate that lists all available | Each file path corresponds to a crate that lists all available | |||||||||||||||
versions. | versions. | |||||||||||||||
""" | """ | |||||||||||||||
crates_index = sorted( | crates_index = sorted( | |||||||||||||||
path | path | |||||||||||||||
for path in self.DESTINATION_PATH.rglob("*") | for path in self.DESTINATION_PATH.rglob("*") | |||||||||||||||
if not any(part.startswith(".") for part in path.parts) | if not any(part.startswith(".") for part in path.parts) | |||||||||||||||
and path.is_file() | and path.is_file() | |||||||||||||||
and path != self.DESTINATION_PATH / "config.json" | and path != self.DESTINATION_PATH / "config.json" | |||||||||||||||
) | ) | |||||||||||||||
return crates_index | return crates_index | |||||||||||||||
def get_last_commit_hash(self, repository_path: Path) -> str: | ||||||||||||||||
"""Returns the last commit hash of a git repository""" | ||||||||||||||||
assert repository_path.exists() | ||||||||||||||||
repo = Repo(str(repository_path)) | ||||||||||||||||
head = repo.head() | ||||||||||||||||
last_commit = repo[head] | ||||||||||||||||
return last_commit.id.decode() | ||||||||||||||||
def get_last_update_by_file(self, filepath: Path) -> Optional[datetime.datetime]: | ||||||||||||||||
"""Given a file path within a Git repository, returns its last commit | ||||||||||||||||
date as iso8601 | ||||||||||||||||
""" | ||||||||||||||||
repo = Repo(str(self.DESTINATION_PATH)) | ||||||||||||||||
# compute relative path otherwise it fails | ||||||||||||||||
relative_path = filepath.relative_to(self.DESTINATION_PATH) | ||||||||||||||||
walker = repo.get_walker(paths=[str(relative_path).encode()], max_entries=1) | ||||||||||||||||
vlorentzUnsubmitted Done Inline Actions
vlorentz: | ||||||||||||||||
try: | ||||||||||||||||
commit = next(iter(walker)).commit | ||||||||||||||||
except StopIteration: | ||||||||||||||||
Done Inline Actionsyou need to sanitize or escape filepath vlorentz: you need to sanitize or escape `filepath` | ||||||||||||||||
logger.error( | ||||||||||||||||
"Can not find %s related commits in repository %s" | ||||||||||||||||
Done Inline Actions
let logging instructions do the formatting. ardumont: let logging instructions do the formatting. | ||||||||||||||||
% (relative_path, repo) | ||||||||||||||||
) | ||||||||||||||||
return None | ||||||||||||||||
else: | ||||||||||||||||
last_update = datetime.datetime.fromtimestamp( | ||||||||||||||||
commit.author_time, datetime.timezone.utc | ||||||||||||||||
) | ||||||||||||||||
return last_update | ||||||||||||||||
def get_last_update_by_commit(self, commit: str) -> datetime.datetime: | ||||||||||||||||
"""Given a commit hash, returns the date of the commit""" | ||||||||||||||||
repo = Repo(str(self.DESTINATION_PATH)) | ||||||||||||||||
commit_object = repo[commit.encode()] | ||||||||||||||||
last_update = datetime.datetime.fromtimestamp( | ||||||||||||||||
commit_object.author_time, datetime.timezone.utc | ||||||||||||||||
) | ||||||||||||||||
Done Inline Actionslooks like dead code vlorentz: looks like dead code | ||||||||||||||||
return last_update | ||||||||||||||||
def parse_diff(self, diff: str) -> Dict[str, Any]: | ||||||||||||||||
"""Extract diff information and commit hash from git log command string output""" | ||||||||||||||||
parsed: Dict[str, Any] = dict() | ||||||||||||||||
commit: str = "" | ||||||||||||||||
for line in diff.splitlines(): | ||||||||||||||||
# Git commit hash are 40 long with leading and trailing single quote it counts 42 | ||||||||||||||||
if line.startswith("'") and len(line) == 42: | ||||||||||||||||
commit = line.replace("'", "") | ||||||||||||||||
# Diff line starts with +, data is always json so add { too | ||||||||||||||||
if line.startswith("+{"): | ||||||||||||||||
version = json.loads(line[1:]) | ||||||||||||||||
if commit: | ||||||||||||||||
if version["name"] not in parsed.keys(): | ||||||||||||||||
parsed[version["name"]] = [dict(data=version, commit=commit)] | ||||||||||||||||
else: | ||||||||||||||||
parsed[version["name"]].append( | ||||||||||||||||
dict(data=version, commit=commit) | ||||||||||||||||
) | ||||||||||||||||
return parsed | ||||||||||||||||
def page_entry_dict(self, entry: Dict[str, Any]) -> Dict[str, Any]: | ||||||||||||||||
"""Transform package version definition dict to a suitable | ||||||||||||||||
page entry dict | ||||||||||||||||
""" | ||||||||||||||||
return dict( | ||||||||||||||||
name=entry["name"], | ||||||||||||||||
version=entry["vers"], | ||||||||||||||||
checksum=entry["cksum"], | ||||||||||||||||
crate_file=self.CRATE_FILE_URL_PATTERN.format( | ||||||||||||||||
crate=entry["name"], version=entry["vers"] | ||||||||||||||||
), | ||||||||||||||||
) | ||||||||||||||||
def get_pages(self) -> Iterator[CratesListerPage]: | def get_pages(self) -> Iterator[CratesListerPage]: | |||||||||||||||
"""Yield an iterator sorted by name in ascending order of pages. | """Yield an iterator sorted by name in ascending order of pages. | |||||||||||||||
Each page is a list of crate versions with: | Each page is a list of crate versions with: | |||||||||||||||
- name: Name of the crate | - name: Name of the crate | |||||||||||||||
- version: Version | - version: Version | |||||||||||||||
- checksum: Checksum | - checksum: Checksum | |||||||||||||||
- crate_file: Url of the crate file | - crate_file: Url of the crate file | |||||||||||||||
- last_update: Date of the last commit of the corresponding index | - last_update: Date of the last commit of the corresponding index | |||||||||||||||
file | file | |||||||||||||||
""" | """ | |||||||||||||||
# Fetch crates.io index repository | # Fetch crates.io index repository | |||||||||||||||
self.get_index_repository() | self.get_index_repository() | |||||||||||||||
# Get a list of all crates files from the index repository | ||||||||||||||||
crates_index = self.get_crates_index() | ||||||||||||||||
logger.debug("found %s crates in crates_index", len(crates_index)) | ||||||||||||||||
if not self.state.last_commit: | ||||||||||||||||
# First discovery | ||||||||||||||||
# List all crates files from the index repository | ||||||||||||||||
crates_index = self.get_crates_index() | ||||||||||||||||
Done Inline Actions
ardumont: | ||||||||||||||||
logger.debug("Found %s crates in crates_index", len(crates_index)) | ||||||||||||||||
# Each line of a crate file is a json entry describing released versions | ||||||||||||||||
# for a package | ||||||||||||||||
for crate in crates_index: | for crate in crates_index: | |||||||||||||||
page = [] | page = [] | |||||||||||||||
# %cI is for strict iso8601 date formatting | last_update = self.get_last_update_by_file(crate) | |||||||||||||||
last_update_str = subprocess.check_output( | ||||||||||||||||
["git", "log", "-1", "--pretty=format:%cI", str(crate)], | ||||||||||||||||
cwd=self.DESTINATION_PATH, | ||||||||||||||||
) | ||||||||||||||||
last_update = iso8601.parse_date(last_update_str.decode().strip()) | ||||||||||||||||
with crate.open("rb") as current_file: | with crate.open("rb") as current_file: | |||||||||||||||
for line in current_file: | for line in current_file: | |||||||||||||||
data = json.loads(line) | data = json.loads(line) | |||||||||||||||
# pick only the data we need | entry = self.page_entry_dict(data) | |||||||||||||||
Done Inline Actions
just to be safe vlorentz: just to be safe | ||||||||||||||||
page.append( | entry["last_update"] = last_update | |||||||||||||||
Done Inline Actions
I don't think you actually need to decode paths here. Can you try this patch? Removing conversions avoids bugs if crates.io ever adds non-UTF8 filenames. (Don't bother adding a test for this) vlorentz: I don't think you actually need to decode paths here. Can you try this patch?
Removing… | ||||||||||||||||
dict( | page.append(entry) | |||||||||||||||
name=data["name"], | yield page | |||||||||||||||
version=data["vers"], | else: | |||||||||||||||
checksum=data["cksum"], | # Incremental case | |||||||||||||||
crate_file=self.CRATE_FILE_URL_PATTERN.format( | # Get new package version by parsing a range of commit from index repository | |||||||||||||||
crate=data["name"], version=data["vers"] | diff = subprocess.check_output( | |||||||||||||||
), | [ | |||||||||||||||
last_update=last_update, | "git", | |||||||||||||||
"log", | ||||||||||||||||
"--pretty=format:'%H'", | ||||||||||||||||
"--unified=0", | ||||||||||||||||
"-p", | ||||||||||||||||
f"{self.state.last_commit}..HEAD", | ||||||||||||||||
], | ||||||||||||||||
cwd=self.DESTINATION_PATH, | ||||||||||||||||
) | ) | |||||||||||||||
parsed = self.parse_diff(diff.decode()) | ||||||||||||||||
logger.debug( | ||||||||||||||||
"Found %(count)s crates in diff log since %(last_commit)s", | ||||||||||||||||
{"count": len(parsed), "last_commit": self.state.last_commit}, | ||||||||||||||||
) | ) | |||||||||||||||
for k, v in parsed.items(): | ||||||||||||||||
page = [] | ||||||||||||||||
for elt in v: | ||||||||||||||||
entry = self.page_entry_dict(entry=elt["data"]) | ||||||||||||||||
entry["last_update"] = self.get_last_update_by_commit(elt["commit"]) | ||||||||||||||||
page.append(entry) | ||||||||||||||||
yield page | yield page | |||||||||||||||
def get_origins_from_page(self, page: CratesListerPage) -> Iterator[ListedOrigin]: | def get_origins_from_page(self, page: CratesListerPage) -> Iterator[ListedOrigin]: | |||||||||||||||
"""Iterate on all crate pages and yield ListedOrigin instances.""" | """Iterate on all crate pages and yield ListedOrigin instances.""" | |||||||||||||||
assert self.lister_obj.id is not None | assert self.lister_obj.id is not None | |||||||||||||||
url = self.CRATE_API_URL_PATTERN.format(crate=page[0]["name"]) | url = self.CRATE_API_URL_PATTERN.format(crate=page[0]["name"]) | |||||||||||||||
last_update = page[0]["last_update"] | last_update = page[0]["last_update"] | |||||||||||||||
artifacts = [] | artifacts = [] | |||||||||||||||
for version in page: | for version in page: | |||||||||||||||
filename = urlparse(version["crate_file"]).path.split("/")[-1] | filename = urlparse(version["crate_file"]).path.split("/")[-1] | |||||||||||||||
# Build an artifact entry following original-artifacts-json specification | # Build an artifact entry following original-artifacts-json specification | |||||||||||||||
# https://docs.softwareheritage.org/devel/swh-storage/extrinsic-metadata-specification.html#original-artifacts-json # noqa: B950 | # https://docs.softwareheritage.org/devel/swh-storage/extrinsic-metadata-specification.html#original-artifacts-json # noqa: B950 | |||||||||||||||
artifact = { | artifact = { | |||||||||||||||
"filename": f"{filename}", | "filename": f"{filename}", | |||||||||||||||
"checksums": { | "checksums": { | |||||||||||||||
"sha256": f"{version['checksum']}", | "sha256": f"{version['checksum']}", | |||||||||||||||
}, | }, | |||||||||||||||
"url": version["crate_file"], | "url": version["crate_file"], | |||||||||||||||
"version": version["version"], | "version": version["version"], | |||||||||||||||
} | } | |||||||||||||||
artifacts.append(artifact) | artifacts.append(artifact) | |||||||||||||||
Done Inline Actions
i'm not sure whether you actually want to wrap version['version'] into a string or not (i assumed it was a string already). (Also, feel free to dismiss this comment if that does not make any sense to you ;) ardumont: i'm not sure whether you actually want to wrap version['version'] into a string or not (i… | ||||||||||||||||
Done Inline ActionsFor now let's let it as is. It's the same flat structure as artifacts and the loader do something similar when it receives the artifacts data. Regarding crates_metadata and the yanked value, for now it's not used in the loader for anything else than being informative. franckbret: For now let's let it as is. It's the same flat structure as artifacts and the loader do… | ||||||||||||||||
yield ListedOrigin( | yield ListedOrigin( | |||||||||||||||
lister_id=self.lister_obj.id, | lister_id=self.lister_obj.id, | |||||||||||||||
visit_type=self.VISIT_TYPE, | visit_type=self.VISIT_TYPE, | |||||||||||||||
url=url, | url=url, | |||||||||||||||
last_update=last_update, | last_update=last_update, | |||||||||||||||
extra_loader_arguments={ | extra_loader_arguments={ | |||||||||||||||
"artifacts": artifacts, | "artifacts": artifacts, | |||||||||||||||
}, | }, | |||||||||||||||
) | ) | |||||||||||||||
def finalize(self) -> None: | ||||||||||||||||
last = self.get_last_commit_hash(repository_path=self.DESTINATION_PATH) | ||||||||||||||||
if self.state.last_commit == last: | ||||||||||||||||
self.updated = False | ||||||||||||||||
Done Inline Actionswhere is this used? vlorentz: where is this used? | ||||||||||||||||
Done Inline Actionsah, it's used in the base class, nvm. You should check the value of self.state.last_commit changed, though. vlorentz: ah, it's used in the base class, nvm.
You should check the value of `self.state.last_commit`… | ||||||||||||||||
else: | ||||||||||||||||
self.state.last_commit = last | ||||||||||||||||
self.updated = True | ||||||||||||||||
Done Inline Actionsadd a test for this (both when successful and failed) vlorentz: add a test for this (both when successful and failed) | ||||||||||||||||
Done Inline ActionsI added a test to ensure it cleans up successfully after lister runs, but can't find a way to make it fail with current code. One way could be by improving get_index_repository method to raise if directory already exists, but looks like blocking and useless unless you have a better idea? franckbret: I added a test to ensure it cleans up successfully after lister runs, but can't find a way to… | ||||||||||||||||
Done Inline Actionsnah, it's fine vlorentz: nah, it's fine | ||||||||||||||||
Done Inline Actionsnot needed to test this either; shutil.rmtree raises an exception when it fails vlorentz: not needed to test this either; `shutil.rmtree` raises an exception when it fails |