Changeset View
Standalone View
swh/lister/crates/lister.py
| # Copyright (C) 2022 The Software Heritage developers | # Copyright (C) 2022 The Software Heritage developers | |||||||||||||||
| # See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | |||||||||||||||
| # License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | |||||||||||||||
| # See top-level LICENSE file for more information | # See top-level LICENSE file for more information | |||||||||||||||
| from dataclasses import asdict, dataclass | ||||||||||||||||
| import datetime | ||||||||||||||||
| import io | ||||||||||||||||
| import json | import json | |||||||||||||||
| import logging | import logging | |||||||||||||||
| from pathlib import Path | from pathlib import Path | |||||||||||||||
| import subprocess | import shutil | |||||||||||||||
| from typing import Any, Dict, Iterator, List | from typing import Any, Dict, Iterator, List, Optional | |||||||||||||||
| from urllib.parse import urlparse | from urllib.parse import urlparse | |||||||||||||||
| import iso8601 | from dulwich import porcelain | |||||||||||||||
| from dulwich.patch import write_tree_diff | ||||||||||||||||
| from dulwich.repo import Repo | ||||||||||||||||
| from swh.scheduler.interface import SchedulerInterface | from swh.scheduler.interface import SchedulerInterface | |||||||||||||||
| from swh.scheduler.model import ListedOrigin | from swh.scheduler.model import ListedOrigin | |||||||||||||||
| from ..pattern import CredentialsType, StatelessLister | from ..pattern import CredentialsType, Lister | |||||||||||||||
| logger = logging.getLogger(__name__) | logger = logging.getLogger(__name__) | |||||||||||||||
| # Aliasing the page results returned by `get_pages` method from the lister. | # Aliasing the page results returned by `get_pages` method from the lister. | |||||||||||||||
| CratesListerPage = List[Dict[str, Any]] | CratesListerPage = List[Dict[str, Any]] | |||||||||||||||
| class CratesLister(StatelessLister[CratesListerPage]): | @dataclass | |||||||||||||||
| class CratesListerState: | ||||||||||||||||
| """Store lister state for incremental mode operations. | ||||||||||||||||
| 'last_commit' represents a git commit hash | ||||||||||||||||
| """ | ||||||||||||||||
| last_commit: str = "" | ||||||||||||||||
| class CratesLister(Lister[CratesListerState, CratesListerPage]): | ||||||||||||||||
| """List origins from the "crates.io" forge. | """List origins from the "crates.io" forge. | |||||||||||||||
| It basically fetches https://github.com/rust-lang/crates.io-index.git to a | It basically fetches https://github.com/rust-lang/crates.io-index.git to a | |||||||||||||||
| temp directory and then walks through each file to get the crate's info. | temp directory and then walks through each file to get the crate's info on | |||||||||||||||
| the first run. | ||||||||||||||||
| In incremental mode, it relies on the same Git repository but instead of reading | ||||||||||||||||
| each file of the repo, it get the differences through ``git log last_commit..HEAD``. | ||||||||||||||||
vlorentzUnsubmitted Done Inline Actions
vlorentz: | ||||||||||||||||
| Resulting output string is parsed to build page entries. | ||||||||||||||||
| """ | """ | |||||||||||||||
| # Part of the lister API, that identifies this lister | # Part of the lister API, that identifies this lister | |||||||||||||||
| LISTER_NAME = "crates" | LISTER_NAME = "crates" | |||||||||||||||
| # (Optional) CVS type of the origins listed by this lister, if constant | # (Optional) CVS type of the origins listed by this lister, if constant | |||||||||||||||
| VISIT_TYPE = "crates" | VISIT_TYPE = "crates" | |||||||||||||||
| INSTANCE = "crates" | INSTANCE = "crates" | |||||||||||||||
| Show All 11 Lines | class CratesLister(Lister[CratesListerState, CratesListerPage]): | |||||||||||||||
| ): | ): | |||||||||||||||
| super().__init__( | super().__init__( | |||||||||||||||
| scheduler=scheduler, | scheduler=scheduler, | |||||||||||||||
| credentials=credentials, | credentials=credentials, | |||||||||||||||
| url=self.INDEX_REPOSITORY_URL, | url=self.INDEX_REPOSITORY_URL, | |||||||||||||||
| instance=self.INSTANCE, | instance=self.INSTANCE, | |||||||||||||||
| ) | ) | |||||||||||||||
| def state_from_dict(self, d: Dict[str, Any]) -> CratesListerState: | ||||||||||||||||
Done Inline Actions
this is equivalent vlorentz: this is equivalent | ||||||||||||||||
| if "last_commit" not in d: | ||||||||||||||||
| d["last_commit"] = "" | ||||||||||||||||
| return CratesListerState(**d) | ||||||||||||||||
| def state_to_dict(self, state: CratesListerState) -> Dict[str, Any]: | ||||||||||||||||
| return asdict(state) | ||||||||||||||||
| def get_index_repository(self) -> None: | def get_index_repository(self) -> None: | |||||||||||||||
| """Get crates.io-index repository up to date running git command.""" | """Get crates.io-index repository up to date running git command.""" | |||||||||||||||
| if self.DESTINATION_PATH.exists(): | ||||||||||||||||
| subprocess.check_call( | porcelain.pull( | |||||||||||||||
| [ | self.DESTINATION_PATH, remote_location=self.INDEX_REPOSITORY_URL | |||||||||||||||
| "git", | ) | |||||||||||||||
| "clone", | else: | |||||||||||||||
| self.INDEX_REPOSITORY_URL, | porcelain.clone( | |||||||||||||||
| self.DESTINATION_PATH, | source=self.INDEX_REPOSITORY_URL, target=self.DESTINATION_PATH | |||||||||||||||
| ] | ||||||||||||||||
| ) | ) | |||||||||||||||
Done Inline ActionsCould you swap the if and else blocks? No big deal, but I prefer to avoid not in conditionals when there is a else clause. vlorentz: Could you swap the `if` and `else` blocks? No big deal, but I prefer to avoid `not` in… | ||||||||||||||||
Done Inline ActionsSure franckbret: Sure | ||||||||||||||||
| def get_crates_index(self) -> List[Path]: | def get_crates_index(self) -> List[Path]: | |||||||||||||||
| """Build a sorted list of file paths excluding dotted directories and | """Build a sorted list of file paths excluding dotted directories and | |||||||||||||||
| dotted files. | dotted files. | |||||||||||||||
| Each file path corresponds to a crate that lists all available | Each file path corresponds to a crate that lists all available | |||||||||||||||
| versions. | versions. | |||||||||||||||
| """ | """ | |||||||||||||||
| crates_index = sorted( | crates_index = sorted( | |||||||||||||||
| path | path | |||||||||||||||
| for path in self.DESTINATION_PATH.rglob("*") | for path in self.DESTINATION_PATH.rglob("*") | |||||||||||||||
| if not any(part.startswith(".") for part in path.parts) | if not any(part.startswith(".") for part in path.parts) | |||||||||||||||
| and path.is_file() | and path.is_file() | |||||||||||||||
| and path != self.DESTINATION_PATH / "config.json" | and path != self.DESTINATION_PATH / "config.json" | |||||||||||||||
| ) | ) | |||||||||||||||
| return crates_index | return crates_index | |||||||||||||||
| def get_last_commit_hash(self, repository_path: Path) -> str: | ||||||||||||||||
| """Returns the last commit hash of a git repository""" | ||||||||||||||||
| assert repository_path.exists() | ||||||||||||||||
| repo = Repo(str(repository_path)) | ||||||||||||||||
| head = repo.head() | ||||||||||||||||
| last_commit = repo[head] | ||||||||||||||||
| return last_commit.id.decode() | ||||||||||||||||
| def get_last_update_by_file(self, filepath: Path) -> Optional[datetime.datetime]: | ||||||||||||||||
| """Given a file path within a Git repository, returns its last commit | ||||||||||||||||
| date as iso8601 | ||||||||||||||||
| """ | ||||||||||||||||
| repo = Repo(str(self.DESTINATION_PATH)) | ||||||||||||||||
| # compute relative path otherwise it fails | ||||||||||||||||
| relative_path = filepath.relative_to(self.DESTINATION_PATH) | ||||||||||||||||
Done Inline Actions
vlorentz: | ||||||||||||||||
| walker = repo.get_walker(paths=[bytes(relative_path)], max_entries=1) | ||||||||||||||||
| try: | ||||||||||||||||
| commit = next(iter(walker)).commit | ||||||||||||||||
| except StopIteration: | ||||||||||||||||
Done Inline Actionsyou need to sanitize or escape filepath vlorentz: you need to sanitize or escape `filepath` | ||||||||||||||||
| logger.error( | ||||||||||||||||
| "Can not find %s related commits in repository %s", relative_path, repo | ||||||||||||||||
Done Inline Actions
let logging instructions do the formatting. ardumont: let logging instructions do the formatting. | ||||||||||||||||
| ) | ||||||||||||||||
| return None | ||||||||||||||||
| else: | ||||||||||||||||
| last_update = datetime.datetime.fromtimestamp( | ||||||||||||||||
| commit.author_time, datetime.timezone.utc | ||||||||||||||||
| ) | ||||||||||||||||
| return last_update | ||||||||||||||||
| def page_entry_dict(self, entry: Dict[str, Any]) -> Dict[str, Any]: | ||||||||||||||||
| """Transform package version definition dict to a suitable | ||||||||||||||||
| page entry dict | ||||||||||||||||
| """ | ||||||||||||||||
| return dict( | ||||||||||||||||
| name=entry["name"], | ||||||||||||||||
| version=entry["vers"], | ||||||||||||||||
| checksum=entry["cksum"], | ||||||||||||||||
Done Inline Actionslooks like dead code vlorentz: looks like dead code | ||||||||||||||||
| yanked=entry["yanked"], | ||||||||||||||||
| crate_file=self.CRATE_FILE_URL_PATTERN.format( | ||||||||||||||||
| crate=entry["name"], version=entry["vers"] | ||||||||||||||||
| ), | ||||||||||||||||
| ) | ||||||||||||||||
| def get_pages(self) -> Iterator[CratesListerPage]: | def get_pages(self) -> Iterator[CratesListerPage]: | |||||||||||||||
| """Yield an iterator sorted by name in ascending order of pages. | """Yield an iterator sorted by name in ascending order of pages. | |||||||||||||||
| Each page is a list of crate versions with: | Each page is a list of crate versions with: | |||||||||||||||
| - name: Name of the crate | - name: Name of the crate | |||||||||||||||
| - version: Version | - version: Version | |||||||||||||||
| - checksum: Checksum | - checksum: Checksum | |||||||||||||||
| - crate_file: Url of the crate file | - crate_file: Url of the crate file | |||||||||||||||
| - last_update: Date of the last commit of the corresponding index | - last_update: Date of the last commit of the corresponding index | |||||||||||||||
| file | file | |||||||||||||||
| """ | """ | |||||||||||||||
| # Fetch crates.io index repository | # Fetch crates.io index repository | |||||||||||||||
| self.get_index_repository() | self.get_index_repository() | |||||||||||||||
| # Get a list of all crates files from the index repository | if not self.state.last_commit: | |||||||||||||||
| # First discovery | ||||||||||||||||
| # List all crates files from the index repository | ||||||||||||||||
| crates_index = self.get_crates_index() | crates_index = self.get_crates_index() | |||||||||||||||
| logger.debug("found %s crates in crates_index", len(crates_index)) | else: | |||||||||||||||
| # Incremental case | ||||||||||||||||
Done Inline Actions
ardumont: | ||||||||||||||||
| # Get new package version by parsing a range of commits from index repository | ||||||||||||||||
| repo = Repo(str(self.DESTINATION_PATH)) | ||||||||||||||||
| head = repo[repo.head()] | ||||||||||||||||
| last = repo[self.state.last_commit.encode()] | ||||||||||||||||
| outstream = io.BytesIO() | ||||||||||||||||
| write_tree_diff(outstream, repo.object_store, last.tree, head.tree) | ||||||||||||||||
| raw_diff = outstream.getvalue() | ||||||||||||||||
| crates_index = [] | ||||||||||||||||
| for line in raw_diff.splitlines(): | ||||||||||||||||
| if line.startswith(b"+++ b/"): | ||||||||||||||||
| filepath = line.split(b"+++ b/", 1)[1] | ||||||||||||||||
| crates_index.append(self.DESTINATION_PATH / filepath.decode()) | ||||||||||||||||
| crates_index = sorted(crates_index) | ||||||||||||||||
| logger.debug("Found %s crates in crates_index", len(crates_index)) | ||||||||||||||||
| # Each line of a crate file is a json entry describing released versions | ||||||||||||||||
| # for a package | ||||||||||||||||
| for crate in crates_index: | for crate in crates_index: | |||||||||||||||
| page = [] | page = [] | |||||||||||||||
Done Inline Actions
just to be safe vlorentz: just to be safe | ||||||||||||||||
| # %cI is for strict iso8601 date formatting | last_update = self.get_last_update_by_file(crate) | |||||||||||||||
Done Inline Actions
I don't think you actually need to decode paths here. Can you try this patch? Removing conversions avoids bugs if crates.io ever adds non-UTF8 filenames. (Don't bother adding a test for this) vlorentz: I don't think you actually need to decode paths here. Can you try this patch?
Removing… | ||||||||||||||||
| last_update_str = subprocess.check_output( | ||||||||||||||||
| ["git", "log", "-1", "--pretty=format:%cI", str(crate)], | ||||||||||||||||
| cwd=self.DESTINATION_PATH, | ||||||||||||||||
| ) | ||||||||||||||||
| last_update = iso8601.parse_date(last_update_str.decode().strip()) | ||||||||||||||||
| with crate.open("rb") as current_file: | with crate.open("rb") as current_file: | |||||||||||||||
| for line in current_file: | for line in current_file: | |||||||||||||||
| data = json.loads(line) | data = json.loads(line) | |||||||||||||||
| # pick only the data we need | entry = self.page_entry_dict(data) | |||||||||||||||
| page.append( | entry["last_update"] = last_update | |||||||||||||||
| dict( | page.append(entry) | |||||||||||||||
| name=data["name"], | ||||||||||||||||
| version=data["vers"], | ||||||||||||||||
| checksum=data["cksum"], | ||||||||||||||||
| crate_file=self.CRATE_FILE_URL_PATTERN.format( | ||||||||||||||||
| crate=data["name"], version=data["vers"] | ||||||||||||||||
| ), | ||||||||||||||||
| last_update=last_update, | ||||||||||||||||
| ) | ||||||||||||||||
| ) | ||||||||||||||||
| yield page | yield page | |||||||||||||||
| def get_origins_from_page(self, page: CratesListerPage) -> Iterator[ListedOrigin]: | def get_origins_from_page(self, page: CratesListerPage) -> Iterator[ListedOrigin]: | |||||||||||||||
| """Iterate on all crate pages and yield ListedOrigin instances.""" | """Iterate on all crate pages and yield ListedOrigin instances.""" | |||||||||||||||
| assert self.lister_obj.id is not None | assert self.lister_obj.id is not None | |||||||||||||||
| url = self.CRATE_API_URL_PATTERN.format(crate=page[0]["name"]) | url = self.CRATE_API_URL_PATTERN.format(crate=page[0]["name"]) | |||||||||||||||
| last_update = page[0]["last_update"] | last_update = page[0]["last_update"] | |||||||||||||||
| artifacts = [] | artifacts = [] | |||||||||||||||
| crates_metadata = [] | ||||||||||||||||
| for version in page: | for version in page: | |||||||||||||||
| filename = urlparse(version["crate_file"]).path.split("/")[-1] | filename = urlparse(version["crate_file"]).path.split("/")[-1] | |||||||||||||||
| # Build an artifact entry following original-artifacts-json specification | # Build an artifact entry following original-artifacts-json specification | |||||||||||||||
| # https://docs.softwareheritage.org/devel/swh-storage/extrinsic-metadata-specification.html#original-artifacts-json # noqa: B950 | # https://docs.softwareheritage.org/devel/swh-storage/extrinsic-metadata-specification.html#original-artifacts-json # noqa: B950 | |||||||||||||||
| artifact = { | artifact = { | |||||||||||||||
| "filename": f"{filename}", | "filename": f"{filename}", | |||||||||||||||
| "checksums": { | "checksums": { | |||||||||||||||
| "sha256": f"{version['checksum']}", | "sha256": f"{version['checksum']}", | |||||||||||||||
| }, | }, | |||||||||||||||
| "url": version["crate_file"], | "url": version["crate_file"], | |||||||||||||||
| "version": version["version"], | "version": version["version"], | |||||||||||||||
| } | } | |||||||||||||||
| artifacts.append(artifact) | artifacts.append(artifact) | |||||||||||||||
| data = {f"{version['version']}": {"yanked": f"{version['yanked']}"}} | ||||||||||||||||
Done Inline Actions
i'm not sure whether you actually want to wrap version['version'] into a string or not (i assumed it was a string already). (Also, feel free to dismiss this comment if that does not make any sense to you ;) ardumont: i'm not sure whether you actually want to wrap version['version'] into a string or not (i… | ||||||||||||||||
Done Inline ActionsFor now let's let it as is. It's the same flat structure as artifacts and the loader do something similar when it receives the artifacts data. Regarding crates_metadata and the yanked value, for now it's not used in the loader for anything else than being informative. franckbret: For now let's let it as is. It's the same flat structure as artifacts and the loader do… | ||||||||||||||||
| crates_metadata.append(data) | ||||||||||||||||
| yield ListedOrigin( | yield ListedOrigin( | |||||||||||||||
| lister_id=self.lister_obj.id, | lister_id=self.lister_obj.id, | |||||||||||||||
| visit_type=self.VISIT_TYPE, | visit_type=self.VISIT_TYPE, | |||||||||||||||
| url=url, | url=url, | |||||||||||||||
| last_update=last_update, | last_update=last_update, | |||||||||||||||
| extra_loader_arguments={ | extra_loader_arguments={ | |||||||||||||||
| "artifacts": artifacts, | "artifacts": artifacts, | |||||||||||||||
| "crates_metadata": crates_metadata, | ||||||||||||||||
| }, | }, | |||||||||||||||
| ) | ) | |||||||||||||||
| def finalize(self) -> None: | ||||||||||||||||
| last = self.get_last_commit_hash(repository_path=self.DESTINATION_PATH) | ||||||||||||||||
| if self.state.last_commit == last: | ||||||||||||||||
| self.updated = False | ||||||||||||||||
Done Inline Actionswhere is this used? vlorentz: where is this used? | ||||||||||||||||
Done Inline Actionsah, it's used in the base class, nvm. You should check the value of self.state.last_commit changed, though. vlorentz: ah, it's used in the base class, nvm.
You should check the value of `self.state.last_commit`… | ||||||||||||||||
| else: | ||||||||||||||||
| self.state.last_commit = last | ||||||||||||||||
| self.updated = True | ||||||||||||||||
| logger.debug("Listing crates origin completed with last commit id %s", last) | ||||||||||||||||
Done Inline Actionsadd a test for this (both when successful and failed) vlorentz: add a test for this (both when successful and failed) | ||||||||||||||||
Done Inline ActionsI added a test to ensure it cleans up successfully after lister runs, but can't find a way to make it fail with current code. One way could be by improving get_index_repository method to raise if directory already exists, but looks like blocking and useless unless you have a better idea? franckbret: I added a test to ensure it cleans up successfully after lister runs, but can't find a way to… | ||||||||||||||||
Done Inline Actionsnah, it's fine vlorentz: nah, it's fine | ||||||||||||||||
| # Cleanup by removing the repository directory | ||||||||||||||||
| if self.DESTINATION_PATH.exists(): | ||||||||||||||||
Done Inline Actionsnot needed to test this either; shutil.rmtree raises an exception when it fails vlorentz: not needed to test this either; `shutil.rmtree` raises an exception when it fails | ||||||||||||||||
| shutil.rmtree(self.DESTINATION_PATH) | ||||||||||||||||
| logger.debug( | ||||||||||||||||
| "Successfully removed %s directory", str(self.DESTINATION_PATH) | ||||||||||||||||
| ) | ||||||||||||||||