diff --git a/mypy.ini b/mypy.ini index eb2343b..51c1c65 100644 --- a/mypy.ini +++ b/mypy.ini @@ -1,45 +1,47 @@ [mypy] namespace_packages = True warn_unused_ignores = True # 3rd party libraries without stubs (yet) [mypy-bs4.*] ignore_missing_imports = True [mypy-celery.*] ignore_missing_imports = True [mypy-debian.*] ignore_missing_imports = True [mypy-iso8601.*] ignore_missing_imports = True [mypy-launchpadlib.*] ignore_missing_imports = True [mypy-lazr.*] ignore_missing_imports = True [mypy-lxml.*] ignore_missing_imports = True [mypy-pkg_resources.*] ignore_missing_imports = True [mypy-pytest.*] ignore_missing_imports = True [mypy-pytest_postgresql.*] ignore_missing_imports = True [mypy-requests_mock.*] ignore_missing_imports = True [mypy-urllib3.util.*] ignore_missing_imports = True [mypy-xmltodict.*] ignore_missing_imports = True +[mypy-dulwich.*] +ignore_missing_imports = True diff --git a/requirements.txt b/requirements.txt index ea5ee0f..5021815 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,10 @@ python_debian requests setuptools iso8601 beautifulsoup4 launchpadlib tenacity >= 6.2 xmltodict lxml +dulwich diff --git a/swh/lister/crates/lister.py b/swh/lister/crates/lister.py index 63604a1..fbe3003 100644 --- a/swh/lister/crates/lister.py +++ b/swh/lister/crates/lister.py @@ -1,162 +1,259 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information - +from dataclasses import asdict, dataclass +import datetime +import io import json import logging from pathlib import Path -import subprocess -from typing import Any, Dict, Iterator, List +import shutil +from typing import Any, Dict, Iterator, List, Optional from urllib.parse import urlparse -import iso8601 +from dulwich import porcelain +from dulwich.patch import write_tree_diff +from dulwich.repo import Repo from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin -from ..pattern import CredentialsType, StatelessLister +from ..pattern import CredentialsType, Lister logger = logging.getLogger(__name__) # Aliasing the page results returned by `get_pages` method from the lister. CratesListerPage = List[Dict[str, Any]] -class CratesLister(StatelessLister[CratesListerPage]): +@dataclass +class CratesListerState: + """Store lister state for incremental mode operations. + 'last_commit' represents a git commit hash + """ + + last_commit: str = "" + + +class CratesLister(Lister[CratesListerState, CratesListerPage]): """List origins from the "crates.io" forge. It basically fetches https://github.com/rust-lang/crates.io-index.git to a - temp directory and then walks through each file to get the crate's info. + temp directory and then walks through each file to get the crate's info on + the first run. + + In incremental mode, it relies on the same Git repository but instead of reading + each file of the repo, it get the differences through ``git log last_commit..HEAD``. + Resulting output string is parsed to build page entries. """ # Part of the lister API, that identifies this lister LISTER_NAME = "crates" # (Optional) CVS type of the origins listed by this lister, if constant VISIT_TYPE = "crates" INSTANCE = "crates" INDEX_REPOSITORY_URL = "https://github.com/rust-lang/crates.io-index.git" DESTINATION_PATH = Path("/tmp/crates.io-index") CRATE_FILE_URL_PATTERN = ( "https://static.crates.io/crates/{crate}/{crate}-{version}.crate" ) CRATE_API_URL_PATTERN = "https://crates.io/api/v1/crates/{crate}" def __init__( self, scheduler: SchedulerInterface, credentials: CredentialsType = None, ): super().__init__( scheduler=scheduler, credentials=credentials, url=self.INDEX_REPOSITORY_URL, instance=self.INSTANCE, ) + def state_from_dict(self, d: Dict[str, Any]) -> CratesListerState: + if "last_commit" not in d: + d["last_commit"] = "" + return CratesListerState(**d) + + def state_to_dict(self, state: CratesListerState) -> Dict[str, Any]: + return asdict(state) + def get_index_repository(self) -> None: """Get crates.io-index repository up to date running git command.""" - - subprocess.check_call( - [ - "git", - "clone", - self.INDEX_REPOSITORY_URL, - self.DESTINATION_PATH, - ] - ) + if self.DESTINATION_PATH.exists(): + porcelain.pull( + self.DESTINATION_PATH, remote_location=self.INDEX_REPOSITORY_URL + ) + else: + porcelain.clone( + source=self.INDEX_REPOSITORY_URL, target=self.DESTINATION_PATH + ) def get_crates_index(self) -> List[Path]: """Build a sorted list of file paths excluding dotted directories and dotted files. Each file path corresponds to a crate that lists all available versions. """ - crates_index = sorted( path for path in self.DESTINATION_PATH.rglob("*") if not any(part.startswith(".") for part in path.parts) and path.is_file() and path != self.DESTINATION_PATH / "config.json" ) return crates_index + def get_last_commit_hash(self, repository_path: Path) -> str: + """Returns the last commit hash of a git repository""" + assert repository_path.exists() + + repo = Repo(str(repository_path)) + head = repo.head() + last_commit = repo[head] + + return last_commit.id.decode() + + def get_last_update_by_file(self, filepath: Path) -> Optional[datetime.datetime]: + """Given a file path within a Git repository, returns its last commit + date as iso8601 + """ + repo = Repo(str(self.DESTINATION_PATH)) + # compute relative path otherwise it fails + relative_path = filepath.relative_to(self.DESTINATION_PATH) + walker = repo.get_walker(paths=[bytes(relative_path)], max_entries=1) + try: + commit = next(iter(walker)).commit + except StopIteration: + logger.error( + "Can not find %s related commits in repository %s", relative_path, repo + ) + return None + else: + last_update = datetime.datetime.fromtimestamp( + commit.author_time, datetime.timezone.utc + ) + return last_update + + def page_entry_dict(self, entry: Dict[str, Any]) -> Dict[str, Any]: + """Transform package version definition dict to a suitable + page entry dict + """ + return dict( + name=entry["name"], + version=entry["vers"], + checksum=entry["cksum"], + yanked=entry["yanked"], + crate_file=self.CRATE_FILE_URL_PATTERN.format( + crate=entry["name"], version=entry["vers"] + ), + ) + def get_pages(self) -> Iterator[CratesListerPage]: """Yield an iterator sorted by name in ascending order of pages. Each page is a list of crate versions with: - name: Name of the crate - version: Version - checksum: Checksum - crate_file: Url of the crate file - last_update: Date of the last commit of the corresponding index file """ # Fetch crates.io index repository self.get_index_repository() - # Get a list of all crates files from the index repository - crates_index = self.get_crates_index() - logger.debug("found %s crates in crates_index", len(crates_index)) + if not self.state.last_commit: + # First discovery + # List all crates files from the index repository + crates_index = self.get_crates_index() + else: + # Incremental case + # Get new package version by parsing a range of commits from index repository + repo = Repo(str(self.DESTINATION_PATH)) + head = repo[repo.head()] + last = repo[self.state.last_commit.encode()] + outstream = io.BytesIO() + write_tree_diff(outstream, repo.object_store, last.tree, head.tree) + raw_diff = outstream.getvalue() + crates_index = [] + for line in raw_diff.splitlines(): + if line.startswith(b"+++ b/"): + filepath = line.split(b"+++ b/", 1)[1] + crates_index.append(self.DESTINATION_PATH / filepath.decode()) + crates_index = sorted(crates_index) + + logger.debug("Found %s crates in crates_index", len(crates_index)) + + # Each line of a crate file is a json entry describing released versions + # for a package for crate in crates_index: page = [] - # %cI is for strict iso8601 date formatting - last_update_str = subprocess.check_output( - ["git", "log", "-1", "--pretty=format:%cI", str(crate)], - cwd=self.DESTINATION_PATH, - ) - last_update = iso8601.parse_date(last_update_str.decode().strip()) + last_update = self.get_last_update_by_file(crate) with crate.open("rb") as current_file: for line in current_file: data = json.loads(line) - # pick only the data we need - page.append( - dict( - name=data["name"], - version=data["vers"], - checksum=data["cksum"], - crate_file=self.CRATE_FILE_URL_PATTERN.format( - crate=data["name"], version=data["vers"] - ), - last_update=last_update, - ) - ) + entry = self.page_entry_dict(data) + entry["last_update"] = last_update + page.append(entry) yield page def get_origins_from_page(self, page: CratesListerPage) -> Iterator[ListedOrigin]: """Iterate on all crate pages and yield ListedOrigin instances.""" assert self.lister_obj.id is not None url = self.CRATE_API_URL_PATTERN.format(crate=page[0]["name"]) last_update = page[0]["last_update"] artifacts = [] + crates_metadata = [] for version in page: filename = urlparse(version["crate_file"]).path.split("/")[-1] # Build an artifact entry following original-artifacts-json specification # https://docs.softwareheritage.org/devel/swh-storage/extrinsic-metadata-specification.html#original-artifacts-json # noqa: B950 artifact = { "filename": f"{filename}", "checksums": { "sha256": f"{version['checksum']}", }, "url": version["crate_file"], "version": version["version"], } artifacts.append(artifact) + data = {f"{version['version']}": {"yanked": f"{version['yanked']}"}} + crates_metadata.append(data) yield ListedOrigin( lister_id=self.lister_obj.id, visit_type=self.VISIT_TYPE, url=url, last_update=last_update, extra_loader_arguments={ "artifacts": artifacts, + "crates_metadata": crates_metadata, }, ) + + def finalize(self) -> None: + last = self.get_last_commit_hash(repository_path=self.DESTINATION_PATH) + if self.state.last_commit == last: + self.updated = False + else: + self.state.last_commit = last + self.updated = True + + logger.debug("Listing crates origin completed with last commit id %s", last) + + # Cleanup by removing the repository directory + if self.DESTINATION_PATH.exists(): + shutil.rmtree(self.DESTINATION_PATH) + logger.debug( + "Successfully removed %s directory", str(self.DESTINATION_PATH) + ) diff --git a/swh/lister/crates/tests/data/fake-crates-repository.tar.gz b/swh/lister/crates/tests/data/fake-crates-repository.tar.gz index 8b384b4..498b105 100644 Binary files a/swh/lister/crates/tests/data/fake-crates-repository.tar.gz and b/swh/lister/crates/tests/data/fake-crates-repository.tar.gz differ diff --git a/swh/lister/crates/tests/data/fake_crates_repository_init.sh b/swh/lister/crates/tests/data/fake_crates_repository_init.sh index 60680d6..6368601 100755 --- a/swh/lister/crates/tests/data/fake_crates_repository_init.sh +++ b/swh/lister/crates/tests/data/fake_crates_repository_init.sh @@ -1,37 +1,64 @@ #!/usr/bin/env bash # Script to generate fake-crates-repository.tar.gz # Creates a git repository like https://github.com/rust-lang/crates.io-index # for tests purposes set -euo pipefail # files and directories mkdir -p tmp_dir/crates.io-index/ cd tmp_dir/crates.io-index/ mkdir -p .dot-dir touch .dot-dir/empty mkdir -p ra/nd mkdir -p re/ge touch .dot-file touch config.json +# Init as a git repository +git init +git add . +git commit -m "Init fake crates.io-index repository for tests purpose" + echo '{"name":"rand","vers":"0.1.1","deps":[],"cksum":"48a45b46c2a8c38348adb1205b13c3c5eb0174e0c0fec52cc88e9fb1de14c54d","features":{},"yanked":false}' > ra/nd/rand +git add . +git commit -m " Updating crate rand#0.1.1" + echo '{"name":"rand","vers":"0.1.2","deps":[{"name":"libc","req":"^0.1.1","features":[""],"optional":false,"default_features":true,"target":null,"kind":"normal"},{"name":"log","req":"^0.2.1","features":[""],"optional":false,"default_features":true,"target":null,"kind":"normal"}],"cksum":"6e229ed392842fa93c1d76018d197b7e1b74250532bafb37b0e1d121a92d4cf7","features":{},"yanked":false}' >> ra/nd/rand +git add . +git commit -m " Updating crate rand#0.1.2" echo '{"name":"regex","vers":"0.1.0","deps":[],"cksum":"f0ff1ca641d3c9a2c30464dac30183a8b91cdcc959d616961be020cdea6255c5","features":{},"yanked":false}' > re/ge/regex +git add . +git commit -m " Updating crate regex#0.1.0" + echo '{"name":"regex","vers":"0.1.1","deps":[{"name":"regex_macros","req":"^0.1.0","features":[""],"optional":false,"default_features":true,"target":null,"kind":"dev"}],"cksum":"a07bef996bd38a73c21a8e345d2c16848b41aa7ec949e2fedffe9edf74cdfb36","features":{},"yanked":false}' >> re/ge/regex +git add . +git commit -m " Updating crate regex#0.1.1" + echo '{"name":"regex","vers":"0.1.2","deps":[{"name":"regex_macros","req":"^0.1.0","features":[""],"optional":false,"default_features":true,"target":null,"kind":"dev"}],"cksum":"343bd0171ee23346506db6f4c64525de6d72f0e8cc533f83aea97f3e7488cbf9","features":{},"yanked":false}' >> re/ge/regex +git add . +git commit -m " Updating crate regex#0.1.2" + echo '{"name":"regex","vers":"0.1.3","deps":[{"name":"regex_macros","req":"^0.1.0","features":[""],"optional":false,"default_features":true,"target":null,"kind":"dev"}],"cksum":"defb220c4054ca1b95fe8b0c9a6e782dda684c1bdf8694df291733ae8a3748e3","features":{},"yanked":false}' >> re/ge/regex +git add . +git commit -m " Updating crate regex#0.1.3" echo '{"name":"regex-syntax","vers":"0.1.0","deps":[{"name":"rand","req":"^0.3","features":[""],"optional":false,"default_features":true,"target":null,"kind":"dev"},{"name":"quickcheck","req":"^0.2","features":[""],"optional":false,"default_features":true,"target":null,"kind":"dev"}],"cksum":"398952a2f6cd1d22bc1774fd663808e32cf36add0280dee5cdd84a8fff2db944","features":{},"yanked":false}' > re/ge/regex-syntax - -# Init as a git repository -git init git add . -git commit -m "Init fake crates.io-index repository for tests purpose" +git commit -m " Updating crate regex-syntax#0.1.0" # Save some space rm .git/hooks/*.sample + +# Compress git directory as a tar.gz archive +cd ../ +tar -cvzf fake-crates-repository.tar.gz crates.io-index +mv fake-crates-repository.tar.gz ../ + +# Clean up tmp_dir +cd ../ +rm -rf tmp_dir diff --git a/swh/lister/crates/tests/test_lister.py b/swh/lister/crates/tests/test_lister.py index bbb1c7d..2c62449 100644 --- a/swh/lister/crates/tests/test_lister.py +++ b/swh/lister/crates/tests/test_lister.py @@ -1,114 +1,234 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from pathlib import Path -from swh.lister.crates.lister import CratesLister +from dulwich.repo import Repo + +from swh.lister.crates.lister import CratesLister, CratesListerState from swh.lister.crates.tests import prepare_repository_from_archive expected_origins = [ { "url": "https://crates.io/api/v1/crates/rand", "artifacts": [ { "checksums": { "sha256": "48a45b46c2a8c38348adb1205b13c3c5eb0174e0c0fec52cc88e9fb1de14c54d", # noqa: B950 }, "filename": "rand-0.1.1.crate", "url": "https://static.crates.io/crates/rand/rand-0.1.1.crate", "version": "0.1.1", }, { "checksums": { "sha256": "6e229ed392842fa93c1d76018d197b7e1b74250532bafb37b0e1d121a92d4cf7", # noqa: B950 }, "filename": "rand-0.1.2.crate", "url": "https://static.crates.io/crates/rand/rand-0.1.2.crate", "version": "0.1.2", }, ], + "metadata": [ + { + "version": "0.1.1", + "yanked": False, + }, + { + "version": "0.1.2", + "yanked": False, + }, + ], }, { "url": "https://crates.io/api/v1/crates/regex", "artifacts": [ { "checksums": { "sha256": "f0ff1ca641d3c9a2c30464dac30183a8b91cdcc959d616961be020cdea6255c5", # noqa: B950 }, "filename": "regex-0.1.0.crate", "url": "https://static.crates.io/crates/regex/regex-0.1.0.crate", "version": "0.1.0", }, { "checksums": { "sha256": "a07bef996bd38a73c21a8e345d2c16848b41aa7ec949e2fedffe9edf74cdfb36", # noqa: B950 }, "filename": "regex-0.1.1.crate", "url": "https://static.crates.io/crates/regex/regex-0.1.1.crate", "version": "0.1.1", }, { "checksums": { "sha256": "343bd0171ee23346506db6f4c64525de6d72f0e8cc533f83aea97f3e7488cbf9", # noqa: B950 }, "filename": "regex-0.1.2.crate", "url": "https://static.crates.io/crates/regex/regex-0.1.2.crate", "version": "0.1.2", }, { "checksums": { "sha256": "defb220c4054ca1b95fe8b0c9a6e782dda684c1bdf8694df291733ae8a3748e3", # noqa: B950 }, "filename": "regex-0.1.3.crate", "url": "https://static.crates.io/crates/regex/regex-0.1.3.crate", "version": "0.1.3", }, ], + "metadata": [ + { + "version": "0.1.0", + "yanked": False, + }, + { + "version": "0.1.1", + "yanked": False, + }, + { + "version": "0.1.2", + "yanked": False, + }, + { + "version": "0.1.3", + "yanked": False, + }, + ], }, { "url": "https://crates.io/api/v1/crates/regex-syntax", "artifacts": [ { "checksums": { "sha256": "398952a2f6cd1d22bc1774fd663808e32cf36add0280dee5cdd84a8fff2db944", # noqa: B950 }, "filename": "regex-syntax-0.1.0.crate", "url": "https://static.crates.io/crates/regex-syntax/regex-syntax-0.1.0.crate", "version": "0.1.0", }, ], + "metadata": [ + { + "version": "0.1.0", + "yanked": False, + }, + ], }, ] +expected_origins_incremental = [expected_origins[1], expected_origins[2]] + + def test_crates_lister(datadir, tmp_path, swh_scheduler): archive_path = Path(datadir, "fake-crates-repository.tar.gz") repo_url = prepare_repository_from_archive( archive_path, "crates.io-index", tmp_path ) lister = CratesLister(scheduler=swh_scheduler) lister.INDEX_REPOSITORY_URL = repo_url lister.DESTINATION_PATH = tmp_path.parent / "crates.io-index-tests" res = lister.run() assert res.pages == 3 assert res.origins == 3 expected_origins_sorted = sorted(expected_origins, key=lambda x: x.get("url")) scheduler_origins_sorted = sorted( swh_scheduler.get_listed_origins(lister.lister_obj.id).results, key=lambda x: x.url, ) for scheduled, expected in zip(scheduler_origins_sorted, expected_origins_sorted): assert scheduled.visit_type == "crates" assert scheduled.url == expected.get("url") assert scheduled.extra_loader_arguments.get("artifacts") == expected.get( "artifacts" ) assert len(scheduler_origins_sorted) == len(expected_origins_sorted) + + +def test_crates_lister_incremental(datadir, tmp_path, swh_scheduler): + archive_path = Path(datadir, "fake-crates-repository.tar.gz") + repo_url = prepare_repository_from_archive( + archive_path, "crates.io-index", tmp_path + ) + + lister = CratesLister(scheduler=swh_scheduler) + lister.INDEX_REPOSITORY_URL = repo_url + lister.DESTINATION_PATH = tmp_path.parent / "crates.io-index-tests" + # The lister has not run yet, get the index repository + lister.get_index_repository() + # Set a CratesListerState with a last commit value to force incremental case + repo = Repo(lister.DESTINATION_PATH) + # Lets set this last commit to third one from head + step = list(repo.get_walker(max_entries=3))[-1] + last_commit_state = CratesListerState(last_commit=step.commit.id.decode()) + lister.state = last_commit_state + + res = lister.run() + + assert res.pages == 2 + assert res.origins == 2 + + expected_origins_sorted = sorted( + expected_origins_incremental, key=lambda x: x.get("url") + ) + scheduler_origins_sorted = sorted( + swh_scheduler.get_listed_origins(lister.lister_obj.id).results, + key=lambda x: x.url, + ) + + for scheduled, expected in zip(scheduler_origins_sorted, expected_origins_sorted): + assert scheduled.visit_type == "crates" + assert scheduled.url == expected.get("url") + assert scheduled.extra_loader_arguments.get("artifacts") == expected.get( + "artifacts" + ) + + assert len(scheduler_origins_sorted) == len(expected_origins_sorted) + + +def test_crates_lister_incremental_nothing_new(datadir, tmp_path, swh_scheduler): + """Ensure incremental mode runs fine when the repository last commit is the same + than lister.state.las-_commit""" + archive_path = Path(datadir, "fake-crates-repository.tar.gz") + repo_url = prepare_repository_from_archive( + archive_path, "crates.io-index", tmp_path + ) + + lister = CratesLister(scheduler=swh_scheduler) + lister.INDEX_REPOSITORY_URL = repo_url + lister.DESTINATION_PATH = tmp_path.parent / "crates.io-index-tests" + lister.get_index_repository() + + repo = Repo(lister.DESTINATION_PATH) + + # Set a CratesListerState with a last commit value to force incremental case + last_commit_state = CratesListerState(last_commit=repo.head().decode()) + lister.state = last_commit_state + + res = lister.run() + + assert res.pages == 0 + assert res.origins == 0 + + +def test_crates_lister_repository_cleanup(datadir, tmp_path, swh_scheduler): + archive_path = Path(datadir, "fake-crates-repository.tar.gz") + repo_url = prepare_repository_from_archive( + archive_path, "crates.io-index", tmp_path + ) + + lister = CratesLister(scheduler=swh_scheduler) + lister.INDEX_REPOSITORY_URL = repo_url + lister.DESTINATION_PATH = tmp_path.parent / "crates.io-index-tests" + + lister.run() + # Repository directory should not exists after the lister runs + assert not lister.DESTINATION_PATH.exists()