diff --git a/mypy.ini b/mypy.ini --- a/mypy.ini +++ b/mypy.ini @@ -43,3 +43,5 @@ [mypy-xmltodict.*] ignore_missing_imports = True +[mypy-dulwich.*] +ignore_missing_imports = True diff --git a/requirements.txt b/requirements.txt --- a/requirements.txt +++ b/requirements.txt @@ -7,3 +7,4 @@ tenacity >= 6.2 xmltodict lxml +dulwich diff --git a/swh/lister/crates/lister.py b/swh/lister/crates/lister.py --- a/swh/lister/crates/lister.py +++ b/swh/lister/crates/lister.py @@ -2,20 +2,24 @@ # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information - +from dataclasses import asdict, dataclass +import datetime +import io import json import logging from pathlib import Path -import subprocess -from typing import Any, Dict, Iterator, List +import shutil +from typing import Any, Dict, Iterator, List, Optional from urllib.parse import urlparse -import iso8601 +from dulwich import porcelain +from dulwich.patch import write_tree_diff +from dulwich.repo import Repo from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin -from ..pattern import CredentialsType, StatelessLister +from ..pattern import CredentialsType, Lister logger = logging.getLogger(__name__) @@ -23,11 +27,25 @@ CratesListerPage = List[Dict[str, Any]] -class CratesLister(StatelessLister[CratesListerPage]): +@dataclass +class CratesListerState: + """Store lister state for incremental mode operations. + 'last_commit' represents a git commit hash + """ + + last_commit: str = "" + + +class CratesLister(Lister[CratesListerState, CratesListerPage]): """List origins from the "crates.io" forge. It basically fetches https://github.com/rust-lang/crates.io-index.git to a - temp directory and then walks through each file to get the crate's info. + temp directory and then walks through each file to get the crate's info on + the first run. + + In incremental mode, it relies on the same Git repository but instead of reading + each file of the repo, it get the differences through ``git log last_commit..HEAD``. + Resulting output string is parsed to build page entries. """ # Part of the lister API, that identifies this lister @@ -55,17 +73,24 @@ instance=self.INSTANCE, ) + def state_from_dict(self, d: Dict[str, Any]) -> CratesListerState: + if "last_commit" not in d: + d["last_commit"] = "" + return CratesListerState(**d) + + def state_to_dict(self, state: CratesListerState) -> Dict[str, Any]: + return asdict(state) + def get_index_repository(self) -> None: """Get crates.io-index repository up to date running git command.""" - - subprocess.check_call( - [ - "git", - "clone", - self.INDEX_REPOSITORY_URL, - self.DESTINATION_PATH, - ] - ) + if self.DESTINATION_PATH.exists(): + porcelain.pull( + self.DESTINATION_PATH, remote_location=self.INDEX_REPOSITORY_URL + ) + else: + porcelain.clone( + source=self.INDEX_REPOSITORY_URL, target=self.DESTINATION_PATH + ) def get_crates_index(self) -> List[Path]: """Build a sorted list of file paths excluding dotted directories and @@ -74,7 +99,6 @@ Each file path corresponds to a crate that lists all available versions. """ - crates_index = sorted( path for path in self.DESTINATION_PATH.rglob("*") @@ -85,6 +109,51 @@ return crates_index + def get_last_commit_hash(self, repository_path: Path) -> str: + """Returns the last commit hash of a git repository""" + assert repository_path.exists() + + repo = Repo(str(repository_path)) + head = repo.head() + last_commit = repo[head] + + return last_commit.id.decode() + + def get_last_update_by_file(self, filepath: Path) -> Optional[datetime.datetime]: + """Given a file path within a Git repository, returns its last commit + date as iso8601 + """ + repo = Repo(str(self.DESTINATION_PATH)) + # compute relative path otherwise it fails + relative_path = filepath.relative_to(self.DESTINATION_PATH) + walker = repo.get_walker(paths=[bytes(relative_path)], max_entries=1) + try: + commit = next(iter(walker)).commit + except StopIteration: + logger.error( + "Can not find %s related commits in repository %s", relative_path, repo + ) + return None + else: + last_update = datetime.datetime.fromtimestamp( + commit.author_time, datetime.timezone.utc + ) + return last_update + + def page_entry_dict(self, entry: Dict[str, Any]) -> Dict[str, Any]: + """Transform package version definition dict to a suitable + page entry dict + """ + return dict( + name=entry["name"], + version=entry["vers"], + checksum=entry["cksum"], + yanked=entry["yanked"], + crate_file=self.CRATE_FILE_URL_PATTERN.format( + crate=entry["name"], version=entry["vers"] + ), + ) + def get_pages(self) -> Iterator[CratesListerPage]: """Yield an iterator sorted by name in ascending order of pages. @@ -98,34 +167,41 @@ """ # Fetch crates.io index repository self.get_index_repository() - # Get a list of all crates files from the index repository - crates_index = self.get_crates_index() - logger.debug("found %s crates in crates_index", len(crates_index)) + if not self.state.last_commit: + # First discovery + # List all crates files from the index repository + crates_index = self.get_crates_index() + else: + # Incremental case + # Get new package version by parsing a range of commits from index repository + repo = Repo(str(self.DESTINATION_PATH)) + head = repo[repo.head()] + last = repo[self.state.last_commit.encode()] + outstream = io.BytesIO() + write_tree_diff(outstream, repo.object_store, last.tree, head.tree) + raw_diff = outstream.getvalue() + crates_index = [] + for line in raw_diff.splitlines(): + if line.startswith(b"+++ b/"): + filepath = line.split(b"+++ b/", 1)[1] + crates_index.append(self.DESTINATION_PATH / filepath.decode()) + crates_index = sorted(crates_index) + + logger.debug("Found %s crates in crates_index", len(crates_index)) + + # Each line of a crate file is a json entry describing released versions + # for a package for crate in crates_index: page = [] - # %cI is for strict iso8601 date formatting - last_update_str = subprocess.check_output( - ["git", "log", "-1", "--pretty=format:%cI", str(crate)], - cwd=self.DESTINATION_PATH, - ) - last_update = iso8601.parse_date(last_update_str.decode().strip()) + last_update = self.get_last_update_by_file(crate) with crate.open("rb") as current_file: for line in current_file: data = json.loads(line) - # pick only the data we need - page.append( - dict( - name=data["name"], - version=data["vers"], - checksum=data["cksum"], - crate_file=self.CRATE_FILE_URL_PATTERN.format( - crate=data["name"], version=data["vers"] - ), - last_update=last_update, - ) - ) + entry = self.page_entry_dict(data) + entry["last_update"] = last_update + page.append(entry) yield page def get_origins_from_page(self, page: CratesListerPage) -> Iterator[ListedOrigin]: @@ -136,6 +212,7 @@ url = self.CRATE_API_URL_PATTERN.format(crate=page[0]["name"]) last_update = page[0]["last_update"] artifacts = [] + crates_metadata = [] for version in page: filename = urlparse(version["crate_file"]).path.split("/")[-1] @@ -150,6 +227,8 @@ "version": version["version"], } artifacts.append(artifact) + data = {f"{version['version']}": {"yanked": f"{version['yanked']}"}} + crates_metadata.append(data) yield ListedOrigin( lister_id=self.lister_obj.id, @@ -158,5 +237,23 @@ last_update=last_update, extra_loader_arguments={ "artifacts": artifacts, + "crates_metadata": crates_metadata, }, ) + + def finalize(self) -> None: + last = self.get_last_commit_hash(repository_path=self.DESTINATION_PATH) + if self.state.last_commit == last: + self.updated = False + else: + self.state.last_commit = last + self.updated = True + + logger.debug("Listing crates origin completed with last commit id %s", last) + + # Cleanup by removing the repository directory + if self.DESTINATION_PATH.exists(): + shutil.rmtree(self.DESTINATION_PATH) + logger.debug( + "Successfully removed %s directory", str(self.DESTINATION_PATH) + ) diff --git a/swh/lister/crates/tests/data/fake-crates-repository.tar.gz b/swh/lister/crates/tests/data/fake-crates-repository.tar.gz index 0000000000000000000000000000000000000000..0000000000000000000000000000000000000000 GIT binary patch literal 0 Hc$@ ra/nd/rand +git add . +git commit -m " Updating crate rand#0.1.1" + echo '{"name":"rand","vers":"0.1.2","deps":[{"name":"libc","req":"^0.1.1","features":[""],"optional":false,"default_features":true,"target":null,"kind":"normal"},{"name":"log","req":"^0.2.1","features":[""],"optional":false,"default_features":true,"target":null,"kind":"normal"}],"cksum":"6e229ed392842fa93c1d76018d197b7e1b74250532bafb37b0e1d121a92d4cf7","features":{},"yanked":false}' >> ra/nd/rand +git add . +git commit -m " Updating crate rand#0.1.2" echo '{"name":"regex","vers":"0.1.0","deps":[],"cksum":"f0ff1ca641d3c9a2c30464dac30183a8b91cdcc959d616961be020cdea6255c5","features":{},"yanked":false}' > re/ge/regex +git add . +git commit -m " Updating crate regex#0.1.0" + echo '{"name":"regex","vers":"0.1.1","deps":[{"name":"regex_macros","req":"^0.1.0","features":[""],"optional":false,"default_features":true,"target":null,"kind":"dev"}],"cksum":"a07bef996bd38a73c21a8e345d2c16848b41aa7ec949e2fedffe9edf74cdfb36","features":{},"yanked":false}' >> re/ge/regex +git add . +git commit -m " Updating crate regex#0.1.1" + echo '{"name":"regex","vers":"0.1.2","deps":[{"name":"regex_macros","req":"^0.1.0","features":[""],"optional":false,"default_features":true,"target":null,"kind":"dev"}],"cksum":"343bd0171ee23346506db6f4c64525de6d72f0e8cc533f83aea97f3e7488cbf9","features":{},"yanked":false}' >> re/ge/regex +git add . +git commit -m " Updating crate regex#0.1.2" + echo '{"name":"regex","vers":"0.1.3","deps":[{"name":"regex_macros","req":"^0.1.0","features":[""],"optional":false,"default_features":true,"target":null,"kind":"dev"}],"cksum":"defb220c4054ca1b95fe8b0c9a6e782dda684c1bdf8694df291733ae8a3748e3","features":{},"yanked":false}' >> re/ge/regex +git add . +git commit -m " Updating crate regex#0.1.3" echo '{"name":"regex-syntax","vers":"0.1.0","deps":[{"name":"rand","req":"^0.3","features":[""],"optional":false,"default_features":true,"target":null,"kind":"dev"},{"name":"quickcheck","req":"^0.2","features":[""],"optional":false,"default_features":true,"target":null,"kind":"dev"}],"cksum":"398952a2f6cd1d22bc1774fd663808e32cf36add0280dee5cdd84a8fff2db944","features":{},"yanked":false}' > re/ge/regex-syntax - -# Init as a git repository -git init git add . -git commit -m "Init fake crates.io-index repository for tests purpose" +git commit -m " Updating crate regex-syntax#0.1.0" # Save some space rm .git/hooks/*.sample + +# Compress git directory as a tar.gz archive +cd ../ +tar -cvzf fake-crates-repository.tar.gz crates.io-index +mv fake-crates-repository.tar.gz ../ + +# Clean up tmp_dir +cd ../ +rm -rf tmp_dir diff --git a/swh/lister/crates/tests/test_lister.py b/swh/lister/crates/tests/test_lister.py --- a/swh/lister/crates/tests/test_lister.py +++ b/swh/lister/crates/tests/test_lister.py @@ -5,7 +5,9 @@ from pathlib import Path -from swh.lister.crates.lister import CratesLister +from dulwich.repo import Repo + +from swh.lister.crates.lister import CratesLister, CratesListerState from swh.lister.crates.tests import prepare_repository_from_archive expected_origins = [ @@ -29,6 +31,16 @@ "version": "0.1.2", }, ], + "metadata": [ + { + "version": "0.1.1", + "yanked": False, + }, + { + "version": "0.1.2", + "yanked": False, + }, + ], }, { "url": "https://crates.io/api/v1/crates/regex", @@ -66,6 +78,24 @@ "version": "0.1.3", }, ], + "metadata": [ + { + "version": "0.1.0", + "yanked": False, + }, + { + "version": "0.1.1", + "yanked": False, + }, + { + "version": "0.1.2", + "yanked": False, + }, + { + "version": "0.1.3", + "yanked": False, + }, + ], }, { "url": "https://crates.io/api/v1/crates/regex-syntax", @@ -79,10 +109,19 @@ "version": "0.1.0", }, ], + "metadata": [ + { + "version": "0.1.0", + "yanked": False, + }, + ], }, ] +expected_origins_incremental = [expected_origins[1], expected_origins[2]] + + def test_crates_lister(datadir, tmp_path, swh_scheduler): archive_path = Path(datadir, "fake-crates-repository.tar.gz") repo_url = prepare_repository_from_archive( @@ -112,3 +151,84 @@ ) assert len(scheduler_origins_sorted) == len(expected_origins_sorted) + + +def test_crates_lister_incremental(datadir, tmp_path, swh_scheduler): + archive_path = Path(datadir, "fake-crates-repository.tar.gz") + repo_url = prepare_repository_from_archive( + archive_path, "crates.io-index", tmp_path + ) + + lister = CratesLister(scheduler=swh_scheduler) + lister.INDEX_REPOSITORY_URL = repo_url + lister.DESTINATION_PATH = tmp_path.parent / "crates.io-index-tests" + # The lister has not run yet, get the index repository + lister.get_index_repository() + # Set a CratesListerState with a last commit value to force incremental case + repo = Repo(lister.DESTINATION_PATH) + # Lets set this last commit to third one from head + step = list(repo.get_walker(max_entries=3))[-1] + last_commit_state = CratesListerState(last_commit=step.commit.id.decode()) + lister.state = last_commit_state + + res = lister.run() + + assert res.pages == 2 + assert res.origins == 2 + + expected_origins_sorted = sorted( + expected_origins_incremental, key=lambda x: x.get("url") + ) + scheduler_origins_sorted = sorted( + swh_scheduler.get_listed_origins(lister.lister_obj.id).results, + key=lambda x: x.url, + ) + + for scheduled, expected in zip(scheduler_origins_sorted, expected_origins_sorted): + assert scheduled.visit_type == "crates" + assert scheduled.url == expected.get("url") + assert scheduled.extra_loader_arguments.get("artifacts") == expected.get( + "artifacts" + ) + + assert len(scheduler_origins_sorted) == len(expected_origins_sorted) + + +def test_crates_lister_incremental_nothing_new(datadir, tmp_path, swh_scheduler): + """Ensure incremental mode runs fine when the repository last commit is the same + than lister.state.las-_commit""" + archive_path = Path(datadir, "fake-crates-repository.tar.gz") + repo_url = prepare_repository_from_archive( + archive_path, "crates.io-index", tmp_path + ) + + lister = CratesLister(scheduler=swh_scheduler) + lister.INDEX_REPOSITORY_URL = repo_url + lister.DESTINATION_PATH = tmp_path.parent / "crates.io-index-tests" + lister.get_index_repository() + + repo = Repo(lister.DESTINATION_PATH) + + # Set a CratesListerState with a last commit value to force incremental case + last_commit_state = CratesListerState(last_commit=repo.head().decode()) + lister.state = last_commit_state + + res = lister.run() + + assert res.pages == 0 + assert res.origins == 0 + + +def test_crates_lister_repository_cleanup(datadir, tmp_path, swh_scheduler): + archive_path = Path(datadir, "fake-crates-repository.tar.gz") + repo_url = prepare_repository_from_archive( + archive_path, "crates.io-index", tmp_path + ) + + lister = CratesLister(scheduler=swh_scheduler) + lister.INDEX_REPOSITORY_URL = repo_url + lister.DESTINATION_PATH = tmp_path.parent / "crates.io-index-tests" + + lister.run() + # Repository directory should not exists after the lister runs + assert not lister.DESTINATION_PATH.exists()