diff --git a/swh/lister/crates/__init__.py b/swh/lister/crates/__init__.py --- a/swh/lister/crates/__init__.py +++ b/swh/lister/crates/__init__.py @@ -2,7 +2,6 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information - """ Crates lister ============= @@ -20,20 +19,24 @@ Origins retrieving strategy --------------------------- -A json http api to list packages from crates.io but we choose a `different strategy`_ -in order to reduce to its bare minimum the amount of http call and bandwidth. -We clone a git repository which contains a tree of directories whose last child folder -name corresponds to the package name and contains a Cargo.toml file with some json data -to describe all existing versions of the package. -It takes a few seconds to clone the repository and browse it to build a full index of -existing package and related versions. -The lister is incremental, so the first time it clones and browses the repository as -previously described then stores the last seen commit id. -Next time, it retrieves the list of new and changed files since last commit id and -returns new or changed package with all of their related versions. - -Note that all Git related operations are done with `Dulwich`_, a Python -implementation of the Git file formats and protocols. +A json http api to list packages from crates.io exists but we choose a +`different strategy`_ in order to reduce to its bare minimum the amount +of http call and bandwidth. + +We download a `db-dump.tar.gz`_ archives which contains csv files as an export of +the crates.io database. Crates.csv list package names, versions.csv list versions +related to package names. +It takes a few seconds to download the archive and parse csv files to build a +full index of existing package and related versions. + +The archive also contains a metadata.json file with a timestamp corresponding to +the date the database dump started. The database dump is automatically generated +every 24 hours, around 02:00:00 UTC. + +The lister is incremental, so the first time it downloads the db-dump.tar.gz archive as +previously described and store the last seen database dump timestamp. +Next time, it downloads the db-dump.tar.gz but retrieves only the list of new and +changed packages since last seen timestamp with all of their related versions. Page listing ------------ @@ -48,56 +51,45 @@ * **crate_file**: Package download url * **checksum**: Package download checksum * **yanked**: Whether the package is yanked or not -* **last_update**: Iso8601 last update date computed upon git commit date of the - related Cargo.toml file +* **last_update**: Iso8601 last update Origins from page ----------------- The lister yields one origin per page. The origin url corresponds to the http api url for a package, for example -"https://crates.io/api/v1/crates/{package}". +"https://crates.io/crates/{crate}". -Additionally we add some data set to "extra_loader_arguments": +Additionally we add some data for each version, set to "extra_loader_arguments": * **artifacts**: Represent data about the Crates to download, following :ref:`original-artifacts-json specification ` * **crates_metadata**: To store all other interesting attributes that do not belongs - to artifacts. For now it mainly indicate when a version is `yanked`_. + to artifacts. For now it mainly indicate when a version is `yanked`_, and the version + last_update timestamp. Origin data example:: { - "url": "https://crates.io/api/v1/crates/rand", + "url": "https://crates.io/api/v1/crates/regex-syntax", "artifacts": [ { + "version": "0.1.0", "checksums": { - "sha256": "48a45b46c2a8c38348adb1205b13c3c5eb0174e0c0fec52cc88e9fb1de14c54d", # noqa: B950 - }, - "filename": "rand-0.1.1.crate", - "url": "https://static.crates.io/crates/rand/rand-0.1.1.crate", - "version": "0.1.1", - }, - { - "checksums": { - "sha256": "6e229ed392842fa93c1d76018d197b7e1b74250532bafb37b0e1d121a92d4cf7", # noqa: B950 + "sha256": "398952a2f6cd1d22bc1774fd663808e32cf36add0280dee5cdd84a8fff2db944", # noqa: B950 }, - "filename": "rand-0.1.2.crate", - "url": "https://static.crates.io/crates/rand/rand-0.1.2.crate", - "version": "0.1.2", + "filename": "regex-syntax-0.1.0.crate", + "url": "https://static.crates.io/crates/regex-syntax/regex-syntax-0.1.0.crate", # noqa: B950 }, ], "crates_metadata": [ { - "version": "0.1.1", - "yanked": False, - }, - { - "version": "0.1.2", + "version": "0.1.0", + "last_update": "2017-11-30 03:37:17.449539", "yanked": False, }, ], - } + }, Running tests ------------- @@ -128,8 +120,8 @@ .. _Cargo: https://doc.rust-lang.org/cargo/guide/why-cargo-exists.html#enter-cargo .. _Cargo.toml: https://doc.rust-lang.org/cargo/reference/manifest.html .. _different strategy: https://crates.io/data-access -.. _Dulwich: https://www.dulwich.io/ .. _yanked: https://doc.rust-lang.org/cargo/reference/publishing.html#cargo-yank +.. _db-dump.tar.gz: https://static.crates.io/db-dump.tar.gz """ diff --git a/swh/lister/crates/lister.py b/swh/lister/crates/lister.py --- a/swh/lister/crates/lister.py +++ b/swh/lister/crates/lister.py @@ -2,19 +2,20 @@ # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from dataclasses import asdict, dataclass -import datetime -import io + +import csv +from dataclasses import dataclass +from datetime import datetime import json import logging from pathlib import Path -import shutil +import tarfile +import tempfile from typing import Any, Dict, Iterator, List, Optional from urllib.parse import urlparse -from dulwich import porcelain -from dulwich.patch import write_tree_diff -from dulwich.repo import Repo +import iso8601 +from packaging.version import parse as parse_version from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin @@ -30,36 +31,36 @@ @dataclass class CratesListerState: """Store lister state for incremental mode operations. - 'last_commit' represents a git commit hash + 'index_last_update' represents the UTC time the crates.io database dump was + started """ - last_commit: str = "" + index_last_update: Optional[datetime] = None class CratesLister(Lister[CratesListerState, CratesListerPage]): """List origins from the "crates.io" forge. - It basically fetches https://github.com/rust-lang/crates.io-index.git to a - temp directory and then walks through each file to get the crate's info on - the first run. + It downloads a tar.gz archive which contains crates.io database table content as + csv files which is automatically generated every 24 hours. + Parsing two csv files we can list all Crates.io package names and their related + versions. - In incremental mode, it relies on the same Git repository but instead of reading - each file of the repo, it get the differences through ``git log last_commit..HEAD``. - Resulting output string is parsed to build page entries. + In incremental mode, it check each entry comparing their 'last_update' value + with self.state.index_last_update """ - # Part of the lister API, that identifies this lister LISTER_NAME = "crates" - # (Optional) CVS type of the origins listed by this lister, if constant VISIT_TYPE = "crates" - INSTANCE = "crates" - INDEX_REPOSITORY_URL = "https://github.com/rust-lang/crates.io-index.git" - DESTINATION_PATH = Path("/tmp/crates.io-index") + + BASE_URL = "https://crates.io" + DB_DUMP_URL = "https://static.crates.io/db-dump.tar.gz" + CRATE_FILE_URL_PATTERN = ( "https://static.crates.io/crates/{crate}/{crate}-{version}.crate" ) - CRATE_API_URL_PATTERN = "https://crates.io/api/v1/crates/{crate}" + CRATE_URL_PATTERN = "https://crates.io/crates/{crate}" def __init__( self, @@ -69,172 +70,172 @@ super().__init__( scheduler=scheduler, credentials=credentials, - url=self.INDEX_REPOSITORY_URL, + url=self.BASE_URL, instance=self.INSTANCE, ) + self.index_metadata: Dict[str, str] = {} def state_from_dict(self, d: Dict[str, Any]) -> CratesListerState: - if "last_commit" not in d: - d["last_commit"] = "" + index_last_update = d.get("index_last_update") + if index_last_update is not None: + d["index_last_update"] = iso8601.parse_date(index_last_update) return CratesListerState(**d) def state_to_dict(self, state: CratesListerState) -> Dict[str, Any]: - return asdict(state) - - def get_index_repository(self) -> None: - """Get crates.io-index repository up to date running git command.""" - if self.DESTINATION_PATH.exists(): - porcelain.pull( - self.DESTINATION_PATH, remote_location=self.INDEX_REPOSITORY_URL - ) - else: - porcelain.clone( - source=self.INDEX_REPOSITORY_URL, target=self.DESTINATION_PATH - ) - - def get_crates_index(self) -> List[Path]: - """Build a sorted list of file paths excluding dotted directories and - dotted files. - - Each file path corresponds to a crate that lists all available - versions. + d: Dict[str, Optional[str]] = {"index_last_update": None} + index_last_update = state.index_last_update + if index_last_update is not None: + d["index_last_update"] = index_last_update.isoformat() + return d + + def is_new(self, dt_str: str): + """Returns True when dt_str is greater than + self.state.index_last_update """ - crates_index = sorted( - path - for path in self.DESTINATION_PATH.rglob("*") - if not any(part.startswith(".") for part in path.parts) - and path.is_file() - and path != self.DESTINATION_PATH / "config.json" - ) - - return crates_index - - def get_last_commit_hash(self, repository_path: Path) -> str: - """Returns the last commit hash of a git repository""" - assert repository_path.exists() + dt = iso8601.parse_date(dt_str) + last = self.state.index_last_update + return not last or (last is not None and last < dt) - repo = Repo(str(repository_path)) - head = repo.head() - last_commit = repo[head] + def get_and_parse_db_dump(self) -> Dict[str, Any]: + """Download and parse csv files from db_dump_path. - return last_commit.id.decode() - - def get_last_update_by_file(self, filepath: Path) -> Optional[datetime.datetime]: - """Given a file path within a Git repository, returns its last commit - date as iso8601 + Returns a dict where each entry corresponds to a package name with its related versions. """ - repo = Repo(str(self.DESTINATION_PATH)) - # compute relative path otherwise it fails - relative_path = filepath.relative_to(self.DESTINATION_PATH) - walker = repo.get_walker(paths=[bytes(relative_path)], max_entries=1) - try: - commit = next(iter(walker)).commit - except StopIteration: - logger.error( - "Can not find %s related commits in repository %s", relative_path, repo - ) - return None - else: - last_update = datetime.datetime.fromtimestamp( - commit.author_time, datetime.timezone.utc - ) - return last_update + + with tempfile.TemporaryDirectory() as tmpdir: + + file_name = self.DB_DUMP_URL.split("/")[-1] + archive_path = Path(tmpdir) / file_name + + # Download the Db dump + with self.http_request(self.DB_DUMP_URL, stream=True) as res: + with open(archive_path, "wb") as out_file: + for chunk in res.iter_content(chunk_size=1024): + out_file.write(chunk) + + # Extract the Db dump + db_dump_path = Path(str(archive_path).split(".tar.gz")[0]) + tar = tarfile.open(archive_path) + tar.extractall(path=db_dump_path) + tar.close() + + csv.field_size_limit(1000000) + + (crates_csv_path,) = list(db_dump_path.glob("*/data/crates.csv")) + (versions_csv_path,) = list(db_dump_path.glob("*/data/versions.csv")) + (index_metadata_json_path,) = list(db_dump_path.rglob("*metadata.json")) + + with index_metadata_json_path.open("rb") as index_metadata_json: + self.index_metadata = json.load(index_metadata_json) + + crates: Dict[str, Any] = {} + with crates_csv_path.open() as crates_fd: + crates_csv = csv.DictReader(crates_fd) + for item in crates_csv: + if self.is_new(item["updated_at"]): + # crate 'id' as key + crates[item["id"]] = { + "name": item["name"], + "updated_at": item["updated_at"], + "versions": {}, + } + + data: Dict[str, Any] = {} + with versions_csv_path.open() as versions_fd: + versions_csv = csv.DictReader(versions_fd) + for version in versions_csv: + if version["crate_id"] in crates.keys(): + crate: Dict[str, Any] = crates[version["crate_id"]] + crate["versions"][version["num"]] = version + # crate 'name' as key + data[crate["name"]] = crate + return data def page_entry_dict(self, entry: Dict[str, Any]) -> Dict[str, Any]: """Transform package version definition dict to a suitable page entry dict """ + crate_file = self.CRATE_FILE_URL_PATTERN.format( + crate=entry["name"], version=entry["version"] + ) + filename = urlparse(crate_file).path.split("/")[-1] return dict( name=entry["name"], - version=entry["vers"], - checksum=entry["cksum"], - yanked=entry["yanked"], - crate_file=self.CRATE_FILE_URL_PATTERN.format( - crate=entry["name"], version=entry["vers"] - ), + version=entry["version"], + checksum=entry["checksum"], + yanked=True if entry["yanked"] == "t" else False, + crate_file=crate_file, + filename=filename, + last_update=entry["updated_at"], ) def get_pages(self) -> Iterator[CratesListerPage]: - """Yield an iterator sorted by name in ascending order of pages. - - Each page is a list of crate versions with: - - name: Name of the crate - - version: Version - - checksum: Checksum - - crate_file: Url of the crate file - - last_update: Date of the last commit of the corresponding index - file + """Each page is a list of crate versions with: + - name: Name of the crate + - version: Version + - checksum: Checksum + - yanked: Whether the package is yanked or not + - crate_file: Url of the crate file + - filename: File name of the crate file + - last_update: Last update for that version """ - # Fetch crates.io index repository - self.get_index_repository() - if not self.state.last_commit: - # First discovery - # List all crates files from the index repository - crates_index = self.get_crates_index() - else: - # Incremental case - # Get new package version by parsing a range of commits from index repository - repo = Repo(str(self.DESTINATION_PATH)) - head = repo[repo.head()] - last = repo[self.state.last_commit.encode()] - - outstream = io.BytesIO() - write_tree_diff(outstream, repo.object_store, last.tree, head.tree) - raw_diff = outstream.getvalue() - crates_index = [] - for line in raw_diff.splitlines(): - if line.startswith(b"+++ b/"): - filepath = line.split(b"+++ b/", 1)[1] - crates_index.append(self.DESTINATION_PATH / filepath.decode()) - crates_index = sorted(crates_index) - - logger.debug("Found %s crates in crates_index", len(crates_index)) - - # Each line of a crate file is a json entry describing released versions - # for a package - for crate in crates_index: + + # Fetch crates.io Db dump, then Parse the data. + dataset = self.get_and_parse_db_dump() + + logger.debug("Found %s crates in crates_index", len(dataset)) + + # Each entry from dataset will correspond to a page + for name, item in dataset.items(): page = [] - last_update = self.get_last_update_by_file(crate) - - with crate.open("rb") as current_file: - for line in current_file: - data = json.loads(line) - entry = self.page_entry_dict(data) - entry["last_update"] = last_update - page.append(entry) + # sort crate versions + versions: list = sorted(item["versions"].keys(), key=parse_version) + + for version in versions: + v = item["versions"][version] + v["name"] = name + v["version"] = version + page.append(self.page_entry_dict(v)) + yield page def get_origins_from_page(self, page: CratesListerPage) -> Iterator[ListedOrigin]: """Iterate on all crate pages and yield ListedOrigin instances.""" - assert self.lister_obj.id is not None - url = self.CRATE_API_URL_PATTERN.format(crate=page[0]["name"]) + url = self.CRATE_URL_PATTERN.format(crate=page[0]["name"]) last_update = page[0]["last_update"] + artifacts = [] crates_metadata = [] - for version in page: - filename = urlparse(version["crate_file"]).path.split("/")[-1] + for entry in page: # Build an artifact entry following original-artifacts-json specification # https://docs.softwareheritage.org/devel/swh-storage/extrinsic-metadata-specification.html#original-artifacts-json # noqa: B950 - artifact = { - "filename": f"{filename}", - "checksums": { - "sha256": f"{version['checksum']}", - }, - "url": version["crate_file"], - "version": version["version"], - } - artifacts.append(artifact) - data = {f"{version['version']}": {"yanked": f"{version['yanked']}"}} - crates_metadata.append(data) + artifacts.append( + { + "version": entry["version"], + "filename": entry["filename"], + "url": entry["crate_file"], + "checksums": { + "sha256": entry["checksum"], + }, + } + ) + + crates_metadata.append( + { + "version": entry["version"], + "yanked": entry["yanked"], + "last_update": entry["last_update"], + } + ) yield ListedOrigin( lister_id=self.lister_obj.id, visit_type=self.VISIT_TYPE, url=url, - last_update=last_update, + last_update=iso8601.parse_date(last_update), extra_loader_arguments={ "artifacts": artifacts, "crates_metadata": crates_metadata, @@ -242,18 +243,8 @@ ) def finalize(self) -> None: - last = self.get_last_commit_hash(repository_path=self.DESTINATION_PATH) - if self.state.last_commit == last: - self.updated = False - else: - self.state.last_commit = last - self.updated = True + last: datetime = iso8601.parse_date(self.index_metadata["timestamp"]) - logger.debug("Listing crates origin completed with last commit id %s", last) - - # Cleanup by removing the repository directory - if self.DESTINATION_PATH.exists(): - shutil.rmtree(self.DESTINATION_PATH) - logger.debug( - "Successfully removed %s directory", str(self.DESTINATION_PATH) - ) + if not self.state.index_last_update: + self.state.index_last_update = last + self.updated = True diff --git a/swh/lister/crates/tests/__init__.py b/swh/lister/crates/tests/__init__.py --- a/swh/lister/crates/tests/__init__.py +++ b/swh/lister/crates/tests/__init__.py @@ -1,29 +1,3 @@ # Copyright (C) 2022 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information - -import os -from pathlib import PosixPath -import subprocess -from typing import Optional, Union - - -def prepare_repository_from_archive( - archive_path: str, - filename: Optional[str] = None, - tmp_path: Union[PosixPath, str] = "/tmp", -) -> str: - """Given an existing archive_path, uncompress it. - Returns a file repo url which can be used as origin url. - - This does not deal with the case where the archive passed along does not exist. - - """ - if not isinstance(tmp_path, str): - tmp_path = str(tmp_path) - # uncompress folder/repositories/dump for the loader to ingest - subprocess.check_output(["tar", "xf", archive_path, "-C", tmp_path]) - # build the origin url (or some derivative form) - _fname = filename if filename else os.path.basename(archive_path) - repo_url = f"file://{tmp_path}/{_fname}" - return repo_url diff --git a/swh/lister/crates/tests/data/fake-crates-repository.tar.gz b/swh/lister/crates/tests/data/fake-crates-repository.tar.gz deleted file mode 100644 index 0000000000000000000000000000000000000000..0000000000000000000000000000000000000000 GIT binary patch literal 0 Hc$@ data/crates.csv -touch .dot-file -touch config.json +echo -e '''checksum,crate_id,crate_size,created_at,downloads,features,id,license,links,num,published_by,updated_at,yanked +398952a2f6cd1d22bc1774fd663808e32cf36add0280dee5cdd84a8fff2db944,2233,,2015-05-27 23:19:16.848643,1961,{},10855,MIT/Apache-2.0,,0.1.0,,2017-11-30 03:37:17.449539,f +343bd0171ee23346506db6f4c64525de6d72f0e8cc533f83aea97f3e7488cbf9,545,,2014-12-18 06:56:46.88489,845,{},1321,MIT/Apache-2.0,,0.1.2,,2017-11-30 02:29:20.01125,f +6e229ed392842fa93c1d76018d197b7e1b74250532bafb37b0e1d121a92d4cf7,1339,,2015-02-03 11:15:19.001762,8211,{},4371,MIT/Apache-2.0,,0.1.2,,2017-11-30 03:14:27.545115,f +defb220c4054ca1b95fe8b0c9a6e782dda684c1bdf8694df291733ae8a3748e3,545,,2014-12-19 16:16:41.73772,1498,{},1363,MIT/Apache-2.0,,0.1.3,,2017-11-30 02:26:59.236947,f +48a45b46c2a8c38348adb1205b13c3c5eb0174e0c0fec52cc88e9fb1de14c54d,1339,,2015-02-03 06:17:14.169972,7963,{},4362,MIT/Apache-2.0,,0.1.1,,2017-11-30 03:33:14.186028,f +f0ff1ca641d3c9a2c30464dac30183a8b91cdcc959d616961be020cdea6255c5,545,,2014-12-13 22:10:11.329494,3204,{},1100,MIT/Apache-2.0,,0.1.0,,2017-11-30 02:51:27.240551,f +a07bef996bd38a73c21a8e345d2c16848b41aa7ec949e2fedffe9edf74cdfb36,545,,2014-12-15 20:31:48.571836,889,{},1178,MIT/Apache-2.0,,0.1.1,,2017-11-30 03:03:20.143103,f +''' > data/versions.csv -# Init as a git repository -git init -git add . -git commit -m "Init fake crates.io-index repository for tests purpose" +echo -e '''{ + "timestamp": "2022-08-08T02:00:27.645191645Z", + "crates_io_commit": "3e5f0b4d2a382ac0951898fd257f693734eadee2" +} +''' > metadata.json -echo '{"name":"rand","vers":"0.1.1","deps":[],"cksum":"48a45b46c2a8c38348adb1205b13c3c5eb0174e0c0fec52cc88e9fb1de14c54d","features":{},"yanked":false}' > ra/nd/rand -git add . -git commit -m " Updating crate rand#0.1.1" +cd ../../ +tar -czf db-dump.tar.gz -C crates.io-db-dump . -echo '{"name":"rand","vers":"0.1.2","deps":[{"name":"libc","req":"^0.1.1","features":[""],"optional":false,"default_features":true,"target":null,"kind":"normal"},{"name":"log","req":"^0.2.1","features":[""],"optional":false,"default_features":true,"target":null,"kind":"normal"}],"cksum":"6e229ed392842fa93c1d76018d197b7e1b74250532bafb37b0e1d121a92d4cf7","features":{},"yanked":false}' >> ra/nd/rand -git add . -git commit -m " Updating crate rand#0.1.2" +# A second db dump with a new entry and a different timestamp -echo '{"name":"regex","vers":"0.1.0","deps":[],"cksum":"f0ff1ca641d3c9a2c30464dac30183a8b91cdcc959d616961be020cdea6255c5","features":{},"yanked":false}' > re/ge/regex -git add . -git commit -m " Updating crate regex#0.1.0" +mkdir -p crates.io-db-dump_visit1 +cp -rf crates.io-db-dump/2022-08-08-020027 crates.io-db-dump_visit1/2022-09-05-020027 -echo '{"name":"regex","vers":"0.1.1","deps":[{"name":"regex_macros","req":"^0.1.0","features":[""],"optional":false,"default_features":true,"target":null,"kind":"dev"}],"cksum":"a07bef996bd38a73c21a8e345d2c16848b41aa7ec949e2fedffe9edf74cdfb36","features":{},"yanked":false}' >> re/ge/regex -git add . -git commit -m " Updating crate regex#0.1.1" +cd crates.io-db-dump_visit1/2022-09-05-020027/ -echo '{"name":"regex","vers":"0.1.2","deps":[{"name":"regex_macros","req":"^0.1.0","features":[""],"optional":false,"default_features":true,"target":null,"kind":"dev"}],"cksum":"343bd0171ee23346506db6f4c64525de6d72f0e8cc533f83aea97f3e7488cbf9","features":{},"yanked":false}' >> re/ge/regex -git add . -git commit -m " Updating crate regex#0.1.2" +echo -e '''{ + "timestamp": "2022-09-05T02:00:27.687167108Z", + "crates_io_commit": "d3652ad81bd8bd837f2d2442ee08484ee5d4bac3" +} +''' > metadata.json -echo '{"name":"regex","vers":"0.1.3","deps":[{"name":"regex_macros","req":"^0.1.0","features":[""],"optional":false,"default_features":true,"target":null,"kind":"dev"}],"cksum":"defb220c4054ca1b95fe8b0c9a6e782dda684c1bdf8694df291733ae8a3748e3","features":{},"yanked":false}' >> re/ge/regex -git add . -git commit -m " Updating crate regex#0.1.3" +echo -e '''2019-01-08 15:11:01.560092,"A crate for safe and ergonomic pin-projection.",,48353738,,107436,,pin-project,,https://github.com/taiki-e/pin-project,2022-08-15 13:52:11.642129 +''' >> data/crates.csv -echo '{"name":"regex-syntax","vers":"0.1.0","deps":[{"name":"rand","req":"^0.3","features":[""],"optional":false,"default_features":true,"target":null,"kind":"dev"},{"name":"quickcheck","req":"^0.2","features":[""],"optional":false,"default_features":true,"target":null,"kind":"dev"}],"cksum":"398952a2f6cd1d22bc1774fd663808e32cf36add0280dee5cdd84a8fff2db944","features":{},"yanked":false}' > re/ge/regex-syntax -git add . -git commit -m " Updating crate regex-syntax#0.1.0" +echo -e '''ad29a609b6bcd67fee905812e544992d216af9d755757c05ed2d0e15a74c6ecc,107436,56972,2022-08-15 13:52:11.642129,580330,{},602929,Apache-2.0 OR MIT,,1.0.12,33035,2022-08-15 13:52:11.642129,f +''' >> data/versions.csv -# Save some space -rm .git/hooks/*.sample +cd ../../ -# Compress git directory as a tar.gz archive -cd ../ -tar -cvzf fake-crates-repository.tar.gz crates.io-index -mv fake-crates-repository.tar.gz ../ +tar -czf db-dump.tar.gz_visit1 -C crates.io-db-dump_visit1 . + +# Move the generated tar.gz archives to a servable directory +mv db-dump.tar.gz ../https_static.crates.io/ +mv db-dump.tar.gz_visit1 ../https_static.crates.io/ # Clean up tmp_dir cd ../ diff --git a/swh/lister/crates/tests/data/https_static.crates.io/db-dump.tar.gz b/swh/lister/crates/tests/data/https_static.crates.io/db-dump.tar.gz new file mode 100644 index 0000000000000000000000000000000000000000..0000000000000000000000000000000000000000 GIT binary patch literal 0 Hc$@