Changeset View
Standalone View
swh/lister/crates/lister.py
# Copyright (C) 2022 The Software Heritage developers | # Copyright (C) 2022 The Software Heritage developers | ||||||||||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||||||||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||||||||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||||||||||
anlambert: Please add a new line between license header and imports. | |||||||||||||
from dataclasses import asdict, dataclass | import csv | ||||||||||||
import datetime | from dataclasses import dataclass | ||||||||||||
import io | from datetime import datetime | ||||||||||||
import json | import json | ||||||||||||
import logging | import logging | ||||||||||||
from pathlib import Path | from pathlib import Path | ||||||||||||
import shutil | import shutil | ||||||||||||
import tarfile | |||||||||||||
from typing import Any, Dict, Iterator, List, Optional | from typing import Any, Dict, Iterator, List, Optional | ||||||||||||
from urllib.parse import urlparse | from urllib.parse import urlparse | ||||||||||||
from dulwich import porcelain | import iso8601 | ||||||||||||
from dulwich.patch import write_tree_diff | from packaging.version import parse as parse_version | ||||||||||||
from dulwich.repo import Repo | import requests | ||||||||||||
from swh.scheduler.interface import SchedulerInterface | from swh.scheduler.interface import SchedulerInterface | ||||||||||||
from swh.scheduler.model import ListedOrigin | from swh.scheduler.model import ListedOrigin | ||||||||||||
from ..pattern import CredentialsType, Lister | from ..pattern import CredentialsType, Lister | ||||||||||||
logger = logging.getLogger(__name__) | logger = logging.getLogger(__name__) | ||||||||||||
# Aliasing the page results returned by `get_pages` method from the lister. | # Aliasing the page results returned by `get_pages` method from the lister. | ||||||||||||
CratesListerPage = List[Dict[str, Any]] | CratesListerPage = List[Dict[str, Any]] | ||||||||||||
@dataclass | @dataclass | ||||||||||||
class CratesListerState: | class CratesListerState: | ||||||||||||
"""Store lister state for incremental mode operations. | """Store lister state for incremental mode operations. | ||||||||||||
'last_commit' represents a git commit hash | 'index_last_update' represents the UTC time the crates.io database dump was | ||||||||||||
started | |||||||||||||
""" | """ | ||||||||||||
last_commit: str = "" | index_last_update: Optional[datetime] = None | ||||||||||||
class CratesLister(Lister[CratesListerState, CratesListerPage]): | class CratesLister(Lister[CratesListerState, CratesListerPage]): | ||||||||||||
"""List origins from the "crates.io" forge. | """List origins from the "crates.io" forge. | ||||||||||||
It basically fetches https://github.com/rust-lang/crates.io-index.git to a | It downloads a tar.gz archive which contains crates.io database table content as | ||||||||||||
temp directory and then walks through each file to get the crate's info on | csv files which is automatically generated every 24 hours. | ||||||||||||
the first run. | Parsing two csv files we can list all Crates.io package names and their related | ||||||||||||
versions. | |||||||||||||
In incremental mode, it relies on the same Git repository but instead of reading | |||||||||||||
each file of the repo, it get the differences through ``git log last_commit..HEAD``. | In incremental mode, it check each entry comparing their 'last_update' value | ||||||||||||
Resulting output string is parsed to build page entries. | with self.state.index_last_update | ||||||||||||
""" | """ | ||||||||||||
# Part of the lister API, that identifies this lister | |||||||||||||
LISTER_NAME = "crates" | LISTER_NAME = "crates" | ||||||||||||
# (Optional) CVS type of the origins listed by this lister, if constant | |||||||||||||
VISIT_TYPE = "crates" | VISIT_TYPE = "crates" | ||||||||||||
INSTANCE = "crates" | INSTANCE = "crates" | ||||||||||||
INDEX_REPOSITORY_URL = "https://github.com/rust-lang/crates.io-index.git" | |||||||||||||
DESTINATION_PATH = Path("/tmp/crates.io-index") | BASE_URL = "https://crates.io" | ||||||||||||
DB_DUMP_URL = "https://static.crates.io/db-dump.tar.gz" | |||||||||||||
Done Inline ActionsYou can remove that variable now. anlambert: You can remove that variable now. | |||||||||||||
DESTINATION_PATH_DB_DUMP = Path("/tmp/crates.io-db_dump") | |||||||||||||
CRATE_FILE_URL_PATTERN = ( | CRATE_FILE_URL_PATTERN = ( | ||||||||||||
"https://static.crates.io/crates/{crate}/{crate}-{version}.crate" | "https://static.crates.io/crates/{crate}/{crate}-{version}.crate" | ||||||||||||
) | ) | ||||||||||||
CRATE_API_URL_PATTERN = "https://crates.io/api/v1/crates/{crate}" | CRATE_API_URL_PATTERN = "https://crates.io/api/v1/crates/{crate}" | ||||||||||||
anlambertUnsubmitted Done Inline ActionsWe should use the HTML page of a crate as origin URL: CRATE_ORIGIN_URL_PATTERN = "https://crates.io/crates/{crate}" anlambert: We should use the HTML page of a crate as origin URL:
```lang=python
CRATE_ORIGIN_URL_PATTERN =… | |||||||||||||
def __init__( | def __init__( | ||||||||||||
self, | self, | ||||||||||||
scheduler: SchedulerInterface, | scheduler: SchedulerInterface, | ||||||||||||
credentials: CredentialsType = None, | credentials: CredentialsType = None, | ||||||||||||
): | ): | ||||||||||||
super().__init__( | super().__init__( | ||||||||||||
scheduler=scheduler, | scheduler=scheduler, | ||||||||||||
credentials=credentials, | credentials=credentials, | ||||||||||||
url=self.INDEX_REPOSITORY_URL, | url=self.BASE_URL, | ||||||||||||
instance=self.INSTANCE, | instance=self.INSTANCE, | ||||||||||||
) | ) | ||||||||||||
self.session = requests.Session() | |||||||||||||
anlambertUnsubmitted Done Inline Actionsto remove anlambert: to remove | |||||||||||||
self.index_metadata: Dict[str, str] = {} | |||||||||||||
def state_from_dict(self, d: Dict[str, Any]) -> CratesListerState: | def state_from_dict(self, d: Dict[str, Any]) -> CratesListerState: | ||||||||||||
if "last_commit" not in d: | index_last_update = d.get("index_last_update") | ||||||||||||
d["last_commit"] = "" | if index_last_update is not None: | ||||||||||||
d["index_last_update"] = iso8601.parse_date(index_last_update) | |||||||||||||
return CratesListerState(**d) | return CratesListerState(**d) | ||||||||||||
def state_to_dict(self, state: CratesListerState) -> Dict[str, Any]: | def state_to_dict(self, state: CratesListerState) -> Dict[str, Any]: | ||||||||||||
return asdict(state) | d: Dict[str, Optional[str]] = {"index_last_update": None} | ||||||||||||
index_last_update = state.index_last_update | |||||||||||||
if index_last_update is not None: | |||||||||||||
d["index_last_update"] = index_last_update.isoformat() | |||||||||||||
return d | |||||||||||||
def is_new(self, dt_str: str): | |||||||||||||
"""Returns True when dt_str is greater than | |||||||||||||
self.state.index_last_update | |||||||||||||
""" | |||||||||||||
dt = iso8601.parse_date(dt_str) | |||||||||||||
last = self.state.index_last_update | |||||||||||||
def get_index_repository(self) -> None: | if not last or (last is not None and last < dt): | ||||||||||||
"""Get crates.io-index repository up to date running git command.""" | return True | ||||||||||||
if self.DESTINATION_PATH.exists(): | |||||||||||||
porcelain.pull( | |||||||||||||
self.DESTINATION_PATH, remote_location=self.INDEX_REPOSITORY_URL | |||||||||||||
) | |||||||||||||
else: | else: | ||||||||||||
porcelain.clone( | return False | ||||||||||||
anlambertUnsubmitted Done Inline Actionsreturn not last or (last is not None and last < dt) anlambert: ```lang=python
return not last or (last is not None and last < dt)
``` | |||||||||||||
source=self.INDEX_REPOSITORY_URL, target=self.DESTINATION_PATH | |||||||||||||
) | |||||||||||||
def get_crates_index(self) -> List[Path]: | def get_db_dump(self) -> Path: | ||||||||||||
"""Build a sorted list of file paths excluding dotted directories and | """Download a tar.gz archive that is a Db dump of crates.io | ||||||||||||
dotted files. | |||||||||||||
Each file path corresponds to a crate that lists all available | Returns the path where the archive has been extracted to. | ||||||||||||
versions. | |||||||||||||
""" | """ | ||||||||||||
crates_index = sorted( | destination_path = self.DESTINATION_PATH_DB_DUMP | ||||||||||||
path | destination_path.mkdir(parents=True, exist_ok=True) | ||||||||||||
for path in self.DESTINATION_PATH.rglob("*") | file_name = self.DB_DUMP_URL.split("/")[-1] | ||||||||||||
if not any(part.startswith(".") for part in path.parts) | archive_path = destination_path / file_name | ||||||||||||
and path.is_file() | |||||||||||||
and path != self.DESTINATION_PATH / "config.json" | # Download the Db dump | ||||||||||||
) | with self.session.get(url=self.DB_DUMP_URL, stream=True) as res: | ||||||||||||
res.raise_for_status() | |||||||||||||
with open(archive_path, "wb") as out_file: | |||||||||||||
for chunk in res.iter_content(chunk_size=1024): | |||||||||||||
out_file.write(chunk) | |||||||||||||
anlambertUnsubmitted Done Inline ActionsUse this instead: with self.http_request(self.DB_DUMP_URL, stream=True) as res: with open(archive_path, "wb") as out_file: for chunk in res.iter_content(chunk_size=1024): out_file.write(chunk) anlambert: Use this instead:
```lang=python
with self.http_request(self.DB_DUMP_URL, stream=True) as res… | |||||||||||||
Done Inline Actionsyou should actually stream the bytes; this causes a full copy to be allocated in memory before writing vlorentz: you should actually stream the bytes; this causes a full copy to be allocated in memory before… | |||||||||||||
# Extract the Db dump | |||||||||||||
extract_to = Path(str(archive_path).split(".tar.gz")[0]) | |||||||||||||
Done Inline Actions
vlorentz: | |||||||||||||
Done Inline Actionshere i want to extract to this path: PosixPath('/tmp/crates.io-db_dump/db-dump') archive_path.stem will return "db-dump.tar" franckbret: here i want to extract to this path:
PosixPath('/tmp/crates.io-db_dump/db-dump')
archive_path. | |||||||||||||
Done Inline ActionsThen extract_to = archive_path.with_suffix("") should do it. Forget this comment if it doesn't either vlorentz: Then `extract_to = archive_path.with_suffix("")` should do it. Forget this comment if it… | |||||||||||||
tar = tarfile.open(archive_path) | |||||||||||||
tar.extractall(path=extract_to) | |||||||||||||
tar.close() | |||||||||||||
return crates_index | return extract_to | ||||||||||||
def get_last_commit_hash(self, repository_path: Path) -> str: | def parse_db_dump(self, db_dump_path: Path) -> Dict[str, Any]: | ||||||||||||
"""Returns the last commit hash of a git repository""" | """Parse csv files from db_dump_path. | ||||||||||||
assert repository_path.exists() | |||||||||||||
repo = Repo(str(repository_path)) | Returns a dict where each entry corresponds to a package name with its related versions. | ||||||||||||
head = repo.head() | """ | ||||||||||||
last_commit = repo[head] | csv.field_size_limit(1000000) | ||||||||||||
return last_commit.id.decode() | crates_csv_path = list(db_dump_path.rglob("*crates.csv"))[0] | ||||||||||||
versions_csv_path = list(db_dump_path.rglob("*versions.csv"))[0] | |||||||||||||
Done Inline Actions
doesn't need to be recursive + doesn't hurt to assert there is only one file matching each pattern. (if there are more than one, it's a bug and should be addressed) vlorentz: doesn't need to be recursive + doesn't hurt to assert there is only one file matching each… | |||||||||||||
Done Inline Actions(oops, I didn't mean to write next( instead of list() vlorentz: (oops, I didn't mean to write `next(` instead of `list(`) | |||||||||||||
Done Inline ActionsI used rglob because the top directory of the tar.gz extracted archive is date based so it is different each time we download a new archive. ipdb> tar.getmembers() [<TarInfo '.' at 0x7f144f378e58>, <TarInfo './2022-08-08-020027' at 0x7f144d379f20>, <TarInfo './2022-08-08-020027/data' at 0x7f1446695048>, <TarInfo './2022-08-08-020027/data/crates.csv' at 0x7f14466952a0>, <TarInfo './2022-08-08-020027/data/versions.csv' at 0x7f1446695368>] Should had a comment about that. franckbret: I used rglob because the top directory of the tar.gz extracted archive is date based so it is… | |||||||||||||
Done Inline ActionsMy bad, I misunderstood rglob. Anyway, you can use this: (crates_csv_path,) = list(db_dump_path.glob("*/data/crates.csv")) (versions_csv_path,) = next(db_dump_path.glob("*/data/versions.csv")) vlorentz: My bad, I misunderstood rglob. Anyway, you can use this:
```
(crates_csv_path,) = list… | |||||||||||||
index_metadata_json_path = list(db_dump_path.rglob("*metadata.json"))[0] | |||||||||||||
with index_metadata_json_path.open("rb") as index_metadata_json: | |||||||||||||
Done Inline ActionsUse with crates_csv_path.open() as fd etc. so we don't rely on CPython-specific behavior to avoid leaking FDs. (Not a big deal since we currently use only CPython, I just want to be safe) vlorentz: Use `with crates_csv_path.open() as fd` etc. so we don't rely on CPython-specific behavior to… | |||||||||||||
self.index_metadata = json.load(index_metadata_json) | |||||||||||||
crates: Dict[str, Any] = {} | |||||||||||||
with crates_csv_path.open() as crates_fd: | |||||||||||||
crates_csv = csv.DictReader(crates_fd) | |||||||||||||
for item in crates_csv: | |||||||||||||
if self.is_new(item["updated_at"]): | |||||||||||||
# crate 'id' as key | |||||||||||||
crates[item["id"]] = { | |||||||||||||
"name": item["name"], | |||||||||||||
"updated_at": item["updated_at"], | |||||||||||||
"versions": {}, | |||||||||||||
} | |||||||||||||
def get_last_update_by_file(self, filepath: Path) -> Optional[datetime.datetime]: | data: Dict[str, Any] = {} | ||||||||||||
"""Given a file path within a Git repository, returns its last commit | with versions_csv_path.open() as versions_fd: | ||||||||||||
date as iso8601 | versions_csv = csv.DictReader(versions_fd) | ||||||||||||
""" | for version in versions_csv: | ||||||||||||
repo = Repo(str(self.DESTINATION_PATH)) | if version["crate_id"] in crates.keys(): | ||||||||||||
# compute relative path otherwise it fails | crate: Dict[str, Any] = crates[version["crate_id"]] | ||||||||||||
relative_path = filepath.relative_to(self.DESTINATION_PATH) | crate["versions"][version["num"]] = version | ||||||||||||
walker = repo.get_walker(paths=[bytes(relative_path)], max_entries=1) | # crate 'name' as key | ||||||||||||
try: | data[crate["name"]] = crate | ||||||||||||
commit = next(iter(walker)).commit | return data | ||||||||||||
except StopIteration: | |||||||||||||
logger.error( | |||||||||||||
"Can not find %s related commits in repository %s", relative_path, repo | |||||||||||||
) | |||||||||||||
return None | |||||||||||||
else: | |||||||||||||
last_update = datetime.datetime.fromtimestamp( | |||||||||||||
commit.author_time, datetime.timezone.utc | |||||||||||||
) | |||||||||||||
return last_update | |||||||||||||
def page_entry_dict(self, entry: Dict[str, Any]) -> Dict[str, Any]: | def page_entry_dict(self, entry: Dict[str, Any]) -> Dict[str, Any]: | ||||||||||||
"""Transform package version definition dict to a suitable | """Transform package version definition dict to a suitable | ||||||||||||
page entry dict | page entry dict | ||||||||||||
""" | """ | ||||||||||||
crate_file = self.CRATE_FILE_URL_PATTERN.format( | |||||||||||||
crate=entry["name"], version=entry["version"] | |||||||||||||
) | |||||||||||||
filename = urlparse(crate_file).path.split("/")[-1] | |||||||||||||
return dict( | return dict( | ||||||||||||
name=entry["name"], | name=entry["name"], | ||||||||||||
version=entry["vers"], | version=entry["version"], | ||||||||||||
checksum=entry["cksum"], | checksum=entry["checksum"], | ||||||||||||
yanked=entry["yanked"], | yanked=True if entry["yanked"] == "t" else False, | ||||||||||||
crate_file=self.CRATE_FILE_URL_PATTERN.format( | crate_file=crate_file, | ||||||||||||
crate=entry["name"], version=entry["vers"] | filename=filename, | ||||||||||||
), | last_update=entry["updated_at"], | ||||||||||||
) | ) | ||||||||||||
def get_pages(self) -> Iterator[CratesListerPage]: | def get_pages(self) -> Iterator[CratesListerPage]: | ||||||||||||
"""Yield an iterator sorted by name in ascending order of pages. | """Each page is a list of crate versions with: | ||||||||||||
Each page is a list of crate versions with: | |||||||||||||
- name: Name of the crate | - name: Name of the crate | ||||||||||||
- version: Version | - version: Version | ||||||||||||
- checksum: Checksum | - checksum: Checksum | ||||||||||||
- yanked: Whether the package is yanked or not | |||||||||||||
- crate_file: Url of the crate file | - crate_file: Url of the crate file | ||||||||||||
- last_update: Date of the last commit of the corresponding index | - filename: File name of the crate file | ||||||||||||
file | - last_update: Last update for that version | ||||||||||||
""" | """ | ||||||||||||
# Fetch crates.io index repository | # Fetch crates.io Db dump | ||||||||||||
self.get_index_repository() | db_dump_path = self.get_db_dump() | ||||||||||||
if not self.state.last_commit: | |||||||||||||
# First discovery | # Parse the data, each entry from dataset will correspond to a page | ||||||||||||
# List all crates files from the index repository | dataset = self.parse_db_dump(db_dump_path) | ||||||||||||
crates_index = self.get_crates_index() | |||||||||||||
else: | logger.debug("Found %s crates in crates_index", len(dataset)) | ||||||||||||
# Incremental case | |||||||||||||
# Get new package version by parsing a range of commits from index repository | for name, item in dataset.items(): | ||||||||||||
repo = Repo(str(self.DESTINATION_PATH)) | |||||||||||||
head = repo[repo.head()] | |||||||||||||
last = repo[self.state.last_commit.encode()] | |||||||||||||
outstream = io.BytesIO() | |||||||||||||
write_tree_diff(outstream, repo.object_store, last.tree, head.tree) | |||||||||||||
raw_diff = outstream.getvalue() | |||||||||||||
crates_index = [] | |||||||||||||
for line in raw_diff.splitlines(): | |||||||||||||
if line.startswith(b"+++ b/"): | |||||||||||||
filepath = line.split(b"+++ b/", 1)[1] | |||||||||||||
crates_index.append(self.DESTINATION_PATH / filepath.decode()) | |||||||||||||
crates_index = sorted(crates_index) | |||||||||||||
logger.debug("Found %s crates in crates_index", len(crates_index)) | |||||||||||||
# Each line of a crate file is a json entry describing released versions | |||||||||||||
# for a package | |||||||||||||
for crate in crates_index: | |||||||||||||
page = [] | page = [] | ||||||||||||
last_update = self.get_last_update_by_file(crate) | # sort crate versions | ||||||||||||
versions: list = sorted(item["versions"].keys(), key=parse_version) | |||||||||||||
for version in versions: | |||||||||||||
v = item["versions"][version] | |||||||||||||
v["name"] = name | |||||||||||||
v["version"] = version | |||||||||||||
page.append(self.page_entry_dict(v)) | |||||||||||||
with crate.open("rb") as current_file: | |||||||||||||
for line in current_file: | |||||||||||||
data = json.loads(line) | |||||||||||||
entry = self.page_entry_dict(data) | |||||||||||||
entry["last_update"] = last_update | |||||||||||||
page.append(entry) | |||||||||||||
yield page | yield page | ||||||||||||
def get_origins_from_page(self, page: CratesListerPage) -> Iterator[ListedOrigin]: | def get_origins_from_page(self, page: CratesListerPage) -> Iterator[ListedOrigin]: | ||||||||||||
"""Iterate on all crate pages and yield ListedOrigin instances.""" | """Iterate on all crate pages and yield ListedOrigin instances.""" | ||||||||||||
assert self.lister_obj.id is not None | assert self.lister_obj.id is not None | ||||||||||||
url = self.CRATE_API_URL_PATTERN.format(crate=page[0]["name"]) | url = self.CRATE_API_URL_PATTERN.format(crate=page[0]["name"]) | ||||||||||||
anlambertUnsubmitted Done Inline Actionsurl = self.CRATE_ORIGIN_URL_PATTERN.format(crate=page[0]["name"]) anlambert: ```lang=python
url = self.CRATE_ORIGIN_URL_PATTERN.format(crate=page[0]["name"])
``` | |||||||||||||
last_update = page[0]["last_update"] | last_update = page[0]["last_update"] | ||||||||||||
artifacts = [] | artifacts = [] | ||||||||||||
crates_metadata = [] | crates_metadata = [] | ||||||||||||
anlambertUnsubmitted Done Inline ActionsUse dicts instead of lists here in order to simplify crates loader processing. artifacts = {} crates_metadata = {} anlambert: Use dicts instead of lists here in order to simplify crates loader processing.
```lang=python… | |||||||||||||
anlambertUnsubmitted Done Inline ActionsIgnore this comment, I was not aware that we should use this format anlambert: Ignore this comment, I was not aware that we should use this [format](https://docs. | |||||||||||||
franckbretAuthorUnsubmitted Done Inline Actionsswitched to lists franckbret: switched to lists | |||||||||||||
for version in page: | for entry in page: | ||||||||||||
filename = urlparse(version["crate_file"]).path.split("/")[-1] | |||||||||||||
# Build an artifact entry following original-artifacts-json specification | # Build an artifact entry following original-artifacts-json specification | ||||||||||||
# https://docs.softwareheritage.org/devel/swh-storage/extrinsic-metadata-specification.html#original-artifacts-json # noqa: B950 | # https://docs.softwareheritage.org/devel/swh-storage/extrinsic-metadata-specification.html#original-artifacts-json # noqa: B950 | ||||||||||||
artifact = { | artifact = { | ||||||||||||
"filename": f"{filename}", | f"{entry['version']}": { | ||||||||||||
"filename": entry["filename"], | |||||||||||||
"url": entry["crate_file"], | |||||||||||||
"checksums": { | "checksums": { | ||||||||||||
"sha256": f"{version['checksum']}", | "sha256": entry["checksum"], | ||||||||||||
}, | }, | ||||||||||||
"url": version["crate_file"], | } | ||||||||||||
"version": version["version"], | |||||||||||||
} | } | ||||||||||||
artifacts.append(artifact) | artifacts.append(artifact) | ||||||||||||
data = {f"{version['version']}": {"yanked": f"{version['yanked']}"}} | data = { | ||||||||||||
f"{entry['version']}": { | |||||||||||||
"yanked": entry["yanked"], | |||||||||||||
"last_update": entry["last_update"], | |||||||||||||
} | |||||||||||||
} | |||||||||||||
crates_metadata.append(data) | crates_metadata.append(data) | ||||||||||||
anlambertUnsubmitted Done Inline Actionsartifacts[f"{entry['version']}"] = { "filename": entry["filename"], "url": entry["crate_file"], "checksums": { "sha256": entry["checksum"], }, } crates_metadata[f"{entry['version']}"] = { "yanked": entry["yanked"], "last_update": entry["last_update"], } anlambert: ```lang=python
artifacts[f"{entry['version']}"] = {
"filename": entry["filename"]… | |||||||||||||
anlambertUnsubmitted Done Inline ActionsIgnore this comment, I was not aware that we should use this format anlambert: Ignore this comment, I was not aware that we should use this [format](https://docs. | |||||||||||||
franckbretAuthorUnsubmitted Done Inline Actionsswitched to lists franckbret: switched to lists | |||||||||||||
yield ListedOrigin( | yield ListedOrigin( | ||||||||||||
lister_id=self.lister_obj.id, | lister_id=self.lister_obj.id, | ||||||||||||
visit_type=self.VISIT_TYPE, | visit_type=self.VISIT_TYPE, | ||||||||||||
url=url, | url=url, | ||||||||||||
last_update=last_update, | last_update=iso8601.parse_date(last_update), | ||||||||||||
extra_loader_arguments={ | extra_loader_arguments={ | ||||||||||||
"artifacts": artifacts, | "artifacts": artifacts, | ||||||||||||
"crates_metadata": crates_metadata, | "crates_metadata": crates_metadata, | ||||||||||||
}, | }, | ||||||||||||
) | ) | ||||||||||||
def finalize(self) -> None: | def finalize(self) -> None: | ||||||||||||
last = self.get_last_commit_hash(repository_path=self.DESTINATION_PATH) | last: datetime = iso8601.parse_date(self.index_metadata["timestamp"]) | ||||||||||||
if self.state.last_commit == last: | |||||||||||||
self.updated = False | |||||||||||||
else: | |||||||||||||
self.state.last_commit = last | |||||||||||||
self.updated = True | |||||||||||||
logger.debug("Listing crates origin completed with last commit id %s", last) | if not self.state.index_last_update: | ||||||||||||
self.state.index_last_update = last | |||||||||||||
self.updated = True | |||||||||||||
# Cleanup by removing the repository directory | # Cleanup by removing the db dump directory | ||||||||||||
if self.DESTINATION_PATH.exists(): | if self.DESTINATION_PATH_DB_DUMP.exists(): | ||||||||||||
shutil.rmtree(self.DESTINATION_PATH) | shutil.rmtree(self.DESTINATION_PATH_DB_DUMP) | ||||||||||||
logger.debug( | logger.debug( | ||||||||||||
"Successfully removed %s directory", str(self.DESTINATION_PATH) | "Successfully removed %s directory", str(self.DESTINATION_PATH_DB_DUMP) | ||||||||||||
) | ) | ||||||||||||
Done Inline ActionsThis can be removed now. anlambert: This can be removed now. |
Please add a new line between license header and imports.