Differential D8454 Diff 30588 swh/lister/crates/lister.py

Changeset View

Standalone View

swh/lister/crates/lister.py

# See the AUTHORS file at the top-level directory of this distribution # See the AUTHORS file at the top-level directory of this distribution

# License: GNU General Public License version 3, or any later version # License: GNU General Public License version 3, or any later version

# See top-level LICENSE file for more information # See top-level LICENSE file for more information

anlambertUnsubmitted

Done

Please add a new line between license header and imports.

anlambert: Please add a new line between license header and imports.

from dataclasses import asdict, dataclass import csv

import datetime from dataclasses import dataclass

import io from datetime import datetime

import json import json

import logging import logging

from pathlib import Path from pathlib import Path

import shutil import shutil

import tarfile

from typing import Any, Dict, Iterator, List, Optional from typing import Any, Dict, Iterator, List, Optional

from urllib.parse import urlparse from urllib.parse import urlparse

from dulwich import porcelain import iso8601

from dulwich.patch import write_tree_diff from packaging.version import parse as parse_version

from dulwich.repo import Repo import requests

from swh.scheduler.interface import SchedulerInterface from swh.scheduler.interface import SchedulerInterface

from swh.scheduler.model import ListedOrigin from swh.scheduler.model import ListedOrigin

from ..pattern import CredentialsType, Lister from ..pattern import CredentialsType, Lister

logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)

# Aliasing the page results returned by `get_pages` method from the lister. # Aliasing the page results returned by `get_pages` method from the lister.

CratesListerPage = List[Dict[str, Any]] CratesListerPage = List[Dict[str, Any]]

@dataclass @dataclass

class CratesListerState: class CratesListerState:

"""Store lister state for incremental mode operations. """Store lister state for incremental mode operations.

'last_commit' represents a git commit hash 'index_last_update' represents the UTC time the crates.io database dump was

started

""" """

last_commit: str = "" index_last_update: Optional[datetime] = None

class CratesLister(Lister[CratesListerState, CratesListerPage]): class CratesLister(Lister[CratesListerState, CratesListerPage]):

"""List origins from the "crates.io" forge. """List origins from the "crates.io" forge.

It basically fetches https://github.com/rust-lang/crates.io-index.git to a It downloads a tar.gz archive which contains crates.io database table content as

temp directory and then walks through each file to get the crate's info on csv files which is automatically generated every 24 hours.

the first run. Parsing two csv files we can list all Crates.io package names and their related

versions.

In incremental mode, it relies on the same Git repository but instead of reading

each file of the repo, it get the differences through ``git log last_commit..HEAD``. In incremental mode, it check each entry comparing their 'last_update' value

Resulting output string is parsed to build page entries. with self.state.index_last_update

""" """

# Part of the lister API, that identifies this lister

LISTER_NAME = "crates" LISTER_NAME = "crates"

# (Optional) CVS type of the origins listed by this lister, if constant

VISIT_TYPE = "crates" VISIT_TYPE = "crates"

INSTANCE = "crates" INSTANCE = "crates"

INDEX_REPOSITORY_URL = "https://github.com/rust-lang/crates.io-index.git"

DESTINATION_PATH = Path("/tmp/crates.io-index") BASE_URL = "https://crates.io"

DB_DUMP_URL = "https://static.crates.io/db-dump.tar.gz"

anlambertUnsubmitted

Done

You can remove that variable now.

anlambert: You can remove that variable now.

DESTINATION_PATH_DB_DUMP = Path("/tmp/crates.io-db_dump")

CRATE_FILE_URL_PATTERN = ( CRATE_FILE_URL_PATTERN = (

"https://static.crates.io/crates/{crate}/{crate}-{version}.crate" "https://static.crates.io/crates/{crate}/{crate}-{version}.crate"

) )

CRATE_API_URL_PATTERN = "https://crates.io/api/v1/crates/{crate}" CRATE_API_URL_PATTERN = "https://crates.io/api/v1/crates/{crate}"

anlambertUnsubmitted

Done

We should use the HTML page of a crate as origin URL:

CRATE_ORIGIN_URL_PATTERN = "https://crates.io/crates/{crate}"

anlambert: We should use the HTML page of a crate as origin URL: ```lang=python CRATE_ORIGIN_URL_PATTERN =…

def __init__( def __init__(

self, self,

scheduler: SchedulerInterface, scheduler: SchedulerInterface,

credentials: CredentialsType = None, credentials: CredentialsType = None,

): ):

super().__init__( super().__init__(

scheduler=scheduler, scheduler=scheduler,

credentials=credentials, credentials=credentials,

url=self.INDEX_REPOSITORY_URL, url=self.BASE_URL,

instance=self.INSTANCE, instance=self.INSTANCE,

) )

self.session = requests.Session()

anlambertUnsubmitted

Done

to remove

anlambert: to remove

self.index_metadata: Dict[str, str] = {}

def state_from_dict(self, d: Dict[str, Any]) -> CratesListerState: def state_from_dict(self, d: Dict[str, Any]) -> CratesListerState:

if "last_commit" not in d: index_last_update = d.get("index_last_update")

d["last_commit"] = "" if index_last_update is not None:

d["index_last_update"] = iso8601.parse_date(index_last_update)

return CratesListerState(**d) return CratesListerState(**d)

def state_to_dict(self, state: CratesListerState) -> Dict[str, Any]: def state_to_dict(self, state: CratesListerState) -> Dict[str, Any]:

return asdict(state) d: Dict[str, Optional[str]] = {"index_last_update": None}

index_last_update = state.index_last_update

if index_last_update is not None:

d["index_last_update"] = index_last_update.isoformat()

return d

def is_new(self, dt_str: str):

"""Returns True when dt_str is greater than

self.state.index_last_update

"""

dt = iso8601.parse_date(dt_str)

last = self.state.index_last_update

def get_index_repository(self) -> None: if not last or (last is not None and last < dt):

"""Get crates.io-index repository up to date running git command.""" return True

if self.DESTINATION_PATH.exists():

porcelain.pull(

self.DESTINATION_PATH, remote_location=self.INDEX_REPOSITORY_URL

)

else: else:

porcelain.clone( return False

anlambertUnsubmitted

Done

return not last or (last is not None and last < dt)

anlambert: ```lang=python return not last or (last is not None and last < dt) ```

source=self.INDEX_REPOSITORY_URL, target=self.DESTINATION_PATH

)

def get_crates_index(self) -> List[Path]: def get_db_dump(self) -> Path:

"""Build a sorted list of file paths excluding dotted directories and """Download a tar.gz archive that is a Db dump of crates.io

dotted files.

Each file path corresponds to a crate that lists all available Returns the path where the archive has been extracted to.

versions.

""" """

crates_index = sorted( destination_path = self.DESTINATION_PATH_DB_DUMP

path destination_path.mkdir(parents=True, exist_ok=True)

for path in self.DESTINATION_PATH.rglob("*") file_name = self.DB_DUMP_URL.split("/")[-1]

if not any(part.startswith(".") for part in path.parts) archive_path = destination_path / file_name

and path.is_file()

and path != self.DESTINATION_PATH / "config.json" # Download the Db dump

) with self.session.get(url=self.DB_DUMP_URL, stream=True) as res:

res.raise_for_status()

with open(archive_path, "wb") as out_file:

for chunk in res.iter_content(chunk_size=1024):

out_file.write(chunk)

anlambertUnsubmitted

Done

Use this instead:

with self.http_request(self.DB_DUMP_URL, stream=True) as res:
    with open(archive_path, "wb") as out_file:
        for chunk in res.iter_content(chunk_size=1024):
            out_file.write(chunk)

anlambert: Use this instead: ```lang=python with self.http_request(self.DB_DUMP_URL, stream=True) as res…

vlorentzUnsubmitted

Done

you should actually stream the bytes; this causes a full copy to be allocated in memory before writing

vlorentz: you should actually stream the bytes; this causes a full copy to be allocated in memory before…

# Extract the Db dump

extract_to = Path(str(archive_path).split(".tar.gz")[0])

vlorentzUnsubmitted

Done

# Extract the Db dump

- extract_to = Path(str(archive_path).split(".tar.gz")[0])

+ extract_to = archive_path.stem

tar = tarfile.open(archive_path)

vlorentz:

franckbretAuthorUnsubmitted

Done

here i want to extract to this path:

PosixPath('/tmp/crates.io-db_dump/db-dump')

archive_path.stem will return "db-dump.tar"

franckbret: here i want to extract to this path: PosixPath('/tmp/crates.io-db_dump/db-dump') archive_path.

vlorentzUnsubmitted

Done

Then extract_to = archive_path.with_suffix("") should do it. Forget this comment if it doesn't either

vlorentz: Then `extract_to = archive_path.with_suffix("")` should do it. Forget this comment if it…

tar = tarfile.open(archive_path)

tar.extractall(path=extract_to)

tar.close()

return crates_index return extract_to

def get_last_commit_hash(self, repository_path: Path) -> str: def parse_db_dump(self, db_dump_path: Path) -> Dict[str, Any]:

"""Returns the last commit hash of a git repository""" """Parse csv files from db_dump_path.

assert repository_path.exists()

repo = Repo(str(repository_path)) Returns a dict where each entry corresponds to a package name with its related versions.

head = repo.head() """

last_commit = repo[head] csv.field_size_limit(1000000)

return last_commit.id.decode() crates_csv_path = list(db_dump_path.rglob("*crates.csv"))[0]

versions_csv_path = list(db_dump_path.rglob("*versions.csv"))[0]

vlorentzUnsubmitted

Done

csv.field_size_limit(1000000)

- crates_csv_path = list(db_dump_path.rglob("*crates.csv"))[0]

- versions_csv_path = list(db_dump_path.rglob("*versions.csv"))[0]

+ (crates_csv_path,) = list(db_dump_path.glob("*crates.csv"))

+ (versions_csv_path,) = next(db_dump_path.glob("*versions.csv"))

crates_csv = csv.DictReader(crates_csv_path.open())

doesn't need to be recursive + doesn't hurt to assert there is only one file matching each pattern.

(if there are more than one, it's a bug and should be addressed)

vlorentz: doesn't need to be recursive + doesn't hurt to assert there is only one file matching each…

vlorentzUnsubmitted

Done

(oops, I didn't mean to write next( instead of list()

vlorentz: (oops, I didn't mean to write `next(` instead of `list(`)

franckbretAuthorUnsubmitted

Done

I used rglob because the top directory of the tar.gz extracted archive is date based so it is different each time we download a new archive.

ipdb> tar.getmembers()
[<TarInfo '.' at 0x7f144f378e58>, <TarInfo './2022-08-08-020027' at 0x7f144d379f20>, <TarInfo './2022-08-08-020027/data' at 0x7f1446695048>, <TarInfo './2022-08-08-020027/data/crates.csv' at 0x7f14466952a0>, <TarInfo './2022-08-08-020027/data/versions.csv' at 0x7f1446695368>]

Should had a comment about that.

franckbret: I used rglob because the top directory of the tar.gz extracted archive is date based so it is…

vlorentzUnsubmitted

Done

My bad, I misunderstood rglob. Anyway, you can use this:

(crates_csv_path,) = list(db_dump_path.glob("*/data/crates.csv"))
(versions_csv_path,) = next(db_dump_path.glob("*/data/versions.csv"))

vlorentz: My bad, I misunderstood rglob. Anyway, you can use this: ``` (crates_csv_path,) = list…

index_metadata_json_path = list(db_dump_path.rglob("*metadata.json"))[0]

with index_metadata_json_path.open("rb") as index_metadata_json:

vlorentzUnsubmitted

Done

Use with crates_csv_path.open() as fd etc. so we don't rely on CPython-specific behavior to avoid leaking FDs. (Not a big deal since we currently use only CPython, I just want to be safe)

vlorentz: Use `with crates_csv_path.open() as fd` etc. so we don't rely on CPython-specific behavior to…

self.index_metadata = json.load(index_metadata_json)

crates: Dict[str, Any] = {}

with crates_csv_path.open() as crates_fd:

crates_csv = csv.DictReader(crates_fd)

for item in crates_csv:

if self.is_new(item["updated_at"]):

# crate 'id' as key

crates[item["id"]] = {

"name": item["name"],

"updated_at": item["updated_at"],

"versions": {},

}

def get_last_update_by_file(self, filepath: Path) -> Optional[datetime.datetime]: data: Dict[str, Any] = {}

"""Given a file path within a Git repository, returns its last commit with versions_csv_path.open() as versions_fd:

date as iso8601 versions_csv = csv.DictReader(versions_fd)

""" for version in versions_csv:

repo = Repo(str(self.DESTINATION_PATH)) if version["crate_id"] in crates.keys():

# compute relative path otherwise it fails crate: Dict[str, Any] = crates[version["crate_id"]]

relative_path = filepath.relative_to(self.DESTINATION_PATH) crate["versions"][version["num"]] = version

walker = repo.get_walker(paths=[bytes(relative_path)], max_entries=1) # crate 'name' as key

try: data[crate["name"]] = crate

commit = next(iter(walker)).commit return data

except StopIteration:

logger.error(

"Can not find %s related commits in repository %s", relative_path, repo

)

return None

else:

last_update = datetime.datetime.fromtimestamp(

commit.author_time, datetime.timezone.utc

)

return last_update

def page_entry_dict(self, entry: Dict[str, Any]) -> Dict[str, Any]: def page_entry_dict(self, entry: Dict[str, Any]) -> Dict[str, Any]:

"""Transform package version definition dict to a suitable """Transform package version definition dict to a suitable

page entry dict page entry dict

""" """

crate_file = self.CRATE_FILE_URL_PATTERN.format(

crate=entry["name"], version=entry["version"]

)

filename = urlparse(crate_file).path.split("/")[-1]

return dict( return dict(

name=entry["name"], name=entry["name"],

version=entry["vers"], version=entry["version"],

checksum=entry["cksum"], checksum=entry["checksum"],

yanked=entry["yanked"], yanked=True if entry["yanked"] == "t" else False,

crate_file=self.CRATE_FILE_URL_PATTERN.format( crate_file=crate_file,

crate=entry["name"], version=entry["vers"] filename=filename,

), last_update=entry["updated_at"],

) )

def get_pages(self) -> Iterator[CratesListerPage]: def get_pages(self) -> Iterator[CratesListerPage]:

"""Yield an iterator sorted by name in ascending order of pages. """Each page is a list of crate versions with:

Each page is a list of crate versions with:

- name: Name of the crate - name: Name of the crate

- version: Version - version: Version

- checksum: Checksum - checksum: Checksum

- yanked: Whether the package is yanked or not

- crate_file: Url of the crate file - crate_file: Url of the crate file

- last_update: Date of the last commit of the corresponding index - filename: File name of the crate file

file - last_update: Last update for that version

""" """

# Fetch crates.io index repository # Fetch crates.io Db dump

self.get_index_repository() db_dump_path = self.get_db_dump()

if not self.state.last_commit:

# First discovery # Parse the data, each entry from dataset will correspond to a page

# List all crates files from the index repository dataset = self.parse_db_dump(db_dump_path)

crates_index = self.get_crates_index()

else: logger.debug("Found %s crates in crates_index", len(dataset))

# Incremental case

# Get new package version by parsing a range of commits from index repository for name, item in dataset.items():

repo = Repo(str(self.DESTINATION_PATH))

head = repo[repo.head()]

last = repo[self.state.last_commit.encode()]

outstream = io.BytesIO()

write_tree_diff(outstream, repo.object_store, last.tree, head.tree)

raw_diff = outstream.getvalue()

crates_index = []

for line in raw_diff.splitlines():

if line.startswith(b"+++ b/"):

filepath = line.split(b"+++ b/", 1)[1]

crates_index.append(self.DESTINATION_PATH / filepath.decode())

crates_index = sorted(crates_index)

logger.debug("Found %s crates in crates_index", len(crates_index))

# Each line of a crate file is a json entry describing released versions

# for a package

for crate in crates_index:

page = [] page = []

last_update = self.get_last_update_by_file(crate) # sort crate versions

versions: list = sorted(item["versions"].keys(), key=parse_version)

for version in versions:

v = item["versions"][version]

v["name"] = name

v["version"] = version

page.append(self.page_entry_dict(v))

with crate.open("rb") as current_file:

for line in current_file:

data = json.loads(line)

entry = self.page_entry_dict(data)

entry["last_update"] = last_update

page.append(entry)

yield page yield page

def get_origins_from_page(self, page: CratesListerPage) -> Iterator[ListedOrigin]: def get_origins_from_page(self, page: CratesListerPage) -> Iterator[ListedOrigin]:

"""Iterate on all crate pages and yield ListedOrigin instances.""" """Iterate on all crate pages and yield ListedOrigin instances."""

assert self.lister_obj.id is not None assert self.lister_obj.id is not None

url = self.CRATE_API_URL_PATTERN.format(crate=page[0]["name"]) url = self.CRATE_API_URL_PATTERN.format(crate=page[0]["name"])

anlambertUnsubmitted

Done

url = self.CRATE_ORIGIN_URL_PATTERN.format(crate=page[0]["name"])

anlambert: ```lang=python url = self.CRATE_ORIGIN_URL_PATTERN.format(crate=page[0]["name"]) ```

last_update = page[0]["last_update"] last_update = page[0]["last_update"]

artifacts = [] artifacts = []

crates_metadata = [] crates_metadata = []

anlambertUnsubmitted

Done

Use dicts instead of lists here in order to simplify crates loader processing.

artifacts = {}
crates_metadata = {}

anlambert: Use dicts instead of lists here in order to simplify crates loader processing. ```lang=python…

anlambertUnsubmitted

Done

Ignore this comment, I was not aware that we should use this format

anlambert: Ignore this comment, I was not aware that we should use this [format](https://docs.

franckbretAuthorUnsubmitted

Done

switched to lists

franckbret: switched to lists

for version in page: for entry in page:

filename = urlparse(version["crate_file"]).path.split("/")[-1]

# Build an artifact entry following original-artifacts-json specification # Build an artifact entry following original-artifacts-json specification

# https://docs.softwareheritage.org/devel/swh-storage/extrinsic-metadata-specification.html#original-artifacts-json # noqa: B950 # https://docs.softwareheritage.org/devel/swh-storage/extrinsic-metadata-specification.html#original-artifacts-json # noqa: B950

artifact = { artifact = {

"filename": f"{filename}", f"{entry['version']}": {

"filename": entry["filename"],

"url": entry["crate_file"],

"checksums": { "checksums": {

"sha256": f"{version['checksum']}", "sha256": entry["checksum"],

}, },

"url": version["crate_file"], }

"version": version["version"],

} }

artifacts.append(artifact) artifacts.append(artifact)

data = {f"{version['version']}": {"yanked": f"{version['yanked']}"}} data = {

f"{entry['version']}": {

"yanked": entry["yanked"],

"last_update": entry["last_update"],

}

crates_metadata.append(data) crates_metadata.append(data)

anlambertUnsubmitted

Done

artifacts[f"{entry['version']}"] = {
    "filename": entry["filename"],
    "url": entry["crate_file"],
    "checksums": {
        "sha256": entry["checksum"],
    },
}

crates_metadata[f"{entry['version']}"] = {
    "yanked": entry["yanked"],
    "last_update": entry["last_update"],
}

anlambert: ```lang=python artifacts[f"{entry['version']}"] = { "filename": entry["filename"]…

anlambertUnsubmitted

Done

Ignore this comment, I was not aware that we should use this format

anlambert: Ignore this comment, I was not aware that we should use this [format](https://docs.

franckbretAuthorUnsubmitted

Done

switched to lists

franckbret: switched to lists

yield ListedOrigin( yield ListedOrigin(

lister_id=self.lister_obj.id, lister_id=self.lister_obj.id,

visit_type=self.VISIT_TYPE, visit_type=self.VISIT_TYPE,

url=url, url=url,

last_update=last_update, last_update=iso8601.parse_date(last_update),

extra_loader_arguments={ extra_loader_arguments={

"artifacts": artifacts, "artifacts": artifacts,

"crates_metadata": crates_metadata, "crates_metadata": crates_metadata,

}, },

) )

def finalize(self) -> None: def finalize(self) -> None:

last = self.get_last_commit_hash(repository_path=self.DESTINATION_PATH) last: datetime = iso8601.parse_date(self.index_metadata["timestamp"])

if self.state.last_commit == last:

self.updated = False

else:

self.state.last_commit = last

self.updated = True

logger.debug("Listing crates origin completed with last commit id %s", last) if not self.state.index_last_update:

self.state.index_last_update = last

self.updated = True

# Cleanup by removing the repository directory # Cleanup by removing the db dump directory

if self.DESTINATION_PATH.exists(): if self.DESTINATION_PATH_DB_DUMP.exists():

shutil.rmtree(self.DESTINATION_PATH) shutil.rmtree(self.DESTINATION_PATH_DB_DUMP)

logger.debug( logger.debug(

"Successfully removed %s directory", str(self.DESTINATION_PATH) "Successfully removed %s directory", str(self.DESTINATION_PATH_DB_DUMP)

) )

anlambertUnsubmitted

Done

This can be removed now.

anlambert: This can be removed now.