Changeset View
Changeset View
Standalone View
Standalone View
swh/lister/packagist/lister.py
# Copyright (C) 2019 The Software Heritage developers | # Copyright (C) 2019-2021 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
ardumont: lol
(a regexp change gone rogue ;) | |||||
Done Inline Actionslol, thanks for spotting ! anlambert: lol, thanks for spotting ! | |||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import json | from dataclasses import dataclass | ||||
from datetime import datetime, timezone | |||||
import logging | import logging | ||||
import random | from typing import Any, Dict, Iterator, List, Optional | ||||
from typing import Any, Dict, List, Mapping | |||||
from swh.lister.core.lister_transports import ListerOnePageApiTransport | import iso8601 | ||||
from swh.lister.core.simple_lister import SimpleLister | import requests | ||||
from swh.scheduler import utils | |||||
from .models import PackagistModel | from swh.scheduler.interface import SchedulerInterface | ||||
from swh.scheduler.model import ListedOrigin | |||||
from .. import USER_AGENT | |||||
from ..pattern import CredentialsType, Lister | |||||
logger = logging.getLogger(__name__) | logger = logging.getLogger(__name__) | ||||
PackagistPageType = List[str] | |||||
def compute_package_url(repo_name: str) -> str: | |||||
"""Compute packgist package url from repo name. | |||||
""" | @dataclass | ||||
return "https://repo.packagist.org/p/%s.json" % repo_name | class PackagistListerState: | ||||
"""State of Packagist lister""" | |||||
last_listing_date: Optional[datetime] = None | |||||
Not Done Inline Actionswhen* ardumont: when*
| |||||
"""Last date when packagist lister was executed""" | |||||
class PackagistLister(ListerOnePageApiTransport, SimpleLister): | |||||
"""List packages available in the Packagist package manager. | |||||
The lister sends the request to the url present in the class | class PackagistLister(Lister[PackagistListerState, PackagistPageType]): | ||||
variable `PAGE`, to receive a list of all the package names | """ | ||||
present in the Packagist package manager. Iterates over all the | List all Packagist projects and send associated origins to scheduler. | ||||
packages and constructs the metadata url of the package from | |||||
the name of the package and creates a loading task:: | |||||
Task: | The lister queries the Packagist API, whose documentation can be found at | ||||
Type: load-packagist | https://packagist.org/apidoc. | ||||
Policy: recurring | |||||
Args: | |||||
<package_name> | |||||
<package_metadata_url> | |||||
Example:: | For each package, its metadata are retrieved using Packagist API endpoints | ||||
whose responses are served from static files, which are guaranteed to be | |||||
efficient on the Packagist side (no dymamic queries). | |||||
Furthermore, subsequent listing will send the "If-Modified-Since" HTTP | |||||
header to only retrieve packages metadata updated since the previous listing | |||||
operation in order to save bandwidth and return only origins which might have | |||||
new released versions. | |||||
""" | |||||
Task: | LISTER_NAME = "Packagist" | ||||
Type: load-packagist | PACKAGIST_PACKAGES_LIST_URL = "https://packagist.org/packages/list.json" | ||||
Policy: recurring | PACKAGIST_REPO_BASE_URL = "https://repo.packagist.org/p" | ||||
Args: | |||||
'hypejunction/hypegamemechanics' | def __init__( | ||||
'https://repo.packagist.org/p/hypejunction/hypegamemechanics.json' | self, scheduler: SchedulerInterface, credentials: CredentialsType = None, | ||||
): | |||||
super().__init__( | |||||
scheduler=scheduler, | |||||
url=self.PACKAGIST_PACKAGES_LIST_URL, | |||||
instance="packagist", | |||||
credentials=credentials, | |||||
) | |||||
""" | self.session = requests.Session() | ||||
self.session.headers.update( | |||||
{"Accept": "application/json", "User-Agent": USER_AGENT} | |||||
) | |||||
self.listing_date = datetime.now().astimezone(tz=timezone.utc) | |||||
MODEL = PackagistModel | def state_from_dict(self, d: Dict[str, Any]) -> PackagistListerState: | ||||
LISTER_NAME = "packagist" | last_listing_date = d.get("last_listing_date") | ||||
PAGE = "https://packagist.org/packages/list.json" | if last_listing_date is not None: | ||||
instance = "packagist" | d["last_listing_date"] = iso8601.parse_date(last_listing_date) | ||||
return PackagistListerState(**d) | |||||
def state_to_dict(self, state: PackagistListerState) -> Dict[str, Any]: | |||||
d: Dict[str, Optional[str]] = {"last_listing_date": None} | |||||
last_listing_date = state.last_listing_date | |||||
if last_listing_date is not None: | |||||
d["last_listing_date"] = last_listing_date.isoformat() | |||||
return d | |||||
def api_request(self, url: str) -> Any: | |||||
logger.debug("Fetching URL %s", url) | |||||
response = self.session.get(url) | |||||
if response.status_code not in (200, 304): | |||||
logger.warning( | |||||
"Unexpected HTTP status code %s on %s: %s", | |||||
response.status_code, | |||||
response.url, | |||||
response.content, | |||||
) | |||||
def __init__(self, override_config=None): | response.raise_for_status() | ||||
ListerOnePageApiTransport.__init__(self) | |||||
SimpleLister.__init__(self, override_config=override_config) | |||||
def task_dict( | # response is empty when status code is 304 | ||||
self, origin_type: str, origin_url: str, **kwargs: Mapping[str, str] | return response.json() if response.status_code == 200 else {} | ||||
) -> Dict[str, Any]: | |||||
"""Return task format dict | |||||
This is overridden from the lister_base as more information is | def get_pages(self) -> Iterator[PackagistPageType]: | ||||
needed for the ingestion task creation. | """ | ||||
Yield a single page listing all Packagist projects. | |||||
""" | |||||
yield self.api_request(self.PACKAGIST_PACKAGES_LIST_URL)["packageNames"] | |||||
def get_origins_from_page(self, page: PackagistPageType) -> Iterator[ListedOrigin]: | |||||
""" | """ | ||||
return utils.create_task_dict( | Iterate on all Packagist projects and yield ListedOrigin instances. | ||||
"load-%s" % origin_type, | """ | ||||
kwargs.get("policy", "recurring"), | assert self.lister_obj.id is not None | ||||
kwargs.get("name"), | |||||
origin_url, | # save some bandwidth by only getting packages metadata updated since | ||||
retries_left=3, | # last listing | ||||
if self.state.last_listing_date is not None: | |||||
if_modified_since = self.state.last_listing_date.strftime( | |||||
"%a, %d %b %Y %H:%M:%S GMT" | |||||
) | ) | ||||
self.session.headers["If-Modified-Since"] = if_modified_since | |||||
def list_packages(self, response: Any) -> List[str]: | # to ensure origins will not be listed multiple times | ||||
"""List the actual packagist origins from the response. | origin_urls = set() | ||||
""" | for package_name in page: | ||||
response = json.loads(response.text) | try: | ||||
packages = [name for name in response["packageNames"]] | metadata = self.api_request( | ||||
logger.debug("Number of packages: %s", len(packages)) | f"{self.PACKAGIST_REPO_BASE_URL}/{package_name}.json" | ||||
random.shuffle(packages) | ) | ||||
return packages | if not metadata.get("packages", {}): | ||||
# package metadata not updated since last listing | |||||
continue | |||||
if package_name not in metadata["packages"]: | |||||
# missing package metadata in response | |||||
continue | |||||
versions_info = metadata["packages"][package_name].values() | |||||
except requests.exceptions.HTTPError: | |||||
# error when getting package metadata (usually 404 when a | |||||
# package has been removed), skip it and process next package | |||||
continue | |||||
origin_url = None | |||||
visit_type = None | |||||
last_update = None | |||||
# extract origin url for package, vcs type and latest release date | |||||
for version_info in versions_info: | |||||
origin_url = version_info.get("source", {}).get("url", "") | |||||
if not origin_url: | |||||
continue | |||||
# can be git, hg or svn | |||||
visit_type = version_info.get("source", {}).get("type", "") | |||||
dist_time_str = version_info.get("time", "") | |||||
if not dist_time_str: | |||||
continue | |||||
dist_time = iso8601.parse_date(dist_time_str) | |||||
if last_update is None or dist_time > last_update: | |||||
last_update = dist_time | |||||
# skip package with already seen origin url or with missing required info | |||||
if visit_type is None or origin_url is None or origin_url in origin_urls: | |||||
continue | |||||
# bitbucket closed its mercurial hosting service, those origins can not be | |||||
# loaded into the archive anymore | |||||
if visit_type == "hg" and origin_url.startswith("https://bitbucket.org/"): | |||||
continue | |||||
def get_model_from_repo(self, repo_name: str) -> Mapping[str, str]: | origin_urls.add(origin_url) | ||||
"""Transform from repository representation to model | |||||
""" | logger.debug( | ||||
url = compute_package_url(repo_name) | "Found package %s last updated on %s", package_name, last_update | ||||
return { | ) | ||||
"uid": repo_name, | |||||
"name": repo_name, | yield ListedOrigin( | ||||
"full_name": repo_name, | lister_id=self.lister_obj.id, | ||||
"html_url": url, | url=origin_url, | ||||
"origin_url": url, | visit_type=visit_type, | ||||
"origin_type": "packagist", | last_update=last_update, | ||||
} | ) | ||||
def finalize(self) -> None: | |||||
self.state.last_listing_date = self.listing_date | |||||
self.updated = True |
lol
(a regexp change gone rogue ;)