lister.py
No OneTemporary
Actions

Size

6 KB

Subscribers

None

lister.py
View Options

	# Copyright (C) 2019-2021 The Software Heritage developers
	# See the AUTHORS file at the top-level directory of this distribution
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information

	from dataclasses import dataclass
	from datetime import datetime, timezone
	import logging
	from typing import Any, Dict, Iterator, List, Optional

	import iso8601
	import requests

	from swh.scheduler.interface import SchedulerInterface
	from swh.scheduler.model import ListedOrigin

	from .. import USER_AGENT
	from ..pattern import CredentialsType, Lister

	logger = logging.getLogger(__name__)

	PackagistPageType = List[str]


	@dataclass
	class PackagistListerState:
	"""State of Packagist lister"""

	last_listing_date: Optional[datetime] = None
	"""Last date when packagist lister was executed"""


	class PackagistLister(Lister[PackagistListerState, PackagistPageType]):
	"""
	List all Packagist projects and send associated origins to scheduler.

	The lister queries the Packagist API, whose documentation can be found at
	https://packagist.org/apidoc.

	For each package, its metadata are retrieved using Packagist API endpoints
	whose responses are served from static files, which are guaranteed to be
	efficient on the Packagist side (no dymamic queries).
	Furthermore, subsequent listing will send the "If-Modified-Since" HTTP
	header to only retrieve packages metadata updated since the previous listing
	operation in order to save bandwidth and return only origins which might have
	new released versions.
	"""

	LISTER_NAME = "Packagist"
	PACKAGIST_PACKAGES_LIST_URL = "https://packagist.org/packages/list.json"
	PACKAGIST_REPO_BASE_URL = "https://repo.packagist.org/p"

	def __init__(
	self,
	scheduler: SchedulerInterface,
	credentials: CredentialsType = None,
	):
	super().__init__(
	scheduler=scheduler,
	url=self.PACKAGIST_PACKAGES_LIST_URL,
	instance="packagist",
	credentials=credentials,
	)

	self.session = requests.Session()
	self.session.headers.update(
	{"Accept": "application/json", "User-Agent": USER_AGENT}
	)
	self.listing_date = datetime.now().astimezone(tz=timezone.utc)

	def state_from_dict(self, d: Dict[str, Any]) -> PackagistListerState:
	last_listing_date = d.get("last_listing_date")
	if last_listing_date is not None:
	d["last_listing_date"] = iso8601.parse_date(last_listing_date)
	return PackagistListerState(**d)

	def state_to_dict(self, state: PackagistListerState) -> Dict[str, Any]:
	d: Dict[str, Optional[str]] = {"last_listing_date": None}
	last_listing_date = state.last_listing_date
	if last_listing_date is not None:
	d["last_listing_date"] = last_listing_date.isoformat()
	return d

	def api_request(self, url: str) -> Any:
	logger.debug("Fetching URL %s", url)

	response = self.session.get(url)

	if response.status_code not in (200, 304):
	logger.warning(
	"Unexpected HTTP status code %s on %s: %s",
	response.status_code,
	response.url,
	response.content,
	)

	response.raise_for_status()

	# response is empty when status code is 304
	return response.json() if response.status_code == 200 else {}

	def get_pages(self) -> Iterator[PackagistPageType]:
	"""
	Yield a single page listing all Packagist projects.
	"""
	yield self.api_request(self.PACKAGIST_PACKAGES_LIST_URL)["packageNames"]

	def get_origins_from_page(self, page: PackagistPageType) -> Iterator[ListedOrigin]:
	"""
	Iterate on all Packagist projects and yield ListedOrigin instances.
	"""
	assert self.lister_obj.id is not None

	# save some bandwidth by only getting packages metadata updated since
	# last listing
	if self.state.last_listing_date is not None:
	if_modified_since = self.state.last_listing_date.strftime(
	"%a, %d %b %Y %H:%M:%S GMT"
	)
	self.session.headers["If-Modified-Since"] = if_modified_since

	# to ensure origins will not be listed multiple times
	origin_urls = set()

	for package_name in page:
	try:
	metadata = self.api_request(
	f"{self.PACKAGIST_REPO_BASE_URL}/{package_name}.json"
	)
	if not metadata.get("packages", {}):
	# package metadata not updated since last listing
	continue
	if package_name not in metadata["packages"]:
	# missing package metadata in response
	continue
	versions_info = metadata["packages"][package_name].values()
	except requests.exceptions.HTTPError:
	# error when getting package metadata (usually 404 when a
	# package has been removed), skip it and process next package
	continue

	origin_url = None
	visit_type = None
	last_update = None

	# extract origin url for package, vcs type and latest release date
	for version_info in versions_info:
	origin_url = version_info.get("source", {}).get("url", "")
	if not origin_url:
	continue
	# can be git, hg or svn
	visit_type = version_info.get("source", {}).get("type", "")
	dist_time_str = version_info.get("time", "")
	if not dist_time_str:
	continue
	dist_time = iso8601.parse_date(dist_time_str)
	if last_update is None or dist_time > last_update:
	last_update = dist_time

	# skip package with already seen origin url or with missing required info
	if visit_type is None or origin_url is None or origin_url in origin_urls:
	continue

	# bitbucket closed its mercurial hosting service, those origins can not be
	# loaded into the archive anymore
	if visit_type == "hg" and origin_url.startswith("https://bitbucket.org/"):
	continue

	origin_urls.add(origin_url)

	logger.debug(
	"Found package %s last updated on %s", package_name, last_update
	)

	yield ListedOrigin(
	lister_id=self.lister_obj.id,
	url=origin_url,
	visit_type=visit_type,
	last_update=last_update,
	)

	def finalize(self) -> None:
	self.state.last_listing_date = self.listing_date
	self.updated = True

File Metadata

Mime Type: text/x-python
Expires: Sat, Jun 21, 7:17 PM (2 w, 3 d ago)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 3315255

lister.pyNo OneTemporaryActions

lister.pyView Options

File Metadata

Event Timeline

lister.py
No OneTemporary
Actions

lister.py
View Options