client.py
No OneTemporary
Actions

Size

8 KB

Subscribers

None

client.py
View Options

	# Copyright (C) 2019 The Software Heritage developers
	# See the AUTHORS file at the top-level directory of this distribution
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information

	import json
	import logging
	import os

	import requests

	from swh.core import tarball
	from swh.model import hashutil

	from swh.loader.npm.utils import extract_npm_package_author


	class NpmClient:
	"""
	Helper class internally used by the npm loader to fetch
	metadata for a specific package hosted on the npm registry.

	Args:
	temp_dir (str): Path to the temporary disk location used
	to uncompress the package tarballs
	"""
	def __init__(self, temp_dir, log=None):
	self.root_temp_dir = temp_dir
	self.session = requests.session()
	self.params = {
	'headers': {
	'User-Agent': 'Software Heritage npm loader'
	}
	}
	self.log = log or logging

	def fetch_package_metadata(self, package_metadata_url):
	"""
	Fetch metadata for a given package and make it the focused one.
	This must be called prior any other operations performed
	by the other methods below.

	Args:
	package_metadata_url: the package metadata url provided
	by the npm loader
	"""
	self.package_metadata_url = package_metadata_url
	self.package_metadata = self._request(self.package_metadata_url).json()
	self.package = self.package_metadata['name']
	self.temp_dir = os.path.join(self.root_temp_dir, self.package)

	def latest_package_version(self):
	"""
	Return the last released version of the focused package.

	Returns:
	str: the last releases package version
	"""
	return self.package_metadata['dist-tags']['latest']

	def package_versions(self, known_versions=None):
	"""
	Return the available versions for the focused package.

	Args:
	known_versions (dict): may be provided by the loader, it enables
	to filter out versions already ingested in the archive.

	Returns:
	dict: A dict whose keys are Tuple[version, tarball_sha1] and
	values dicts with the following entries:

	* name: the package name
	* version: the package version
	* filename: the package source tarball filename
	* sha1: the package source tarball sha1 checksum
	* date: the package release date
	* url: the package source tarball download url
	"""
	versions = {}
	if 'versions' in self.package_metadata:
	for version, data in self.package_metadata['versions'].items():
	sha1 = data['dist']['shasum']
	key = (version, sha1)
	if known_versions and key in known_versions:
	continue
	tarball_url = data['dist']['tarball']
	filename = os.path.basename(tarball_url)
	date = self.package_metadata['time'][version]
	versions[key] = {
	'name': self.package,
	'version': version,
	'filename': filename,
	'sha1': sha1,
	'date': date,
	'url': tarball_url
	}
	return versions

	def prepare_package_versions(self, known_versions=None):
	"""
	Instantiate a generator that will process a specific package released
	version at each iteration step. The following operations will be
	performed:

	1. Create a temporary directory to download and extract the
	release tarball
	2. Download the tarball
	3. Check downloaded tarball integrity
	4. Uncompress the tarball
	5. Parse ``package.json`` file associated to the package version
	6. Extract author from the parsed ``package.json`` file

	Args:
	known_versions (dict): may be provided by the loader, it enables
	to filter out versions already ingested in the archive.

	Yields:
	Tuple[dict, dict, dict, str]: tuples containing the following
	members:

	* a dict holding the parsed ``package.json`` file
	* a dict holding package author information
	* a dict holding package tarball information
	* a string holding the path of the uncompressed package to
	load into the archive
	"""
	new_versions = self.package_versions(known_versions)
	for version, package_source_data in sorted(new_versions.items()):
	# filter out version with missing tarball (cases exist),
	# package visit will be marked as partial at the end of
	# the loading process
	tarball_url = package_source_data['url']
	tarball_request = self._request(tarball_url,
	throw_error=False)
	if tarball_request.status_code == 404:
	self.log.debug('Tarball url %s returns a 404 error.' %
	tarball_url)
	self.log.debug(('Version %s of %s package will be missing and '
	'the visit will be marked as partial.') %
	(version[0], self.package))
	continue
	version_data = self.package_metadata['versions'][version[0]]
	yield self._prepare_package_version(package_source_data,
	version_data)

	def _prepare_package_version(self, package_source_data, version_data):
	version = version_data['version']
	self.log.debug('Processing version %s for npm package %s' %
	(version, self.package))

	# create temp dir to download and extract package tarball
	path = os.path.join(self.temp_dir, version)
	os.makedirs(path, exist_ok=True)
	filepath = os.path.join(path, package_source_data['filename'])
	self.log.debug('Package local path: %s' % filepath)

	# download tarball
	url = package_source_data['url']
	response = self._request(url)
	hash_names = hashutil.DEFAULT_ALGORITHMS - {'sha1_git'}
	h = hashutil.MultiHash(hash_names=hash_names)
	with open(filepath, 'wb') as f:
	for chunk in response.iter_content(chunk_size=None):
	h.update(chunk)
	f.write(chunk)

	# check tarball integrity
	hashes = h.hexdigest()
	expected_digest = package_source_data['sha1']
	actual_digest = hashes['sha1']
	if actual_digest != expected_digest:
	raise ValueError(
	'%s %s: Checksum mismatched: %s != %s' % (
	self.package, version, expected_digest, actual_digest))

	# uncompress tarball
	tarball.uncompress(filepath, path)

	# do not archive useless tarball root directory
	package_path = os.path.join(path, 'package')
	# some old packages use their name as root directory
	if not os.path.exists(package_path):
	ver_pos = package_source_data['filename'].rfind(version)
	package_name = package_source_data['filename'][:ver_pos-1]
	package_path = os.path.join(path, package_name)
	# fallback: archive root tarball directory
	if not os.path.exists(package_path):
	package_path = path

	package_source_data.update(hashes)

	# parse package.json file to add its content to revision metadata
	package_json_path = os.path.join(package_path, 'package.json')
	package_json = {}
	with open(package_json_path, "r") as package_json_file:
	package_json = json.load(package_json_file)

	# extract author from package.json
	author = extract_npm_package_author(package_json)

	return (package_json, author, package_source_data, package_path)

	def _request(self, url, throw_error=True):
	response = self.session.get(url, **self.params, stream=True)
	if response.status_code != 200 and throw_error:
	raise ValueError("Fail to query '%s'. Reason: %s" % (
	url, response.status_code))
	return response

File Metadata

Mime Type: text/x-python
Expires: Thu, Jul 3, 11:59 AM (2 d, 21 h ago)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 3321456

client.pyNo OneTemporaryActions

client.pyView Options

File Metadata

Event Timeline

client.py
No OneTemporary
Actions

client.py
View Options