No OneTemporary
Actions

Size

26 KB

Subscribers

None

View Options

	diff --git a/swh/loader/package/loader.py b/swh/loader/package/loader.py
	index 72f80f3..a77f473 100644
	--- a/swh/loader/package/loader.py
	+++ b/swh/loader/package/loader.py
	@@ -1,295 +1,303 @@
	# Copyright (C) 2019 The Software Heritage developers
	# See the AUTHORS file at the top-level directory of this distribution
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information

	import datetime
	import logging
	import tempfile
	import os

	from typing import Generator, Dict, Tuple, Sequence, List

	from swh.core.tarball import uncompress
	from swh.core.config import SWHConfig
	from swh.model.from_disk import Directory
	from swh.model.identifiers import (
	revision_identifier, snapshot_identifier, identifier_to_bytes
	)
	from swh.storage import get_storage
	from swh.loader.core.converters import content_for_storage


	logger = logging.getLogger(__name__)


	# Not implemented yet:
	# - clean up disk routines from previous killed workers (when OOMkilled)
	# -> separation of concern would like this to be abstracted from the code
	# -> experience tells us it's complicated to do as such (T903, T964, T982,
	# etc...)
	#
	# - splitting into groups too many objects sent to storage > could be a >
	# -> specialized collaborator or storage implementation or proxy which deals
	# with this
	#
	# - model: swh.model.merkle.from_disk should output swh.model.model.* objects
	# to avoid this layer's conversion routine call
	# -> Take this up within swh.model's current implementation
	#
	# - Does not trap exceptions yet within the PackageLoader.load method

	class PackageLoader:
	# Origin visit type (str) set by the loader
	visit_type = ''

	def __init__(self, url):
	"""Loader's constructor. This raises exception if the minimal required
	configuration is missing (cf. fn:`check` method).

	Args:
	url (str): Origin url to load data from

	"""
	# This expects to use the environment variable SWH_CONFIG_FILENAME
	self.config = SWHConfig.parse_config_file()
	self._check_configuration()
	self.storage = get_storage(**self.config['storage'])
	self.url = url

	def _check_configuration(self):
	"""Checks the minimal configuration required is set for the loader.

	If some required configuration is missing, exception detailing the
	issue is raised.

	"""
	if 'storage' not in self.config:
	raise ValueError(
	'Misconfiguration, at least the storage key should be set')

	def get_versions(self) -> Sequence[str]:
	"""Return the list of all published package versions.

	Returns:
	Sequence of published versions

	"""
	return []

	def get_artifacts(self, version: str) -> Generator[
	Tuple[str, str, Dict], None, None]:
	"""Given a release version of a package, retrieve the associated
	artifact information for such version.

	Args:
	version: Package version

	Returns:
	(artifact filename, artifact uri, raw artifact metadata)

	"""
	yield from {}

	def fetch_artifact_archive(
	self, artifact_archive_path: str, dest: str) -> Tuple[str, Dict]:
	"""Fetch artifact archive to a temporary folder and returns its
	path.

	Args:
	artifact_archive_path: Path to artifact archive to uncompress
	dest: Directory to write the downloaded archive to

	Returns:
	the locally retrieved artifact path

	"""
	return '', {}

	def build_revision(
	self, a_metadata: Dict, a_uncompressed_path: str) -> Dict:
	"""Build the revision dict

	Returns:
	SWH data dict

	"""
	return {}

	def get_default_release(self) -> str:
	"""Retrieve the latest release version

	Returns:
	Latest version

	"""
	return ''

	def load(self) -> Dict:
	"""Load for a specific origin the associated contents.

	for each package version of the origin

	1. Fetch the files for one package version By default, this can be
	implemented as a simple HTTP request. Loaders with more specific
	requirements can override this, e.g.: the PyPI loader checks the
	integrity of the downloaded files; the Debian loader has to download
	and check several files for one package version.

	2. Extract the downloaded files By default, this would be a universal
	archive/tarball extraction.

	Loaders for specific formats can override this method (for instance,
	the Debian loader uses dpkg-source -x).

	3. Convert the extracted directory to a set of Software Heritage
	objects Using swh.model.from_disk.

	4. Extract the metadata from the unpacked directories This would only
	be applicable for "smart" loaders like npm (parsing the
	package.json), PyPI (parsing the PKG-INFO file) or Debian (parsing
	debian/changelog and debian/control).

	On "minimal-metadata" sources such as the GNU archive, the lister
	should provide the minimal set of metadata needed to populate the
	revision/release objects (authors, dates) as an argument to the
	task.

	5. Generate the revision/release objects for the given version. From
	the data generated at steps 3 and 4.

	end for each

	6. Generate and load the snapshot for the visit

	Using the revisions/releases collected at step 5., and the branch
	information from step 0., generate a snapshot and load it into the
	Software Heritage archive

	"""
	status_load = 'uneventful' # either: eventful, uneventful, failed
	- status_visit = 'partial' # either: partial, full
	+ status_visit = 'full' # either: partial, full
	tmp_revisions: Dict[str, List] = {}
	+ snapshot = None

	try:
	# Prepare origin and origin_visit
	origin = {'url': self.url}
	self.storage.origin_add([origin])
	visit_date = datetime.datetime.now(tz=datetime.timezone.utc)
	visit_id = self.storage.origin_visit_add(
	origin=self.url,
	date=visit_date,
	type=self.visit_type)['visit']

	# Retrieve the default release (the "latest" one)
	default_release = self.get_default_release()
	logger.debug('default release: %s', default_release)

	# FIXME: Add load exceptions handling
	for version in self.get_versions(): # for each
	logger.debug('version: %s', version)
	tmp_revisions[version] = []
	# `a_` stands for `artifact_`
	for a_filename, a_uri, a_metadata in self.get_artifacts(
	version):
	with tempfile.TemporaryDirectory() as tmpdir:
	- # a_c_: archive_computed_
	- a_path, a_c_metadata = self.fetch_artifact_archive(
	- a_uri, dest=tmpdir)
	+ try:
	+ # a_c_: archive_computed_
	+ a_path, a_c_metadata = self.fetch_artifact_archive(
	+ a_uri, dest=tmpdir)
	+ except Exception as e:
	+ logger.warning('Unable to retrieve %s. Reason: %s',
	+ a_uri, e)
	+ status_visit = 'partial'
	+ continue

	logger.debug('archive_path: %s', a_path)
	logger.debug('archive_computed_metadata: %s',
	a_c_metadata)

	uncompressed_path = os.path.join(tmpdir, 'src')
	uncompress(a_path, dest=uncompressed_path)

	logger.debug('uncompressed_path: %s',
	uncompressed_path)

	directory = Directory.from_disk(
	path=uncompressed_path.encode('utf-8'), data=True)
	# FIXME: Try not to load the full raw content in memory
	objects = directory.collect()

	contents = objects['content'].values()
	logger.debug('Number of contents: %s',
	len(contents))

	self.storage.content_add(
	map(content_for_storage, contents))

	status_load = 'eventful'
	directories = objects['directory'].values()

	logger.debug('Number of directories: %s',
	len(directories))

	self.storage.directory_add(directories)

	# FIXME: This should be release. cf. D409 discussion
	revision = self.build_revision(
	a_metadata, uncompressed_path)
	revision.update({
	'type': 'tar',
	'synthetic': True,
	'directory': directory.hash,
	})
	revision['metadata'].update({
	'original_artifact': a_metadata,
	'hashes_artifact': a_c_metadata
	})

	revision['id'] = identifier_to_bytes(
	revision_identifier(revision))

	logger.debug('Revision: %s', revision)

	self.storage.revision_add([revision])

	tmp_revisions[version].append({
	'filename': a_filename,
	'target': revision['id'],
	})

	# Build and load the snapshot
	branches = {}
	for version, v_branches in tmp_revisions.items():
	if len(v_branches) == 1:
	branch_name = ('releases/%s' % version).encode('utf-8')
	if version == default_release:
	branches[b'HEAD'] = {
	'target_type': 'alias',
	'target': branch_name,
	}

	branches[branch_name] = {
	'target_type': 'revision',
	'target': v_branches[0]['target'],
	}
	else:
	for x in v_branches:
	branch_name = ('releases/%s/%s' % (
	version, v_branches['filename'])).encode('utf-8')
	branches[branch_name] = {
	'target_type': 'revision',
	'target': x['target'],
	}
	- snapshot = {
	- 'branches': branches
	- }
	- snapshot['id'] = identifier_to_bytes(
	- snapshot_identifier(snapshot))
	- self.storage.snapshot_add([snapshot])
	+ if branches:
	+ snapshot = {
	+ 'branches': branches
	+ }
	+ snapshot['id'] = identifier_to_bytes(
	+ snapshot_identifier(snapshot))

	- # come so far, we actually reached a full visit
	- status_visit = 'full'
	+ logger.debug('snapshot: %s', snapshot)
	+ self.storage.snapshot_add([snapshot])

	# Update the visit's state
	self.storage.origin_visit_update(
	origin=self.url, visit_id=visit_id, status=status_visit,
	snapshot=snapshot)
	except Exception as e:
	logger.warning('Fail to load %s. Reason: %s' % (self.url, e))
	+ status_visit = 'partial'
	finally:
	return {'status': status_load}
	diff --git a/swh/loader/package/tests/conftest.py b/swh/loader/package/tests/conftest.py
	index 7c359b4..aff0231 100644
	--- a/swh/loader/package/tests/conftest.py
	+++ b/swh/loader/package/tests/conftest.py
	@@ -1,99 +1,101 @@
	# Copyright (C) 2019 The Software Heritage developers
	# See the AUTHORS file at the top-level directory of this distribution
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information

	import logging
	import os
	import re
	import pytest

	from functools import partial
	from os import path
	from urllib.parse import urlparse

	from .common import DATADIR


	logger = logging.getLogger(__name__)


	@pytest.fixture
	def swh_config(monkeypatch):
	conffile = os.path.join(DATADIR, 'loader.yml')
	monkeypatch.setenv('SWH_CONFIG_FILENAME', conffile)
	return conffile


	def get_response_cb(request, context, ignore_urls=[]):
	"""Mount point callback to fetch on disk the content of a request

	This is meant to be used as 'body' argument of the requests_mock.get()
	method.

	It will look for files on the local filesystem based on the requested URL,
	using the following rules:

	- files are searched in the DATADIR/<hostname> directory

	- the local file name is the path part of the URL with path hierarchy
	markers (aka '/') replaced by '_'

	Eg. if you use the requests_mock fixture in your test file as:

	requests_mock.get('https://nowhere.com', body=get_response_cb)
	# or even
	requests_mock.get(re.compile('https://'), body=get_response_cb)

	then a call requests.get like:

	requests.get('https://nowhere.com/path/to/resource')

	will look the content of the response in:

	DATADIR/resources/nowhere.com/path_to_resource

	Args:
	request (requests.Request): Object requests
	context (requests.Context): Object holding response metadata
	information (status_code, headers, etc...)
	ignore_urls (List): urls whose status response should be 404 even if
	the local file exists

	Returns:
	Optional[FileDescriptor] on the on disk file to read from the test
	context

	"""
	logger.debug('get_response_cb(%s, %s)', request, context)
	- url = urlparse(request.url)
	- if url in ignore_urls:
	+ logger.debug('url: %s', request.url)
	+ logger.debug('ignore_urls: %s', ignore_urls)
	+ if request.url in ignore_urls:
	context.status_code = 404
	return None
	+ url = urlparse(request.url)
	dirname = url.hostname # pypi.org \| files.pythonhosted.org
	# url.path: pypi/<project>/json -> local file: pypi_<project>_json
	filename = url.path[1:]
	if filename.endswith('/'):
	filename = filename[:-1]
	filename = filename.replace('/', '_')
	filepath = path.join(DATADIR, dirname, filename)
	if not path.isfile(filepath):
	context.status_code = 404
	return None
	fd = open(filepath, 'rb')
	context.headers['content-length'] = str(path.getsize(filepath))
	return fd


	def local_get_factory(ignore_urls=[]):
	@pytest.fixture
	def local_get(requests_mock):
	cb = partial(get_response_cb, ignore_urls=ignore_urls)
	requests_mock.get(re.compile('https://'), body=cb)

	return requests_mock

	return local_get


	local_get = local_get_factory([])
	diff --git a/swh/loader/package/tests/test_pypi.py b/swh/loader/package/tests/test_pypi.py
	index 8dd6bf6..6adbae2 100644
	--- a/swh/loader/package/tests/test_pypi.py
	+++ b/swh/loader/package/tests/test_pypi.py
	@@ -1,304 +1,378 @@
	# Copyright (C) 2019 The Software Heritage developers
	# See the AUTHORS file at the top-level directory of this distribution
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information

	import os

	from os import path

	import pytest

	from swh.core.tarball import uncompress
	from swh.model.hashutil import hash_to_bytes
	from swh.loader.package.pypi import (
	PyPILoader, pypi_api_url, pypi_info, author, sdist_parse
	)

	from swh.loader.package.tests.common import DATADIR, check_snapshot

	+from swh.loader.package.tests.conftest import local_get_factory
	+

	def test_author_basic():
	data = {
	'author': "i-am-groot",
	'author_email': 'iam@groot.org',
	}
	actual_author = author(data)

	expected_author = {
	'fullname': b'i-am-groot <iam@groot.org>',
	'name': b'i-am-groot',
	'email': b'iam@groot.org',
	}

	assert actual_author == expected_author


	def test_author_empty_email():
	data = {
	'author': 'i-am-groot',
	'author_email': '',
	}
	actual_author = author(data)

	expected_author = {
	'fullname': b'i-am-groot',
	'name': b'i-am-groot',
	'email': b'',
	}

	assert actual_author == expected_author


	def test_author_empty_name():
	data = {
	'author': "",
	'author_email': 'iam@groot.org',
	}
	actual_author = author(data)

	expected_author = {
	'fullname': b' <iam@groot.org>',
	'name': b'',
	'email': b'iam@groot.org',
	}

	assert actual_author == expected_author


	def test_author_malformed():
	data = {
	'author': "['pierre', 'paul', 'jacques']",
	'author_email': None,
	}

	actual_author = author(data)

	expected_author = {
	'fullname': b"['pierre', 'paul', 'jacques']",
	'name': b"['pierre', 'paul', 'jacques']",
	'email': None,
	}

	assert actual_author == expected_author


	def test_author_malformed_2():
	data = {
	'author': '[marie, jeanne]',
	'author_email': '[marie@some, jeanne@thing]',
	}

	actual_author = author(data)

	expected_author = {
	'fullname': b'[marie, jeanne] <[marie@some, jeanne@thing]>',
	'name': b'[marie, jeanne]',
	'email': b'[marie@some, jeanne@thing]',
	}

	assert actual_author == expected_author


	def test_author_malformed_3():
	data = {
	'author': '[marie, jeanne, pierre]',
	'author_email': '[marie@somewhere.org, jeanne@somewhere.org]',
	}

	actual_author = author(data)

	expected_author = {
	'fullname': b'[marie, jeanne, pierre] <[marie@somewhere.org, jeanne@somewhere.org]>', # noqa
	'name': b'[marie, jeanne, pierre]',
	'email': b'[marie@somewhere.org, jeanne@somewhere.org]',
	}

	actual_author == expected_author


	# configuration error #

	def test_badly_configured_loader_raise(monkeypatch):
	"""Badly configured loader should raise"""
	monkeypatch.delenv('SWH_CONFIG_FILENAME', raising=False)
	with pytest.raises(ValueError) as e:
	PyPILoader(url='some-url')

	assert 'Misconfiguration' in e.value.args[0]


	def test_pypi_api_url():
	"""Compute pypi api url from the pypi project url should be ok"""
	url = pypi_api_url('https://pypi.org/project/requests')
	assert url == 'https://pypi.org/pypi/requests/json'


	def test_pypi_info_failure(requests_mock):
	"""Failure to fetch info/release information should raise"""
	project_url = 'https://pypi.org/project/requests'
	info_url = 'https://pypi.org/pypi/requests/json'
	status_code = 400
	requests_mock.get(info_url, status_code=status_code)

	with pytest.raises(ValueError) as e0:
	pypi_info(project_url)

	assert e0.value.args[0] == "Fail to query '%s'. Reason: %s" % (
	info_url, status_code
	)


	def test_pypi_info(requests_mock):
	"""Fetching json info from pypi project should be ok"""
	url = 'https://pypi.org/project/requests'
	info_url = 'https://pypi.org/pypi/requests/json'
	requests_mock.get(info_url,
	text='{"version": "0.0.1"}')
	actual_info = pypi_info(url)
	assert actual_info == {
	'version': '0.0.1',
	}


	@pytest.mark.fs
	def test_sdist_parse(tmp_path):
	"""Parsing existing archive's PKG-INFO should yield results"""
	uncompressed_archive_path = str(tmp_path)
	archive_path = path.join(
	DATADIR, 'files.pythonhosted.org', '0805nexter-1.1.0.zip')
	uncompress(archive_path, dest=uncompressed_archive_path)

	actual_sdist = sdist_parse(uncompressed_archive_path)
	expected_sdist = {
	'metadata_version': '1.0',
	'name': '0805nexter',
	'version': '1.1.0',
	'summary': 'a simple printer of nested lest',
	'home_page': 'http://www.hp.com',
	'author': 'hgtkpython',
	'author_email': '2868989685@qq.com',
	'platforms': ['UNKNOWN'],
	}

	assert actual_sdist == expected_sdist


	@pytest.mark.fs
	def test_sdist_parse_failures(tmp_path):
	"""Parsing inexistant path/archive/PKG-INFO yield None"""
	# inexistant first level path
	assert sdist_parse('/something-inexistant') == {}
	# inexistant second level path (as expected by pypi archives)
	assert sdist_parse(tmp_path) == {}
	# inexistant PKG-INFO within second level path
	existing_path_no_pkginfo = str(tmp_path / 'something')
	os.mkdir(existing_path_no_pkginfo)
	assert sdist_parse(tmp_path) == {}


	# LOADER SCENARIO #

	# "edge" cases (for the same origin) #


	+# no release artifact:
	+# {visit full, status: uneventful, no contents, etc...}
	def test_no_release_artifact(requests_mock):
	pass


	-# no release artifact:
	-# {visit full, status: uneventful, no contents, etc...}
	-
	# problem during loading:
	# {visit: partial, status: uneventful, no snapshot}

	+
	+
	+
	# problem during loading: failure early enough in between swh contents...
	# some contents (contents, directories, etc...) have been written in storage
	# {visit: partial, status: eventful, no snapshot}

	# problem during loading: failure late enough we can have snapshots (some
	# revisions are written in storage already)
	# {visit: partial, status: eventful, snapshot}

	# "normal" cases (for the same origin) #

	+
	+local_get_missing = local_get_factory(ignore_urls=[
	+ 'https://files.pythonhosted.org/packages/ec/65/c0116953c9a3f47de89e71964d6c7b0c783b01f29fa3390584dbf3046b4d/0805nexter-1.1.0.zip', # noqa
	+])
	+
	+# some missing release artifacts:
	+# {visit partial, status: eventful, 1 snapshot}
	+
	+def test_release_with_missing_artifact(swh_config, local_get_missing):
	+ """Load a pypi project with some missing artifacts ends up with 1 snapshot
	+
	+ """
	+ loader = PyPILoader('https://pypi.org/project/0805nexter')
	+
	+ actual_load_status = loader.load()
	+
	+ assert actual_load_status == {'status': 'eventful'}
	+
	+ stats = loader.storage.stat_counters()
	+ assert {
	+ 'content': 3,
	+ 'directory': 2,
	+ 'origin': 1,
	+ 'origin_visit': 1,
	+ 'person': 1,
	+ 'release': 0,
	+ 'revision': 1,
	+ 'skipped_content': 0,
	+ 'snapshot': 1
	+ } == stats
	+
	+ expected_contents = map(hash_to_bytes, [
	+ '405859113963cb7a797642b45f171d6360425d16',
	+ 'e5686aa568fdb1d19d7f1329267082fe40482d31',
	+ '83ecf6ec1114fd260ca7a833a2d165e71258c338',
	+ ])
	+
	+ assert list(loader.storage.content_missing_per_sha1(expected_contents))\
	+ == []
	+
	+ expected_dirs = map(hash_to_bytes, [
	+ 'b178b66bd22383d5f16f4f5c923d39ca798861b4',
	+ 'c3a58f8b57433a4b56caaa5033ae2e0931405338',
	+ ])
	+
	+ assert list(loader.storage.directory_missing(expected_dirs)) == []
	+
	+ # {revision hash: directory hash}
	+ expected_revs = {
	+ hash_to_bytes('e445da4da22b31bfebb6ffc4383dbf839a074d21'): hash_to_bytes('b178b66bd22383d5f16f4f5c923d39ca798861b4'), # noqa
	+ }
	+ assert list(loader.storage.revision_missing(expected_revs)) == []
	+
	+ expected_branches = {
	+ 'releases/1.2.0': {
	+ 'target': 'e445da4da22b31bfebb6ffc4383dbf839a074d21',
	+ 'target_type': 'revision',
	+ },
	+ 'HEAD': {
	+ 'target': 'releases/1.2.0',
	+ 'target_type': 'alias',
	+ },
	+ }
	+
	+ check_snapshot(
	+ 'dd0e4201a232b1c104433741dbf45895b8ac9355',
	+ expected_branches,
	+ storage=loader.storage)
	+
	+
	def test_release_artifact_no_prior_visit(swh_config, local_get):
	"""With no prior visit, load a pypi project ends up with 1 snapshot

	"""
	loader = PyPILoader('https://pypi.org/project/0805nexter')

	actual_load_status = loader.load()

	assert actual_load_status == {'status': 'eventful'}

	stats = loader.storage.stat_counters()
	assert {
	'content': 6,
	'directory': 4,
	'origin': 1,
	'origin_visit': 1,
	'person': 1,
	'release': 0,
	'revision': 2,
	'skipped_content': 0,
	'snapshot': 1
	} == stats

	expected_contents = map(hash_to_bytes, [
	'a61e24cdfdab3bb7817f6be85d37a3e666b34566',
	'938c33483285fd8ad57f15497f538320df82aeb8',
	'a27576d60e08c94a05006d2e6d540c0fdb5f38c8',
	'405859113963cb7a797642b45f171d6360425d16',
	'e5686aa568fdb1d19d7f1329267082fe40482d31',
	'83ecf6ec1114fd260ca7a833a2d165e71258c338',
	])

	assert list(loader.storage.content_missing_per_sha1(expected_contents))\
	== []

	expected_dirs = map(hash_to_bytes, [
	'05219ba38bc542d4345d5638af1ed56c7d43ca7d',
	'cf019eb456cf6f78d8c4674596f1c9a97ece8f44',
	'b178b66bd22383d5f16f4f5c923d39ca798861b4',
	'c3a58f8b57433a4b56caaa5033ae2e0931405338',
	])

	assert list(loader.storage.directory_missing(expected_dirs)) == []

	# {revision hash: directory hash}
	expected_revs = {
	hash_to_bytes('4c99891f93b81450385777235a37b5e966dd1571'): hash_to_bytes('05219ba38bc542d4345d5638af1ed56c7d43ca7d'), # noqa
	hash_to_bytes('e445da4da22b31bfebb6ffc4383dbf839a074d21'): hash_to_bytes('b178b66bd22383d5f16f4f5c923d39ca798861b4'), # noqa
	}
	assert list(loader.storage.revision_missing(expected_revs)) == []

	expected_branches = {
	'releases/1.1.0': {
	'target': '4c99891f93b81450385777235a37b5e966dd1571',
	'target_type': 'revision',
	},
	'releases/1.2.0': {
	'target': 'e445da4da22b31bfebb6ffc4383dbf839a074d21',
	'target_type': 'revision',
	},
	'HEAD': {
	'target': 'releases/1.2.0',
	'target_type': 'alias',
	},
	}

	check_snapshot(
	'ba6e158ada75d0b3cfb209ffdf6daa4ed34a227a',
	expected_branches,
	storage=loader.storage)

	# release artifact, no new artifact
	# {visit full, status uneventful, same snapshot as before}

	# release artifact, new artifact
	# {visit full, status full, new snapshot with shared history as prior snapshot}

	# release artifact, old artifact with different checksums
	# {visit full, status full, new snapshot with shared history and some new
	# different history}

File Metadata

Mime Type: text/x-diff
Expires: Fri, Jul 4, 12:31 PM (2 w, 3 d ago)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 3267052

No OneTemporaryActions

View Options

File Metadata

Event Timeline

No OneTemporary
Actions