Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9342163
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
26 KB
Subscribers
None
View Options
diff --git a/swh/loader/package/loader.py b/swh/loader/package/loader.py
index 72f80f3..a77f473 100644
--- a/swh/loader/package/loader.py
+++ b/swh/loader/package/loader.py
@@ -1,295 +1,303 @@
# Copyright (C) 2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import datetime
import logging
import tempfile
import os
from typing import Generator, Dict, Tuple, Sequence, List
from swh.core.tarball import uncompress
from swh.core.config import SWHConfig
from swh.model.from_disk import Directory
from swh.model.identifiers import (
revision_identifier, snapshot_identifier, identifier_to_bytes
)
from swh.storage import get_storage
from swh.loader.core.converters import content_for_storage
logger = logging.getLogger(__name__)
# Not implemented yet:
# - clean up disk routines from previous killed workers (when OOMkilled)
# -> separation of concern would like this to be abstracted from the code
# -> experience tells us it's complicated to do as such (T903, T964, T982,
# etc...)
#
# - splitting into groups too many objects sent to storage > could be a >
# -> specialized collaborator or storage implementation or proxy which deals
# with this
#
# - model: swh.model.merkle.from_disk should output swh.model.model.* objects
# to avoid this layer's conversion routine call
# -> Take this up within swh.model's current implementation
#
# - Does not trap exceptions yet within the PackageLoader.load method
class PackageLoader:
# Origin visit type (str) set by the loader
visit_type = ''
def __init__(self, url):
"""Loader's constructor. This raises exception if the minimal required
configuration is missing (cf. fn:`check` method).
Args:
url (str): Origin url to load data from
"""
# This expects to use the environment variable SWH_CONFIG_FILENAME
self.config = SWHConfig.parse_config_file()
self._check_configuration()
self.storage = get_storage(**self.config['storage'])
self.url = url
def _check_configuration(self):
"""Checks the minimal configuration required is set for the loader.
If some required configuration is missing, exception detailing the
issue is raised.
"""
if 'storage' not in self.config:
raise ValueError(
'Misconfiguration, at least the storage key should be set')
def get_versions(self) -> Sequence[str]:
"""Return the list of all published package versions.
Returns:
Sequence of published versions
"""
return []
def get_artifacts(self, version: str) -> Generator[
Tuple[str, str, Dict], None, None]:
"""Given a release version of a package, retrieve the associated
artifact information for such version.
Args:
version: Package version
Returns:
(artifact filename, artifact uri, raw artifact metadata)
"""
yield from {}
def fetch_artifact_archive(
self, artifact_archive_path: str, dest: str) -> Tuple[str, Dict]:
"""Fetch artifact archive to a temporary folder and returns its
path.
Args:
artifact_archive_path: Path to artifact archive to uncompress
dest: Directory to write the downloaded archive to
Returns:
the locally retrieved artifact path
"""
return '', {}
def build_revision(
self, a_metadata: Dict, a_uncompressed_path: str) -> Dict:
"""Build the revision dict
Returns:
SWH data dict
"""
return {}
def get_default_release(self) -> str:
"""Retrieve the latest release version
Returns:
Latest version
"""
return ''
def load(self) -> Dict:
"""Load for a specific origin the associated contents.
for each package version of the origin
1. Fetch the files for one package version By default, this can be
implemented as a simple HTTP request. Loaders with more specific
requirements can override this, e.g.: the PyPI loader checks the
integrity of the downloaded files; the Debian loader has to download
and check several files for one package version.
2. Extract the downloaded files By default, this would be a universal
archive/tarball extraction.
Loaders for specific formats can override this method (for instance,
the Debian loader uses dpkg-source -x).
3. Convert the extracted directory to a set of Software Heritage
objects Using swh.model.from_disk.
4. Extract the metadata from the unpacked directories This would only
be applicable for "smart" loaders like npm (parsing the
package.json), PyPI (parsing the PKG-INFO file) or Debian (parsing
debian/changelog and debian/control).
On "minimal-metadata" sources such as the GNU archive, the lister
should provide the minimal set of metadata needed to populate the
revision/release objects (authors, dates) as an argument to the
task.
5. Generate the revision/release objects for the given version. From
the data generated at steps 3 and 4.
end for each
6. Generate and load the snapshot for the visit
Using the revisions/releases collected at step 5., and the branch
information from step 0., generate a snapshot and load it into the
Software Heritage archive
"""
status_load = 'uneventful' # either: eventful, uneventful, failed
- status_visit = 'partial' # either: partial, full
+ status_visit = 'full' # either: partial, full
tmp_revisions: Dict[str, List] = {}
+ snapshot = None
try:
# Prepare origin and origin_visit
origin = {'url': self.url}
self.storage.origin_add([origin])
visit_date = datetime.datetime.now(tz=datetime.timezone.utc)
visit_id = self.storage.origin_visit_add(
origin=self.url,
date=visit_date,
type=self.visit_type)['visit']
# Retrieve the default release (the "latest" one)
default_release = self.get_default_release()
logger.debug('default release: %s', default_release)
# FIXME: Add load exceptions handling
for version in self.get_versions(): # for each
logger.debug('version: %s', version)
tmp_revisions[version] = []
# `a_` stands for `artifact_`
for a_filename, a_uri, a_metadata in self.get_artifacts(
version):
with tempfile.TemporaryDirectory() as tmpdir:
- # a_c_: archive_computed_
- a_path, a_c_metadata = self.fetch_artifact_archive(
- a_uri, dest=tmpdir)
+ try:
+ # a_c_: archive_computed_
+ a_path, a_c_metadata = self.fetch_artifact_archive(
+ a_uri, dest=tmpdir)
+ except Exception as e:
+ logger.warning('Unable to retrieve %s. Reason: %s',
+ a_uri, e)
+ status_visit = 'partial'
+ continue
logger.debug('archive_path: %s', a_path)
logger.debug('archive_computed_metadata: %s',
a_c_metadata)
uncompressed_path = os.path.join(tmpdir, 'src')
uncompress(a_path, dest=uncompressed_path)
logger.debug('uncompressed_path: %s',
uncompressed_path)
directory = Directory.from_disk(
path=uncompressed_path.encode('utf-8'), data=True)
# FIXME: Try not to load the full raw content in memory
objects = directory.collect()
contents = objects['content'].values()
logger.debug('Number of contents: %s',
len(contents))
self.storage.content_add(
map(content_for_storage, contents))
status_load = 'eventful'
directories = objects['directory'].values()
logger.debug('Number of directories: %s',
len(directories))
self.storage.directory_add(directories)
# FIXME: This should be release. cf. D409 discussion
revision = self.build_revision(
a_metadata, uncompressed_path)
revision.update({
'type': 'tar',
'synthetic': True,
'directory': directory.hash,
})
revision['metadata'].update({
'original_artifact': a_metadata,
'hashes_artifact': a_c_metadata
})
revision['id'] = identifier_to_bytes(
revision_identifier(revision))
logger.debug('Revision: %s', revision)
self.storage.revision_add([revision])
tmp_revisions[version].append({
'filename': a_filename,
'target': revision['id'],
})
# Build and load the snapshot
branches = {}
for version, v_branches in tmp_revisions.items():
if len(v_branches) == 1:
branch_name = ('releases/%s' % version).encode('utf-8')
if version == default_release:
branches[b'HEAD'] = {
'target_type': 'alias',
'target': branch_name,
}
branches[branch_name] = {
'target_type': 'revision',
'target': v_branches[0]['target'],
}
else:
for x in v_branches:
branch_name = ('releases/%s/%s' % (
version, v_branches['filename'])).encode('utf-8')
branches[branch_name] = {
'target_type': 'revision',
'target': x['target'],
}
- snapshot = {
- 'branches': branches
- }
- snapshot['id'] = identifier_to_bytes(
- snapshot_identifier(snapshot))
- self.storage.snapshot_add([snapshot])
+ if branches:
+ snapshot = {
+ 'branches': branches
+ }
+ snapshot['id'] = identifier_to_bytes(
+ snapshot_identifier(snapshot))
- # come so far, we actually reached a full visit
- status_visit = 'full'
+ logger.debug('snapshot: %s', snapshot)
+ self.storage.snapshot_add([snapshot])
# Update the visit's state
self.storage.origin_visit_update(
origin=self.url, visit_id=visit_id, status=status_visit,
snapshot=snapshot)
except Exception as e:
logger.warning('Fail to load %s. Reason: %s' % (self.url, e))
+ status_visit = 'partial'
finally:
return {'status': status_load}
diff --git a/swh/loader/package/tests/conftest.py b/swh/loader/package/tests/conftest.py
index 7c359b4..aff0231 100644
--- a/swh/loader/package/tests/conftest.py
+++ b/swh/loader/package/tests/conftest.py
@@ -1,99 +1,101 @@
# Copyright (C) 2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import logging
import os
import re
import pytest
from functools import partial
from os import path
from urllib.parse import urlparse
from .common import DATADIR
logger = logging.getLogger(__name__)
@pytest.fixture
def swh_config(monkeypatch):
conffile = os.path.join(DATADIR, 'loader.yml')
monkeypatch.setenv('SWH_CONFIG_FILENAME', conffile)
return conffile
def get_response_cb(request, context, ignore_urls=[]):
"""Mount point callback to fetch on disk the content of a request
This is meant to be used as 'body' argument of the requests_mock.get()
method.
It will look for files on the local filesystem based on the requested URL,
using the following rules:
- files are searched in the DATADIR/<hostname> directory
- the local file name is the path part of the URL with path hierarchy
markers (aka '/') replaced by '_'
Eg. if you use the requests_mock fixture in your test file as:
requests_mock.get('https://nowhere.com', body=get_response_cb)
# or even
requests_mock.get(re.compile('https://'), body=get_response_cb)
then a call requests.get like:
requests.get('https://nowhere.com/path/to/resource')
will look the content of the response in:
DATADIR/resources/nowhere.com/path_to_resource
Args:
request (requests.Request): Object requests
context (requests.Context): Object holding response metadata
information (status_code, headers, etc...)
ignore_urls (List): urls whose status response should be 404 even if
the local file exists
Returns:
Optional[FileDescriptor] on the on disk file to read from the test
context
"""
logger.debug('get_response_cb(%s, %s)', request, context)
- url = urlparse(request.url)
- if url in ignore_urls:
+ logger.debug('url: %s', request.url)
+ logger.debug('ignore_urls: %s', ignore_urls)
+ if request.url in ignore_urls:
context.status_code = 404
return None
+ url = urlparse(request.url)
dirname = url.hostname # pypi.org | files.pythonhosted.org
# url.path: pypi/<project>/json -> local file: pypi_<project>_json
filename = url.path[1:]
if filename.endswith('/'):
filename = filename[:-1]
filename = filename.replace('/', '_')
filepath = path.join(DATADIR, dirname, filename)
if not path.isfile(filepath):
context.status_code = 404
return None
fd = open(filepath, 'rb')
context.headers['content-length'] = str(path.getsize(filepath))
return fd
def local_get_factory(ignore_urls=[]):
@pytest.fixture
def local_get(requests_mock):
cb = partial(get_response_cb, ignore_urls=ignore_urls)
requests_mock.get(re.compile('https://'), body=cb)
return requests_mock
return local_get
local_get = local_get_factory([])
diff --git a/swh/loader/package/tests/test_pypi.py b/swh/loader/package/tests/test_pypi.py
index 8dd6bf6..6adbae2 100644
--- a/swh/loader/package/tests/test_pypi.py
+++ b/swh/loader/package/tests/test_pypi.py
@@ -1,304 +1,378 @@
# Copyright (C) 2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import os
from os import path
import pytest
from swh.core.tarball import uncompress
from swh.model.hashutil import hash_to_bytes
from swh.loader.package.pypi import (
PyPILoader, pypi_api_url, pypi_info, author, sdist_parse
)
from swh.loader.package.tests.common import DATADIR, check_snapshot
+from swh.loader.package.tests.conftest import local_get_factory
+
def test_author_basic():
data = {
'author': "i-am-groot",
'author_email': 'iam@groot.org',
}
actual_author = author(data)
expected_author = {
'fullname': b'i-am-groot <iam@groot.org>',
'name': b'i-am-groot',
'email': b'iam@groot.org',
}
assert actual_author == expected_author
def test_author_empty_email():
data = {
'author': 'i-am-groot',
'author_email': '',
}
actual_author = author(data)
expected_author = {
'fullname': b'i-am-groot',
'name': b'i-am-groot',
'email': b'',
}
assert actual_author == expected_author
def test_author_empty_name():
data = {
'author': "",
'author_email': 'iam@groot.org',
}
actual_author = author(data)
expected_author = {
'fullname': b' <iam@groot.org>',
'name': b'',
'email': b'iam@groot.org',
}
assert actual_author == expected_author
def test_author_malformed():
data = {
'author': "['pierre', 'paul', 'jacques']",
'author_email': None,
}
actual_author = author(data)
expected_author = {
'fullname': b"['pierre', 'paul', 'jacques']",
'name': b"['pierre', 'paul', 'jacques']",
'email': None,
}
assert actual_author == expected_author
def test_author_malformed_2():
data = {
'author': '[marie, jeanne]',
'author_email': '[marie@some, jeanne@thing]',
}
actual_author = author(data)
expected_author = {
'fullname': b'[marie, jeanne] <[marie@some, jeanne@thing]>',
'name': b'[marie, jeanne]',
'email': b'[marie@some, jeanne@thing]',
}
assert actual_author == expected_author
def test_author_malformed_3():
data = {
'author': '[marie, jeanne, pierre]',
'author_email': '[marie@somewhere.org, jeanne@somewhere.org]',
}
actual_author = author(data)
expected_author = {
'fullname': b'[marie, jeanne, pierre] <[marie@somewhere.org, jeanne@somewhere.org]>', # noqa
'name': b'[marie, jeanne, pierre]',
'email': b'[marie@somewhere.org, jeanne@somewhere.org]',
}
actual_author == expected_author
# configuration error #
def test_badly_configured_loader_raise(monkeypatch):
"""Badly configured loader should raise"""
monkeypatch.delenv('SWH_CONFIG_FILENAME', raising=False)
with pytest.raises(ValueError) as e:
PyPILoader(url='some-url')
assert 'Misconfiguration' in e.value.args[0]
def test_pypi_api_url():
"""Compute pypi api url from the pypi project url should be ok"""
url = pypi_api_url('https://pypi.org/project/requests')
assert url == 'https://pypi.org/pypi/requests/json'
def test_pypi_info_failure(requests_mock):
"""Failure to fetch info/release information should raise"""
project_url = 'https://pypi.org/project/requests'
info_url = 'https://pypi.org/pypi/requests/json'
status_code = 400
requests_mock.get(info_url, status_code=status_code)
with pytest.raises(ValueError) as e0:
pypi_info(project_url)
assert e0.value.args[0] == "Fail to query '%s'. Reason: %s" % (
info_url, status_code
)
def test_pypi_info(requests_mock):
"""Fetching json info from pypi project should be ok"""
url = 'https://pypi.org/project/requests'
info_url = 'https://pypi.org/pypi/requests/json'
requests_mock.get(info_url,
text='{"version": "0.0.1"}')
actual_info = pypi_info(url)
assert actual_info == {
'version': '0.0.1',
}
@pytest.mark.fs
def test_sdist_parse(tmp_path):
"""Parsing existing archive's PKG-INFO should yield results"""
uncompressed_archive_path = str(tmp_path)
archive_path = path.join(
DATADIR, 'files.pythonhosted.org', '0805nexter-1.1.0.zip')
uncompress(archive_path, dest=uncompressed_archive_path)
actual_sdist = sdist_parse(uncompressed_archive_path)
expected_sdist = {
'metadata_version': '1.0',
'name': '0805nexter',
'version': '1.1.0',
'summary': 'a simple printer of nested lest',
'home_page': 'http://www.hp.com',
'author': 'hgtkpython',
'author_email': '2868989685@qq.com',
'platforms': ['UNKNOWN'],
}
assert actual_sdist == expected_sdist
@pytest.mark.fs
def test_sdist_parse_failures(tmp_path):
"""Parsing inexistant path/archive/PKG-INFO yield None"""
# inexistant first level path
assert sdist_parse('/something-inexistant') == {}
# inexistant second level path (as expected by pypi archives)
assert sdist_parse(tmp_path) == {}
# inexistant PKG-INFO within second level path
existing_path_no_pkginfo = str(tmp_path / 'something')
os.mkdir(existing_path_no_pkginfo)
assert sdist_parse(tmp_path) == {}
# LOADER SCENARIO #
# "edge" cases (for the same origin) #
+# no release artifact:
+# {visit full, status: uneventful, no contents, etc...}
def test_no_release_artifact(requests_mock):
pass
-# no release artifact:
-# {visit full, status: uneventful, no contents, etc...}
-
# problem during loading:
# {visit: partial, status: uneventful, no snapshot}
+
+
+
# problem during loading: failure early enough in between swh contents...
# some contents (contents, directories, etc...) have been written in storage
# {visit: partial, status: eventful, no snapshot}
# problem during loading: failure late enough we can have snapshots (some
# revisions are written in storage already)
# {visit: partial, status: eventful, snapshot}
# "normal" cases (for the same origin) #
+
+local_get_missing = local_get_factory(ignore_urls=[
+ 'https://files.pythonhosted.org/packages/ec/65/c0116953c9a3f47de89e71964d6c7b0c783b01f29fa3390584dbf3046b4d/0805nexter-1.1.0.zip', # noqa
+])
+
+# some missing release artifacts:
+# {visit partial, status: eventful, 1 snapshot}
+
+def test_release_with_missing_artifact(swh_config, local_get_missing):
+ """Load a pypi project with some missing artifacts ends up with 1 snapshot
+
+ """
+ loader = PyPILoader('https://pypi.org/project/0805nexter')
+
+ actual_load_status = loader.load()
+
+ assert actual_load_status == {'status': 'eventful'}
+
+ stats = loader.storage.stat_counters()
+ assert {
+ 'content': 3,
+ 'directory': 2,
+ 'origin': 1,
+ 'origin_visit': 1,
+ 'person': 1,
+ 'release': 0,
+ 'revision': 1,
+ 'skipped_content': 0,
+ 'snapshot': 1
+ } == stats
+
+ expected_contents = map(hash_to_bytes, [
+ '405859113963cb7a797642b45f171d6360425d16',
+ 'e5686aa568fdb1d19d7f1329267082fe40482d31',
+ '83ecf6ec1114fd260ca7a833a2d165e71258c338',
+ ])
+
+ assert list(loader.storage.content_missing_per_sha1(expected_contents))\
+ == []
+
+ expected_dirs = map(hash_to_bytes, [
+ 'b178b66bd22383d5f16f4f5c923d39ca798861b4',
+ 'c3a58f8b57433a4b56caaa5033ae2e0931405338',
+ ])
+
+ assert list(loader.storage.directory_missing(expected_dirs)) == []
+
+ # {revision hash: directory hash}
+ expected_revs = {
+ hash_to_bytes('e445da4da22b31bfebb6ffc4383dbf839a074d21'): hash_to_bytes('b178b66bd22383d5f16f4f5c923d39ca798861b4'), # noqa
+ }
+ assert list(loader.storage.revision_missing(expected_revs)) == []
+
+ expected_branches = {
+ 'releases/1.2.0': {
+ 'target': 'e445da4da22b31bfebb6ffc4383dbf839a074d21',
+ 'target_type': 'revision',
+ },
+ 'HEAD': {
+ 'target': 'releases/1.2.0',
+ 'target_type': 'alias',
+ },
+ }
+
+ check_snapshot(
+ 'dd0e4201a232b1c104433741dbf45895b8ac9355',
+ expected_branches,
+ storage=loader.storage)
+
+
def test_release_artifact_no_prior_visit(swh_config, local_get):
"""With no prior visit, load a pypi project ends up with 1 snapshot
"""
loader = PyPILoader('https://pypi.org/project/0805nexter')
actual_load_status = loader.load()
assert actual_load_status == {'status': 'eventful'}
stats = loader.storage.stat_counters()
assert {
'content': 6,
'directory': 4,
'origin': 1,
'origin_visit': 1,
'person': 1,
'release': 0,
'revision': 2,
'skipped_content': 0,
'snapshot': 1
} == stats
expected_contents = map(hash_to_bytes, [
'a61e24cdfdab3bb7817f6be85d37a3e666b34566',
'938c33483285fd8ad57f15497f538320df82aeb8',
'a27576d60e08c94a05006d2e6d540c0fdb5f38c8',
'405859113963cb7a797642b45f171d6360425d16',
'e5686aa568fdb1d19d7f1329267082fe40482d31',
'83ecf6ec1114fd260ca7a833a2d165e71258c338',
])
assert list(loader.storage.content_missing_per_sha1(expected_contents))\
== []
expected_dirs = map(hash_to_bytes, [
'05219ba38bc542d4345d5638af1ed56c7d43ca7d',
'cf019eb456cf6f78d8c4674596f1c9a97ece8f44',
'b178b66bd22383d5f16f4f5c923d39ca798861b4',
'c3a58f8b57433a4b56caaa5033ae2e0931405338',
])
assert list(loader.storage.directory_missing(expected_dirs)) == []
# {revision hash: directory hash}
expected_revs = {
hash_to_bytes('4c99891f93b81450385777235a37b5e966dd1571'): hash_to_bytes('05219ba38bc542d4345d5638af1ed56c7d43ca7d'), # noqa
hash_to_bytes('e445da4da22b31bfebb6ffc4383dbf839a074d21'): hash_to_bytes('b178b66bd22383d5f16f4f5c923d39ca798861b4'), # noqa
}
assert list(loader.storage.revision_missing(expected_revs)) == []
expected_branches = {
'releases/1.1.0': {
'target': '4c99891f93b81450385777235a37b5e966dd1571',
'target_type': 'revision',
},
'releases/1.2.0': {
'target': 'e445da4da22b31bfebb6ffc4383dbf839a074d21',
'target_type': 'revision',
},
'HEAD': {
'target': 'releases/1.2.0',
'target_type': 'alias',
},
}
check_snapshot(
'ba6e158ada75d0b3cfb209ffdf6daa4ed34a227a',
expected_branches,
storage=loader.storage)
# release artifact, no new artifact
# {visit full, status uneventful, same snapshot as before}
# release artifact, new artifact
# {visit full, status full, new snapshot with shared history as prior snapshot}
# release artifact, old artifact with different checksums
# {visit full, status full, new snapshot with shared history and some new
# different history}
File Metadata
Details
Attached
Mime Type
text/x-diff
Expires
Fri, Jul 4, 12:31 PM (2 w, 3 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3267052
Attached To
rDLDBASE Generic VCS/Package Loader
Event Timeline
Log In to Comment